change prepare worker to use fork instead of threads (#1685)

Co-authored-by: Marcin S <marcin@realemail.net>
This commit is contained in:
jserrat
2023-11-14 14:50:18 -03:00
committed by GitHub
parent 3a87390b30
commit 54f84285bf
24 changed files with 1468 additions and 534 deletions
@@ -9,6 +9,9 @@ license.workspace = true
[dependencies]
cpu-time = "1.0.0"
gum = { package = "tracing-gum", path = "../../../gum" }
os_pipe = "1.1.4"
nix = { version = "0.27.1", features = ["resource", "process"]}
libc = "0.2.139"
parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] }
+313 -86
View File
@@ -25,23 +25,33 @@ pub use polkadot_node_core_pvf_common::{
const LOG_TARGET: &str = "parachain::pvf-execute-worker";
use cpu_time::ProcessTime;
use nix::{
errno::Errno,
sys::{
resource::{Usage, UsageWho},
wait::WaitStatus,
},
unistd::{ForkResult, Pid},
};
use os_pipe::{self, PipeReader, PipeWriter};
use parity_scale_codec::{Decode, Encode};
use polkadot_node_core_pvf_common::{
error::InternalValidationError,
execute::{Handshake, Response},
execute::{Handshake, JobError, JobResponse, JobResult, WorkerResponse},
framed_recv_blocking, framed_send_blocking,
worker::{
cpu_time_monitor_loop, stringify_panic_payload,
cpu_time_monitor_loop, run_worker, stringify_panic_payload,
thread::{self, WaitOutcome},
worker_event_loop, WorkerKind,
WorkerKind,
},
};
use polkadot_parachain_primitives::primitives::ValidationResult;
use polkadot_primitives::{executor_params::DEFAULT_NATIVE_STACK_MAX, ExecutorParams};
use std::{
io,
io::{self, Read},
os::unix::net::UnixStream,
path::PathBuf,
process,
sync::{mpsc::channel, Arc},
time::Duration,
};
@@ -105,7 +115,7 @@ fn recv_request(stream: &mut UnixStream) -> io::Result<(Vec<u8>, Duration)> {
Ok((params, execution_timeout))
}
fn send_response(stream: &mut UnixStream, response: Response) -> io::Result<()> {
fn send_response(stream: &mut UnixStream, response: WorkerResponse) -> io::Result<()> {
framed_send_blocking(stream, &response.encode())
}
@@ -131,7 +141,7 @@ pub fn worker_entrypoint(
worker_version: Option<&str>,
security_status: SecurityStatus,
) {
worker_event_loop(
run_worker(
WorkerKind::Execute,
socket_path,
worker_dir_path,
@@ -139,7 +149,7 @@ pub fn worker_entrypoint(
worker_version,
&security_status,
|mut stream, worker_dir_path| {
let worker_pid = std::process::id();
let worker_pid = process::id();
let artifact_path = worker_dir::execute_artifact(&worker_dir_path);
let Handshake { executor_params } = recv_handshake(&mut stream)?;
@@ -157,7 +167,7 @@ pub fn worker_entrypoint(
let compiled_artifact_blob = match std::fs::read(&artifact_path) {
Ok(bytes) => bytes,
Err(err) => {
let response = Response::InternalError(
let response = WorkerResponse::InternalError(
InternalValidationError::CouldNotOpenFile(err.to_string()),
);
send_response(&mut stream, response)?;
@@ -165,82 +175,51 @@ pub fn worker_entrypoint(
},
};
// Conditional variable to notify us when a thread is done.
let condvar = thread::get_condvar();
let (pipe_reader, pipe_writer) = os_pipe::pipe()?;
let cpu_time_start = ProcessTime::now();
let usage_before = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) {
Ok(usage) => usage,
Err(errno) => {
let response = internal_error_from_errno("getrusage before", errno);
send_response(&mut stream, response)?;
continue
},
};
// Spawn a new thread that runs the CPU time monitor.
let (cpu_time_monitor_tx, cpu_time_monitor_rx) = channel::<()>();
let cpu_time_monitor_thread = thread::spawn_worker_thread(
"cpu time monitor thread",
move || {
cpu_time_monitor_loop(
cpu_time_start,
// SAFETY: new process is spawned within a single threaded process. This invariant
// is enforced by tests.
let response = match unsafe { nix::unistd::fork() } {
Err(errno) => internal_error_from_errno("fork", errno),
Ok(ForkResult::Child) => {
// Dropping the stream closes the underlying socket. We want to make sure
// that the sandboxed child can't get any kind of information from the
// outside world. The only IPC it should be able to do is sending its
// response over the pipe.
drop(stream);
// Drop the read end so we don't have too many FDs open.
drop(pipe_reader);
handle_child_process(
pipe_writer,
compiled_artifact_blob,
executor_params,
params,
execution_timeout,
cpu_time_monitor_rx,
)
},
Arc::clone(&condvar),
WaitOutcome::TimedOut,
)?;
Ok(ForkResult::Parent { child }) => {
// the read end will wait until all write ends have been closed,
// this drop is necessary to avoid deadlock
drop(pipe_writer);
let executor_params_2 = executor_params.clone();
let execute_thread = thread::spawn_worker_thread_with_stack_size(
"execute thread",
move || {
validate_using_artifact(
&compiled_artifact_blob,
&executor_params_2,
&params,
cpu_time_start,
)
handle_parent_process(
pipe_reader,
child,
worker_pid,
usage_before,
execution_timeout,
)?
},
Arc::clone(&condvar),
WaitOutcome::Finished,
EXECUTE_THREAD_STACK_SIZE,
)?;
let outcome = thread::wait_for_threads(condvar);
let response = match outcome {
WaitOutcome::Finished => {
let _ = cpu_time_monitor_tx.send(());
execute_thread
.join()
.unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e)))
},
// If the CPU thread is not selected, we signal it to end, the join handle is
// dropped and the thread will finish in the background.
WaitOutcome::TimedOut => {
match cpu_time_monitor_thread.join() {
Ok(Some(cpu_time_elapsed)) => {
// Log if we exceed the timeout and the other thread hasn't
// finished.
gum::warn!(
target: LOG_TARGET,
%worker_pid,
"execute job took {}ms cpu time, exceeded execute timeout {}ms",
cpu_time_elapsed.as_millis(),
execution_timeout.as_millis(),
);
Response::TimedOut
},
Ok(None) => Response::InternalError(
InternalValidationError::CpuTimeMonitorThread(
"error communicating over finished channel".into(),
),
),
Err(e) => Response::InternalError(
InternalValidationError::CpuTimeMonitorThread(
stringify_panic_payload(e),
),
),
}
},
WaitOutcome::Pending => unreachable!(
"we run wait_while until the outcome is no longer pending; qed"
),
};
gum::trace!(
@@ -259,27 +238,275 @@ fn validate_using_artifact(
compiled_artifact_blob: &[u8],
executor_params: &ExecutorParams,
params: &[u8],
cpu_time_start: ProcessTime,
) -> Response {
) -> JobResponse {
let descriptor_bytes = match unsafe {
// SAFETY: this should be safe since the compiled artifact passed here comes from the
// file created by the prepare workers. These files are obtained by calling
// [`executor_intf::prepare`].
execute_artifact(compiled_artifact_blob, executor_params, params)
} {
Err(err) => return Response::format_invalid("execute", &err),
Err(err) => return JobResponse::format_invalid("execute", &err),
Ok(d) => d,
};
let result_descriptor = match ValidationResult::decode(&mut &descriptor_bytes[..]) {
Err(err) =>
return Response::format_invalid("validation result decoding failed", &err.to_string()),
return JobResponse::format_invalid(
"validation result decoding failed",
&err.to_string(),
),
Ok(r) => r,
};
// Include the decoding in the measured time, to prevent any potential attacks exploiting some
// bug in decoding.
let duration = cpu_time_start.elapsed();
Response::Ok { result_descriptor, duration }
JobResponse::Ok { result_descriptor }
}
/// This is used to handle child process during pvf execute worker.
/// It execute the artifact and pipes back the response to the parent process
///
/// # Arguments
///
/// - `pipe_write`: A `PipeWriter` structure, the writing end of a pipe.
///
/// - `compiled_artifact_blob`: The artifact bytes from compiled by the prepare worker`.
///
/// - `executor_params`: Deterministically serialized execution environment semantics.
///
/// - `params`: Validation parameters.
///
/// - `execution_timeout`: The timeout in `Duration`.
///
/// # Returns
///
/// - pipe back `JobResponse` to the parent process.
fn handle_child_process(
mut pipe_write: PipeWriter,
compiled_artifact_blob: Vec<u8>,
executor_params: ExecutorParams,
params: Vec<u8>,
execution_timeout: Duration,
) -> ! {
gum::debug!(
target: LOG_TARGET,
worker_job_pid = %process::id(),
"worker job: executing artifact",
);
// Conditional variable to notify us when a thread is done.
let condvar = thread::get_condvar();
let cpu_time_start = ProcessTime::now();
// Spawn a new thread that runs the CPU time monitor.
let (cpu_time_monitor_tx, cpu_time_monitor_rx) = channel::<()>();
let cpu_time_monitor_thread = thread::spawn_worker_thread(
"cpu time monitor thread",
move || cpu_time_monitor_loop(cpu_time_start, execution_timeout, cpu_time_monitor_rx),
Arc::clone(&condvar),
WaitOutcome::TimedOut,
)
.unwrap_or_else(|err| {
send_child_response(&mut pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string())))
});
let executor_params_2 = executor_params.clone();
let execute_thread = thread::spawn_worker_thread_with_stack_size(
"execute thread",
move || validate_using_artifact(&compiled_artifact_blob, &executor_params_2, &params),
Arc::clone(&condvar),
WaitOutcome::Finished,
EXECUTE_THREAD_STACK_SIZE,
)
.unwrap_or_else(|err| {
send_child_response(&mut pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string())))
});
let outcome = thread::wait_for_threads(condvar);
let response = match outcome {
WaitOutcome::Finished => {
let _ = cpu_time_monitor_tx.send(());
execute_thread.join().map_err(|e| JobError::Panic(stringify_panic_payload(e)))
},
// If the CPU thread is not selected, we signal it to end, the join handle is
// dropped and the thread will finish in the background.
WaitOutcome::TimedOut => match cpu_time_monitor_thread.join() {
Ok(Some(_cpu_time_elapsed)) => Err(JobError::TimedOut),
Ok(None) => Err(JobError::CpuTimeMonitorThread(
"error communicating over finished channel".into(),
)),
Err(e) => Err(JobError::CpuTimeMonitorThread(stringify_panic_payload(e))),
},
WaitOutcome::Pending =>
unreachable!("we run wait_while until the outcome is no longer pending; qed"),
};
send_child_response(&mut pipe_write, response);
}
/// Waits for child process to finish and handle child response from pipe.
///
/// # Arguments
///
/// - `pipe_read`: A `PipeReader` used to read data from the child process.
///
/// - `child`: The child pid.
///
/// - `usage_before`: Resource usage statistics before executing the child process.
///
/// - `timeout`: The maximum allowed time for the child process to finish, in `Duration`.
///
/// # Returns
///
/// - The response, either `Ok` or some error state.
fn handle_parent_process(
mut pipe_read: PipeReader,
child: Pid,
worker_pid: u32,
usage_before: Usage,
timeout: Duration,
) -> io::Result<WorkerResponse> {
// Read from the child. Don't decode unless the process exited normally, which we check later.
let mut received_data = Vec::new();
pipe_read
.read_to_end(&mut received_data)
// Could not decode job response. There is either a bug or the job was hijacked.
// Should retry at any rate.
.map_err(|err| io::Error::new(io::ErrorKind::Other, err.to_string()))?;
let status = nix::sys::wait::waitpid(child, None);
gum::trace!(
target: LOG_TARGET,
%worker_pid,
"execute worker received wait status from job: {:?}",
status,
);
let usage_after = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) {
Ok(usage) => usage,
Err(errno) => return Ok(internal_error_from_errno("getrusage after", errno)),
};
// Using `getrusage` is needed to check whether child has timedout since we cannot rely on
// child to report its own time.
// As `getrusage` returns resource usage from all terminated child processes,
// it is necessary to subtract the usage before the current child process to isolate its cpu
// time
let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before);
if cpu_tv >= timeout {
gum::warn!(
target: LOG_TARGET,
%worker_pid,
"execute job took {}ms cpu time, exceeded execute timeout {}ms",
cpu_tv.as_millis(),
timeout.as_millis(),
);
return Ok(WorkerResponse::JobTimedOut)
}
match status {
Ok(WaitStatus::Exited(_, exit_status)) => {
let mut reader = io::BufReader::new(received_data.as_slice());
let result = match recv_child_response(&mut reader) {
Ok(result) => result,
Err(err) => return Ok(WorkerResponse::JobError(err.to_string())),
};
match result {
Ok(JobResponse::Ok { result_descriptor }) => {
// The exit status should have been zero if no error occurred.
if exit_status != 0 {
return Ok(WorkerResponse::JobError(format!(
"unexpected exit status: {}",
exit_status
)))
}
Ok(WorkerResponse::Ok { result_descriptor, duration: cpu_tv })
},
Ok(JobResponse::InvalidCandidate(err)) => Ok(WorkerResponse::InvalidCandidate(err)),
Err(job_error) => {
gum::warn!(
target: LOG_TARGET,
%worker_pid,
"execute job error: {}",
job_error,
);
if matches!(job_error, JobError::TimedOut) {
Ok(WorkerResponse::JobTimedOut)
} else {
Ok(WorkerResponse::JobError(job_error.to_string()))
}
},
}
},
// The job was killed by the given signal.
//
// The job gets SIGSYS on seccomp violations, but this signal may have been sent for some
// other reason, so we still need to check for seccomp violations elsewhere.
Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) =>
Ok(WorkerResponse::JobDied(format!("received signal: {signal:?}"))),
Err(errno) => Ok(internal_error_from_errno("waitpid", errno)),
// It is within an attacker's power to send an unexpected exit status. So we cannot treat
// this as an internal error (which would make us abstain), but must vote against.
Ok(unexpected_wait_status) => Ok(WorkerResponse::JobDied(format!(
"unexpected status from wait: {unexpected_wait_status:?}"
))),
}
}
/// Calculate the total CPU time from the given `usage` structure, returned from
/// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user
/// and system time.
///
/// # Arguments
///
/// - `rusage`: Contains resource usage information.
///
/// # Returns
///
/// Returns a `Duration` representing the total CPU time.
fn get_total_cpu_usage(rusage: Usage) -> Duration {
let micros = (((rusage.user_time().tv_sec() + rusage.system_time().tv_sec()) * 1_000_000) +
(rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) as u64;
return Duration::from_micros(micros)
}
/// Get a job response.
fn recv_child_response(received_data: &mut io::BufReader<&[u8]>) -> io::Result<JobResult> {
let response_bytes = framed_recv_blocking(received_data)?;
JobResult::decode(&mut response_bytes.as_slice()).map_err(|e| {
io::Error::new(
io::ErrorKind::Other,
format!("execute pvf recv_child_response: decode error: {:?}", e),
)
})
}
/// Write response to the pipe and exit process after.
///
/// # Arguments
///
/// - `pipe_write`: A `PipeWriter` structure, the writing end of a pipe.
///
/// - `response`: Child process response, or error.
fn send_child_response(pipe_write: &mut PipeWriter, response: JobResult) -> ! {
framed_send_blocking(pipe_write, response.encode().as_slice())
.unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE));
if response.is_ok() {
process::exit(libc::EXIT_SUCCESS)
} else {
process::exit(libc::EXIT_FAILURE)
}
}
fn internal_error_from_errno(context: &'static str, errno: Errno) -> WorkerResponse {
WorkerResponse::InternalError(InternalValidationError::Kernel(format!(
"{}: {}: {}",
context,
errno,
io::Error::last_os_error()
)))
}