mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-14 23:51:05 +00:00
change prepare worker to use fork instead of threads (#1685)
Co-authored-by: Marcin S <marcin@realemail.net>
This commit is contained in:
@@ -9,6 +9,9 @@ license.workspace = true
|
||||
[dependencies]
|
||||
cpu-time = "1.0.0"
|
||||
gum = { package = "tracing-gum", path = "../../../gum" }
|
||||
os_pipe = "1.1.4"
|
||||
nix = { version = "0.27.1", features = ["resource", "process"]}
|
||||
libc = "0.2.139"
|
||||
|
||||
parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] }
|
||||
|
||||
|
||||
@@ -25,23 +25,33 @@ pub use polkadot_node_core_pvf_common::{
|
||||
const LOG_TARGET: &str = "parachain::pvf-execute-worker";
|
||||
|
||||
use cpu_time::ProcessTime;
|
||||
use nix::{
|
||||
errno::Errno,
|
||||
sys::{
|
||||
resource::{Usage, UsageWho},
|
||||
wait::WaitStatus,
|
||||
},
|
||||
unistd::{ForkResult, Pid},
|
||||
};
|
||||
use os_pipe::{self, PipeReader, PipeWriter};
|
||||
use parity_scale_codec::{Decode, Encode};
|
||||
use polkadot_node_core_pvf_common::{
|
||||
error::InternalValidationError,
|
||||
execute::{Handshake, Response},
|
||||
execute::{Handshake, JobError, JobResponse, JobResult, WorkerResponse},
|
||||
framed_recv_blocking, framed_send_blocking,
|
||||
worker::{
|
||||
cpu_time_monitor_loop, stringify_panic_payload,
|
||||
cpu_time_monitor_loop, run_worker, stringify_panic_payload,
|
||||
thread::{self, WaitOutcome},
|
||||
worker_event_loop, WorkerKind,
|
||||
WorkerKind,
|
||||
},
|
||||
};
|
||||
use polkadot_parachain_primitives::primitives::ValidationResult;
|
||||
use polkadot_primitives::{executor_params::DEFAULT_NATIVE_STACK_MAX, ExecutorParams};
|
||||
use std::{
|
||||
io,
|
||||
io::{self, Read},
|
||||
os::unix::net::UnixStream,
|
||||
path::PathBuf,
|
||||
process,
|
||||
sync::{mpsc::channel, Arc},
|
||||
time::Duration,
|
||||
};
|
||||
@@ -105,7 +115,7 @@ fn recv_request(stream: &mut UnixStream) -> io::Result<(Vec<u8>, Duration)> {
|
||||
Ok((params, execution_timeout))
|
||||
}
|
||||
|
||||
fn send_response(stream: &mut UnixStream, response: Response) -> io::Result<()> {
|
||||
fn send_response(stream: &mut UnixStream, response: WorkerResponse) -> io::Result<()> {
|
||||
framed_send_blocking(stream, &response.encode())
|
||||
}
|
||||
|
||||
@@ -131,7 +141,7 @@ pub fn worker_entrypoint(
|
||||
worker_version: Option<&str>,
|
||||
security_status: SecurityStatus,
|
||||
) {
|
||||
worker_event_loop(
|
||||
run_worker(
|
||||
WorkerKind::Execute,
|
||||
socket_path,
|
||||
worker_dir_path,
|
||||
@@ -139,7 +149,7 @@ pub fn worker_entrypoint(
|
||||
worker_version,
|
||||
&security_status,
|
||||
|mut stream, worker_dir_path| {
|
||||
let worker_pid = std::process::id();
|
||||
let worker_pid = process::id();
|
||||
let artifact_path = worker_dir::execute_artifact(&worker_dir_path);
|
||||
|
||||
let Handshake { executor_params } = recv_handshake(&mut stream)?;
|
||||
@@ -157,7 +167,7 @@ pub fn worker_entrypoint(
|
||||
let compiled_artifact_blob = match std::fs::read(&artifact_path) {
|
||||
Ok(bytes) => bytes,
|
||||
Err(err) => {
|
||||
let response = Response::InternalError(
|
||||
let response = WorkerResponse::InternalError(
|
||||
InternalValidationError::CouldNotOpenFile(err.to_string()),
|
||||
);
|
||||
send_response(&mut stream, response)?;
|
||||
@@ -165,82 +175,51 @@ pub fn worker_entrypoint(
|
||||
},
|
||||
};
|
||||
|
||||
// Conditional variable to notify us when a thread is done.
|
||||
let condvar = thread::get_condvar();
|
||||
let (pipe_reader, pipe_writer) = os_pipe::pipe()?;
|
||||
|
||||
let cpu_time_start = ProcessTime::now();
|
||||
let usage_before = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) {
|
||||
Ok(usage) => usage,
|
||||
Err(errno) => {
|
||||
let response = internal_error_from_errno("getrusage before", errno);
|
||||
send_response(&mut stream, response)?;
|
||||
continue
|
||||
},
|
||||
};
|
||||
|
||||
// Spawn a new thread that runs the CPU time monitor.
|
||||
let (cpu_time_monitor_tx, cpu_time_monitor_rx) = channel::<()>();
|
||||
let cpu_time_monitor_thread = thread::spawn_worker_thread(
|
||||
"cpu time monitor thread",
|
||||
move || {
|
||||
cpu_time_monitor_loop(
|
||||
cpu_time_start,
|
||||
// SAFETY: new process is spawned within a single threaded process. This invariant
|
||||
// is enforced by tests.
|
||||
let response = match unsafe { nix::unistd::fork() } {
|
||||
Err(errno) => internal_error_from_errno("fork", errno),
|
||||
Ok(ForkResult::Child) => {
|
||||
// Dropping the stream closes the underlying socket. We want to make sure
|
||||
// that the sandboxed child can't get any kind of information from the
|
||||
// outside world. The only IPC it should be able to do is sending its
|
||||
// response over the pipe.
|
||||
drop(stream);
|
||||
// Drop the read end so we don't have too many FDs open.
|
||||
drop(pipe_reader);
|
||||
|
||||
handle_child_process(
|
||||
pipe_writer,
|
||||
compiled_artifact_blob,
|
||||
executor_params,
|
||||
params,
|
||||
execution_timeout,
|
||||
cpu_time_monitor_rx,
|
||||
)
|
||||
},
|
||||
Arc::clone(&condvar),
|
||||
WaitOutcome::TimedOut,
|
||||
)?;
|
||||
Ok(ForkResult::Parent { child }) => {
|
||||
// the read end will wait until all write ends have been closed,
|
||||
// this drop is necessary to avoid deadlock
|
||||
drop(pipe_writer);
|
||||
|
||||
let executor_params_2 = executor_params.clone();
|
||||
let execute_thread = thread::spawn_worker_thread_with_stack_size(
|
||||
"execute thread",
|
||||
move || {
|
||||
validate_using_artifact(
|
||||
&compiled_artifact_blob,
|
||||
&executor_params_2,
|
||||
¶ms,
|
||||
cpu_time_start,
|
||||
)
|
||||
handle_parent_process(
|
||||
pipe_reader,
|
||||
child,
|
||||
worker_pid,
|
||||
usage_before,
|
||||
execution_timeout,
|
||||
)?
|
||||
},
|
||||
Arc::clone(&condvar),
|
||||
WaitOutcome::Finished,
|
||||
EXECUTE_THREAD_STACK_SIZE,
|
||||
)?;
|
||||
|
||||
let outcome = thread::wait_for_threads(condvar);
|
||||
|
||||
let response = match outcome {
|
||||
WaitOutcome::Finished => {
|
||||
let _ = cpu_time_monitor_tx.send(());
|
||||
execute_thread
|
||||
.join()
|
||||
.unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e)))
|
||||
},
|
||||
// If the CPU thread is not selected, we signal it to end, the join handle is
|
||||
// dropped and the thread will finish in the background.
|
||||
WaitOutcome::TimedOut => {
|
||||
match cpu_time_monitor_thread.join() {
|
||||
Ok(Some(cpu_time_elapsed)) => {
|
||||
// Log if we exceed the timeout and the other thread hasn't
|
||||
// finished.
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
%worker_pid,
|
||||
"execute job took {}ms cpu time, exceeded execute timeout {}ms",
|
||||
cpu_time_elapsed.as_millis(),
|
||||
execution_timeout.as_millis(),
|
||||
);
|
||||
Response::TimedOut
|
||||
},
|
||||
Ok(None) => Response::InternalError(
|
||||
InternalValidationError::CpuTimeMonitorThread(
|
||||
"error communicating over finished channel".into(),
|
||||
),
|
||||
),
|
||||
Err(e) => Response::InternalError(
|
||||
InternalValidationError::CpuTimeMonitorThread(
|
||||
stringify_panic_payload(e),
|
||||
),
|
||||
),
|
||||
}
|
||||
},
|
||||
WaitOutcome::Pending => unreachable!(
|
||||
"we run wait_while until the outcome is no longer pending; qed"
|
||||
),
|
||||
};
|
||||
|
||||
gum::trace!(
|
||||
@@ -259,27 +238,275 @@ fn validate_using_artifact(
|
||||
compiled_artifact_blob: &[u8],
|
||||
executor_params: &ExecutorParams,
|
||||
params: &[u8],
|
||||
cpu_time_start: ProcessTime,
|
||||
) -> Response {
|
||||
) -> JobResponse {
|
||||
let descriptor_bytes = match unsafe {
|
||||
// SAFETY: this should be safe since the compiled artifact passed here comes from the
|
||||
// file created by the prepare workers. These files are obtained by calling
|
||||
// [`executor_intf::prepare`].
|
||||
execute_artifact(compiled_artifact_blob, executor_params, params)
|
||||
} {
|
||||
Err(err) => return Response::format_invalid("execute", &err),
|
||||
Err(err) => return JobResponse::format_invalid("execute", &err),
|
||||
Ok(d) => d,
|
||||
};
|
||||
|
||||
let result_descriptor = match ValidationResult::decode(&mut &descriptor_bytes[..]) {
|
||||
Err(err) =>
|
||||
return Response::format_invalid("validation result decoding failed", &err.to_string()),
|
||||
return JobResponse::format_invalid(
|
||||
"validation result decoding failed",
|
||||
&err.to_string(),
|
||||
),
|
||||
Ok(r) => r,
|
||||
};
|
||||
|
||||
// Include the decoding in the measured time, to prevent any potential attacks exploiting some
|
||||
// bug in decoding.
|
||||
let duration = cpu_time_start.elapsed();
|
||||
|
||||
Response::Ok { result_descriptor, duration }
|
||||
JobResponse::Ok { result_descriptor }
|
||||
}
|
||||
|
||||
/// This is used to handle child process during pvf execute worker.
|
||||
/// It execute the artifact and pipes back the response to the parent process
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// - `pipe_write`: A `PipeWriter` structure, the writing end of a pipe.
|
||||
///
|
||||
/// - `compiled_artifact_blob`: The artifact bytes from compiled by the prepare worker`.
|
||||
///
|
||||
/// - `executor_params`: Deterministically serialized execution environment semantics.
|
||||
///
|
||||
/// - `params`: Validation parameters.
|
||||
///
|
||||
/// - `execution_timeout`: The timeout in `Duration`.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// - pipe back `JobResponse` to the parent process.
|
||||
fn handle_child_process(
|
||||
mut pipe_write: PipeWriter,
|
||||
compiled_artifact_blob: Vec<u8>,
|
||||
executor_params: ExecutorParams,
|
||||
params: Vec<u8>,
|
||||
execution_timeout: Duration,
|
||||
) -> ! {
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
worker_job_pid = %process::id(),
|
||||
"worker job: executing artifact",
|
||||
);
|
||||
|
||||
// Conditional variable to notify us when a thread is done.
|
||||
let condvar = thread::get_condvar();
|
||||
let cpu_time_start = ProcessTime::now();
|
||||
|
||||
// Spawn a new thread that runs the CPU time monitor.
|
||||
let (cpu_time_monitor_tx, cpu_time_monitor_rx) = channel::<()>();
|
||||
let cpu_time_monitor_thread = thread::spawn_worker_thread(
|
||||
"cpu time monitor thread",
|
||||
move || cpu_time_monitor_loop(cpu_time_start, execution_timeout, cpu_time_monitor_rx),
|
||||
Arc::clone(&condvar),
|
||||
WaitOutcome::TimedOut,
|
||||
)
|
||||
.unwrap_or_else(|err| {
|
||||
send_child_response(&mut pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string())))
|
||||
});
|
||||
|
||||
let executor_params_2 = executor_params.clone();
|
||||
let execute_thread = thread::spawn_worker_thread_with_stack_size(
|
||||
"execute thread",
|
||||
move || validate_using_artifact(&compiled_artifact_blob, &executor_params_2, ¶ms),
|
||||
Arc::clone(&condvar),
|
||||
WaitOutcome::Finished,
|
||||
EXECUTE_THREAD_STACK_SIZE,
|
||||
)
|
||||
.unwrap_or_else(|err| {
|
||||
send_child_response(&mut pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string())))
|
||||
});
|
||||
|
||||
let outcome = thread::wait_for_threads(condvar);
|
||||
|
||||
let response = match outcome {
|
||||
WaitOutcome::Finished => {
|
||||
let _ = cpu_time_monitor_tx.send(());
|
||||
execute_thread.join().map_err(|e| JobError::Panic(stringify_panic_payload(e)))
|
||||
},
|
||||
// If the CPU thread is not selected, we signal it to end, the join handle is
|
||||
// dropped and the thread will finish in the background.
|
||||
WaitOutcome::TimedOut => match cpu_time_monitor_thread.join() {
|
||||
Ok(Some(_cpu_time_elapsed)) => Err(JobError::TimedOut),
|
||||
Ok(None) => Err(JobError::CpuTimeMonitorThread(
|
||||
"error communicating over finished channel".into(),
|
||||
)),
|
||||
Err(e) => Err(JobError::CpuTimeMonitorThread(stringify_panic_payload(e))),
|
||||
},
|
||||
WaitOutcome::Pending =>
|
||||
unreachable!("we run wait_while until the outcome is no longer pending; qed"),
|
||||
};
|
||||
|
||||
send_child_response(&mut pipe_write, response);
|
||||
}
|
||||
|
||||
/// Waits for child process to finish and handle child response from pipe.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// - `pipe_read`: A `PipeReader` used to read data from the child process.
|
||||
///
|
||||
/// - `child`: The child pid.
|
||||
///
|
||||
/// - `usage_before`: Resource usage statistics before executing the child process.
|
||||
///
|
||||
/// - `timeout`: The maximum allowed time for the child process to finish, in `Duration`.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// - The response, either `Ok` or some error state.
|
||||
fn handle_parent_process(
|
||||
mut pipe_read: PipeReader,
|
||||
child: Pid,
|
||||
worker_pid: u32,
|
||||
usage_before: Usage,
|
||||
timeout: Duration,
|
||||
) -> io::Result<WorkerResponse> {
|
||||
// Read from the child. Don't decode unless the process exited normally, which we check later.
|
||||
let mut received_data = Vec::new();
|
||||
pipe_read
|
||||
.read_to_end(&mut received_data)
|
||||
// Could not decode job response. There is either a bug or the job was hijacked.
|
||||
// Should retry at any rate.
|
||||
.map_err(|err| io::Error::new(io::ErrorKind::Other, err.to_string()))?;
|
||||
|
||||
let status = nix::sys::wait::waitpid(child, None);
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
%worker_pid,
|
||||
"execute worker received wait status from job: {:?}",
|
||||
status,
|
||||
);
|
||||
|
||||
let usage_after = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) {
|
||||
Ok(usage) => usage,
|
||||
Err(errno) => return Ok(internal_error_from_errno("getrusage after", errno)),
|
||||
};
|
||||
|
||||
// Using `getrusage` is needed to check whether child has timedout since we cannot rely on
|
||||
// child to report its own time.
|
||||
// As `getrusage` returns resource usage from all terminated child processes,
|
||||
// it is necessary to subtract the usage before the current child process to isolate its cpu
|
||||
// time
|
||||
let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before);
|
||||
if cpu_tv >= timeout {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
%worker_pid,
|
||||
"execute job took {}ms cpu time, exceeded execute timeout {}ms",
|
||||
cpu_tv.as_millis(),
|
||||
timeout.as_millis(),
|
||||
);
|
||||
return Ok(WorkerResponse::JobTimedOut)
|
||||
}
|
||||
|
||||
match status {
|
||||
Ok(WaitStatus::Exited(_, exit_status)) => {
|
||||
let mut reader = io::BufReader::new(received_data.as_slice());
|
||||
let result = match recv_child_response(&mut reader) {
|
||||
Ok(result) => result,
|
||||
Err(err) => return Ok(WorkerResponse::JobError(err.to_string())),
|
||||
};
|
||||
|
||||
match result {
|
||||
Ok(JobResponse::Ok { result_descriptor }) => {
|
||||
// The exit status should have been zero if no error occurred.
|
||||
if exit_status != 0 {
|
||||
return Ok(WorkerResponse::JobError(format!(
|
||||
"unexpected exit status: {}",
|
||||
exit_status
|
||||
)))
|
||||
}
|
||||
|
||||
Ok(WorkerResponse::Ok { result_descriptor, duration: cpu_tv })
|
||||
},
|
||||
Ok(JobResponse::InvalidCandidate(err)) => Ok(WorkerResponse::InvalidCandidate(err)),
|
||||
Err(job_error) => {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
%worker_pid,
|
||||
"execute job error: {}",
|
||||
job_error,
|
||||
);
|
||||
if matches!(job_error, JobError::TimedOut) {
|
||||
Ok(WorkerResponse::JobTimedOut)
|
||||
} else {
|
||||
Ok(WorkerResponse::JobError(job_error.to_string()))
|
||||
}
|
||||
},
|
||||
}
|
||||
},
|
||||
// The job was killed by the given signal.
|
||||
//
|
||||
// The job gets SIGSYS on seccomp violations, but this signal may have been sent for some
|
||||
// other reason, so we still need to check for seccomp violations elsewhere.
|
||||
Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) =>
|
||||
Ok(WorkerResponse::JobDied(format!("received signal: {signal:?}"))),
|
||||
Err(errno) => Ok(internal_error_from_errno("waitpid", errno)),
|
||||
|
||||
// It is within an attacker's power to send an unexpected exit status. So we cannot treat
|
||||
// this as an internal error (which would make us abstain), but must vote against.
|
||||
Ok(unexpected_wait_status) => Ok(WorkerResponse::JobDied(format!(
|
||||
"unexpected status from wait: {unexpected_wait_status:?}"
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate the total CPU time from the given `usage` structure, returned from
|
||||
/// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user
|
||||
/// and system time.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// - `rusage`: Contains resource usage information.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Duration` representing the total CPU time.
|
||||
fn get_total_cpu_usage(rusage: Usage) -> Duration {
|
||||
let micros = (((rusage.user_time().tv_sec() + rusage.system_time().tv_sec()) * 1_000_000) +
|
||||
(rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) as u64;
|
||||
|
||||
return Duration::from_micros(micros)
|
||||
}
|
||||
|
||||
/// Get a job response.
|
||||
fn recv_child_response(received_data: &mut io::BufReader<&[u8]>) -> io::Result<JobResult> {
|
||||
let response_bytes = framed_recv_blocking(received_data)?;
|
||||
JobResult::decode(&mut response_bytes.as_slice()).map_err(|e| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("execute pvf recv_child_response: decode error: {:?}", e),
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Write response to the pipe and exit process after.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// - `pipe_write`: A `PipeWriter` structure, the writing end of a pipe.
|
||||
///
|
||||
/// - `response`: Child process response, or error.
|
||||
fn send_child_response(pipe_write: &mut PipeWriter, response: JobResult) -> ! {
|
||||
framed_send_blocking(pipe_write, response.encode().as_slice())
|
||||
.unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE));
|
||||
|
||||
if response.is_ok() {
|
||||
process::exit(libc::EXIT_SUCCESS)
|
||||
} else {
|
||||
process::exit(libc::EXIT_FAILURE)
|
||||
}
|
||||
}
|
||||
|
||||
fn internal_error_from_errno(context: &'static str, errno: Errno) -> WorkerResponse {
|
||||
WorkerResponse::InternalError(InternalValidationError::Kernel(format!(
|
||||
"{}: {}: {}",
|
||||
context,
|
||||
errno,
|
||||
io::Error::last_os_error()
|
||||
)))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user