feat: initialize Kurdistan SDK - independent fork of Polkadot SDK
This commit is contained in:
@@ -0,0 +1,38 @@
|
||||
[package]
|
||||
name = "pezkuwi-node-core-pvf-execute-worker"
|
||||
description = "Pezkuwi crate that contains the logic for executing PVFs. Used by the pezkuwi-execute-worker binary."
|
||||
version = "7.0.0"
|
||||
authors.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
homepage.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
cfg-if = { workspace = true }
|
||||
cpu-time = { workspace = true }
|
||||
gum = { workspace = true, default-features = true }
|
||||
libc = { workspace = true }
|
||||
nix = { features = ["process", "resource", "sched"], workspace = true }
|
||||
|
||||
codec = { features = ["derive"], workspace = true }
|
||||
|
||||
pezkuwi-node-core-pvf-common = { workspace = true, default-features = true }
|
||||
pezkuwi-node-primitives = { workspace = true, default-features = true }
|
||||
pezkuwi-primitives = { workspace = true, default-features = true }
|
||||
pezkuwi-teyrchain-primitives = { workspace = true, default-features = true }
|
||||
|
||||
sp-maybe-compressed-blob = { workspace = true, default-features = true }
|
||||
|
||||
[features]
|
||||
builder = []
|
||||
runtime-benchmarks = [
|
||||
"gum/runtime-benchmarks",
|
||||
"pezkuwi-node-core-pvf-common/runtime-benchmarks",
|
||||
"pezkuwi-node-primitives/runtime-benchmarks",
|
||||
"pezkuwi-primitives/runtime-benchmarks",
|
||||
"pezkuwi-teyrchain-primitives/runtime-benchmarks",
|
||||
]
|
||||
@@ -0,0 +1,696 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Contains the logic for executing PVFs. Used by the pezkuwi-execute-worker binary.
|
||||
|
||||
#![deny(unused_crate_dependencies)]
|
||||
#![warn(missing_docs)]
|
||||
|
||||
pub use pezkuwi_node_core_pvf_common::{error::ExecuteError, executor_interface::execute_artifact};
|
||||
use pezkuwi_teyrchain_primitives::primitives::ValidationParams;
|
||||
|
||||
// NOTE: Initializing logging in e.g. tests will not have an effect in the workers, as they are
|
||||
// separate spawned processes. Run with e.g. `RUST_LOG=teyrchain::pvf-execute-worker=trace`.
|
||||
const LOG_TARGET: &str = "teyrchain::pvf-execute-worker";
|
||||
|
||||
use codec::{Decode, Encode};
|
||||
use cpu_time::ProcessTime;
|
||||
use nix::{
|
||||
errno::Errno,
|
||||
sys::{
|
||||
resource::{Usage, UsageWho},
|
||||
wait::WaitStatus,
|
||||
},
|
||||
unistd::{ForkResult, Pid},
|
||||
};
|
||||
use pezkuwi_node_core_pvf_common::{
|
||||
compute_checksum,
|
||||
error::InternalValidationError,
|
||||
execute::{
|
||||
ExecuteRequest, Handshake, JobError, JobResponse, JobResult, WorkerError, WorkerResponse,
|
||||
},
|
||||
executor_interface::params_to_wasmtime_semantics,
|
||||
framed_recv_blocking, framed_send_blocking,
|
||||
worker::{
|
||||
cpu_time_monitor_loop, get_total_cpu_usage, pipe2_cloexec, recv_child_response, run_worker,
|
||||
send_result, stringify_errno, stringify_panic_payload,
|
||||
thread::{self, WaitOutcome},
|
||||
PipeFd, WorkerInfo, WorkerKind,
|
||||
},
|
||||
worker_dir, ArtifactChecksum,
|
||||
};
|
||||
use pezkuwi_node_primitives::{BlockData, PoV, POV_BOMB_LIMIT};
|
||||
use pezkuwi_primitives::{ExecutorParams, PersistedValidationData};
|
||||
use pezkuwi_teyrchain_primitives::primitives::ValidationResult;
|
||||
use std::{
|
||||
io::{self, Read},
|
||||
os::{
|
||||
fd::{AsRawFd, FromRawFd},
|
||||
unix::net::UnixStream,
|
||||
},
|
||||
path::PathBuf,
|
||||
process,
|
||||
sync::{mpsc::channel, Arc},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
/// The number of threads for the child process:
|
||||
/// 1 - Main thread
|
||||
/// 2 - Cpu monitor thread
|
||||
/// 3 - Execute thread
|
||||
///
|
||||
/// NOTE: The correctness of this value is enforced by a test. If the number of threads inside
|
||||
/// the child process changes in the future, this value must be changed as well.
|
||||
pub const EXECUTE_WORKER_THREAD_NUMBER: u32 = 3;
|
||||
|
||||
/// Receives a handshake with information specific to the execute worker.
|
||||
fn recv_execute_handshake(stream: &mut UnixStream) -> io::Result<Handshake> {
|
||||
let handshake_enc = framed_recv_blocking(stream)?;
|
||||
let handshake = Handshake::decode(&mut &handshake_enc[..]).map_err(|_| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"execute pvf recv_execute_handshake: failed to decode Handshake".to_owned(),
|
||||
)
|
||||
})?;
|
||||
Ok(handshake)
|
||||
}
|
||||
|
||||
fn recv_request(
|
||||
stream: &mut UnixStream,
|
||||
) -> io::Result<(PersistedValidationData, PoV, Duration, ArtifactChecksum)> {
|
||||
let request_bytes = framed_recv_blocking(stream)?;
|
||||
let request = ExecuteRequest::decode(&mut &request_bytes[..]).map_err(|_| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"execute pvf recv_request: failed to decode ExecuteRequest".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok((request.pvd, request.pov, request.execution_timeout, request.artifact_checksum))
|
||||
}
|
||||
|
||||
/// Sends an error to the host and returns the original error wrapped in `io::Error`.
|
||||
macro_rules! map_and_send_err {
|
||||
($error:expr, $err_constructor:expr, $stream:expr, $worker_info:expr) => {{
|
||||
let err: WorkerError = $err_constructor($error.to_string()).into();
|
||||
let io_err = io::Error::new(io::ErrorKind::Other, err.to_string());
|
||||
let _ = send_result::<WorkerResponse, WorkerError>($stream, Err(err), $worker_info);
|
||||
io_err
|
||||
}};
|
||||
}
|
||||
|
||||
/// The entrypoint that the spawned execute worker should start with.
|
||||
///
|
||||
/// # Parameters
|
||||
///
|
||||
/// - `socket_path`: specifies the path to the socket used to communicate with the host.
|
||||
///
|
||||
/// - `worker_dir_path`: specifies the path to the worker-specific temporary directory.
|
||||
///
|
||||
/// - `node_version`: if `Some`, is checked against the `worker_version`. A mismatch results in
|
||||
/// immediate worker termination. `None` is used for tests and in other situations when version
|
||||
/// check is not necessary.
|
||||
///
|
||||
/// - `worker_version`: see above
|
||||
pub fn worker_entrypoint(
|
||||
socket_path: PathBuf,
|
||||
worker_dir_path: PathBuf,
|
||||
node_version: Option<&str>,
|
||||
worker_version: Option<&str>,
|
||||
) {
|
||||
run_worker(
|
||||
WorkerKind::Execute,
|
||||
socket_path,
|
||||
worker_dir_path,
|
||||
node_version,
|
||||
worker_version,
|
||||
|mut stream, worker_info, security_status| {
|
||||
let artifact_path = worker_dir::execute_artifact(&worker_info.worker_dir_path);
|
||||
|
||||
let Handshake { executor_params } =
|
||||
recv_execute_handshake(&mut stream).map_err(|e| {
|
||||
map_and_send_err!(
|
||||
e,
|
||||
InternalValidationError::HostCommunication,
|
||||
&mut stream,
|
||||
worker_info
|
||||
)
|
||||
})?;
|
||||
|
||||
let executor_params: Arc<ExecutorParams> = Arc::new(executor_params);
|
||||
let execute_thread_stack_size = max_stack_size(&executor_params);
|
||||
|
||||
loop {
|
||||
let (pvd, pov, execution_timeout, artifact_checksum) = recv_request(&mut stream)
|
||||
.map_err(|e| {
|
||||
map_and_send_err!(
|
||||
e,
|
||||
InternalValidationError::HostCommunication,
|
||||
&mut stream,
|
||||
worker_info
|
||||
)
|
||||
})?;
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
?security_status,
|
||||
"worker: validating artifact {}",
|
||||
artifact_path.display(),
|
||||
);
|
||||
|
||||
// Get the artifact bytes.
|
||||
let compiled_artifact_blob = std::fs::read(&artifact_path).map_err(|e| {
|
||||
map_and_send_err!(
|
||||
e,
|
||||
InternalValidationError::CouldNotOpenFile,
|
||||
&mut stream,
|
||||
worker_info
|
||||
)
|
||||
})?;
|
||||
|
||||
if artifact_checksum != compute_checksum(&compiled_artifact_blob) {
|
||||
send_result::<WorkerResponse, WorkerError>(
|
||||
&mut stream,
|
||||
Ok(WorkerResponse {
|
||||
job_response: JobResponse::CorruptedArtifact,
|
||||
duration: Duration::ZERO,
|
||||
pov_size: 0,
|
||||
}),
|
||||
worker_info,
|
||||
)?;
|
||||
continue;
|
||||
}
|
||||
|
||||
let (pipe_read_fd, pipe_write_fd) = pipe2_cloexec().map_err(|e| {
|
||||
map_and_send_err!(
|
||||
e,
|
||||
InternalValidationError::CouldNotCreatePipe,
|
||||
&mut stream,
|
||||
worker_info
|
||||
)
|
||||
})?;
|
||||
|
||||
let usage_before = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN)
|
||||
.map_err(|errno| {
|
||||
let e = stringify_errno("getrusage before", errno);
|
||||
map_and_send_err!(
|
||||
e,
|
||||
InternalValidationError::Kernel,
|
||||
&mut stream,
|
||||
worker_info
|
||||
)
|
||||
})?;
|
||||
let stream_fd = stream.as_raw_fd();
|
||||
|
||||
let compiled_artifact_blob = Arc::new(compiled_artifact_blob);
|
||||
|
||||
let raw_block_data =
|
||||
match sp_maybe_compressed_blob::decompress(&pov.block_data.0, POV_BOMB_LIMIT) {
|
||||
Ok(data) => data,
|
||||
Err(_) => {
|
||||
send_result::<WorkerResponse, WorkerError>(
|
||||
&mut stream,
|
||||
Ok(WorkerResponse {
|
||||
job_response: JobResponse::PoVDecompressionFailure,
|
||||
duration: Duration::ZERO,
|
||||
pov_size: 0,
|
||||
}),
|
||||
worker_info,
|
||||
)?;
|
||||
continue;
|
||||
},
|
||||
};
|
||||
|
||||
let pov_size = raw_block_data.len() as u32;
|
||||
|
||||
let params = ValidationParams {
|
||||
parent_head: pvd.parent_head.clone(),
|
||||
block_data: BlockData(raw_block_data.to_vec()),
|
||||
relay_parent_number: pvd.relay_parent_number,
|
||||
relay_parent_storage_root: pvd.relay_parent_storage_root,
|
||||
};
|
||||
let params = Arc::new(params.encode());
|
||||
|
||||
cfg_if::cfg_if! {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
let result = if security_status.can_do_secure_clone {
|
||||
handle_clone(
|
||||
pipe_write_fd,
|
||||
pipe_read_fd,
|
||||
stream_fd,
|
||||
&compiled_artifact_blob,
|
||||
&executor_params,
|
||||
¶ms,
|
||||
execution_timeout,
|
||||
execute_thread_stack_size,
|
||||
worker_info,
|
||||
security_status.can_unshare_user_namespace_and_change_root,
|
||||
usage_before,
|
||||
pov_size,
|
||||
)?
|
||||
} else {
|
||||
// Fall back to using fork.
|
||||
handle_fork(
|
||||
pipe_write_fd,
|
||||
pipe_read_fd,
|
||||
stream_fd,
|
||||
&compiled_artifact_blob,
|
||||
&executor_params,
|
||||
¶ms,
|
||||
execution_timeout,
|
||||
execute_thread_stack_size,
|
||||
worker_info,
|
||||
usage_before,
|
||||
pov_size,
|
||||
)?
|
||||
};
|
||||
} else {
|
||||
let result = handle_fork(
|
||||
pipe_write_fd,
|
||||
pipe_read_fd,
|
||||
stream_fd,
|
||||
&compiled_artifact_blob,
|
||||
&executor_params,
|
||||
¶ms,
|
||||
execution_timeout,
|
||||
execute_thread_stack_size,
|
||||
worker_info,
|
||||
usage_before,
|
||||
pov_size,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"worker: sending result to host: {:?}",
|
||||
result
|
||||
);
|
||||
send_result(&mut stream, result, worker_info)?;
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
fn validate_using_artifact(
|
||||
compiled_artifact_blob: &[u8],
|
||||
executor_params: &ExecutorParams,
|
||||
params: &[u8],
|
||||
) -> JobResponse {
|
||||
let descriptor_bytes = match unsafe {
|
||||
// SAFETY: this should be safe since the compiled artifact passed here comes from the
|
||||
// file created by the prepare workers. These files are obtained by calling
|
||||
// [`executor_interface::prepare`].
|
||||
execute_artifact(compiled_artifact_blob, executor_params, params)
|
||||
} {
|
||||
Err(ExecuteError::RuntimeConstruction(wasmerr)) =>
|
||||
return JobResponse::runtime_construction("execute", &wasmerr.to_string()),
|
||||
Err(err) => return JobResponse::format_invalid("execute", &err.to_string()),
|
||||
Ok(d) => d,
|
||||
};
|
||||
|
||||
let result_descriptor = match ValidationResult::decode(&mut &descriptor_bytes[..]) {
|
||||
Err(err) =>
|
||||
return JobResponse::format_invalid(
|
||||
"validation result decoding failed",
|
||||
&err.to_string(),
|
||||
),
|
||||
Ok(r) => r,
|
||||
};
|
||||
|
||||
JobResponse::Ok { result_descriptor }
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
fn handle_clone(
|
||||
pipe_write_fd: i32,
|
||||
pipe_read_fd: i32,
|
||||
stream_fd: i32,
|
||||
compiled_artifact_blob: &Arc<Vec<u8>>,
|
||||
executor_params: &Arc<ExecutorParams>,
|
||||
params: &Arc<Vec<u8>>,
|
||||
execution_timeout: Duration,
|
||||
execute_stack_size: usize,
|
||||
worker_info: &WorkerInfo,
|
||||
have_unshare_newuser: bool,
|
||||
usage_before: Usage,
|
||||
pov_size: u32,
|
||||
) -> io::Result<Result<WorkerResponse, WorkerError>> {
|
||||
use pezkuwi_node_core_pvf_common::worker::security;
|
||||
|
||||
// SAFETY: new process is spawned within a single threaded process. This invariant
|
||||
// is enforced by tests. Stack size being specified to ensure child doesn't overflow
|
||||
match unsafe {
|
||||
security::clone::clone_on_worker(
|
||||
worker_info,
|
||||
have_unshare_newuser,
|
||||
Box::new(|| {
|
||||
handle_child_process(
|
||||
pipe_write_fd,
|
||||
pipe_read_fd,
|
||||
stream_fd,
|
||||
Arc::clone(compiled_artifact_blob),
|
||||
Arc::clone(executor_params),
|
||||
Arc::clone(params),
|
||||
execution_timeout,
|
||||
execute_stack_size,
|
||||
)
|
||||
}),
|
||||
)
|
||||
} {
|
||||
Ok(child) => handle_parent_process(
|
||||
pipe_read_fd,
|
||||
pipe_write_fd,
|
||||
worker_info,
|
||||
child,
|
||||
usage_before,
|
||||
pov_size,
|
||||
execution_timeout,
|
||||
),
|
||||
Err(security::clone::Error::Clone(errno)) =>
|
||||
Ok(Err(internal_error_from_errno("clone", errno))),
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_fork(
|
||||
pipe_write_fd: i32,
|
||||
pipe_read_fd: i32,
|
||||
stream_fd: i32,
|
||||
compiled_artifact_blob: &Arc<Vec<u8>>,
|
||||
executor_params: &Arc<ExecutorParams>,
|
||||
params: &Arc<Vec<u8>>,
|
||||
execution_timeout: Duration,
|
||||
execute_worker_stack_size: usize,
|
||||
worker_info: &WorkerInfo,
|
||||
usage_before: Usage,
|
||||
pov_size: u32,
|
||||
) -> io::Result<Result<WorkerResponse, WorkerError>> {
|
||||
// SAFETY: new process is spawned within a single threaded process. This invariant
|
||||
// is enforced by tests.
|
||||
match unsafe { nix::unistd::fork() } {
|
||||
Ok(ForkResult::Child) => handle_child_process(
|
||||
pipe_write_fd,
|
||||
pipe_read_fd,
|
||||
stream_fd,
|
||||
Arc::clone(compiled_artifact_blob),
|
||||
Arc::clone(executor_params),
|
||||
Arc::clone(params),
|
||||
execution_timeout,
|
||||
execute_worker_stack_size,
|
||||
),
|
||||
Ok(ForkResult::Parent { child }) => handle_parent_process(
|
||||
pipe_read_fd,
|
||||
pipe_write_fd,
|
||||
worker_info,
|
||||
child,
|
||||
usage_before,
|
||||
pov_size,
|
||||
execution_timeout,
|
||||
),
|
||||
Err(errno) => Ok(Err(internal_error_from_errno("fork", errno))),
|
||||
}
|
||||
}
|
||||
|
||||
/// This is used to handle child process during pvf execute worker.
|
||||
/// It executes the artifact and pipes back the response to the parent process.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// - pipe back `JobResponse` to the parent process.
|
||||
fn handle_child_process(
|
||||
pipe_write_fd: i32,
|
||||
pipe_read_fd: i32,
|
||||
stream_fd: i32,
|
||||
compiled_artifact_blob: Arc<Vec<u8>>,
|
||||
executor_params: Arc<ExecutorParams>,
|
||||
params: Arc<Vec<u8>>,
|
||||
execution_timeout: Duration,
|
||||
execute_thread_stack_size: usize,
|
||||
) -> ! {
|
||||
// SAFETY: this is an open and owned file descriptor at this point.
|
||||
let mut pipe_write = unsafe { PipeFd::from_raw_fd(pipe_write_fd) };
|
||||
|
||||
// Drop the read end so we don't have too many FDs open.
|
||||
if let Err(errno) = nix::unistd::close(pipe_read_fd) {
|
||||
send_child_response(&mut pipe_write, job_error_from_errno("closing pipe", errno));
|
||||
}
|
||||
|
||||
// Dropping the stream closes the underlying socket. We want to make sure
|
||||
// that the sandboxed child can't get any kind of information from the
|
||||
// outside world. The only IPC it should be able to do is sending its
|
||||
// response over the pipe.
|
||||
if let Err(errno) = nix::unistd::close(stream_fd) {
|
||||
send_child_response(&mut pipe_write, job_error_from_errno("closing stream", errno));
|
||||
}
|
||||
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
worker_job_pid = %process::id(),
|
||||
"worker job: executing artifact",
|
||||
);
|
||||
|
||||
// Conditional variable to notify us when a thread is done.
|
||||
let condvar = thread::get_condvar();
|
||||
let cpu_time_start = ProcessTime::now();
|
||||
|
||||
// Spawn a new thread that runs the CPU time monitor.
|
||||
let (cpu_time_monitor_tx, cpu_time_monitor_rx) = channel::<()>();
|
||||
let cpu_time_monitor_thread = thread::spawn_worker_thread(
|
||||
"cpu time monitor thread",
|
||||
move || cpu_time_monitor_loop(cpu_time_start, execution_timeout, cpu_time_monitor_rx),
|
||||
Arc::clone(&condvar),
|
||||
WaitOutcome::TimedOut,
|
||||
)
|
||||
.unwrap_or_else(|err| {
|
||||
send_child_response(&mut pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string())))
|
||||
});
|
||||
|
||||
let execute_thread = thread::spawn_worker_thread_with_stack_size(
|
||||
"execute thread",
|
||||
move || validate_using_artifact(&compiled_artifact_blob, &executor_params, ¶ms),
|
||||
Arc::clone(&condvar),
|
||||
WaitOutcome::Finished,
|
||||
execute_thread_stack_size,
|
||||
)
|
||||
.unwrap_or_else(|err| {
|
||||
send_child_response(&mut pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string())))
|
||||
});
|
||||
|
||||
let outcome = thread::wait_for_threads(condvar);
|
||||
|
||||
let response = match outcome {
|
||||
WaitOutcome::Finished => {
|
||||
let _ = cpu_time_monitor_tx.send(());
|
||||
execute_thread.join().map_err(|e| JobError::Panic(stringify_panic_payload(e)))
|
||||
},
|
||||
// If the CPU thread is not selected, we signal it to end, the join handle is
|
||||
// dropped and the thread will finish in the background.
|
||||
WaitOutcome::TimedOut => match cpu_time_monitor_thread.join() {
|
||||
Ok(Some(_cpu_time_elapsed)) => Err(JobError::TimedOut),
|
||||
Ok(None) => Err(JobError::CpuTimeMonitorThread(
|
||||
"error communicating over finished channel".into(),
|
||||
)),
|
||||
Err(e) => Err(JobError::CpuTimeMonitorThread(stringify_panic_payload(e))),
|
||||
},
|
||||
WaitOutcome::Pending => {
|
||||
unreachable!("we run wait_while until the outcome is no longer pending; qed")
|
||||
},
|
||||
};
|
||||
|
||||
send_child_response(&mut pipe_write, response);
|
||||
}
|
||||
|
||||
/// Returns stack size based on the number of threads.
|
||||
/// The stack size is represented by 2MiB * number_of_threads + native stack;
|
||||
///
|
||||
/// # Background
|
||||
///
|
||||
/// Wasmtime powers the Substrate Executor. It compiles the wasm bytecode into native code.
|
||||
/// That native code does not create any stacks and just reuses the stack of the thread that
|
||||
/// wasmtime was invoked from.
|
||||
///
|
||||
/// Also, we configure the executor to provide the deterministic stack and that requires
|
||||
/// supplying the amount of the native stack space that wasm is allowed to use. This is
|
||||
/// realized by supplying the limit into `wasmtime::Config::max_wasm_stack`.
|
||||
///
|
||||
/// There are quirks to that configuration knob:
|
||||
///
|
||||
/// 1. It only limits the amount of stack space consumed by wasm but does not ensure nor check that
|
||||
/// the stack space is actually available.
|
||||
///
|
||||
/// That means, if the calling thread has 1 MiB of stack space left and the wasm code consumes
|
||||
/// more, then the wasmtime limit will **not** trigger. Instead, the wasm code will hit the
|
||||
/// guard page and the Rust stack overflow handler will be triggered. That leads to an
|
||||
/// **abort**.
|
||||
///
|
||||
/// 2. It cannot and does not limit the stack space consumed by Rust code.
|
||||
///
|
||||
/// Meaning that if the wasm code leaves no stack space for Rust code, then the Rust code
|
||||
/// will abort and that will abort the process as well.
|
||||
///
|
||||
/// Typically on Linux the main thread gets the stack size specified by the `ulimit` and
|
||||
/// typically it's configured to 8 MiB. Rust's spawned threads are 2 MiB. OTOH, the
|
||||
/// DEFAULT_NATIVE_STACK_MAX is set to 256 MiB. Not nearly enough.
|
||||
///
|
||||
/// Hence we need to increase it. The simplest way to fix that is to spawn an execute thread with
|
||||
/// the desired stack limit. We must also make sure the job process has enough stack for *all* its
|
||||
/// threads. This function can be used to get the stack size of either the execute thread or execute
|
||||
/// job process.
|
||||
fn max_stack_size(executor_params: &ExecutorParams) -> usize {
|
||||
let (_sem, deterministic_stack_limit) = params_to_wasmtime_semantics(executor_params);
|
||||
return (2 * 1024 * 1024 + deterministic_stack_limit.native_stack_max) as usize;
|
||||
}
|
||||
|
||||
/// Waits for child process to finish and handle child response from pipe.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// - The response, either `Ok` or some error state.
|
||||
fn handle_parent_process(
|
||||
pipe_read_fd: i32,
|
||||
pipe_write_fd: i32,
|
||||
worker_info: &WorkerInfo,
|
||||
job_pid: Pid,
|
||||
usage_before: Usage,
|
||||
pov_size: u32,
|
||||
timeout: Duration,
|
||||
) -> io::Result<Result<WorkerResponse, WorkerError>> {
|
||||
// the read end will wait until all write ends have been closed,
|
||||
// this drop is necessary to avoid deadlock
|
||||
if let Err(errno) = nix::unistd::close(pipe_write_fd) {
|
||||
return Ok(Err(internal_error_from_errno("closing pipe write fd", errno)));
|
||||
};
|
||||
|
||||
// SAFETY: pipe_read_fd is an open and owned file descriptor at this point.
|
||||
let mut pipe_read = unsafe { PipeFd::from_raw_fd(pipe_read_fd) };
|
||||
|
||||
// Read from the child. Don't decode unless the process exited normally, which we check later.
|
||||
let mut received_data = Vec::new();
|
||||
pipe_read
|
||||
.read_to_end(&mut received_data)
|
||||
// Could not decode job response. There is either a bug or the job was hijacked.
|
||||
// Should retry at any rate.
|
||||
.map_err(|err| io::Error::new(io::ErrorKind::Other, err.to_string()))?;
|
||||
|
||||
let status = nix::sys::wait::waitpid(job_pid, None);
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
%job_pid,
|
||||
"execute worker received wait status from job: {:?}",
|
||||
status,
|
||||
);
|
||||
|
||||
let usage_after = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) {
|
||||
Ok(usage) => usage,
|
||||
Err(errno) => return Ok(Err(internal_error_from_errno("getrusage after", errno))),
|
||||
};
|
||||
|
||||
// Using `getrusage` is needed to check whether child has timedout since we cannot rely on
|
||||
// child to report its own time.
|
||||
// As `getrusage` returns resource usage from all terminated child processes,
|
||||
// it is necessary to subtract the usage before the current child process to isolate its cpu
|
||||
// time
|
||||
let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before);
|
||||
if cpu_tv >= timeout {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
%job_pid,
|
||||
"execute job took {}ms cpu time, exceeded execute timeout {}ms",
|
||||
cpu_tv.as_millis(),
|
||||
timeout.as_millis(),
|
||||
);
|
||||
return Ok(Err(WorkerError::JobTimedOut));
|
||||
}
|
||||
|
||||
match status {
|
||||
Ok(WaitStatus::Exited(_, exit_status)) => {
|
||||
let mut reader = io::BufReader::new(received_data.as_slice());
|
||||
let result = recv_child_response(&mut reader, "execute")?;
|
||||
|
||||
match result {
|
||||
Ok(job_response) => {
|
||||
// The exit status should have been zero if no error occurred.
|
||||
if exit_status != 0 {
|
||||
return Ok(Err(WorkerError::JobError(JobError::UnexpectedExitStatus(
|
||||
exit_status,
|
||||
))));
|
||||
}
|
||||
|
||||
Ok(Ok(WorkerResponse { job_response, pov_size, duration: cpu_tv }))
|
||||
},
|
||||
Err(job_error) => {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
%job_pid,
|
||||
"execute job error: {}",
|
||||
job_error,
|
||||
);
|
||||
if matches!(job_error, JobError::TimedOut) {
|
||||
Ok(Err(WorkerError::JobTimedOut))
|
||||
} else {
|
||||
Ok(Err(WorkerError::JobError(job_error.into())))
|
||||
}
|
||||
},
|
||||
}
|
||||
},
|
||||
// The job was killed by the given signal.
|
||||
//
|
||||
// The job gets SIGSYS on seccomp violations, but this signal may have been sent for some
|
||||
// other reason, so we still need to check for seccomp violations elsewhere.
|
||||
Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) => Ok(Err(WorkerError::JobDied {
|
||||
err: format!("received signal: {signal:?}"),
|
||||
job_pid: job_pid.as_raw(),
|
||||
})),
|
||||
Err(errno) => Ok(Err(internal_error_from_errno("waitpid", errno))),
|
||||
|
||||
// It is within an attacker's power to send an unexpected exit status. So we cannot treat
|
||||
// this as an internal error (which would make us abstain), but must vote against.
|
||||
Ok(unexpected_wait_status) => Ok(Err(WorkerError::JobDied {
|
||||
err: format!("unexpected status from wait: {unexpected_wait_status:?}"),
|
||||
job_pid: job_pid.as_raw(),
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
/// Write a job response to the pipe and exit process after.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// - `pipe_write`: A `PipeFd` structure, the writing end of a pipe.
|
||||
///
|
||||
/// - `response`: Child process response
|
||||
fn send_child_response(pipe_write: &mut PipeFd, response: JobResult) -> ! {
|
||||
framed_send_blocking(pipe_write, response.encode().as_slice())
|
||||
.unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE));
|
||||
|
||||
if response.is_ok() {
|
||||
process::exit(libc::EXIT_SUCCESS)
|
||||
} else {
|
||||
process::exit(libc::EXIT_FAILURE)
|
||||
}
|
||||
}
|
||||
|
||||
fn internal_error_from_errno(context: &'static str, errno: Errno) -> WorkerError {
|
||||
WorkerError::InternalError(InternalValidationError::Kernel(stringify_errno(context, errno)))
|
||||
}
|
||||
|
||||
fn job_error_from_errno(context: &'static str, errno: Errno) -> JobResult {
|
||||
Err(JobError::Kernel(stringify_errno(context, errno)))
|
||||
}
|
||||
Reference in New Issue
Block a user