feat: initialize Kurdistan SDK - independent fork of Polkadot SDK
This commit is contained in:
@@ -0,0 +1,164 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use crate::prepare::{PrepareSuccess, PrepareWorkerSuccess};
|
||||
use codec::{Decode, Encode};
|
||||
pub use sc_executor_common::error::Error as ExecuteError;
|
||||
|
||||
/// Result of PVF preparation from a worker, with checksum of the compiled PVF and stats of the
|
||||
/// preparation if successful.
|
||||
pub type PrepareWorkerResult = Result<PrepareWorkerSuccess, PrepareError>;
|
||||
|
||||
/// Result of PVF preparation propagated all the way back to the host, with path to the concluded
|
||||
/// artifact and stats of the preparation if successful.
|
||||
pub type PrepareResult = Result<PrepareSuccess, PrepareError>;
|
||||
|
||||
/// Result of prechecking PVF performed by the validation host. Contains stats about the preparation
|
||||
/// if successful.
|
||||
pub type PrecheckResult = Result<(), PrepareError>;
|
||||
|
||||
/// An error that occurred during the prepare part of the PVF pipeline.
|
||||
// Codec indexes are intended to stabilize pre-encoded payloads (see `OOM_PAYLOAD`)
|
||||
#[derive(thiserror::Error, Debug, Clone, Encode, Decode)]
|
||||
pub enum PrepareError {
|
||||
/// During the prevalidation stage of preparation an issue was found with the PVF.
|
||||
#[codec(index = 0)]
|
||||
#[error("prepare: prevalidation error: {0}")]
|
||||
Prevalidation(String),
|
||||
/// Compilation failed for the given PVF.
|
||||
#[codec(index = 1)]
|
||||
#[error("prepare: preparation error: {0}")]
|
||||
Preparation(String),
|
||||
/// Instantiation of the WASM module instance failed.
|
||||
#[codec(index = 2)]
|
||||
#[error("prepare: runtime construction: {0}")]
|
||||
RuntimeConstruction(String),
|
||||
/// An unexpected error has occurred in the preparation job.
|
||||
#[codec(index = 3)]
|
||||
#[error("prepare: job error: {0}")]
|
||||
JobError(String),
|
||||
/// Failed to prepare the PVF due to the time limit.
|
||||
#[codec(index = 4)]
|
||||
#[error("prepare: timeout")]
|
||||
TimedOut,
|
||||
/// An IO error occurred. This state is reported by either the validation host or by the
|
||||
/// worker.
|
||||
#[codec(index = 5)]
|
||||
#[error("prepare: io error while receiving response: {0}")]
|
||||
IoErr(String),
|
||||
/// The temporary file for the artifact could not be created at the given cache path. This
|
||||
/// state is reported by the validation host (not by the worker).
|
||||
#[codec(index = 6)]
|
||||
#[error("prepare: error creating tmp file: {0}")]
|
||||
CreateTmpFile(String),
|
||||
/// The response from the worker is received, but the file cannot be renamed (moved) to the
|
||||
/// final destination location. This state is reported by the validation host (not by the
|
||||
/// worker).
|
||||
#[codec(index = 7)]
|
||||
#[error("prepare: error renaming tmp file ({src:?} -> {dest:?}): {err}")]
|
||||
RenameTmpFile {
|
||||
err: String,
|
||||
// Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible
|
||||
// conversion to `Option<String>`.
|
||||
src: Option<String>,
|
||||
dest: Option<String>,
|
||||
},
|
||||
/// Memory limit reached
|
||||
#[codec(index = 8)]
|
||||
#[error("prepare: out of memory")]
|
||||
OutOfMemory,
|
||||
/// The response from the worker is received, but the worker cache could not be cleared. The
|
||||
/// worker has to be killed to avoid jobs having access to data from other jobs. This state is
|
||||
/// reported by the validation host (not by the worker).
|
||||
#[codec(index = 9)]
|
||||
#[error("prepare: error clearing worker cache: {0}")]
|
||||
ClearWorkerDir(String),
|
||||
/// The preparation job process died, due to OOM, a seccomp violation, or some other factor.
|
||||
#[codec(index = 10)]
|
||||
#[error("prepare: prepare job with pid {job_pid} died: {err}")]
|
||||
JobDied { err: String, job_pid: i32 },
|
||||
/// Some error occurred when interfacing with the kernel.
|
||||
#[codec(index = 11)]
|
||||
#[error("prepare: error interfacing with the kernel: {0}")]
|
||||
Kernel(String),
|
||||
/// Code blob failed to decompress
|
||||
#[codec(index = 12)]
|
||||
#[error("prepare: could not decompress code blob: {0}")]
|
||||
CouldNotDecompressCodeBlob(String),
|
||||
}
|
||||
|
||||
impl PrepareError {
|
||||
/// Returns whether this is a deterministic error, i.e. one that should trigger reliably. Those
|
||||
/// errors depend on the PVF itself and the sc-executor/wasmtime logic.
|
||||
///
|
||||
/// Non-deterministic errors can happen spuriously. Typically, they occur due to resource
|
||||
/// starvation, e.g. under heavy load or memory pressure. Those errors are typically transient
|
||||
/// but may persist e.g. if the node is run by overwhelmingly underpowered machine.
|
||||
pub fn is_deterministic(&self) -> bool {
|
||||
use PrepareError::*;
|
||||
match self {
|
||||
Prevalidation(_) |
|
||||
Preparation(_) |
|
||||
JobError(_) |
|
||||
OutOfMemory |
|
||||
CouldNotDecompressCodeBlob(_) => true,
|
||||
IoErr(_) |
|
||||
JobDied { .. } |
|
||||
CreateTmpFile(_) |
|
||||
RenameTmpFile { .. } |
|
||||
ClearWorkerDir(_) |
|
||||
Kernel(_) => false,
|
||||
// Can occur due to issues with the PVF, but also due to factors like local load.
|
||||
TimedOut => false,
|
||||
// Can occur due to issues with the PVF, but also due to local errors.
|
||||
RuntimeConstruction(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Some internal error occurred.
|
||||
///
|
||||
/// Should only ever be used for validation errors independent of the candidate and PVF, or for
|
||||
/// errors we ruled out during pre-checking (so preparation errors are fine).
|
||||
#[derive(thiserror::Error, Debug, Clone, Encode, Decode)]
|
||||
pub enum InternalValidationError {
|
||||
/// Some communication error occurred with the host.
|
||||
#[error("validation: some communication error occurred with the host: {0}")]
|
||||
HostCommunication(String),
|
||||
/// Host could not create a hard link to the artifact path.
|
||||
#[error("validation: host could not create a hard link to the artifact path: {0}")]
|
||||
CouldNotCreateLink(String),
|
||||
/// Could not find or open compiled artifact file.
|
||||
#[error("validation: could not find or open compiled artifact file: {0}")]
|
||||
CouldNotOpenFile(String),
|
||||
/// Could not create a pipe between the worker and a child process.
|
||||
#[error("validation: could not create pipe: {0}")]
|
||||
CouldNotCreatePipe(String),
|
||||
/// Host could not clear the worker cache after a job.
|
||||
#[error("validation: host could not clear the worker cache ({path:?}) after a job: {err}")]
|
||||
CouldNotClearWorkerDir {
|
||||
err: String,
|
||||
// Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible
|
||||
// conversion to `Option<String>`.
|
||||
path: Option<String>,
|
||||
},
|
||||
/// Some error occurred when interfacing with the kernel.
|
||||
#[error("validation: error interfacing with the kernel: {0}")]
|
||||
Kernel(String),
|
||||
/// Some non-deterministic preparation error occurred.
|
||||
#[error("validation: prepare: {0}")]
|
||||
NonDeterministicPrepareError(PrepareError),
|
||||
}
|
||||
@@ -0,0 +1,141 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use crate::{error::InternalValidationError, ArtifactChecksum};
|
||||
use codec::{Decode, Encode};
|
||||
use pezkuwi_node_primitives::PoV;
|
||||
use pezkuwi_primitives::{ExecutorParams, PersistedValidationData};
|
||||
use pezkuwi_teyrchain_primitives::primitives::ValidationResult;
|
||||
use std::time::Duration;
|
||||
|
||||
/// The payload of the one-time handshake that is done when a worker process is created. Carries
|
||||
/// data from the host to the worker.
|
||||
#[derive(Encode, Decode)]
|
||||
pub struct Handshake {
|
||||
/// The executor parameters.
|
||||
pub executor_params: ExecutorParams,
|
||||
}
|
||||
|
||||
/// A request to execute a PVF
|
||||
#[derive(Encode, Decode)]
|
||||
pub struct ExecuteRequest {
|
||||
/// Persisted validation data.
|
||||
pub pvd: PersistedValidationData,
|
||||
/// Proof-of-validity.
|
||||
pub pov: PoV,
|
||||
/// Execution timeout.
|
||||
pub execution_timeout: Duration,
|
||||
/// Checksum of the artifact to execute.
|
||||
pub artifact_checksum: ArtifactChecksum,
|
||||
}
|
||||
|
||||
/// The response from the execution worker.
|
||||
#[derive(Debug, Encode, Decode)]
|
||||
pub struct WorkerResponse {
|
||||
/// The response from the execute job process.
|
||||
pub job_response: JobResponse,
|
||||
/// The amount of CPU time taken by the job.
|
||||
pub duration: Duration,
|
||||
/// The uncompressed PoV size.
|
||||
pub pov_size: u32,
|
||||
}
|
||||
|
||||
/// An error occurred in the worker process.
|
||||
#[derive(thiserror::Error, Debug, Clone, Encode, Decode)]
|
||||
pub enum WorkerError {
|
||||
/// The job timed out.
|
||||
#[error("The job timed out")]
|
||||
JobTimedOut,
|
||||
/// The job process has died. We must kill the worker just in case.
|
||||
///
|
||||
/// We cannot treat this as an internal error because malicious code may have killed the job.
|
||||
/// We still retry it, because in the non-malicious case it is likely spurious.
|
||||
#[error("The job process (pid {job_pid}) has died: {err}")]
|
||||
JobDied { err: String, job_pid: i32 },
|
||||
/// An unexpected error occurred in the job process, e.g. failing to spawn a thread, panic,
|
||||
/// etc.
|
||||
///
|
||||
/// Because malicious code can cause a job error, we must not treat it as an internal error. We
|
||||
/// still retry it, because in the non-malicious case it is likely spurious.
|
||||
#[error("An unexpected error occurred in the job process: {0}")]
|
||||
JobError(#[from] JobError),
|
||||
|
||||
/// Some internal error occurred.
|
||||
#[error("An internal error occurred: {0}")]
|
||||
InternalError(#[from] InternalValidationError),
|
||||
}
|
||||
|
||||
/// The result of a job on the execution worker.
|
||||
pub type JobResult = Result<JobResponse, JobError>;
|
||||
|
||||
/// The successful response from a job on the execution worker.
|
||||
#[derive(Debug, Encode, Decode)]
|
||||
pub enum JobResponse {
|
||||
Ok {
|
||||
/// The result of teyrchain validation.
|
||||
result_descriptor: ValidationResult,
|
||||
},
|
||||
/// A possibly transient runtime instantiation error happened during the execution; may be
|
||||
/// retried with re-preparation
|
||||
RuntimeConstruction(String),
|
||||
/// The candidate is invalid.
|
||||
InvalidCandidate(String),
|
||||
/// PoV decompression failed
|
||||
PoVDecompressionFailure,
|
||||
/// The artifact is corrupted, re-prepare the artifact and try again.
|
||||
CorruptedArtifact,
|
||||
}
|
||||
|
||||
impl JobResponse {
|
||||
/// Creates an invalid response from a context `ctx` and a message `msg` (which can be empty).
|
||||
pub fn format_invalid(ctx: &'static str, msg: &str) -> Self {
|
||||
if msg.is_empty() {
|
||||
Self::InvalidCandidate(ctx.to_string())
|
||||
} else {
|
||||
Self::InvalidCandidate(format!("{}: {}", ctx, msg))
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a may retry response from a context `ctx` and a message `msg` (which can be empty).
|
||||
pub fn runtime_construction(ctx: &'static str, msg: &str) -> Self {
|
||||
if msg.is_empty() {
|
||||
Self::RuntimeConstruction(ctx.to_string())
|
||||
} else {
|
||||
Self::RuntimeConstruction(format!("{}: {}", ctx, msg))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An unexpected error occurred in the execution job process. Because this comes from the job,
|
||||
/// which executes untrusted code, this error must likewise be treated as untrusted. That is, we
|
||||
/// cannot raise an internal error based on this.
|
||||
#[derive(thiserror::Error, Clone, Debug, Encode, Decode)]
|
||||
pub enum JobError {
|
||||
#[error("The job timed out")]
|
||||
TimedOut,
|
||||
#[error("An unexpected panic has occurred in the execution job: {0}")]
|
||||
Panic(String),
|
||||
/// Some error occurred when interfacing with the kernel.
|
||||
#[error("Error interfacing with the kernel: {0}")]
|
||||
Kernel(String),
|
||||
#[error("Could not spawn the requested thread: {0}")]
|
||||
CouldNotSpawnThread(String),
|
||||
#[error("An error occurred in the CPU time monitor thread: {0}")]
|
||||
CpuTimeMonitorThread(String),
|
||||
/// Since the job can return any exit status it wants, we have to treat this as untrusted.
|
||||
#[error("Unexpected exit status: {0}")]
|
||||
UnexpectedExitStatus(i32),
|
||||
}
|
||||
@@ -0,0 +1,495 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Interface to the Substrate Executor
|
||||
|
||||
use crate::error::ExecuteError;
|
||||
use pezkuwi_primitives::{
|
||||
executor_params::{DEFAULT_LOGICAL_STACK_MAX, DEFAULT_NATIVE_STACK_MAX},
|
||||
ExecutorParam, ExecutorParams,
|
||||
};
|
||||
use sc_executor_common::{
|
||||
error::WasmError,
|
||||
runtime_blob::RuntimeBlob,
|
||||
wasm_runtime::{HeapAllocStrategy, WasmModule as _},
|
||||
};
|
||||
use sc_executor_wasmtime::{Config, DeterministicStackLimit, Semantics, WasmtimeRuntime};
|
||||
use sp_core::storage::{ChildInfo, TrackedStorageKey};
|
||||
use sp_externalities::MultiRemovalResults;
|
||||
use std::any::{Any, TypeId};
|
||||
|
||||
// Memory configuration
|
||||
//
|
||||
// When Substrate Runtime is instantiated, a number of WASM pages are allocated for the Substrate
|
||||
// Runtime instance's linear memory. The exact number of pages is a sum of whatever the WASM blob
|
||||
// itself requests (by default at least enough to hold the data section as well as have some space
|
||||
// left for the stack; this is, of course, overridable at link time when compiling the runtime)
|
||||
// plus the number of pages specified in the `extra_heap_pages` passed to the executor.
|
||||
//
|
||||
// By default, rustc (or `lld` specifically) should allocate 1 MiB for the shadow stack, or 16
|
||||
// pages. The data section for runtimes are typically rather small and can fit in a single digit
|
||||
// number of WASM pages, so let's say an extra 16 pages. Thus let's assume that 32 pages or 2 MiB
|
||||
// are used for these needs by default.
|
||||
const DEFAULT_HEAP_PAGES_ESTIMATE: u32 = 32;
|
||||
const EXTRA_HEAP_PAGES: u32 = 2048;
|
||||
|
||||
// VALUES OF THE DEFAULT CONFIGURATION SHOULD NEVER BE CHANGED
|
||||
// They are used as base values for the execution environment parametrization.
|
||||
// To overwrite them, add new ones to `EXECUTOR_PARAMS` in the `session_info` pallet and perform
|
||||
// a runtime upgrade to make them active.
|
||||
pub const DEFAULT_CONFIG: Config = Config {
|
||||
allow_missing_func_imports: true,
|
||||
cache_path: None,
|
||||
semantics: Semantics {
|
||||
heap_alloc_strategy: sc_executor_common::wasm_runtime::HeapAllocStrategy::Dynamic {
|
||||
maximum_pages: Some(DEFAULT_HEAP_PAGES_ESTIMATE + EXTRA_HEAP_PAGES),
|
||||
},
|
||||
|
||||
instantiation_strategy:
|
||||
sc_executor_wasmtime::InstantiationStrategy::RecreateInstanceCopyOnWrite,
|
||||
|
||||
// Enable deterministic stack limit to pin down the exact number of items the wasmtime stack
|
||||
// can contain before it traps with stack overflow.
|
||||
//
|
||||
// Here is how the values below were chosen.
|
||||
//
|
||||
// At the moment of writing, the default native stack size limit is 1 MiB. Assuming a
|
||||
// logical item (see the docs about the field and the instrumentation algorithm) is 8 bytes,
|
||||
// 1 MiB can fit 2x 65536 logical items.
|
||||
//
|
||||
// Since reaching the native stack limit is undesirable, we halve the logical item limit and
|
||||
// also increase the native 256x. This hopefully should preclude wasm code from reaching
|
||||
// the stack limit set by the wasmtime.
|
||||
deterministic_stack_limit: Some(DeterministicStackLimit {
|
||||
logical_max: DEFAULT_LOGICAL_STACK_MAX,
|
||||
native_stack_max: DEFAULT_NATIVE_STACK_MAX,
|
||||
}),
|
||||
canonicalize_nans: true,
|
||||
// Rationale for turning the multi-threaded compilation off is to make the preparation time
|
||||
// easily reproducible and as deterministic as possible.
|
||||
//
|
||||
// Currently the prepare queue doesn't distinguish between precheck and prepare requests.
|
||||
// On the one hand, it simplifies the code, on the other, however, slows down compile times
|
||||
// for execute requests. This behavior may change in future.
|
||||
parallel_compilation: false,
|
||||
|
||||
// WASM extensions. Only those that are meaningful to us may be controlled here. By default,
|
||||
// we're using WASM MVP, which means all the extensions are disabled. Nevertheless, some
|
||||
// extensions (e.g., sign extension ops) are enabled by Wasmtime and cannot be disabled.
|
||||
wasm_reference_types: false,
|
||||
wasm_simd: false,
|
||||
wasm_bulk_memory: false,
|
||||
wasm_multi_value: false,
|
||||
},
|
||||
};
|
||||
|
||||
/// Executes the given PVF in the form of a compiled artifact and returns the result of
|
||||
/// execution upon success.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// The caller must ensure that the compiled artifact passed here was:
|
||||
/// 1) produced by `prepare`,
|
||||
/// 2) was not modified,
|
||||
///
|
||||
/// Failure to adhere to these requirements might lead to crashes and arbitrary code execution.
|
||||
pub unsafe fn execute_artifact(
|
||||
compiled_artifact_blob: &[u8],
|
||||
executor_params: &ExecutorParams,
|
||||
params: &[u8],
|
||||
) -> Result<Vec<u8>, ExecuteError> {
|
||||
let mut extensions = sp_externalities::Extensions::new();
|
||||
|
||||
extensions.register(sp_core::traits::ReadRuntimeVersionExt::new(ReadRuntimeVersion));
|
||||
|
||||
let mut ext = ValidationExternalities(extensions);
|
||||
|
||||
match sc_executor::with_externalities_safe(&mut ext, || {
|
||||
let runtime = create_runtime_from_artifact_bytes(compiled_artifact_blob, executor_params)?;
|
||||
runtime.new_instance()?.call("validate_block", params)
|
||||
}) {
|
||||
Ok(Ok(ok)) => Ok(ok),
|
||||
Ok(Err(err)) | Err(err) => Err(err),
|
||||
}
|
||||
}
|
||||
|
||||
/// Constructs the runtime for the given PVF, given the artifact bytes.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// The caller must ensure that the compiled artifact passed here was:
|
||||
/// 1) produced by `prepare`,
|
||||
/// 2) was not modified,
|
||||
///
|
||||
/// Failure to adhere to these requirements might lead to crashes and arbitrary code execution.
|
||||
pub unsafe fn create_runtime_from_artifact_bytes(
|
||||
compiled_artifact_blob: &[u8],
|
||||
executor_params: &ExecutorParams,
|
||||
) -> Result<WasmtimeRuntime, WasmError> {
|
||||
let mut config = DEFAULT_CONFIG.clone();
|
||||
config.semantics = params_to_wasmtime_semantics(executor_params).0;
|
||||
|
||||
sc_executor_wasmtime::create_runtime_from_artifact_bytes::<HostFunctions>(
|
||||
compiled_artifact_blob,
|
||||
config,
|
||||
)
|
||||
}
|
||||
|
||||
/// Takes the default config and overwrites any settings with existing executor parameters.
|
||||
///
|
||||
/// Returns the semantics as well as the stack limit (since we are guaranteed to have it).
|
||||
pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> (Semantics, DeterministicStackLimit) {
|
||||
let mut sem = DEFAULT_CONFIG.semantics.clone();
|
||||
let mut stack_limit = sem
|
||||
.deterministic_stack_limit
|
||||
.expect("There is a comment to not change the default stack limit; it should always be available; qed")
|
||||
.clone();
|
||||
|
||||
for p in par.iter() {
|
||||
match p {
|
||||
ExecutorParam::MaxMemoryPages(max_pages) =>
|
||||
sem.heap_alloc_strategy = HeapAllocStrategy::Dynamic {
|
||||
maximum_pages: Some((*max_pages).saturating_add(DEFAULT_HEAP_PAGES_ESTIMATE)),
|
||||
},
|
||||
ExecutorParam::StackLogicalMax(slm) => stack_limit.logical_max = *slm,
|
||||
ExecutorParam::StackNativeMax(snm) => stack_limit.native_stack_max = *snm,
|
||||
ExecutorParam::WasmExtBulkMemory => sem.wasm_bulk_memory = true,
|
||||
ExecutorParam::PrecheckingMaxMemory(_) |
|
||||
ExecutorParam::PvfPrepTimeout(_, _) |
|
||||
ExecutorParam::PvfExecTimeout(_, _) => (), /* Not used here */
|
||||
}
|
||||
}
|
||||
sem.deterministic_stack_limit = Some(stack_limit.clone());
|
||||
(sem, stack_limit)
|
||||
}
|
||||
|
||||
/// Runs the prevalidation on the given code. Returns a [`RuntimeBlob`] if it succeeds.
|
||||
pub fn prevalidate(code: &[u8]) -> Result<RuntimeBlob, sc_executor_common::error::WasmError> {
|
||||
// Construct the runtime blob and do some basic checks for consistency.
|
||||
let blob = RuntimeBlob::new(code)?;
|
||||
// In the future this function should take care of any further prevalidation logic.
|
||||
Ok(blob)
|
||||
}
|
||||
|
||||
/// Runs preparation on the given runtime blob. If successful, it returns a serialized compiled
|
||||
/// artifact which can then be used to pass into `Executor::execute` after writing it to the disk.
|
||||
pub fn prepare(
|
||||
blob: RuntimeBlob,
|
||||
executor_params: &ExecutorParams,
|
||||
) -> Result<Vec<u8>, sc_executor_common::error::WasmError> {
|
||||
let (semantics, _) = params_to_wasmtime_semantics(executor_params);
|
||||
sc_executor_wasmtime::prepare_runtime_artifact(blob, &semantics)
|
||||
}
|
||||
|
||||
/// Available host functions. We leave out:
|
||||
///
|
||||
/// 1. storage related stuff (PVF doesn't have a notion of a persistent storage/trie)
|
||||
/// 2. tracing
|
||||
/// 3. off chain workers (PVFs do not have such a notion)
|
||||
/// 4. runtime tasks
|
||||
/// 5. sandbox
|
||||
type HostFunctions = (
|
||||
sp_io::misc::HostFunctions,
|
||||
sp_io::crypto::HostFunctions,
|
||||
sp_io::hashing::HostFunctions,
|
||||
sp_io::allocator::HostFunctions,
|
||||
sp_io::logging::HostFunctions,
|
||||
sp_io::trie::HostFunctions,
|
||||
);
|
||||
|
||||
/// The validation externalities that will panic on any storage related access. (PVFs should not
|
||||
/// have a notion of a persistent storage/trie.)
|
||||
struct ValidationExternalities(sp_externalities::Extensions);
|
||||
|
||||
impl sp_externalities::Externalities for ValidationExternalities {
|
||||
fn storage(&mut self, _: &[u8]) -> Option<Vec<u8>> {
|
||||
panic!("storage: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn storage_hash(&mut self, _: &[u8]) -> Option<Vec<u8>> {
|
||||
panic!("storage_hash: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn child_storage_hash(&mut self, _: &ChildInfo, _: &[u8]) -> Option<Vec<u8>> {
|
||||
panic!("child_storage_hash: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn child_storage(&mut self, _: &ChildInfo, _: &[u8]) -> Option<Vec<u8>> {
|
||||
panic!("child_storage: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn kill_child_storage(
|
||||
&mut self,
|
||||
_child_info: &ChildInfo,
|
||||
_maybe_limit: Option<u32>,
|
||||
_maybe_cursor: Option<&[u8]>,
|
||||
) -> MultiRemovalResults {
|
||||
panic!("kill_child_storage: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn clear_prefix(
|
||||
&mut self,
|
||||
_prefix: &[u8],
|
||||
_maybe_limit: Option<u32>,
|
||||
_maybe_cursor: Option<&[u8]>,
|
||||
) -> MultiRemovalResults {
|
||||
panic!("clear_prefix: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn clear_child_prefix(
|
||||
&mut self,
|
||||
_child_info: &ChildInfo,
|
||||
_prefix: &[u8],
|
||||
_maybe_limit: Option<u32>,
|
||||
_maybe_cursor: Option<&[u8]>,
|
||||
) -> MultiRemovalResults {
|
||||
panic!("clear_child_prefix: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn place_storage(&mut self, _: Vec<u8>, _: Option<Vec<u8>>) {
|
||||
panic!("place_storage: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn place_child_storage(&mut self, _: &ChildInfo, _: Vec<u8>, _: Option<Vec<u8>>) {
|
||||
panic!("place_child_storage: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn storage_root(&mut self, _: sp_core::storage::StateVersion) -> Vec<u8> {
|
||||
panic!("storage_root: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn child_storage_root(&mut self, _: &ChildInfo, _: sp_core::storage::StateVersion) -> Vec<u8> {
|
||||
panic!("child_storage_root: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn next_child_storage_key(&mut self, _: &ChildInfo, _: &[u8]) -> Option<Vec<u8>> {
|
||||
panic!("next_child_storage_key: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn next_storage_key(&mut self, _: &[u8]) -> Option<Vec<u8>> {
|
||||
panic!("next_storage_key: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn storage_append(&mut self, _key: Vec<u8>, _value: Vec<u8>) {
|
||||
panic!("storage_append: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn storage_start_transaction(&mut self) {
|
||||
panic!("storage_start_transaction: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn storage_rollback_transaction(&mut self) -> Result<(), ()> {
|
||||
panic!("storage_rollback_transaction: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn storage_commit_transaction(&mut self) -> Result<(), ()> {
|
||||
panic!("storage_commit_transaction: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn wipe(&mut self) {
|
||||
panic!("wipe: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn commit(&mut self) {
|
||||
panic!("commit: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn read_write_count(&self) -> (u32, u32, u32, u32) {
|
||||
panic!("read_write_count: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn reset_read_write_count(&mut self) {
|
||||
panic!("reset_read_write_count: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn get_whitelist(&self) -> Vec<TrackedStorageKey> {
|
||||
panic!("get_whitelist: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn set_whitelist(&mut self, _: Vec<TrackedStorageKey>) {
|
||||
panic!("set_whitelist: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn set_offchain_storage(&mut self, _: &[u8], _: std::option::Option<&[u8]>) {
|
||||
panic!("set_offchain_storage: unsupported feature for teyrchain validation")
|
||||
}
|
||||
|
||||
fn get_read_and_written_keys(&self) -> Vec<(Vec<u8>, u32, u32, bool)> {
|
||||
panic!("get_read_and_written_keys: unsupported feature for teyrchain validation")
|
||||
}
|
||||
}
|
||||
|
||||
impl sp_externalities::ExtensionStore for ValidationExternalities {
|
||||
fn extension_by_type_id(&mut self, type_id: TypeId) -> Option<&mut dyn Any> {
|
||||
self.0.get_mut(type_id)
|
||||
}
|
||||
|
||||
fn register_extension_with_type_id(
|
||||
&mut self,
|
||||
type_id: TypeId,
|
||||
extension: Box<dyn sp_externalities::Extension>,
|
||||
) -> Result<(), sp_externalities::Error> {
|
||||
self.0.register_with_type_id(type_id, extension)
|
||||
}
|
||||
|
||||
fn deregister_extension_by_type_id(
|
||||
&mut self,
|
||||
type_id: TypeId,
|
||||
) -> Result<(), sp_externalities::Error> {
|
||||
if self.0.deregister(type_id) {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(sp_externalities::Error::ExtensionIsNotRegistered(type_id))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ReadRuntimeVersion;
|
||||
|
||||
impl sp_core::traits::ReadRuntimeVersion for ReadRuntimeVersion {
|
||||
fn read_runtime_version(
|
||||
&self,
|
||||
wasm_code: &[u8],
|
||||
_ext: &mut dyn sp_externalities::Externalities,
|
||||
) -> Result<Vec<u8>, String> {
|
||||
let blob = RuntimeBlob::uncompress_if_needed(wasm_code)
|
||||
.map_err(|e| format!("Failed to read the PVF runtime blob: {:?}", e))?;
|
||||
|
||||
match sc_executor::read_embedded_version(&blob)
|
||||
.map_err(|e| format!("Failed to read the static section from the PVF blob: {:?}", e))?
|
||||
{
|
||||
Some(version) => {
|
||||
use codec::Encode;
|
||||
Ok(version.encode())
|
||||
},
|
||||
None => Err("runtime version section is not found".to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn prep_hash_matches_artifact_effect_of_executor_params() {
|
||||
use ExecutorParam::*;
|
||||
|
||||
// If you're adding a new ExecutorParam, please add it to the `cases` below.
|
||||
|
||||
let _coverage_check = |param: &ExecutorParam| match param {
|
||||
MaxMemoryPages(_) => true,
|
||||
StackLogicalMax(_) => true,
|
||||
StackNativeMax(_) => true,
|
||||
PrecheckingMaxMemory(_) => true,
|
||||
PvfPrepTimeout(_, _) => true,
|
||||
PvfExecTimeout(_, _) => true,
|
||||
WasmExtBulkMemory => true,
|
||||
};
|
||||
|
||||
// A minimal module with memory and an exported `validate_block` function.
|
||||
let wat = r#"(module
|
||||
(memory 1)
|
||||
(func (export "validate_block") (param i32 i32))
|
||||
)"#;
|
||||
let wasm = wat::parse_str(wat).expect("wat parsing failed");
|
||||
let blob = prevalidate(&wasm).expect("valid runtime blob");
|
||||
|
||||
let base = ExecutorParams::default();
|
||||
|
||||
let prepare_with = |params: &ExecutorParams| -> Vec<u8> {
|
||||
prepare(blob.clone(), params).expect("prepare should succeed")
|
||||
};
|
||||
|
||||
// Define pairs that toggle exactly one parameter.
|
||||
let cases: Vec<(&str, ExecutorParams, ExecutorParams)> = vec![
|
||||
(
|
||||
"MaxMemoryPages",
|
||||
base.clone(),
|
||||
ExecutorParams::from(&[ExecutorParam::MaxMemoryPages(128)][..]),
|
||||
),
|
||||
(
|
||||
"StackLogicalMax",
|
||||
base.clone(),
|
||||
ExecutorParams::from(
|
||||
&[ExecutorParam::StackLogicalMax(DEFAULT_LOGICAL_STACK_MAX + 1)][..],
|
||||
),
|
||||
),
|
||||
(
|
||||
"StackNativeMax",
|
||||
base.clone(),
|
||||
ExecutorParams::from(
|
||||
&[ExecutorParam::StackNativeMax(DEFAULT_NATIVE_STACK_MAX + 1024)][..],
|
||||
),
|
||||
),
|
||||
(
|
||||
"PrecheckingMaxMemory",
|
||||
base.clone(),
|
||||
ExecutorParams::from(&[ExecutorParam::PrecheckingMaxMemory(300 * 1024 * 1024)][..]),
|
||||
),
|
||||
(
|
||||
"PvfPrepTimeout(Precheck)",
|
||||
base.clone(),
|
||||
ExecutorParams::from(
|
||||
&[ExecutorParam::PvfPrepTimeout(pezkuwi_primitives::PvfPrepKind::Precheck, 1)]
|
||||
[..],
|
||||
),
|
||||
),
|
||||
(
|
||||
"PvfPrepTimeout(Prepare)",
|
||||
base.clone(),
|
||||
ExecutorParams::from(
|
||||
&[ExecutorParam::PvfPrepTimeout(pezkuwi_primitives::PvfPrepKind::Prepare, 2)][..],
|
||||
),
|
||||
),
|
||||
(
|
||||
"PvfExecTimeout(Backing)",
|
||||
base.clone(),
|
||||
ExecutorParams::from(
|
||||
&[ExecutorParam::PvfExecTimeout(pezkuwi_primitives::PvfExecKind::Backing, 1)][..],
|
||||
),
|
||||
),
|
||||
(
|
||||
"PvfExecTimeout(Approval)",
|
||||
base.clone(),
|
||||
ExecutorParams::from(
|
||||
&[ExecutorParam::PvfExecTimeout(pezkuwi_primitives::PvfExecKind::Approval, 2)]
|
||||
[..],
|
||||
),
|
||||
),
|
||||
(
|
||||
"WasmExtBulkMemory",
|
||||
base.clone(),
|
||||
ExecutorParams::from(&[ExecutorParam::WasmExtBulkMemory][..]),
|
||||
),
|
||||
];
|
||||
|
||||
for (name, a, b) in cases.into_iter() {
|
||||
let art_a = prepare_with(&a);
|
||||
let art_b = prepare_with(&b);
|
||||
let artifact_changed = art_a != art_b;
|
||||
let prep_hash_changed = a.prep_hash() != b.prep_hash();
|
||||
assert_eq!(
|
||||
artifact_changed,
|
||||
prep_hash_changed,
|
||||
"ExecutorParam classification mismatch for {}: artifact_changed={}, prep_hash_changed={}",
|
||||
name,
|
||||
artifact_changed,
|
||||
prep_hash_changed,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Contains functionality related to PVFs that is shared by the PVF host and the PVF workers.
|
||||
#![deny(unused_crate_dependencies)]
|
||||
|
||||
pub mod error;
|
||||
pub mod execute;
|
||||
pub mod executor_interface;
|
||||
pub mod prepare;
|
||||
pub mod pvf;
|
||||
pub mod worker;
|
||||
pub mod worker_dir;
|
||||
|
||||
pub use cpu_time::ProcessTime;
|
||||
|
||||
// Used by `decl_worker_main!`.
|
||||
pub use sp_tracing;
|
||||
|
||||
const LOG_TARGET: &str = "teyrchain::pvf-common";
|
||||
|
||||
use codec::{Decode, Encode};
|
||||
use sp_core::H256;
|
||||
use std::{
|
||||
io::{self, Read, Write},
|
||||
mem,
|
||||
};
|
||||
|
||||
#[cfg(feature = "test-utils")]
|
||||
pub mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
pub const TEST_EXECUTION_TIMEOUT: Duration = Duration::from_secs(3);
|
||||
pub const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
}
|
||||
|
||||
/// Status of security features on the current system.
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq, Encode, Decode)]
|
||||
pub struct SecurityStatus {
|
||||
/// Whether Secure Validator Mode is enabled. This mode enforces that all required security
|
||||
/// features are present. All features are enabled on a best-effort basis regardless.
|
||||
pub secure_validator_mode: bool,
|
||||
/// Whether the landlock features we use are fully available on this system.
|
||||
pub can_enable_landlock: bool,
|
||||
/// Whether the seccomp features we use are fully available on this system.
|
||||
pub can_enable_seccomp: bool,
|
||||
/// Whether we are able to unshare the user namespace and change the filesystem root.
|
||||
pub can_unshare_user_namespace_and_change_root: bool,
|
||||
/// Whether we are able to call `clone` with all sandboxing flags.
|
||||
pub can_do_secure_clone: bool,
|
||||
}
|
||||
|
||||
/// A handshake with information for the worker.
|
||||
#[derive(Debug, Encode, Decode)]
|
||||
pub struct WorkerHandshake {
|
||||
pub security_status: SecurityStatus,
|
||||
}
|
||||
|
||||
/// Write some data prefixed by its length into `w`. Sync version of `framed_send` to avoid
|
||||
/// dependency on tokio.
|
||||
pub fn framed_send_blocking(w: &mut (impl Write + Unpin), buf: &[u8]) -> io::Result<()> {
|
||||
let len_buf = buf.len().to_le_bytes();
|
||||
w.write_all(&len_buf)?;
|
||||
w.write_all(buf)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Read some data prefixed by its length from `r`. Sync version of `framed_recv` to avoid
|
||||
/// dependency on tokio.
|
||||
pub fn framed_recv_blocking(r: &mut (impl Read + Unpin)) -> io::Result<Vec<u8>> {
|
||||
let mut len_buf = [0u8; mem::size_of::<usize>()];
|
||||
r.read_exact(&mut len_buf)?;
|
||||
let len = usize::from_le_bytes(len_buf);
|
||||
let mut buf = vec![0; len];
|
||||
r.read_exact(&mut buf)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy, Encode, Decode, PartialEq, Eq)]
|
||||
#[repr(transparent)]
|
||||
pub struct ArtifactChecksum(H256);
|
||||
|
||||
/// Compute the checksum of the given artifact.
|
||||
pub fn compute_checksum(data: &[u8]) -> ArtifactChecksum {
|
||||
ArtifactChecksum(H256::from_slice(&sp_crypto_hashing::twox_256(data)))
|
||||
}
|
||||
|
||||
#[cfg(all(test, not(feature = "test-utils")))]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn default_secure_status() {
|
||||
let status = SecurityStatus::default();
|
||||
assert!(
|
||||
!status.secure_validator_mode,
|
||||
"secure_validator_mode is false for default security status"
|
||||
);
|
||||
assert!(
|
||||
!status.can_enable_landlock,
|
||||
"can_enable_landlock is false for default security status"
|
||||
);
|
||||
assert!(
|
||||
!status.can_enable_seccomp,
|
||||
"can_enable_seccomp is false for default security status"
|
||||
);
|
||||
assert!(
|
||||
!status.can_unshare_user_namespace_and_change_root,
|
||||
"can_unshare_user_namespace_and_change_root is false for default security status"
|
||||
);
|
||||
assert!(
|
||||
!status.can_do_secure_clone,
|
||||
"can_do_secure_clone is false for default security status"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use crate::ArtifactChecksum;
|
||||
use codec::{Decode, Encode};
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Result from prepare worker if successful.
|
||||
#[derive(Debug, Clone, Default, Encode, Decode)]
|
||||
pub struct PrepareWorkerSuccess {
|
||||
/// Checksum of the compiled PVF.
|
||||
pub checksum: ArtifactChecksum,
|
||||
/// Stats of the current preparation run.
|
||||
pub stats: PrepareStats,
|
||||
}
|
||||
|
||||
/// Result of PVF preparation if successful.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PrepareSuccess {
|
||||
/// Checksum of the compiled PVF.
|
||||
pub checksum: ArtifactChecksum,
|
||||
/// Canonical path to the compiled artifact.
|
||||
pub path: PathBuf,
|
||||
/// Size in bytes
|
||||
pub size: u64,
|
||||
/// Stats of the current preparation run.
|
||||
pub stats: PrepareStats,
|
||||
}
|
||||
|
||||
/// Preparation statistics, including the CPU time and memory taken.
|
||||
#[derive(Debug, Clone, Default, Encode, Decode)]
|
||||
pub struct PrepareStats {
|
||||
/// The CPU time that elapsed for the preparation job.
|
||||
pub cpu_time_elapsed: std::time::Duration,
|
||||
/// The observed memory statistics for the preparation job.
|
||||
pub memory_stats: MemoryStats,
|
||||
/// The decompressed Wasm code length observed during the preparation.
|
||||
pub observed_wasm_code_len: u32,
|
||||
}
|
||||
|
||||
/// Helper struct to contain all the memory stats, including `MemoryAllocationStats` and, if
|
||||
/// supported by the OS, `ru_maxrss`.
|
||||
#[derive(Clone, Debug, Default, Encode, Decode)]
|
||||
pub struct MemoryStats {
|
||||
/// Memory stats from `tikv_jemalloc_ctl`, polling-based and not very precise.
|
||||
#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
|
||||
pub memory_tracker_stats: Option<MemoryAllocationStats>,
|
||||
/// `ru_maxrss` from `getrusage`. `None` if an error occurred.
|
||||
#[cfg(target_os = "linux")]
|
||||
pub max_rss: Option<i64>,
|
||||
/// Peak allocation in bytes measured by tracking allocator
|
||||
pub peak_tracked_alloc: u64,
|
||||
}
|
||||
|
||||
/// Statistics of collected memory metrics.
|
||||
#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
|
||||
#[derive(Clone, Debug, Default, Encode, Decode)]
|
||||
pub struct MemoryAllocationStats {
|
||||
/// Total resident memory, in bytes.
|
||||
pub resident: u64,
|
||||
/// Total allocated memory, in bytes.
|
||||
pub allocated: u64,
|
||||
}
|
||||
|
||||
/// The kind of prepare job.
|
||||
#[derive(Copy, Clone, Debug, Encode, Decode)]
|
||||
pub enum PrepareJobKind {
|
||||
/// Compilation triggered by a candidate validation request.
|
||||
Compilation,
|
||||
/// A prechecking job.
|
||||
Prechecking,
|
||||
}
|
||||
@@ -0,0 +1,141 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use crate::prepare::PrepareJobKind;
|
||||
use codec::{Decode, Encode};
|
||||
use pezkuwi_primitives::ExecutorParams;
|
||||
use pezkuwi_teyrchain_primitives::primitives::ValidationCodeHash;
|
||||
use std::{fmt, sync::Arc, time::Duration};
|
||||
|
||||
/// A struct that carries the exhaustive set of data to prepare an artifact out of plain
|
||||
/// Wasm binary
|
||||
///
|
||||
/// Should be cheap to clone.
|
||||
#[derive(Clone, Encode, Decode)]
|
||||
pub struct PvfPrepData {
|
||||
/// Wasm code (maybe compressed)
|
||||
maybe_compressed_code: Arc<Vec<u8>>,
|
||||
/// Maximum uncompressed code size.
|
||||
validation_code_bomb_limit: u32,
|
||||
/// Wasm code hash.
|
||||
code_hash: ValidationCodeHash,
|
||||
/// Executor environment parameters for the session for which artifact is prepared
|
||||
executor_params: Arc<ExecutorParams>,
|
||||
/// Preparation timeout
|
||||
prep_timeout: Duration,
|
||||
/// The kind of preparation job.
|
||||
prep_kind: PrepareJobKind,
|
||||
}
|
||||
|
||||
impl PvfPrepData {
|
||||
/// Returns an instance of the PVF out of the given PVF code and executor params.
|
||||
pub fn from_code(
|
||||
code: Vec<u8>,
|
||||
executor_params: ExecutorParams,
|
||||
prep_timeout: Duration,
|
||||
prep_kind: PrepareJobKind,
|
||||
validation_code_bomb_limit: u32,
|
||||
) -> Self {
|
||||
let maybe_compressed_code = Arc::new(code);
|
||||
let code_hash = sp_crypto_hashing::blake2_256(&maybe_compressed_code).into();
|
||||
let executor_params = Arc::new(executor_params);
|
||||
Self {
|
||||
maybe_compressed_code,
|
||||
code_hash,
|
||||
executor_params,
|
||||
prep_timeout,
|
||||
prep_kind,
|
||||
validation_code_bomb_limit,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns validation code hash
|
||||
pub fn code_hash(&self) -> ValidationCodeHash {
|
||||
self.code_hash
|
||||
}
|
||||
|
||||
/// Returns PVF code blob
|
||||
pub fn maybe_compressed_code(&self) -> Arc<Vec<u8>> {
|
||||
self.maybe_compressed_code.clone()
|
||||
}
|
||||
|
||||
/// Returns executor params
|
||||
pub fn executor_params(&self) -> Arc<ExecutorParams> {
|
||||
self.executor_params.clone()
|
||||
}
|
||||
|
||||
/// Returns preparation timeout.
|
||||
pub fn prep_timeout(&self) -> Duration {
|
||||
self.prep_timeout
|
||||
}
|
||||
|
||||
/// Returns preparation kind.
|
||||
pub fn prep_kind(&self) -> PrepareJobKind {
|
||||
self.prep_kind
|
||||
}
|
||||
|
||||
/// Returns validation code bomb limit.
|
||||
pub fn validation_code_bomb_limit(&self) -> u32 {
|
||||
self.validation_code_bomb_limit
|
||||
}
|
||||
|
||||
/// Creates a structure for tests.
|
||||
#[cfg(feature = "test-utils")]
|
||||
pub fn from_discriminator_and_timeout(num: u32, timeout: Duration) -> Self {
|
||||
let discriminator_buf = num.to_le_bytes().to_vec();
|
||||
Self::from_code(
|
||||
discriminator_buf,
|
||||
ExecutorParams::default(),
|
||||
timeout,
|
||||
PrepareJobKind::Compilation,
|
||||
30 * 1024 * 1024,
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates a structure for tests.
|
||||
#[cfg(feature = "test-utils")]
|
||||
pub fn from_discriminator(num: u32) -> Self {
|
||||
Self::from_discriminator_and_timeout(num, crate::tests::TEST_PREPARATION_TIMEOUT)
|
||||
}
|
||||
|
||||
/// Creates a structure for tests.
|
||||
#[cfg(feature = "test-utils")]
|
||||
pub fn from_discriminator_precheck(num: u32) -> Self {
|
||||
let mut pvf =
|
||||
Self::from_discriminator_and_timeout(num, crate::tests::TEST_PREPARATION_TIMEOUT);
|
||||
pvf.prep_kind = PrepareJobKind::Prechecking;
|
||||
pvf
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for PvfPrepData {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"Pvf {{ code: [...], code_hash: {:?}, executor_params: {:?}, prep_timeout: {:?} }}",
|
||||
self.code_hash, self.executor_params, self.prep_timeout
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for PvfPrepData {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.code_hash == other.code_hash &&
|
||||
self.executor_params.hash() == other.executor_params.hash()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for PvfPrepData {}
|
||||
@@ -0,0 +1,839 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Functionality common to both prepare and execute workers.
|
||||
|
||||
pub mod security;
|
||||
|
||||
use crate::{
|
||||
framed_recv_blocking, framed_send_blocking, SecurityStatus, WorkerHandshake, LOG_TARGET,
|
||||
};
|
||||
use codec::{Decode, Encode};
|
||||
use cpu_time::ProcessTime;
|
||||
use futures::never::Never;
|
||||
use nix::{errno::Errno, sys::resource::Usage};
|
||||
use std::{
|
||||
any::Any,
|
||||
fmt::{self},
|
||||
fs::File,
|
||||
io::{self, Read, Write},
|
||||
os::{
|
||||
fd::{AsRawFd, FromRawFd, RawFd},
|
||||
unix::net::UnixStream,
|
||||
},
|
||||
path::PathBuf,
|
||||
sync::mpsc::{Receiver, RecvTimeoutError},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
/// Use this macro to declare a `fn main() {}` that will create an executable that can be used for
|
||||
/// spawning the desired worker.
|
||||
#[macro_export]
|
||||
macro_rules! decl_worker_main {
|
||||
($expected_command:expr, $entrypoint:expr, $worker_version:expr, $worker_version_hash:expr $(,)*) => {
|
||||
fn get_full_version() -> String {
|
||||
format!("{}-{}", $worker_version, $worker_version_hash)
|
||||
}
|
||||
|
||||
fn print_help(expected_command: &str) {
|
||||
println!("{} {}", expected_command, $worker_version);
|
||||
println!("commit: {}", $worker_version_hash);
|
||||
println!();
|
||||
println!("PVF worker that is called by pezkuwi.");
|
||||
}
|
||||
|
||||
fn main() {
|
||||
#[cfg(target_os = "linux")]
|
||||
use $crate::worker::security;
|
||||
|
||||
$crate::sp_tracing::try_init_simple();
|
||||
|
||||
let args = std::env::args().collect::<Vec<_>>();
|
||||
if args.len() == 1 {
|
||||
print_help($expected_command);
|
||||
return;
|
||||
}
|
||||
|
||||
match args[1].as_ref() {
|
||||
"--help" | "-h" => {
|
||||
print_help($expected_command);
|
||||
return;
|
||||
},
|
||||
"--version" | "-v" => {
|
||||
println!("{}", $worker_version);
|
||||
return;
|
||||
},
|
||||
// Useful for debugging. --version is used for version checks.
|
||||
"--full-version" => {
|
||||
println!("{}", get_full_version());
|
||||
return;
|
||||
},
|
||||
|
||||
"--check-can-enable-landlock" => {
|
||||
#[cfg(target_os = "linux")]
|
||||
let status = if let Err(err) = security::landlock::check_can_fully_enable() {
|
||||
// Write the error to stderr, log it on the host-side.
|
||||
eprintln!("{}", err);
|
||||
-1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
let status = -1;
|
||||
std::process::exit(status)
|
||||
},
|
||||
"--check-can-enable-seccomp" => {
|
||||
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
|
||||
let status = if let Err(err) = security::seccomp::check_can_fully_enable() {
|
||||
// Write the error to stderr, log it on the host-side.
|
||||
eprintln!("{}", err);
|
||||
-1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
#[cfg(not(all(target_os = "linux", target_arch = "x86_64")))]
|
||||
let status = -1;
|
||||
std::process::exit(status)
|
||||
},
|
||||
"--check-can-unshare-user-namespace-and-change-root" => {
|
||||
#[cfg(target_os = "linux")]
|
||||
let cache_path_tempdir = std::path::Path::new(&args[2]);
|
||||
#[cfg(target_os = "linux")]
|
||||
let status = if let Err(err) =
|
||||
security::change_root::check_can_fully_enable(&cache_path_tempdir)
|
||||
{
|
||||
// Write the error to stderr, log it on the host-side.
|
||||
eprintln!("{}", err);
|
||||
-1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
let status = -1;
|
||||
std::process::exit(status)
|
||||
},
|
||||
"--check-can-do-secure-clone" => {
|
||||
#[cfg(target_os = "linux")]
|
||||
// SAFETY: new process is spawned within a single threaded process. This
|
||||
// invariant is enforced by tests.
|
||||
let status = if let Err(err) = unsafe { security::clone::check_can_fully_clone() } {
|
||||
// Write the error to stderr, log it on the host-side.
|
||||
eprintln!("{}", err);
|
||||
-1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
let status = -1;
|
||||
std::process::exit(status)
|
||||
},
|
||||
|
||||
"test-sleep" => {
|
||||
std::thread::sleep(std::time::Duration::from_secs(5));
|
||||
return;
|
||||
},
|
||||
|
||||
subcommand => {
|
||||
// Must be passed for compatibility with the single-binary test workers.
|
||||
if subcommand != $expected_command {
|
||||
panic!(
|
||||
"trying to run {} binary with the {} subcommand",
|
||||
$expected_command, subcommand
|
||||
)
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
let mut socket_path = None;
|
||||
let mut worker_dir_path = None;
|
||||
let mut node_version = None;
|
||||
|
||||
let mut i = 2;
|
||||
while i < args.len() {
|
||||
match args[i].as_ref() {
|
||||
"--socket-path" => {
|
||||
socket_path = Some(args[i + 1].as_str());
|
||||
i += 1
|
||||
},
|
||||
"--worker-dir-path" => {
|
||||
worker_dir_path = Some(args[i + 1].as_str());
|
||||
i += 1
|
||||
},
|
||||
"--node-impl-version" => {
|
||||
node_version = Some(args[i + 1].as_str());
|
||||
i += 1
|
||||
},
|
||||
arg => panic!("Unexpected argument found: {}", arg),
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
let socket_path = socket_path.expect("the --socket-path argument is required");
|
||||
let worker_dir_path =
|
||||
worker_dir_path.expect("the --worker-dir-path argument is required");
|
||||
|
||||
let socket_path = std::path::Path::new(socket_path).to_owned();
|
||||
let worker_dir_path = std::path::Path::new(worker_dir_path).to_owned();
|
||||
|
||||
$entrypoint(socket_path, worker_dir_path, node_version, Some($worker_version));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
//taken from the os_pipe crate. Copied here to reduce one dependency and
|
||||
// because its type-safe abstractions do not play well with nix's clone
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
|
||||
let mut fds: [libc::c_int; 2] = [0; 2];
|
||||
let res = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC) };
|
||||
if res != 0 {
|
||||
return Err(io::Error::last_os_error());
|
||||
}
|
||||
Ok((fds[0], fds[1]))
|
||||
}
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
|
||||
let mut fds: [libc::c_int; 2] = [0; 2];
|
||||
let res = unsafe { libc::pipe(fds.as_mut_ptr()) };
|
||||
if res != 0 {
|
||||
return Err(io::Error::last_os_error());
|
||||
}
|
||||
let res = unsafe { libc::fcntl(fds[0], libc::F_SETFD, libc::FD_CLOEXEC) };
|
||||
if res != 0 {
|
||||
return Err(io::Error::last_os_error());
|
||||
}
|
||||
let res = unsafe { libc::fcntl(fds[1], libc::F_SETFD, libc::FD_CLOEXEC) };
|
||||
if res != 0 {
|
||||
return Err(io::Error::last_os_error());
|
||||
}
|
||||
Ok((fds[0], fds[1]))
|
||||
}
|
||||
|
||||
/// A wrapper around a file descriptor used to encapsulate and restrict
|
||||
/// functionality for pipe operations.
|
||||
pub struct PipeFd {
|
||||
file: File,
|
||||
}
|
||||
|
||||
impl AsRawFd for PipeFd {
|
||||
/// Returns the raw file descriptor associated with this `PipeFd`
|
||||
fn as_raw_fd(&self) -> RawFd {
|
||||
self.file.as_raw_fd()
|
||||
}
|
||||
}
|
||||
|
||||
impl FromRawFd for PipeFd {
|
||||
/// Creates a new `PipeFd` instance from a raw file descriptor.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// The fd passed in must be an owned file descriptor; in particular, it must be open.
|
||||
unsafe fn from_raw_fd(fd: RawFd) -> Self {
|
||||
PipeFd { file: File::from_raw_fd(fd) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Read for PipeFd {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
self.file.read(buf)
|
||||
}
|
||||
|
||||
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
|
||||
self.file.read_to_end(buf)
|
||||
}
|
||||
}
|
||||
|
||||
impl Write for PipeFd {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
self.file.write(buf)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.file.flush()
|
||||
}
|
||||
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
self.file.write_all(buf)
|
||||
}
|
||||
}
|
||||
|
||||
/// Some allowed overhead that we account for in the "CPU time monitor" thread's sleeps, on the
|
||||
/// child process.
|
||||
pub const JOB_TIMEOUT_OVERHEAD: Duration = Duration::from_millis(50);
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum WorkerKind {
|
||||
Prepare,
|
||||
Execute,
|
||||
CheckPivotRoot,
|
||||
}
|
||||
|
||||
impl fmt::Display for WorkerKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::Prepare => write!(f, "prepare"),
|
||||
Self::Execute => write!(f, "execute"),
|
||||
Self::CheckPivotRoot => write!(f, "check pivot root"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct WorkerInfo {
|
||||
pub pid: u32,
|
||||
pub kind: WorkerKind,
|
||||
pub version: Option<String>,
|
||||
pub worker_dir_path: PathBuf,
|
||||
}
|
||||
|
||||
// NOTE: The worker version must be passed in so that we accurately get the version of the worker,
|
||||
// and not the version that this crate was compiled with.
|
||||
//
|
||||
// NOTE: This must not spawn any threads due to safety requirements in `event_loop` and to avoid
|
||||
// errors in [`security::change_root::try_restrict`].
|
||||
//
|
||||
/// Initializes the worker process, then runs the given event loop, which spawns a new job process
|
||||
/// to securely handle each incoming request.
|
||||
pub fn run_worker<F>(
|
||||
worker_kind: WorkerKind,
|
||||
socket_path: PathBuf,
|
||||
worker_dir_path: PathBuf,
|
||||
node_version: Option<&str>,
|
||||
worker_version: Option<&str>,
|
||||
mut event_loop: F,
|
||||
) where
|
||||
F: FnMut(UnixStream, &WorkerInfo, SecurityStatus) -> io::Result<Never>,
|
||||
{
|
||||
#[cfg_attr(not(target_os = "linux"), allow(unused_mut))]
|
||||
let mut worker_info = WorkerInfo {
|
||||
pid: std::process::id(),
|
||||
kind: worker_kind,
|
||||
version: worker_version.map(|v| v.to_string()),
|
||||
worker_dir_path,
|
||||
};
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
?socket_path,
|
||||
"starting pvf worker ({})",
|
||||
worker_info.kind
|
||||
);
|
||||
|
||||
// Check for a mismatch between the node and worker versions.
|
||||
if let (Some(node_version), Some(worker_version)) = (node_version, &worker_info.version) {
|
||||
if node_version != worker_version {
|
||||
gum::error!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
%node_version,
|
||||
"Node and worker version mismatch, node needs restarting, forcing shutdown",
|
||||
);
|
||||
kill_parent_node_in_emergency();
|
||||
worker_shutdown(worker_info, "Version mismatch");
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure that we can read the worker dir path, and log its contents.
|
||||
let entries: io::Result<Vec<_>> = std::fs::read_dir(&worker_info.worker_dir_path)
|
||||
.and_then(|d| d.map(|res| res.map(|e| e.file_name())).collect());
|
||||
match entries {
|
||||
Ok(entries) => {
|
||||
gum::trace!(target: LOG_TARGET, ?worker_info, "content of worker dir: {:?}", entries)
|
||||
},
|
||||
Err(err) => {
|
||||
let err = format!("Could not read worker dir: {}", err.to_string());
|
||||
worker_shutdown_error(worker_info, &err);
|
||||
},
|
||||
}
|
||||
|
||||
// Connect to the socket.
|
||||
let stream = || -> io::Result<UnixStream> {
|
||||
let stream = UnixStream::connect(&socket_path)?;
|
||||
let _ = std::fs::remove_file(&socket_path);
|
||||
Ok(stream)
|
||||
}();
|
||||
let mut stream = match stream {
|
||||
Ok(ok) => ok,
|
||||
Err(err) => worker_shutdown_error(worker_info, &err.to_string()),
|
||||
};
|
||||
|
||||
let WorkerHandshake { security_status } = match recv_worker_handshake(&mut stream) {
|
||||
Ok(ok) => ok,
|
||||
Err(err) => worker_shutdown_error(worker_info, &err.to_string()),
|
||||
};
|
||||
|
||||
// Enable some security features.
|
||||
{
|
||||
gum::trace!(target: LOG_TARGET, ?security_status, "Enabling security features");
|
||||
|
||||
// First, make sure env vars were cleared, to match the environment we perform the checks
|
||||
// within. (In theory, running checks with different env vars could result in different
|
||||
// outcomes of the checks.)
|
||||
if !security::check_env_vars_were_cleared(&worker_info) {
|
||||
let err = "not all env vars were cleared when spawning the process";
|
||||
gum::error!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"{}",
|
||||
err
|
||||
);
|
||||
if security_status.secure_validator_mode {
|
||||
worker_shutdown(worker_info, err);
|
||||
}
|
||||
}
|
||||
|
||||
// Call based on whether we can change root. Error out if it should work but fails.
|
||||
//
|
||||
// NOTE: This should not be called in a multi-threaded context (i.e. inside the tokio
|
||||
// runtime). `unshare(2)`:
|
||||
//
|
||||
// > CLONE_NEWUSER requires that the calling process is not threaded.
|
||||
#[cfg(target_os = "linux")]
|
||||
if security_status.can_unshare_user_namespace_and_change_root {
|
||||
if let Err(err) = security::change_root::enable_for_worker(&worker_info) {
|
||||
// The filesystem may be in an inconsistent state, always bail out.
|
||||
let err = format!("Could not change root to be the worker cache path: {}", err);
|
||||
worker_shutdown_error(worker_info, &err);
|
||||
}
|
||||
worker_info.worker_dir_path = std::path::Path::new("/").to_owned();
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
if security_status.can_enable_landlock {
|
||||
if let Err(err) = security::landlock::enable_for_worker(&worker_info) {
|
||||
// We previously were able to enable, so this should never happen. Shutdown if
|
||||
// running in secure mode.
|
||||
let err = format!("could not fully enable landlock: {:?}", err);
|
||||
gum::error!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"{}. This should not happen, please report an issue",
|
||||
err
|
||||
);
|
||||
if security_status.secure_validator_mode {
|
||||
worker_shutdown(worker_info, &err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: We can enable the seccomp networking blacklist on aarch64 as well, but we need a CI
|
||||
// job to catch regressions. See issue ci_cd/issues/609.
|
||||
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
|
||||
if security_status.can_enable_seccomp {
|
||||
if let Err(err) = security::seccomp::enable_for_worker(&worker_info) {
|
||||
// We previously were able to enable, so this should never happen. Shutdown if
|
||||
// running in secure mode.
|
||||
let err = format!("could not fully enable seccomp: {:?}", err);
|
||||
gum::error!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"{}. This should not happen, please report an issue",
|
||||
err
|
||||
);
|
||||
if security_status.secure_validator_mode {
|
||||
worker_shutdown(worker_info, &err);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run the main worker loop.
|
||||
let err = event_loop(stream, &worker_info, security_status)
|
||||
// It's never `Ok` because it's `Ok(Never)`.
|
||||
.unwrap_err();
|
||||
|
||||
worker_shutdown(worker_info, &err.to_string());
|
||||
}
|
||||
|
||||
/// Provide a consistent message on unexpected worker shutdown.
|
||||
fn worker_shutdown(worker_info: WorkerInfo, err: &str) -> ! {
|
||||
gum::warn!(target: LOG_TARGET, ?worker_info, "quitting pvf worker ({}): {}", worker_info.kind, err);
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
/// Provide a consistent error on unexpected worker shutdown.
|
||||
fn worker_shutdown_error(worker_info: WorkerInfo, err: &str) -> ! {
|
||||
gum::error!(target: LOG_TARGET, ?worker_info, "quitting pvf worker ({}): {}", worker_info.kind, err);
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
/// Loop that runs in the CPU time monitor thread on prepare and execute jobs. Continuously wakes up
|
||||
/// and then either blocks for the remaining CPU time, or returns if we exceed the CPU timeout.
|
||||
///
|
||||
/// Returning `Some` indicates that we should send a `TimedOut` error to the host. Will return
|
||||
/// `None` if the other thread finishes first, without us timing out.
|
||||
///
|
||||
/// NOTE: Sending a `TimedOut` error to the host will cause the worker, whether preparation or
|
||||
/// execution, to be killed by the host. We do not kill the process here because it would interfere
|
||||
/// with the proper handling of this error.
|
||||
pub fn cpu_time_monitor_loop(
|
||||
cpu_time_start: ProcessTime,
|
||||
timeout: Duration,
|
||||
finished_rx: Receiver<()>,
|
||||
) -> Option<Duration> {
|
||||
loop {
|
||||
let cpu_time_elapsed = cpu_time_start.elapsed();
|
||||
|
||||
// Treat the timeout as CPU time, which is less subject to variance due to load.
|
||||
if cpu_time_elapsed <= timeout {
|
||||
// Sleep for the remaining CPU time, plus a bit to account for overhead. (And we don't
|
||||
// want to wake up too often -- so, since we just want to halt the worker thread if it
|
||||
// stalled, we can sleep longer than necessary.) Note that the sleep is wall clock time.
|
||||
// The CPU clock may be slower than the wall clock.
|
||||
let sleep_interval = timeout.saturating_sub(cpu_time_elapsed) + JOB_TIMEOUT_OVERHEAD;
|
||||
match finished_rx.recv_timeout(sleep_interval) {
|
||||
// Received finish signal.
|
||||
Ok(()) => return None,
|
||||
// Timed out, restart loop.
|
||||
Err(RecvTimeoutError::Timeout) => continue,
|
||||
Err(RecvTimeoutError::Disconnected) => return None,
|
||||
}
|
||||
}
|
||||
|
||||
return Some(cpu_time_elapsed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt to convert an opaque panic payload to a string.
|
||||
///
|
||||
/// This is a best effort, and is not guaranteed to provide the most accurate value.
|
||||
pub fn stringify_panic_payload(payload: Box<dyn Any + Send + 'static>) -> String {
|
||||
match payload.downcast::<&'static str>() {
|
||||
Ok(msg) => msg.to_string(),
|
||||
Err(payload) => match payload.downcast::<String>() {
|
||||
Ok(msg) => *msg,
|
||||
// At least we tried...
|
||||
Err(_) => "unknown panic payload".to_string(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// In case of node and worker version mismatch (as a result of in-place upgrade), send `SIGTERM`
|
||||
/// to the node to tear it down and prevent it from raising disputes on valid candidates. Node
|
||||
/// restart should be handled by the node owner. As node exits, Unix sockets opened to workers
|
||||
/// get closed by the OS and other workers receive error on socket read and also exit. Preparation
|
||||
/// jobs are written to the temporary files that are renamed to real artifacts on the node side, so
|
||||
/// no leftover artifacts are possible.
|
||||
fn kill_parent_node_in_emergency() {
|
||||
unsafe {
|
||||
// SAFETY: `getpid()` never fails but may return "no-parent" (0) or "parent-init" (1) in
|
||||
// some corner cases, which is checked. `kill()` never fails.
|
||||
let ppid = libc::getppid();
|
||||
if ppid > 1 {
|
||||
libc::kill(ppid, libc::SIGTERM);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Receives a handshake with information for the worker.
|
||||
fn recv_worker_handshake(stream: &mut UnixStream) -> io::Result<WorkerHandshake> {
|
||||
let worker_handshake = framed_recv_blocking(stream)?;
|
||||
let worker_handshake = WorkerHandshake::decode(&mut &worker_handshake[..]).map_err(|e| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("recv_worker_handshake: failed to decode WorkerHandshake: {}", e),
|
||||
)
|
||||
})?;
|
||||
Ok(worker_handshake)
|
||||
}
|
||||
|
||||
/// Calculate the total CPU time from the given `usage` structure, returned from
|
||||
/// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user
|
||||
/// and system time.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// - `rusage`: Contains resource usage information.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Duration` representing the total CPU time.
|
||||
pub fn get_total_cpu_usage(rusage: Usage) -> Duration {
|
||||
let micros = (((rusage.user_time().tv_sec() + rusage.system_time().tv_sec()) * 1_000_000) +
|
||||
(rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) as u64;
|
||||
|
||||
return Duration::from_micros(micros);
|
||||
}
|
||||
|
||||
/// Get a job response.
|
||||
pub fn recv_child_response<T>(
|
||||
received_data: &mut io::BufReader<&[u8]>,
|
||||
context: &'static str,
|
||||
) -> io::Result<T>
|
||||
where
|
||||
T: Decode,
|
||||
{
|
||||
let response_bytes = framed_recv_blocking(received_data)?;
|
||||
T::decode(&mut response_bytes.as_slice()).map_err(|e| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("{} pvf recv_child_response: decode error: {}", context, e),
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn send_result<T, E>(
|
||||
stream: &mut UnixStream,
|
||||
result: Result<T, E>,
|
||||
worker_info: &WorkerInfo,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
T: std::fmt::Debug,
|
||||
E: std::fmt::Debug + std::fmt::Display,
|
||||
Result<T, E>: Encode,
|
||||
{
|
||||
if let Err(ref err) = result {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"worker: error occurred: {}",
|
||||
err
|
||||
);
|
||||
}
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"worker: sending result to host: {:?}",
|
||||
result
|
||||
);
|
||||
|
||||
framed_send_blocking(stream, &result.encode()).map_err(|err| {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"worker: error occurred sending result to host: {}",
|
||||
err
|
||||
);
|
||||
err
|
||||
})
|
||||
}
|
||||
|
||||
pub fn stringify_errno(context: &'static str, errno: Errno) -> String {
|
||||
format!("{}: {}: {}", context, errno, io::Error::last_os_error())
|
||||
}
|
||||
|
||||
/// Functionality related to threads spawned by the workers.
|
||||
///
|
||||
/// The motivation for this module is to coordinate worker threads without using async Rust.
|
||||
pub mod thread {
|
||||
use std::{
|
||||
io, panic,
|
||||
sync::{Arc, Condvar, Mutex},
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
/// Contains the outcome of waiting on threads, or `Pending` if none are ready.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum WaitOutcome {
|
||||
Finished,
|
||||
TimedOut,
|
||||
Pending,
|
||||
}
|
||||
|
||||
impl WaitOutcome {
|
||||
pub fn is_pending(&self) -> bool {
|
||||
matches!(self, Self::Pending)
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper type.
|
||||
pub type Cond = Arc<(Mutex<WaitOutcome>, Condvar)>;
|
||||
|
||||
/// Gets a condvar initialized to `Pending`.
|
||||
pub fn get_condvar() -> Cond {
|
||||
Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new()))
|
||||
}
|
||||
|
||||
/// Runs a worker thread. Will run the requested function, and afterwards notify the threads
|
||||
/// waiting on the condvar. Catches panics during execution and resumes the panics after
|
||||
/// triggering the condvar, so that the waiting thread is notified on panics.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the thread's join handle. Calling `.join()` on it returns the result of executing
|
||||
/// `f()`, as well as whether we were able to enable sandboxing.
|
||||
pub fn spawn_worker_thread<F, R>(
|
||||
name: &str,
|
||||
f: F,
|
||||
cond: Cond,
|
||||
outcome: WaitOutcome,
|
||||
) -> io::Result<thread::JoinHandle<R>>
|
||||
where
|
||||
F: FnOnce() -> R,
|
||||
F: Send + 'static + panic::UnwindSafe,
|
||||
R: Send + 'static,
|
||||
{
|
||||
thread::Builder::new()
|
||||
.name(name.into())
|
||||
.spawn(move || cond_notify_on_done(f, cond, outcome))
|
||||
}
|
||||
|
||||
/// Runs a worker thread with the given stack size. See [`spawn_worker_thread`].
|
||||
pub fn spawn_worker_thread_with_stack_size<F, R>(
|
||||
name: &str,
|
||||
f: F,
|
||||
cond: Cond,
|
||||
outcome: WaitOutcome,
|
||||
stack_size: usize,
|
||||
) -> io::Result<thread::JoinHandle<R>>
|
||||
where
|
||||
F: FnOnce() -> R,
|
||||
F: Send + 'static + panic::UnwindSafe,
|
||||
R: Send + 'static,
|
||||
{
|
||||
thread::Builder::new()
|
||||
.name(name.into())
|
||||
.stack_size(stack_size)
|
||||
.spawn(move || cond_notify_on_done(f, cond, outcome))
|
||||
}
|
||||
|
||||
/// Runs a function, afterwards notifying the threads waiting on the condvar. Catches panics and
|
||||
/// resumes them after triggering the condvar, so that the waiting thread is notified on panics.
|
||||
fn cond_notify_on_done<F, R>(f: F, cond: Cond, outcome: WaitOutcome) -> R
|
||||
where
|
||||
F: FnOnce() -> R,
|
||||
F: panic::UnwindSafe,
|
||||
{
|
||||
let result = panic::catch_unwind(|| f());
|
||||
cond_notify_all(cond, outcome);
|
||||
match result {
|
||||
Ok(inner) => return inner,
|
||||
Err(err) => panic::resume_unwind(err),
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to notify all threads waiting on this condvar.
|
||||
fn cond_notify_all(cond: Cond, outcome: WaitOutcome) {
|
||||
let (lock, cvar) = &*cond;
|
||||
let mut flag = lock.lock().unwrap();
|
||||
if !flag.is_pending() {
|
||||
// Someone else already triggered the condvar.
|
||||
return;
|
||||
}
|
||||
*flag = outcome;
|
||||
cvar.notify_all();
|
||||
}
|
||||
|
||||
/// Block the thread while it waits on the condvar.
|
||||
pub fn wait_for_threads(cond: Cond) -> WaitOutcome {
|
||||
let (lock, cvar) = &*cond;
|
||||
let guard = cvar.wait_while(lock.lock().unwrap(), |flag| flag.is_pending()).unwrap();
|
||||
*guard
|
||||
}
|
||||
|
||||
/// Block the thread while it waits on the condvar or on a timeout. If the timeout is hit,
|
||||
/// returns `None`.
|
||||
#[cfg_attr(not(any(target_os = "linux", feature = "jemalloc-allocator")), allow(dead_code))]
|
||||
pub fn wait_for_threads_with_timeout(cond: &Cond, dur: Duration) -> Option<WaitOutcome> {
|
||||
let (lock, cvar) = &**cond;
|
||||
let result = cvar
|
||||
.wait_timeout_while(lock.lock().unwrap(), dur, |flag| flag.is_pending())
|
||||
.unwrap();
|
||||
if result.1.timed_out() {
|
||||
None
|
||||
} else {
|
||||
Some(*result.0)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use assert_matches::assert_matches;
|
||||
|
||||
#[test]
|
||||
fn get_condvar_should_be_pending() {
|
||||
let condvar = get_condvar();
|
||||
let outcome = *condvar.0.lock().unwrap();
|
||||
assert!(outcome.is_pending());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wait_for_threads_with_timeout_return_none_on_time_out() {
|
||||
let condvar = Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new()));
|
||||
let outcome = wait_for_threads_with_timeout(&condvar, Duration::from_millis(100));
|
||||
assert!(outcome.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wait_for_threads_with_timeout_returns_outcome() {
|
||||
let condvar = Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new()));
|
||||
let condvar2 = condvar.clone();
|
||||
cond_notify_all(condvar2, WaitOutcome::Finished);
|
||||
let outcome = wait_for_threads_with_timeout(&condvar, Duration::from_secs(2));
|
||||
assert_matches!(outcome.unwrap(), WaitOutcome::Finished);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spawn_worker_thread_should_notify_on_done() {
|
||||
let condvar = Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new()));
|
||||
let response =
|
||||
spawn_worker_thread("thread", || 2, condvar.clone(), WaitOutcome::TimedOut);
|
||||
let (lock, _) = &*condvar;
|
||||
let r = response.unwrap().join().unwrap();
|
||||
assert_eq!(r, 2);
|
||||
assert_matches!(*lock.lock().unwrap(), WaitOutcome::TimedOut);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spawn_worker_should_not_change_finished_outcome() {
|
||||
let condvar = Arc::new((Mutex::new(WaitOutcome::Finished), Condvar::new()));
|
||||
let response =
|
||||
spawn_worker_thread("thread", move || 2, condvar.clone(), WaitOutcome::TimedOut);
|
||||
|
||||
let r = response.unwrap().join().unwrap();
|
||||
assert_eq!(r, 2);
|
||||
assert_matches!(*condvar.0.lock().unwrap(), WaitOutcome::Finished);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cond_notify_on_done_should_update_wait_outcome_when_panic() {
|
||||
let condvar = Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new()));
|
||||
let err = panic::catch_unwind(panic::AssertUnwindSafe(|| {
|
||||
cond_notify_on_done(|| panic!("test"), condvar.clone(), WaitOutcome::Finished)
|
||||
}));
|
||||
|
||||
assert_matches!(*condvar.0.lock().unwrap(), WaitOutcome::Finished);
|
||||
assert!(err.is_err());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::mpsc::channel;
|
||||
|
||||
#[test]
|
||||
fn cpu_time_monitor_loop_should_return_time_elapsed() {
|
||||
let cpu_time_start = ProcessTime::now();
|
||||
let timeout = Duration::from_secs(0);
|
||||
let (_tx, rx) = channel();
|
||||
let result = cpu_time_monitor_loop(cpu_time_start, timeout, rx);
|
||||
assert_ne!(result, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cpu_time_monitor_loop_should_return_none() {
|
||||
let cpu_time_start = ProcessTime::now();
|
||||
let timeout = Duration::from_secs(10);
|
||||
let (tx, rx) = channel();
|
||||
tx.send(()).unwrap();
|
||||
let result = cpu_time_monitor_loop(cpu_time_start, timeout, rx);
|
||||
assert_eq!(result, None);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,168 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Functionality for securing workers by unsharing some namespaces from other processes and
|
||||
//! changing the root.
|
||||
|
||||
use crate::{
|
||||
worker::{WorkerInfo, WorkerKind},
|
||||
LOG_TARGET,
|
||||
};
|
||||
use std::{env, ffi::CString, io, os::unix::ffi::OsStrExt, path::Path, ptr};
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("{0}")]
|
||||
OsErrWithContext(String),
|
||||
#[error(transparent)]
|
||||
Io(#[from] io::Error),
|
||||
#[error("assertion failed: {0}")]
|
||||
AssertionFailed(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
/// Try to enable for the given kind of worker.
|
||||
///
|
||||
/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
|
||||
/// "CLONE_NEWUSER requires that the calling process is not threaded."
|
||||
pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"enabling change-root",
|
||||
);
|
||||
|
||||
try_restrict(worker_info)
|
||||
}
|
||||
|
||||
/// Runs a check for unshare-and-change-root and returns an error indicating whether it can be fully
|
||||
/// enabled on the current Linux environment.
|
||||
///
|
||||
/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
|
||||
/// "CLONE_NEWUSER requires that the calling process is not threaded."
|
||||
pub fn check_can_fully_enable(tempdir: &Path) -> Result<()> {
|
||||
let worker_dir_path = tempdir.to_owned();
|
||||
try_restrict(&WorkerInfo {
|
||||
pid: std::process::id(),
|
||||
kind: WorkerKind::CheckPivotRoot,
|
||||
version: None,
|
||||
worker_dir_path,
|
||||
})
|
||||
}
|
||||
|
||||
/// Unshare the user namespace and change root to be the worker directory.
|
||||
///
|
||||
/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
|
||||
/// "CLONE_NEWUSER requires that the calling process is not threaded."
|
||||
fn try_restrict(worker_info: &WorkerInfo) -> Result<()> {
|
||||
// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
|
||||
macro_rules! cstr_ptr {
|
||||
($e:expr) => {
|
||||
concat!($e, "\0").as_ptr().cast::<core::ffi::c_char>()
|
||||
};
|
||||
}
|
||||
|
||||
let worker_dir_path_c = CString::new(worker_info.worker_dir_path.as_os_str().as_bytes())
|
||||
.expect("on unix; the path will never contain 0 bytes; qed");
|
||||
|
||||
// Wrapper around all the work to prevent repetitive error handling.
|
||||
//
|
||||
// # Errors
|
||||
//
|
||||
// It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does
|
||||
// not give the context of which call failed, so we return a &str error.
|
||||
|| -> std::result::Result<(), &'static str> {
|
||||
// SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps
|
||||
// (2) and (3) are adapted from the example in pivot_root(2), with the additional
|
||||
// change described in the `pivot_root(".", ".")` section.
|
||||
unsafe {
|
||||
// 1. `unshare` the user and the mount namespaces.
|
||||
if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 {
|
||||
return Err("unshare user and mount namespaces");
|
||||
}
|
||||
|
||||
// 2. Setup mounts.
|
||||
//
|
||||
// Ensure that new root and its parent mount don't have shared propagation (which would
|
||||
// cause pivot_root() to return an error), and prevent propagation of mount events to
|
||||
// the initial mount namespace.
|
||||
if libc::mount(
|
||||
ptr::null(),
|
||||
cstr_ptr!("/"),
|
||||
ptr::null(),
|
||||
libc::MS_REC | libc::MS_PRIVATE,
|
||||
ptr::null(),
|
||||
) < 0
|
||||
{
|
||||
return Err("mount MS_PRIVATE");
|
||||
}
|
||||
// Ensure that the new root is a mount point.
|
||||
let additional_flags =
|
||||
if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_info.kind {
|
||||
libc::MS_RDONLY
|
||||
} else {
|
||||
0
|
||||
};
|
||||
if libc::mount(
|
||||
worker_dir_path_c.as_ptr(),
|
||||
worker_dir_path_c.as_ptr(),
|
||||
ptr::null(), // ignored when MS_BIND is used
|
||||
libc::MS_BIND |
|
||||
libc::MS_REC | libc::MS_NOEXEC |
|
||||
libc::MS_NODEV | libc::MS_NOSUID |
|
||||
libc::MS_NOATIME |
|
||||
additional_flags,
|
||||
ptr::null(), // ignored when MS_BIND is used
|
||||
) < 0
|
||||
{
|
||||
return Err("mount MS_BIND");
|
||||
}
|
||||
|
||||
// 3. `pivot_root` to the artifact directory.
|
||||
if libc::chdir(worker_dir_path_c.as_ptr()) < 0 {
|
||||
return Err("chdir to worker dir path");
|
||||
}
|
||||
if libc::syscall(libc::SYS_pivot_root, cstr_ptr!("."), cstr_ptr!(".")) < 0 {
|
||||
return Err("pivot_root");
|
||||
}
|
||||
if libc::umount2(cstr_ptr!("."), libc::MNT_DETACH) < 0 {
|
||||
return Err("umount the old root mount point");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}()
|
||||
.map_err(|err_ctx| {
|
||||
let err = io::Error::last_os_error();
|
||||
Error::OsErrWithContext(format!("{}: {}", err_ctx, err))
|
||||
})?;
|
||||
|
||||
// Do some assertions.
|
||||
if env::current_dir()? != Path::new("/") {
|
||||
return Err(Error::AssertionFailed(
|
||||
"expected current dir after pivot_root to be `/`".into(),
|
||||
));
|
||||
}
|
||||
env::set_current_dir("..")?;
|
||||
if env::current_dir()? != Path::new("/") {
|
||||
return Err(Error::AssertionFailed(
|
||||
"expected not to be able to break out of new root by doing `..`".into(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Functionality for securing the job processes spawned by the workers using `clone`. If
|
||||
//! unsupported, falls back to `fork`.
|
||||
|
||||
use crate::{worker::WorkerInfo, LOG_TARGET};
|
||||
use nix::{
|
||||
errno::Errno,
|
||||
sched::{CloneCb, CloneFlags},
|
||||
unistd::Pid,
|
||||
};
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("could not clone, errno: {0}")]
|
||||
Clone(Errno),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
/// Try to run clone(2) on the current worker.
|
||||
///
|
||||
/// SAFETY: new process should be either spawned within a single threaded process, or use only
|
||||
/// async-signal-safe functions.
|
||||
pub unsafe fn clone_on_worker(
|
||||
worker_info: &WorkerInfo,
|
||||
have_unshare_newuser: bool,
|
||||
cb: CloneCb,
|
||||
) -> Result<Pid> {
|
||||
let flags = clone_flags(have_unshare_newuser);
|
||||
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"calling clone with flags: {:?}",
|
||||
flags
|
||||
);
|
||||
|
||||
try_clone(cb, flags)
|
||||
}
|
||||
|
||||
/// Runs a check for clone(2) with all sandboxing flags and returns an error indicating whether it
|
||||
/// can be fully enabled on the current Linux environment.
|
||||
///
|
||||
/// SAFETY: new process should be either spawned within a single threaded process, or use only
|
||||
/// async-signal-safe functions.
|
||||
pub unsafe fn check_can_fully_clone() -> Result<()> {
|
||||
try_clone(Box::new(|| 0), clone_flags(false)).map(|_pid| ())
|
||||
}
|
||||
|
||||
/// Runs clone(2) with all sandboxing flags.
|
||||
///
|
||||
/// SAFETY: new process should be either spawned within a single threaded process, or use only
|
||||
/// async-signal-safe functions.
|
||||
unsafe fn try_clone(cb: CloneCb, flags: CloneFlags) -> Result<Pid> {
|
||||
let mut stack = [0u8; 2 * 1024 * 1024];
|
||||
|
||||
nix::sched::clone(cb, stack.as_mut_slice(), flags, None).map_err(|errno| Error::Clone(errno))
|
||||
}
|
||||
|
||||
/// Returns flags for `clone(2)`, including all the sandbox-related ones.
|
||||
fn clone_flags(have_unshare_newuser: bool) -> CloneFlags {
|
||||
// NOTE: CLONE_NEWUSER does not work in `clone` if we previously called `unshare` with this
|
||||
// flag. On the other hand, if we did not call `unshare` we need this flag for the CAP_SYS_ADMIN
|
||||
// capability.
|
||||
let maybe_clone_newuser =
|
||||
if have_unshare_newuser { CloneFlags::empty() } else { CloneFlags::CLONE_NEWUSER };
|
||||
// SIGCHLD flag is used to inform clone that the parent process is
|
||||
// expecting a child termination signal, without this flag `waitpid` function
|
||||
// return `ECHILD` error.
|
||||
maybe_clone_newuser |
|
||||
CloneFlags::CLONE_NEWCGROUP |
|
||||
CloneFlags::CLONE_NEWIPC |
|
||||
CloneFlags::CLONE_NEWNET |
|
||||
CloneFlags::CLONE_NEWNS |
|
||||
CloneFlags::CLONE_NEWPID |
|
||||
CloneFlags::CLONE_NEWUTS |
|
||||
CloneFlags::from_bits_retain(libc::SIGCHLD)
|
||||
}
|
||||
@@ -0,0 +1,323 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! The [landlock] docs say it best:
|
||||
//!
|
||||
//! > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict
|
||||
//! ambient rights (e.g., global filesystem access) for a set of processes by creating safe security
|
||||
//! sandboxes as new security layers in addition to the existing system-wide access-controls. This
|
||||
//! kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or
|
||||
//! malicious behaviors in applications. Landlock empowers any process, including unprivileged ones,
|
||||
//! to securely restrict themselves."
|
||||
//!
|
||||
//! [landlock]: https://docs.rs/landlock/latest/landlock/index.html
|
||||
|
||||
pub use landlock::RulesetStatus;
|
||||
|
||||
use crate::{
|
||||
worker::{stringify_panic_payload, WorkerInfo, WorkerKind},
|
||||
LOG_TARGET,
|
||||
};
|
||||
use landlock::*;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
/// Landlock ABI version. We use ABI V1 because:
|
||||
///
|
||||
/// 1. It is supported by our reference kernel version.
|
||||
/// 2. Later versions do not (yet) provide additional security that would benefit us.
|
||||
///
|
||||
/// # Versions (as of October 2023)
|
||||
///
|
||||
/// - Pezkuwi reference kernel version: 5.16+
|
||||
///
|
||||
/// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads.
|
||||
///
|
||||
/// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During
|
||||
/// execution an attacker can only affect the name of a symlinked artifact and not the original
|
||||
/// one.
|
||||
///
|
||||
/// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can
|
||||
/// prevent attackers from affecting a symlinked artifact. We don't strictly need this as we
|
||||
/// plan to check for file integrity anyway; see
|
||||
/// <https://github.com/pezkuwichain/pezkuwi-sdk/issues/107>.
|
||||
///
|
||||
/// # Determinism
|
||||
///
|
||||
/// You may wonder whether we could always use the latest ABI instead of only the ABI supported
|
||||
/// by the reference kernel version. It seems plausible, since landlock provides a best-effort
|
||||
/// approach to enabling sandboxing. For example, if the reference version only supported V1 and
|
||||
/// we were on V2, then landlock would use V2 if it was supported on the current machine, and
|
||||
/// just fall back to V1 if not.
|
||||
///
|
||||
/// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1,
|
||||
/// they may have different semantics on some PVFs. So a malicious PVF now has a new attack
|
||||
/// vector: they can exploit this indeterminism between landlock ABIs!
|
||||
///
|
||||
/// On the other hand we do want validators to be as secure as possible and protect their keys
|
||||
/// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy
|
||||
/// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version
|
||||
/// supports it or if it introduces some new feature that is beneficial to security.
|
||||
pub const LANDLOCK_ABI: ABI = ABI::V1;
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("Could not fully enable: {0:?}")]
|
||||
NotFullyEnabled(RulesetStatus),
|
||||
#[error("Invalid exception path: {0:?}")]
|
||||
InvalidExceptionPath(PathBuf),
|
||||
#[error(transparent)]
|
||||
RulesetError(#[from] RulesetError),
|
||||
#[error("A panic occurred in try_restrict: {0}")]
|
||||
Panic(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
/// Try to enable landlock for the given kind of worker.
|
||||
pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
|
||||
let exceptions: Vec<(PathBuf, BitFlags<AccessFs>)> = match worker_info.kind {
|
||||
WorkerKind::Prepare => {
|
||||
vec![(worker_info.worker_dir_path.to_owned(), AccessFs::WriteFile.into())]
|
||||
},
|
||||
WorkerKind::Execute => {
|
||||
vec![(worker_info.worker_dir_path.to_owned(), AccessFs::ReadFile.into())]
|
||||
},
|
||||
WorkerKind::CheckPivotRoot => {
|
||||
panic!("this should only be passed for checking pivot_root; qed")
|
||||
},
|
||||
};
|
||||
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"enabling landlock with exceptions: {:?}",
|
||||
exceptions,
|
||||
);
|
||||
|
||||
try_restrict(exceptions)
|
||||
}
|
||||
|
||||
// TODO: <https://github.com/landlock-lsm/rust-landlock/issues/36>
|
||||
/// Runs a check for landlock in its own thread, and returns an error indicating whether the given
|
||||
/// landlock ABI is fully enabled on the current Linux environment.
|
||||
pub fn check_can_fully_enable() -> Result<()> {
|
||||
match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())).join() {
|
||||
Ok(Ok(())) => Ok(()),
|
||||
Ok(Err(err)) => Err(err),
|
||||
Err(err) => Err(Error::Panic(stringify_panic_payload(err))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Tries to restrict the current thread (should only be called in a process' main thread) with
|
||||
/// the following landlock access controls:
|
||||
///
|
||||
/// 1. all global filesystem access restricted, with optional exceptions
|
||||
/// 2. ... more sandbox types (e.g. networking) may be supported in the future.
|
||||
///
|
||||
/// If landlock is not supported in the current environment this is simply a noop.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The status of the restriction (whether it was fully, partially, or not-at-all enforced).
|
||||
fn try_restrict<I, P, A>(fs_exceptions: I) -> Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = (P, A)>,
|
||||
P: AsRef<Path>,
|
||||
A: Into<BitFlags<AccessFs>>,
|
||||
{
|
||||
let mut ruleset =
|
||||
Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?;
|
||||
for (fs_path, access_bits) in fs_exceptions {
|
||||
let paths = &[fs_path.as_ref().to_owned()];
|
||||
let mut rules = path_beneath_rules(paths, access_bits).peekable();
|
||||
if rules.peek().is_none() {
|
||||
// `path_beneath_rules` silently ignores missing paths, so check for it manually.
|
||||
return Err(Error::InvalidExceptionPath(fs_path.as_ref().to_owned()));
|
||||
}
|
||||
ruleset = ruleset.add_rules(rules)?;
|
||||
}
|
||||
|
||||
let status = ruleset.restrict_self()?;
|
||||
if !matches!(status.ruleset, RulesetStatus::FullyEnforced) {
|
||||
return Err(Error::NotFullyEnabled(status.ruleset));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::{fs, io::ErrorKind, thread};
|
||||
|
||||
#[test]
|
||||
fn restricted_thread_cannot_read_file() {
|
||||
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
|
||||
if check_can_fully_enable().is_err() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Restricted thread cannot read from FS.
|
||||
let handle = thread::spawn(|| {
|
||||
// Create, write, and read two tmp files. This should succeed before any
|
||||
// landlock restrictions are applied.
|
||||
const TEXT: &str = "foo";
|
||||
let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
|
||||
let path1 = tmpfile1.path();
|
||||
let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
|
||||
let path2 = tmpfile2.path();
|
||||
|
||||
fs::write(path1, TEXT).unwrap();
|
||||
let s = fs::read_to_string(path1).unwrap();
|
||||
assert_eq!(s, TEXT);
|
||||
fs::write(path2, TEXT).unwrap();
|
||||
let s = fs::read_to_string(path2).unwrap();
|
||||
assert_eq!(s, TEXT);
|
||||
|
||||
// Apply Landlock with a read exception for only one of the files.
|
||||
let status = try_restrict(vec![(path1, AccessFs::ReadFile)]);
|
||||
if !matches!(status, Ok(())) {
|
||||
panic!(
|
||||
"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
|
||||
status
|
||||
);
|
||||
}
|
||||
|
||||
// Try to read from both files, only tmpfile1 should succeed.
|
||||
let result = fs::read_to_string(path1);
|
||||
assert!(matches!(
|
||||
result,
|
||||
Ok(s) if s == TEXT
|
||||
));
|
||||
let result = fs::read_to_string(path2);
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
|
||||
));
|
||||
|
||||
// Apply Landlock for all files.
|
||||
let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
|
||||
if !matches!(status, Ok(())) {
|
||||
panic!(
|
||||
"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
|
||||
status
|
||||
);
|
||||
}
|
||||
|
||||
// Try to read from tmpfile1 after landlock, it should fail.
|
||||
let result = fs::read_to_string(path1);
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
|
||||
));
|
||||
});
|
||||
|
||||
assert!(handle.join().is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn restricted_thread_cannot_write_file() {
|
||||
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
|
||||
if check_can_fully_enable().is_err() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Restricted thread cannot write to FS.
|
||||
let handle = thread::spawn(|| {
|
||||
// Create and write two tmp files. This should succeed before any landlock
|
||||
// restrictions are applied.
|
||||
const TEXT: &str = "foo";
|
||||
let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
|
||||
let path1 = tmpfile1.path();
|
||||
let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
|
||||
let path2 = tmpfile2.path();
|
||||
|
||||
fs::write(path1, TEXT).unwrap();
|
||||
fs::write(path2, TEXT).unwrap();
|
||||
|
||||
// Apply Landlock with a write exception for only one of the files.
|
||||
let status = try_restrict(vec![(path1, AccessFs::WriteFile)]);
|
||||
if !matches!(status, Ok(())) {
|
||||
panic!(
|
||||
"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
|
||||
status
|
||||
);
|
||||
}
|
||||
|
||||
// Try to write to both files, only tmpfile1 should succeed.
|
||||
let result = fs::write(path1, TEXT);
|
||||
assert!(matches!(result, Ok(_)));
|
||||
let result = fs::write(path2, TEXT);
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
|
||||
));
|
||||
|
||||
// Apply Landlock for all files.
|
||||
let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
|
||||
if !matches!(status, Ok(())) {
|
||||
panic!(
|
||||
"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
|
||||
status
|
||||
);
|
||||
}
|
||||
|
||||
// Try to write to tmpfile1 after landlock, it should fail.
|
||||
let result = fs::write(path1, TEXT);
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
|
||||
));
|
||||
});
|
||||
|
||||
assert!(handle.join().is_ok());
|
||||
}
|
||||
|
||||
// Test that checks whether landlock under our ABI version is able to truncate files.
|
||||
#[test]
|
||||
fn restricted_thread_can_truncate_file() {
|
||||
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
|
||||
if check_can_fully_enable().is_err() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Restricted thread can truncate file.
|
||||
let handle = thread::spawn(|| {
|
||||
// Create and write a file. This should succeed before any landlock
|
||||
// restrictions are applied.
|
||||
const TEXT: &str = "foo";
|
||||
let tmpfile = tempfile::NamedTempFile::new().unwrap();
|
||||
let path = tmpfile.path();
|
||||
|
||||
fs::write(path, TEXT).unwrap();
|
||||
|
||||
// Apply Landlock with all exceptions under the current ABI.
|
||||
let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]);
|
||||
if !matches!(status, Ok(())) {
|
||||
panic!(
|
||||
"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
|
||||
status
|
||||
);
|
||||
}
|
||||
|
||||
// Try to truncate the file.
|
||||
let result = tmpfile.as_file().set_len(0);
|
||||
assert!(result.is_ok());
|
||||
});
|
||||
|
||||
assert!(handle.join().is_ok());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Functionality for securing workers.
|
||||
//!
|
||||
//! This is needed because workers are used to compile and execute untrusted code (PVFs).
|
||||
//!
|
||||
//! We currently employ the following security measures:
|
||||
//!
|
||||
//! - Restrict filesystem
|
||||
//! - Use Landlock to remove all unnecessary FS access rights.
|
||||
//! - Unshare the user and mount namespaces.
|
||||
//! - Change the root directory to a worker-specific temporary directory.
|
||||
//! - Restrict networking by blocking socket creation and io_uring.
|
||||
//! - Remove env vars
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod change_root;
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod clone;
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod landlock;
|
||||
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
|
||||
pub mod seccomp;
|
||||
|
||||
use crate::{worker::WorkerInfo, LOG_TARGET};
|
||||
|
||||
/// Require env vars to have been removed when spawning the process, to prevent malicious code from
|
||||
/// accessing them.
|
||||
pub fn check_env_vars_were_cleared(worker_info: &WorkerInfo) -> bool {
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"clearing env vars in worker",
|
||||
);
|
||||
|
||||
let mut ok = true;
|
||||
|
||||
for (key, value) in std::env::vars_os() {
|
||||
// TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of
|
||||
// randomness for malicious code. It should be removed in the job process, which does no
|
||||
// logging.
|
||||
if key == "RUST_LOG" {
|
||||
continue;
|
||||
}
|
||||
// An exception for MacOS. This is not a secure platform anyway, so we let it slide.
|
||||
#[cfg(target_os = "macos")]
|
||||
if key == "__CF_USER_TEXT_ENCODING" {
|
||||
continue;
|
||||
}
|
||||
|
||||
gum::error!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
?key,
|
||||
?value,
|
||||
"env var was present that should have been removed",
|
||||
);
|
||||
|
||||
ok = false;
|
||||
}
|
||||
|
||||
ok
|
||||
}
|
||||
@@ -0,0 +1,191 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Functionality for sandboxing workers by restricting their capabilities by blocking certain
|
||||
//! syscalls with seccomp.
|
||||
//!
|
||||
//! For security we block the following:
|
||||
//!
|
||||
//! - creation of new sockets - these are unneeded in PVF jobs, and we can safely block them without
|
||||
//! affecting consensus.
|
||||
//!
|
||||
//! - `io_uring` - allows for networking and needs to be blocked. See below for a discussion on the
|
||||
//! safety of doing this.
|
||||
//!
|
||||
//! # Safety of blocking io_uring
|
||||
//!
|
||||
//! `io_uring` is just a way of issuing system calls in an async manner, and there is nothing
|
||||
//! stopping wasmtime from legitimately using it. Fortunately, at the moment it does not. Generally,
|
||||
//! not many applications use `io_uring` in production yet, because of the numerous kernel CVEs
|
||||
//! discovered. It's still under a lot of development. Android outright banned `io_uring` for these
|
||||
//! reasons.
|
||||
//!
|
||||
//! Considering `io_uring`'s status discussed above, and that it very likely would get detected
|
||||
//! either by our [static analysis](https://github.com/paritytech/polkadot-sdk/pull/1663) or by
|
||||
//! testing, we think it is safe to block it.
|
||||
//!
|
||||
//! ## Consensus analysis
|
||||
//!
|
||||
//! If execution hits an edge case code path unique to a given machine, it's already taken a
|
||||
//! non-deterministic branch anyway. After all, we just care that the majority of validators reach
|
||||
//! the same result and preserve consensus. So worst-case scenario, there's a dispute, and we can
|
||||
//! always admit fault and refund the wrong validator. On the other hand, if all validators take the
|
||||
//! code path that results in a seccomp violation, then they would all vote against the current
|
||||
//! candidate, which is also fine. The violation would get logged (in big scary letters) and
|
||||
//! hopefully some validator reports it to us.
|
||||
//!
|
||||
//! Actually, a worst-worse-case scenario is that 50% of validators vote against, so that there is
|
||||
//! no consensus. But so many things would have to go wrong for that to happen:
|
||||
//!
|
||||
//! 1. An update to `wasmtime` is introduced that uses io_uring (unlikely as io_uring is mainly for
|
||||
//! IO-heavy applications)
|
||||
//!
|
||||
//! 2. The new syscall is not detected by our static analysis
|
||||
//!
|
||||
//! 3. It is never triggered in any of our tests
|
||||
//!
|
||||
//! 4. It then gets triggered on some super edge case in production on 50% of validators causing a
|
||||
//! stall (bad but very unlikely)
|
||||
//!
|
||||
//! 5. Or, it triggers on only a few validators causing a dispute (more likely but not as bad)
|
||||
//!
|
||||
//! Considering how many things would have to go wrong here, we believe it's safe to block
|
||||
//! `io_uring`.
|
||||
//!
|
||||
//! # Action on syscall violations
|
||||
//!
|
||||
//! When a forbidden syscall is attempted we immediately kill the process in order to prevent the
|
||||
//! attacker from doing anything else. In execution, this will result in voting against the
|
||||
//! candidate.
|
||||
|
||||
use crate::{
|
||||
worker::{stringify_panic_payload, WorkerInfo},
|
||||
LOG_TARGET,
|
||||
};
|
||||
use seccompiler::*;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
/// The action to take on caught syscalls.
|
||||
#[cfg(not(test))]
|
||||
const CAUGHT_ACTION: SeccompAction = SeccompAction::KillProcess;
|
||||
/// Don't kill the process when testing.
|
||||
#[cfg(test)]
|
||||
const CAUGHT_ACTION: SeccompAction = SeccompAction::Errno(libc::EACCES as u32);
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error(transparent)]
|
||||
Seccomp(#[from] seccompiler::Error),
|
||||
#[error(transparent)]
|
||||
Backend(#[from] seccompiler::BackendError),
|
||||
#[error("A panic occurred in try_restrict: {0}")]
|
||||
Panic(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
/// Try to enable seccomp for the given kind of worker.
|
||||
pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"enabling seccomp",
|
||||
);
|
||||
|
||||
try_restrict()
|
||||
}
|
||||
|
||||
/// Runs a check for seccomp in its own thread, and returns an error indicating whether seccomp with
|
||||
/// our rules is fully enabled on the current Linux environment.
|
||||
pub fn check_can_fully_enable() -> Result<()> {
|
||||
match std::thread::spawn(|| try_restrict()).join() {
|
||||
Ok(Ok(())) => Ok(()),
|
||||
Ok(Err(err)) => Err(err),
|
||||
Err(err) => Err(Error::Panic(stringify_panic_payload(err))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Applies a `seccomp` filter to disable networking for the PVF threads.
|
||||
fn try_restrict() -> Result<()> {
|
||||
// Build a `seccomp` filter which by default allows all syscalls except those blocked in the
|
||||
// blacklist.
|
||||
let mut blacklisted_rules = BTreeMap::default();
|
||||
|
||||
// Restrict the creation of sockets.
|
||||
blacklisted_rules.insert(libc::SYS_socketpair, vec![]);
|
||||
blacklisted_rules.insert(libc::SYS_socket, vec![]);
|
||||
|
||||
// Prevent connecting to sockets for extra safety.
|
||||
blacklisted_rules.insert(libc::SYS_connect, vec![]);
|
||||
|
||||
// Restrict io_uring.
|
||||
blacklisted_rules.insert(libc::SYS_io_uring_setup, vec![]);
|
||||
blacklisted_rules.insert(libc::SYS_io_uring_enter, vec![]);
|
||||
blacklisted_rules.insert(libc::SYS_io_uring_register, vec![]);
|
||||
|
||||
let filter = SeccompFilter::new(
|
||||
blacklisted_rules,
|
||||
// Mismatch action: what to do if not in rule list.
|
||||
SeccompAction::Allow,
|
||||
// Match action: what to do if in rule list.
|
||||
CAUGHT_ACTION,
|
||||
TargetArch::x86_64,
|
||||
)?;
|
||||
|
||||
let bpf_prog: BpfProgram = filter.try_into()?;
|
||||
|
||||
// Applies filter (runs seccomp) to the calling thread.
|
||||
seccompiler::apply_filter(&bpf_prog)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::{io::ErrorKind, net::TcpListener, thread};
|
||||
|
||||
#[test]
|
||||
fn sandboxed_thread_cannot_use_sockets() {
|
||||
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
|
||||
if check_can_fully_enable().is_err() {
|
||||
return;
|
||||
}
|
||||
|
||||
let handle = thread::spawn(|| {
|
||||
// Open a socket, this should succeed before seccomp is applied.
|
||||
TcpListener::bind("127.0.0.1:0").unwrap();
|
||||
|
||||
let status = try_restrict();
|
||||
if !matches!(status, Ok(())) {
|
||||
panic!("Ruleset should be enforced since we checked if seccomp is enabled");
|
||||
}
|
||||
|
||||
// Try to open a socket after seccomp.
|
||||
assert!(matches!(
|
||||
TcpListener::bind("127.0.0.1:0"),
|
||||
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
|
||||
));
|
||||
|
||||
// Other syscalls should still work.
|
||||
unsafe {
|
||||
assert!(libc::getppid() > 0);
|
||||
}
|
||||
});
|
||||
|
||||
assert!(handle.join().is_ok());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Shared functions for getting the known worker files.
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
const WORKER_EXECUTE_ARTIFACT_NAME: &str = "artifact";
|
||||
const WORKER_PREPARE_TMP_ARTIFACT_NAME: &str = "tmp-artifact";
|
||||
|
||||
pub fn execute_artifact(worker_dir_path: &Path) -> PathBuf {
|
||||
worker_dir_path.join(WORKER_EXECUTE_ARTIFACT_NAME)
|
||||
}
|
||||
|
||||
pub fn prepare_tmp_artifact(worker_dir_path: &Path) -> PathBuf {
|
||||
worker_dir_path.join(WORKER_PREPARE_TMP_ARTIFACT_NAME)
|
||||
}
|
||||
Reference in New Issue
Block a user