feat: initialize Kurdistan SDK - independent fork of Polkadot SDK

2025-12-13 15:44:15 +03:00
commit e4778b4576
6838 changed files with 1847450 additions and 0 deletions
@@ -0,0 +1,164 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+use crate::prepare::{PrepareSuccess, PrepareWorkerSuccess};
+use codec::{Decode, Encode};
+pub use sc_executor_common::error::Error as ExecuteError;
+
+/// Result of PVF preparation from a worker, with checksum of the compiled PVF and stats of the
+/// preparation if successful.
+pub type PrepareWorkerResult = Result<PrepareWorkerSuccess, PrepareError>;
+
+/// Result of PVF preparation propagated all the way back to the host, with path to the concluded
+/// artifact and stats of the preparation if successful.
+pub type PrepareResult = Result<PrepareSuccess, PrepareError>;
+
+/// Result of prechecking PVF performed by the validation host. Contains stats about the preparation
+/// if successful.
+pub type PrecheckResult = Result<(), PrepareError>;
+
+/// An error that occurred during the prepare part of the PVF pipeline.
+// Codec indexes are intended to stabilize pre-encoded payloads (see `OOM_PAYLOAD`)
+#[derive(thiserror::Error, Debug, Clone, Encode, Decode)]
+pub enum PrepareError {
+	/// During the prevalidation stage of preparation an issue was found with the PVF.
+	#[codec(index = 0)]
+	#[error("prepare: prevalidation error: {0}")]
+	Prevalidation(String),
+	/// Compilation failed for the given PVF.
+	#[codec(index = 1)]
+	#[error("prepare: preparation error: {0}")]
+	Preparation(String),
+	/// Instantiation of the WASM module instance failed.
+	#[codec(index = 2)]
+	#[error("prepare: runtime construction: {0}")]
+	RuntimeConstruction(String),
+	/// An unexpected error has occurred in the preparation job.
+	#[codec(index = 3)]
+	#[error("prepare: job error: {0}")]
+	JobError(String),
+	/// Failed to prepare the PVF due to the time limit.
+	#[codec(index = 4)]
+	#[error("prepare: timeout")]
+	TimedOut,
+	/// An IO error occurred. This state is reported by either the validation host or by the
+	/// worker.
+	#[codec(index = 5)]
+	#[error("prepare: io error while receiving response: {0}")]
+	IoErr(String),
+	/// The temporary file for the artifact could not be created at the given cache path. This
+	/// state is reported by the validation host (not by the worker).
+	#[codec(index = 6)]
+	#[error("prepare: error creating tmp file: {0}")]
+	CreateTmpFile(String),
+	/// The response from the worker is received, but the file cannot be renamed (moved) to the
+	/// final destination location. This state is reported by the validation host (not by the
+	/// worker).
+	#[codec(index = 7)]
+	#[error("prepare: error renaming tmp file ({src:?} -> {dest:?}): {err}")]
+	RenameTmpFile {
+		err: String,
+		// Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible
+		// conversion to `Option<String>`.
+		src: Option<String>,
+		dest: Option<String>,
+	},
+	/// Memory limit reached
+	#[codec(index = 8)]
+	#[error("prepare: out of memory")]
+	OutOfMemory,
+	/// The response from the worker is received, but the worker cache could not be cleared. The
+	/// worker has to be killed to avoid jobs having access to data from other jobs. This state is
+	/// reported by the validation host (not by the worker).
+	#[codec(index = 9)]
+	#[error("prepare: error clearing worker cache: {0}")]
+	ClearWorkerDir(String),
+	/// The preparation job process died, due to OOM, a seccomp violation, or some other factor.
+	#[codec(index = 10)]
+	#[error("prepare: prepare job with pid {job_pid} died: {err}")]
+	JobDied { err: String, job_pid: i32 },
+	/// Some error occurred when interfacing with the kernel.
+	#[codec(index = 11)]
+	#[error("prepare: error interfacing with the kernel: {0}")]
+	Kernel(String),
+	/// Code blob failed to decompress
+	#[codec(index = 12)]
+	#[error("prepare: could not decompress code blob: {0}")]
+	CouldNotDecompressCodeBlob(String),
+}
+
+impl PrepareError {
+	/// Returns whether this is a deterministic error, i.e. one that should trigger reliably. Those
+	/// errors depend on the PVF itself and the sc-executor/wasmtime logic.
+	///
+	/// Non-deterministic errors can happen spuriously. Typically, they occur due to resource
+	/// starvation, e.g. under heavy load or memory pressure. Those errors are typically transient
+	/// but may persist e.g. if the node is run by overwhelmingly underpowered machine.
+	pub fn is_deterministic(&self) -> bool {
+		use PrepareError::*;
+		match self {
+			Prevalidation(_) |
+			Preparation(_) |
+			JobError(_) |
+			OutOfMemory |
+			CouldNotDecompressCodeBlob(_) => true,
+			IoErr(_) |
+			JobDied { .. } |
+			CreateTmpFile(_) |
+			RenameTmpFile { .. } |
+			ClearWorkerDir(_) |
+			Kernel(_) => false,
+			// Can occur due to issues with the PVF, but also due to factors like local load.
+			TimedOut => false,
+			// Can occur due to issues with the PVF, but also due to local errors.
+			RuntimeConstruction(_) => false,
+		}
+	}
+}
+
+/// Some internal error occurred.
+///
+/// Should only ever be used for validation errors independent of the candidate and PVF, or for
+/// errors we ruled out during pre-checking (so preparation errors are fine).
+#[derive(thiserror::Error, Debug, Clone, Encode, Decode)]
+pub enum InternalValidationError {
+	/// Some communication error occurred with the host.
+	#[error("validation: some communication error occurred with the host: {0}")]
+	HostCommunication(String),
+	/// Host could not create a hard link to the artifact path.
+	#[error("validation: host could not create a hard link to the artifact path: {0}")]
+	CouldNotCreateLink(String),
+	/// Could not find or open compiled artifact file.
+	#[error("validation: could not find or open compiled artifact file: {0}")]
+	CouldNotOpenFile(String),
+	/// Could not create a pipe between the worker and a child process.
+	#[error("validation: could not create pipe: {0}")]
+	CouldNotCreatePipe(String),
+	/// Host could not clear the worker cache after a job.
+	#[error("validation: host could not clear the worker cache ({path:?}) after a job: {err}")]
+	CouldNotClearWorkerDir {
+		err: String,
+		// Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible
+		// conversion to `Option<String>`.
+		path: Option<String>,
+	},
+	/// Some error occurred when interfacing with the kernel.
+	#[error("validation: error interfacing with the kernel: {0}")]
+	Kernel(String),
+	/// Some non-deterministic preparation error occurred.
+	#[error("validation: prepare: {0}")]
+	NonDeterministicPrepareError(PrepareError),
+}
@@ -0,0 +1,141 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+use crate::{error::InternalValidationError, ArtifactChecksum};
+use codec::{Decode, Encode};
+use pezkuwi_node_primitives::PoV;
+use pezkuwi_primitives::{ExecutorParams, PersistedValidationData};
+use pezkuwi_teyrchain_primitives::primitives::ValidationResult;
+use std::time::Duration;
+
+/// The payload of the one-time handshake that is done when a worker process is created. Carries
+/// data from the host to the worker.
+#[derive(Encode, Decode)]
+pub struct Handshake {
+	/// The executor parameters.
+	pub executor_params: ExecutorParams,
+}
+
+/// A request to execute a PVF
+#[derive(Encode, Decode)]
+pub struct ExecuteRequest {
+	/// Persisted validation data.
+	pub pvd: PersistedValidationData,
+	/// Proof-of-validity.
+	pub pov: PoV,
+	/// Execution timeout.
+	pub execution_timeout: Duration,
+	/// Checksum of the artifact to execute.
+	pub artifact_checksum: ArtifactChecksum,
+}
+
+/// The response from the execution worker.
+#[derive(Debug, Encode, Decode)]
+pub struct WorkerResponse {
+	/// The response from the execute job process.
+	pub job_response: JobResponse,
+	/// The amount of CPU time taken by the job.
+	pub duration: Duration,
+	/// The uncompressed PoV size.
+	pub pov_size: u32,
+}
+
+/// An error occurred in the worker process.
+#[derive(thiserror::Error, Debug, Clone, Encode, Decode)]
+pub enum WorkerError {
+	/// The job timed out.
+	#[error("The job timed out")]
+	JobTimedOut,
+	/// The job process has died. We must kill the worker just in case.
+	///
+	/// We cannot treat this as an internal error because malicious code may have killed the job.
+	/// We still retry it, because in the non-malicious case it is likely spurious.
+	#[error("The job process (pid {job_pid}) has died: {err}")]
+	JobDied { err: String, job_pid: i32 },
+	/// An unexpected error occurred in the job process, e.g. failing to spawn a thread, panic,
+	/// etc.
+	///
+	/// Because malicious code can cause a job error, we must not treat it as an internal error. We
+	/// still retry it, because in the non-malicious case it is likely spurious.
+	#[error("An unexpected error occurred in the job process: {0}")]
+	JobError(#[from] JobError),
+
+	/// Some internal error occurred.
+	#[error("An internal error occurred: {0}")]
+	InternalError(#[from] InternalValidationError),
+}
+
+/// The result of a job on the execution worker.
+pub type JobResult = Result<JobResponse, JobError>;
+
+/// The successful response from a job on the execution worker.
+#[derive(Debug, Encode, Decode)]
+pub enum JobResponse {
+	Ok {
+		/// The result of teyrchain validation.
+		result_descriptor: ValidationResult,
+	},
+	/// A possibly transient runtime instantiation error happened during the execution; may be
+	/// retried with re-preparation
+	RuntimeConstruction(String),
+	/// The candidate is invalid.
+	InvalidCandidate(String),
+	/// PoV decompression failed
+	PoVDecompressionFailure,
+	/// The artifact is corrupted, re-prepare the artifact and try again.
+	CorruptedArtifact,
+}
+
+impl JobResponse {
+	/// Creates an invalid response from a context `ctx` and a message `msg` (which can be empty).
+	pub fn format_invalid(ctx: &'static str, msg: &str) -> Self {
+		if msg.is_empty() {
+			Self::InvalidCandidate(ctx.to_string())
+		} else {
+			Self::InvalidCandidate(format!("{}: {}", ctx, msg))
+		}
+	}
+
+	/// Creates a may retry response from a context `ctx` and a message `msg` (which can be empty).
+	pub fn runtime_construction(ctx: &'static str, msg: &str) -> Self {
+		if msg.is_empty() {
+			Self::RuntimeConstruction(ctx.to_string())
+		} else {
+			Self::RuntimeConstruction(format!("{}: {}", ctx, msg))
+		}
+	}
+}
+
+/// An unexpected error occurred in the execution job process. Because this comes from the job,
+/// which executes untrusted code, this error must likewise be treated as untrusted. That is, we
+/// cannot raise an internal error based on this.
+#[derive(thiserror::Error, Clone, Debug, Encode, Decode)]
+pub enum JobError {
+	#[error("The job timed out")]
+	TimedOut,
+	#[error("An unexpected panic has occurred in the execution job: {0}")]
+	Panic(String),
+	/// Some error occurred when interfacing with the kernel.
+	#[error("Error interfacing with the kernel: {0}")]
+	Kernel(String),
+	#[error("Could not spawn the requested thread: {0}")]
+	CouldNotSpawnThread(String),
+	#[error("An error occurred in the CPU time monitor thread: {0}")]
+	CpuTimeMonitorThread(String),
+	/// Since the job can return any exit status it wants, we have to treat this as untrusted.
+	#[error("Unexpected exit status: {0}")]
+	UnexpectedExitStatus(i32),
+}
@@ -0,0 +1,495 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Interface to the Substrate Executor
+
+use crate::error::ExecuteError;
+use pezkuwi_primitives::{
+	executor_params::{DEFAULT_LOGICAL_STACK_MAX, DEFAULT_NATIVE_STACK_MAX},
+	ExecutorParam, ExecutorParams,
+};
+use sc_executor_common::{
+	error::WasmError,
+	runtime_blob::RuntimeBlob,
+	wasm_runtime::{HeapAllocStrategy, WasmModule as _},
+};
+use sc_executor_wasmtime::{Config, DeterministicStackLimit, Semantics, WasmtimeRuntime};
+use sp_core::storage::{ChildInfo, TrackedStorageKey};
+use sp_externalities::MultiRemovalResults;
+use std::any::{Any, TypeId};
+
+// Memory configuration
+//
+// When Substrate Runtime is instantiated, a number of WASM pages are allocated for the Substrate
+// Runtime instance's linear memory. The exact number of pages is a sum of whatever the WASM blob
+// itself requests (by default at least enough to hold the data section as well as have some space
+// left for the stack; this is, of course, overridable at link time when compiling the runtime)
+// plus the number of pages specified in the `extra_heap_pages` passed to the executor.
+//
+// By default, rustc (or `lld` specifically) should allocate 1 MiB for the shadow stack, or 16
+// pages. The data section for runtimes are typically rather small and can fit in a single digit
+// number of WASM pages, so let's say an extra 16 pages. Thus let's assume that 32 pages or 2 MiB
+// are used for these needs by default.
+const DEFAULT_HEAP_PAGES_ESTIMATE: u32 = 32;
+const EXTRA_HEAP_PAGES: u32 = 2048;
+
+// VALUES OF THE DEFAULT CONFIGURATION SHOULD NEVER BE CHANGED
+// They are used as base values for the execution environment parametrization.
+// To overwrite them, add new ones to `EXECUTOR_PARAMS` in the `session_info` pallet and perform
+// a runtime upgrade to make them active.
+pub const DEFAULT_CONFIG: Config = Config {
+	allow_missing_func_imports: true,
+	cache_path: None,
+	semantics: Semantics {
+		heap_alloc_strategy: sc_executor_common::wasm_runtime::HeapAllocStrategy::Dynamic {
+			maximum_pages: Some(DEFAULT_HEAP_PAGES_ESTIMATE + EXTRA_HEAP_PAGES),
+		},
+
+		instantiation_strategy:
+			sc_executor_wasmtime::InstantiationStrategy::RecreateInstanceCopyOnWrite,
+
+		// Enable deterministic stack limit to pin down the exact number of items the wasmtime stack
+		// can contain before it traps with stack overflow.
+		//
+		// Here is how the values below were chosen.
+		//
+		// At the moment of writing, the default native stack size limit is 1 MiB. Assuming a
+		// logical item (see the docs about the field and the instrumentation algorithm) is 8 bytes,
+		// 1 MiB can fit 2x 65536 logical items.
+		//
+		// Since reaching the native stack limit is undesirable, we halve the logical item limit and
+		// also increase the native 256x. This hopefully should preclude wasm code from reaching
+		// the stack limit set by the wasmtime.
+		deterministic_stack_limit: Some(DeterministicStackLimit {
+			logical_max: DEFAULT_LOGICAL_STACK_MAX,
+			native_stack_max: DEFAULT_NATIVE_STACK_MAX,
+		}),
+		canonicalize_nans: true,
+		// Rationale for turning the multi-threaded compilation off is to make the preparation time
+		// easily reproducible and as deterministic as possible.
+		//
+		// Currently the prepare queue doesn't distinguish between precheck and prepare requests.
+		// On the one hand, it simplifies the code, on the other, however, slows down compile times
+		// for execute requests. This behavior may change in future.
+		parallel_compilation: false,
+
+		// WASM extensions. Only those that are meaningful to us may be controlled here. By default,
+		// we're using WASM MVP, which means all the extensions are disabled. Nevertheless, some
+		// extensions (e.g., sign extension ops) are enabled by Wasmtime and cannot be disabled.
+		wasm_reference_types: false,
+		wasm_simd: false,
+		wasm_bulk_memory: false,
+		wasm_multi_value: false,
+	},
+};
+
+/// Executes the given PVF in the form of a compiled artifact and returns the result of
+/// execution upon success.
+///
+/// # Safety
+///
+/// The caller must ensure that the compiled artifact passed here was:
+///   1) produced by `prepare`,
+///   2) was not modified,
+///
+/// Failure to adhere to these requirements might lead to crashes and arbitrary code execution.
+pub unsafe fn execute_artifact(
+	compiled_artifact_blob: &[u8],
+	executor_params: &ExecutorParams,
+	params: &[u8],
+) -> Result<Vec<u8>, ExecuteError> {
+	let mut extensions = sp_externalities::Extensions::new();
+
+	extensions.register(sp_core::traits::ReadRuntimeVersionExt::new(ReadRuntimeVersion));
+
+	let mut ext = ValidationExternalities(extensions);
+
+	match sc_executor::with_externalities_safe(&mut ext, || {
+		let runtime = create_runtime_from_artifact_bytes(compiled_artifact_blob, executor_params)?;
+		runtime.new_instance()?.call("validate_block", params)
+	}) {
+		Ok(Ok(ok)) => Ok(ok),
+		Ok(Err(err)) | Err(err) => Err(err),
+	}
+}
+
+/// Constructs the runtime for the given PVF, given the artifact bytes.
+///
+/// # Safety
+///
+/// The caller must ensure that the compiled artifact passed here was:
+///   1) produced by `prepare`,
+///   2) was not modified,
+///
+/// Failure to adhere to these requirements might lead to crashes and arbitrary code execution.
+pub unsafe fn create_runtime_from_artifact_bytes(
+	compiled_artifact_blob: &[u8],
+	executor_params: &ExecutorParams,
+) -> Result<WasmtimeRuntime, WasmError> {
+	let mut config = DEFAULT_CONFIG.clone();
+	config.semantics = params_to_wasmtime_semantics(executor_params).0;
+
+	sc_executor_wasmtime::create_runtime_from_artifact_bytes::<HostFunctions>(
+		compiled_artifact_blob,
+		config,
+	)
+}
+
+/// Takes the default config and overwrites any settings with existing executor parameters.
+///
+/// Returns the semantics as well as the stack limit (since we are guaranteed to have it).
+pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> (Semantics, DeterministicStackLimit) {
+	let mut sem = DEFAULT_CONFIG.semantics.clone();
+	let mut stack_limit = sem
+		.deterministic_stack_limit
+		.expect("There is a comment to not change the default stack limit; it should always be available; qed")
+		.clone();
+
+	for p in par.iter() {
+		match p {
+			ExecutorParam::MaxMemoryPages(max_pages) =>
+				sem.heap_alloc_strategy = HeapAllocStrategy::Dynamic {
+					maximum_pages: Some((*max_pages).saturating_add(DEFAULT_HEAP_PAGES_ESTIMATE)),
+				},
+			ExecutorParam::StackLogicalMax(slm) => stack_limit.logical_max = *slm,
+			ExecutorParam::StackNativeMax(snm) => stack_limit.native_stack_max = *snm,
+			ExecutorParam::WasmExtBulkMemory => sem.wasm_bulk_memory = true,
+			ExecutorParam::PrecheckingMaxMemory(_) |
+			ExecutorParam::PvfPrepTimeout(_, _) |
+			ExecutorParam::PvfExecTimeout(_, _) => (), /* Not used here */
+		}
+	}
+	sem.deterministic_stack_limit = Some(stack_limit.clone());
+	(sem, stack_limit)
+}
+
+/// Runs the prevalidation on the given code. Returns a [`RuntimeBlob`] if it succeeds.
+pub fn prevalidate(code: &[u8]) -> Result<RuntimeBlob, sc_executor_common::error::WasmError> {
+	// Construct the runtime blob and do some basic checks for consistency.
+	let blob = RuntimeBlob::new(code)?;
+	// In the future this function should take care of any further prevalidation logic.
+	Ok(blob)
+}
+
+/// Runs preparation on the given runtime blob. If successful, it returns a serialized compiled
+/// artifact which can then be used to pass into `Executor::execute` after writing it to the disk.
+pub fn prepare(
+	blob: RuntimeBlob,
+	executor_params: &ExecutorParams,
+) -> Result<Vec<u8>, sc_executor_common::error::WasmError> {
+	let (semantics, _) = params_to_wasmtime_semantics(executor_params);
+	sc_executor_wasmtime::prepare_runtime_artifact(blob, &semantics)
+}
+
+/// Available host functions. We leave out:
+///
+/// 1. storage related stuff (PVF doesn't have a notion of a persistent storage/trie)
+/// 2. tracing
+/// 3. off chain workers (PVFs do not have such a notion)
+/// 4. runtime tasks
+/// 5. sandbox
+type HostFunctions = (
+	sp_io::misc::HostFunctions,
+	sp_io::crypto::HostFunctions,
+	sp_io::hashing::HostFunctions,
+	sp_io::allocator::HostFunctions,
+	sp_io::logging::HostFunctions,
+	sp_io::trie::HostFunctions,
+);
+
+/// The validation externalities that will panic on any storage related access. (PVFs should not
+/// have a notion of a persistent storage/trie.)
+struct ValidationExternalities(sp_externalities::Extensions);
+
+impl sp_externalities::Externalities for ValidationExternalities {
+	fn storage(&mut self, _: &[u8]) -> Option<Vec<u8>> {
+		panic!("storage: unsupported feature for teyrchain validation")
+	}
+
+	fn storage_hash(&mut self, _: &[u8]) -> Option<Vec<u8>> {
+		panic!("storage_hash: unsupported feature for teyrchain validation")
+	}
+
+	fn child_storage_hash(&mut self, _: &ChildInfo, _: &[u8]) -> Option<Vec<u8>> {
+		panic!("child_storage_hash: unsupported feature for teyrchain validation")
+	}
+
+	fn child_storage(&mut self, _: &ChildInfo, _: &[u8]) -> Option<Vec<u8>> {
+		panic!("child_storage: unsupported feature for teyrchain validation")
+	}
+
+	fn kill_child_storage(
+		&mut self,
+		_child_info: &ChildInfo,
+		_maybe_limit: Option<u32>,
+		_maybe_cursor: Option<&[u8]>,
+	) -> MultiRemovalResults {
+		panic!("kill_child_storage: unsupported feature for teyrchain validation")
+	}
+
+	fn clear_prefix(
+		&mut self,
+		_prefix: &[u8],
+		_maybe_limit: Option<u32>,
+		_maybe_cursor: Option<&[u8]>,
+	) -> MultiRemovalResults {
+		panic!("clear_prefix: unsupported feature for teyrchain validation")
+	}
+
+	fn clear_child_prefix(
+		&mut self,
+		_child_info: &ChildInfo,
+		_prefix: &[u8],
+		_maybe_limit: Option<u32>,
+		_maybe_cursor: Option<&[u8]>,
+	) -> MultiRemovalResults {
+		panic!("clear_child_prefix: unsupported feature for teyrchain validation")
+	}
+
+	fn place_storage(&mut self, _: Vec<u8>, _: Option<Vec<u8>>) {
+		panic!("place_storage: unsupported feature for teyrchain validation")
+	}
+
+	fn place_child_storage(&mut self, _: &ChildInfo, _: Vec<u8>, _: Option<Vec<u8>>) {
+		panic!("place_child_storage: unsupported feature for teyrchain validation")
+	}
+
+	fn storage_root(&mut self, _: sp_core::storage::StateVersion) -> Vec<u8> {
+		panic!("storage_root: unsupported feature for teyrchain validation")
+	}
+
+	fn child_storage_root(&mut self, _: &ChildInfo, _: sp_core::storage::StateVersion) -> Vec<u8> {
+		panic!("child_storage_root: unsupported feature for teyrchain validation")
+	}
+
+	fn next_child_storage_key(&mut self, _: &ChildInfo, _: &[u8]) -> Option<Vec<u8>> {
+		panic!("next_child_storage_key: unsupported feature for teyrchain validation")
+	}
+
+	fn next_storage_key(&mut self, _: &[u8]) -> Option<Vec<u8>> {
+		panic!("next_storage_key: unsupported feature for teyrchain validation")
+	}
+
+	fn storage_append(&mut self, _key: Vec<u8>, _value: Vec<u8>) {
+		panic!("storage_append: unsupported feature for teyrchain validation")
+	}
+
+	fn storage_start_transaction(&mut self) {
+		panic!("storage_start_transaction: unsupported feature for teyrchain validation")
+	}
+
+	fn storage_rollback_transaction(&mut self) -> Result<(), ()> {
+		panic!("storage_rollback_transaction: unsupported feature for teyrchain validation")
+	}
+
+	fn storage_commit_transaction(&mut self) -> Result<(), ()> {
+		panic!("storage_commit_transaction: unsupported feature for teyrchain validation")
+	}
+
+	fn wipe(&mut self) {
+		panic!("wipe: unsupported feature for teyrchain validation")
+	}
+
+	fn commit(&mut self) {
+		panic!("commit: unsupported feature for teyrchain validation")
+	}
+
+	fn read_write_count(&self) -> (u32, u32, u32, u32) {
+		panic!("read_write_count: unsupported feature for teyrchain validation")
+	}
+
+	fn reset_read_write_count(&mut self) {
+		panic!("reset_read_write_count: unsupported feature for teyrchain validation")
+	}
+
+	fn get_whitelist(&self) -> Vec<TrackedStorageKey> {
+		panic!("get_whitelist: unsupported feature for teyrchain validation")
+	}
+
+	fn set_whitelist(&mut self, _: Vec<TrackedStorageKey>) {
+		panic!("set_whitelist: unsupported feature for teyrchain validation")
+	}
+
+	fn set_offchain_storage(&mut self, _: &[u8], _: std::option::Option<&[u8]>) {
+		panic!("set_offchain_storage: unsupported feature for teyrchain validation")
+	}
+
+	fn get_read_and_written_keys(&self) -> Vec<(Vec<u8>, u32, u32, bool)> {
+		panic!("get_read_and_written_keys: unsupported feature for teyrchain validation")
+	}
+}
+
+impl sp_externalities::ExtensionStore for ValidationExternalities {
+	fn extension_by_type_id(&mut self, type_id: TypeId) -> Option<&mut dyn Any> {
+		self.0.get_mut(type_id)
+	}
+
+	fn register_extension_with_type_id(
+		&mut self,
+		type_id: TypeId,
+		extension: Box<dyn sp_externalities::Extension>,
+	) -> Result<(), sp_externalities::Error> {
+		self.0.register_with_type_id(type_id, extension)
+	}
+
+	fn deregister_extension_by_type_id(
+		&mut self,
+		type_id: TypeId,
+	) -> Result<(), sp_externalities::Error> {
+		if self.0.deregister(type_id) {
+			Ok(())
+		} else {
+			Err(sp_externalities::Error::ExtensionIsNotRegistered(type_id))
+		}
+	}
+}
+
+struct ReadRuntimeVersion;
+
+impl sp_core::traits::ReadRuntimeVersion for ReadRuntimeVersion {
+	fn read_runtime_version(
+		&self,
+		wasm_code: &[u8],
+		_ext: &mut dyn sp_externalities::Externalities,
+	) -> Result<Vec<u8>, String> {
+		let blob = RuntimeBlob::uncompress_if_needed(wasm_code)
+			.map_err(|e| format!("Failed to read the PVF runtime blob: {:?}", e))?;
+
+		match sc_executor::read_embedded_version(&blob)
+			.map_err(|e| format!("Failed to read the static section from the PVF blob: {:?}", e))?
+		{
+			Some(version) => {
+				use codec::Encode;
+				Ok(version.encode())
+			},
+			None => Err("runtime version section is not found".to_string()),
+		}
+	}
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+
+	#[test]
+	fn prep_hash_matches_artifact_effect_of_executor_params() {
+		use ExecutorParam::*;
+
+		// If you're adding a new ExecutorParam, please add it to the `cases` below.
+
+		let _coverage_check = |param: &ExecutorParam| match param {
+			MaxMemoryPages(_) => true,
+			StackLogicalMax(_) => true,
+			StackNativeMax(_) => true,
+			PrecheckingMaxMemory(_) => true,
+			PvfPrepTimeout(_, _) => true,
+			PvfExecTimeout(_, _) => true,
+			WasmExtBulkMemory => true,
+		};
+
+		// A minimal module with memory and an exported `validate_block` function.
+		let wat = r#"(module
+			(memory 1)
+			(func (export "validate_block") (param i32 i32))
+		)"#;
+		let wasm = wat::parse_str(wat).expect("wat parsing failed");
+		let blob = prevalidate(&wasm).expect("valid runtime blob");
+
+		let base = ExecutorParams::default();
+
+		let prepare_with = |params: &ExecutorParams| -> Vec<u8> {
+			prepare(blob.clone(), params).expect("prepare should succeed")
+		};
+
+		// Define pairs that toggle exactly one parameter.
+		let cases: Vec<(&str, ExecutorParams, ExecutorParams)> = vec![
+			(
+				"MaxMemoryPages",
+				base.clone(),
+				ExecutorParams::from(&[ExecutorParam::MaxMemoryPages(128)][..]),
+			),
+			(
+				"StackLogicalMax",
+				base.clone(),
+				ExecutorParams::from(
+					&[ExecutorParam::StackLogicalMax(DEFAULT_LOGICAL_STACK_MAX + 1)][..],
+				),
+			),
+			(
+				"StackNativeMax",
+				base.clone(),
+				ExecutorParams::from(
+					&[ExecutorParam::StackNativeMax(DEFAULT_NATIVE_STACK_MAX + 1024)][..],
+				),
+			),
+			(
+				"PrecheckingMaxMemory",
+				base.clone(),
+				ExecutorParams::from(&[ExecutorParam::PrecheckingMaxMemory(300 * 1024 * 1024)][..]),
+			),
+			(
+				"PvfPrepTimeout(Precheck)",
+				base.clone(),
+				ExecutorParams::from(
+					&[ExecutorParam::PvfPrepTimeout(pezkuwi_primitives::PvfPrepKind::Precheck, 1)]
+						[..],
+				),
+			),
+			(
+				"PvfPrepTimeout(Prepare)",
+				base.clone(),
+				ExecutorParams::from(
+					&[ExecutorParam::PvfPrepTimeout(pezkuwi_primitives::PvfPrepKind::Prepare, 2)][..],
+				),
+			),
+			(
+				"PvfExecTimeout(Backing)",
+				base.clone(),
+				ExecutorParams::from(
+					&[ExecutorParam::PvfExecTimeout(pezkuwi_primitives::PvfExecKind::Backing, 1)][..],
+				),
+			),
+			(
+				"PvfExecTimeout(Approval)",
+				base.clone(),
+				ExecutorParams::from(
+					&[ExecutorParam::PvfExecTimeout(pezkuwi_primitives::PvfExecKind::Approval, 2)]
+						[..],
+				),
+			),
+			(
+				"WasmExtBulkMemory",
+				base.clone(),
+				ExecutorParams::from(&[ExecutorParam::WasmExtBulkMemory][..]),
+			),
+		];
+
+		for (name, a, b) in cases.into_iter() {
+			let art_a = prepare_with(&a);
+			let art_b = prepare_with(&b);
+			let artifact_changed = art_a != art_b;
+			let prep_hash_changed = a.prep_hash() != b.prep_hash();
+			assert_eq!(
+				artifact_changed,
+				prep_hash_changed,
+				"ExecutorParam classification mismatch for {}: artifact_changed={}, prep_hash_changed={}",
+				name,
+				artifact_changed,
+				prep_hash_changed,
+			);
+		}
+	}
+}
@@ -0,0 +1,129 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Contains functionality related to PVFs that is shared by the PVF host and the PVF workers.
+#![deny(unused_crate_dependencies)]
+
+pub mod error;
+pub mod execute;
+pub mod executor_interface;
+pub mod prepare;
+pub mod pvf;
+pub mod worker;
+pub mod worker_dir;
+
+pub use cpu_time::ProcessTime;
+
+// Used by `decl_worker_main!`.
+pub use sp_tracing;
+
+const LOG_TARGET: &str = "teyrchain::pvf-common";
+
+use codec::{Decode, Encode};
+use sp_core::H256;
+use std::{
+	io::{self, Read, Write},
+	mem,
+};
+
+#[cfg(feature = "test-utils")]
+pub mod tests {
+	use std::time::Duration;
+
+	pub const TEST_EXECUTION_TIMEOUT: Duration = Duration::from_secs(3);
+	pub const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(30);
+}
+
+/// Status of security features on the current system.
+#[derive(Debug, Clone, Default, PartialEq, Eq, Encode, Decode)]
+pub struct SecurityStatus {
+	/// Whether Secure Validator Mode is enabled. This mode enforces that all required security
+	/// features are present. All features are enabled on a best-effort basis regardless.
+	pub secure_validator_mode: bool,
+	/// Whether the landlock features we use are fully available on this system.
+	pub can_enable_landlock: bool,
+	/// Whether the seccomp features we use are fully available on this system.
+	pub can_enable_seccomp: bool,
+	/// Whether we are able to unshare the user namespace and change the filesystem root.
+	pub can_unshare_user_namespace_and_change_root: bool,
+	/// Whether we are able to call `clone` with all sandboxing flags.
+	pub can_do_secure_clone: bool,
+}
+
+/// A handshake with information for the worker.
+#[derive(Debug, Encode, Decode)]
+pub struct WorkerHandshake {
+	pub security_status: SecurityStatus,
+}
+
+/// Write some data prefixed by its length into `w`. Sync version of `framed_send` to avoid
+/// dependency on tokio.
+pub fn framed_send_blocking(w: &mut (impl Write + Unpin), buf: &[u8]) -> io::Result<()> {
+	let len_buf = buf.len().to_le_bytes();
+	w.write_all(&len_buf)?;
+	w.write_all(buf)?;
+	Ok(())
+}
+
+/// Read some data prefixed by its length from `r`. Sync version of `framed_recv` to avoid
+/// dependency on tokio.
+pub fn framed_recv_blocking(r: &mut (impl Read + Unpin)) -> io::Result<Vec<u8>> {
+	let mut len_buf = [0u8; mem::size_of::<usize>()];
+	r.read_exact(&mut len_buf)?;
+	let len = usize::from_le_bytes(len_buf);
+	let mut buf = vec![0; len];
+	r.read_exact(&mut buf)?;
+	Ok(buf)
+}
+
+#[derive(Debug, Default, Clone, Copy, Encode, Decode, PartialEq, Eq)]
+#[repr(transparent)]
+pub struct ArtifactChecksum(H256);
+
+/// Compute the checksum of the given artifact.
+pub fn compute_checksum(data: &[u8]) -> ArtifactChecksum {
+	ArtifactChecksum(H256::from_slice(&sp_crypto_hashing::twox_256(data)))
+}
+
+#[cfg(all(test, not(feature = "test-utils")))]
+mod tests {
+	use super::*;
+
+	#[test]
+	fn default_secure_status() {
+		let status = SecurityStatus::default();
+		assert!(
+			!status.secure_validator_mode,
+			"secure_validator_mode is false for default security status"
+		);
+		assert!(
+			!status.can_enable_landlock,
+			"can_enable_landlock is false for default security status"
+		);
+		assert!(
+			!status.can_enable_seccomp,
+			"can_enable_seccomp is false for default security status"
+		);
+		assert!(
+			!status.can_unshare_user_namespace_and_change_root,
+			"can_unshare_user_namespace_and_change_root is false for default security status"
+		);
+		assert!(
+			!status.can_do_secure_clone,
+			"can_do_secure_clone is false for default security status"
+		);
+	}
+}
@@ -0,0 +1,85 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+use crate::ArtifactChecksum;
+use codec::{Decode, Encode};
+use std::path::PathBuf;
+
+/// Result from prepare worker if successful.
+#[derive(Debug, Clone, Default, Encode, Decode)]
+pub struct PrepareWorkerSuccess {
+	/// Checksum of the compiled PVF.
+	pub checksum: ArtifactChecksum,
+	/// Stats of the current preparation run.
+	pub stats: PrepareStats,
+}
+
+/// Result of PVF preparation if successful.
+#[derive(Debug, Clone, Default)]
+pub struct PrepareSuccess {
+	/// Checksum of the compiled PVF.
+	pub checksum: ArtifactChecksum,
+	/// Canonical path to the compiled artifact.
+	pub path: PathBuf,
+	/// Size in bytes
+	pub size: u64,
+	/// Stats of the current preparation run.
+	pub stats: PrepareStats,
+}
+
+/// Preparation statistics, including the CPU time and memory taken.
+#[derive(Debug, Clone, Default, Encode, Decode)]
+pub struct PrepareStats {
+	/// The CPU time that elapsed for the preparation job.
+	pub cpu_time_elapsed: std::time::Duration,
+	/// The observed memory statistics for the preparation job.
+	pub memory_stats: MemoryStats,
+	/// The decompressed Wasm code length observed during the preparation.
+	pub observed_wasm_code_len: u32,
+}
+
+/// Helper struct to contain all the memory stats, including `MemoryAllocationStats` and, if
+/// supported by the OS, `ru_maxrss`.
+#[derive(Clone, Debug, Default, Encode, Decode)]
+pub struct MemoryStats {
+	/// Memory stats from `tikv_jemalloc_ctl`, polling-based and not very precise.
+	#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
+	pub memory_tracker_stats: Option<MemoryAllocationStats>,
+	/// `ru_maxrss` from `getrusage`. `None` if an error occurred.
+	#[cfg(target_os = "linux")]
+	pub max_rss: Option<i64>,
+	/// Peak allocation in bytes measured by tracking allocator
+	pub peak_tracked_alloc: u64,
+}
+
+/// Statistics of collected memory metrics.
+#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
+#[derive(Clone, Debug, Default, Encode, Decode)]
+pub struct MemoryAllocationStats {
+	/// Total resident memory, in bytes.
+	pub resident: u64,
+	/// Total allocated memory, in bytes.
+	pub allocated: u64,
+}
+
+/// The kind of prepare job.
+#[derive(Copy, Clone, Debug, Encode, Decode)]
+pub enum PrepareJobKind {
+	/// Compilation triggered by a candidate validation request.
+	Compilation,
+	/// A prechecking job.
+	Prechecking,
+}
@@ -0,0 +1,141 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+use crate::prepare::PrepareJobKind;
+use codec::{Decode, Encode};
+use pezkuwi_primitives::ExecutorParams;
+use pezkuwi_teyrchain_primitives::primitives::ValidationCodeHash;
+use std::{fmt, sync::Arc, time::Duration};
+
+/// A struct that carries the exhaustive set of data to prepare an artifact out of plain
+/// Wasm binary
+///
+/// Should be cheap to clone.
+#[derive(Clone, Encode, Decode)]
+pub struct PvfPrepData {
+	/// Wasm code (maybe compressed)
+	maybe_compressed_code: Arc<Vec<u8>>,
+	/// Maximum uncompressed code size.
+	validation_code_bomb_limit: u32,
+	/// Wasm code hash.
+	code_hash: ValidationCodeHash,
+	/// Executor environment parameters for the session for which artifact is prepared
+	executor_params: Arc<ExecutorParams>,
+	/// Preparation timeout
+	prep_timeout: Duration,
+	/// The kind of preparation job.
+	prep_kind: PrepareJobKind,
+}
+
+impl PvfPrepData {
+	/// Returns an instance of the PVF out of the given PVF code and executor params.
+	pub fn from_code(
+		code: Vec<u8>,
+		executor_params: ExecutorParams,
+		prep_timeout: Duration,
+		prep_kind: PrepareJobKind,
+		validation_code_bomb_limit: u32,
+	) -> Self {
+		let maybe_compressed_code = Arc::new(code);
+		let code_hash = sp_crypto_hashing::blake2_256(&maybe_compressed_code).into();
+		let executor_params = Arc::new(executor_params);
+		Self {
+			maybe_compressed_code,
+			code_hash,
+			executor_params,
+			prep_timeout,
+			prep_kind,
+			validation_code_bomb_limit,
+		}
+	}
+
+	/// Returns validation code hash
+	pub fn code_hash(&self) -> ValidationCodeHash {
+		self.code_hash
+	}
+
+	/// Returns PVF code blob
+	pub fn maybe_compressed_code(&self) -> Arc<Vec<u8>> {
+		self.maybe_compressed_code.clone()
+	}
+
+	/// Returns executor params
+	pub fn executor_params(&self) -> Arc<ExecutorParams> {
+		self.executor_params.clone()
+	}
+
+	/// Returns preparation timeout.
+	pub fn prep_timeout(&self) -> Duration {
+		self.prep_timeout
+	}
+
+	/// Returns preparation kind.
+	pub fn prep_kind(&self) -> PrepareJobKind {
+		self.prep_kind
+	}
+
+	/// Returns validation code bomb limit.
+	pub fn validation_code_bomb_limit(&self) -> u32 {
+		self.validation_code_bomb_limit
+	}
+
+	/// Creates a structure for tests.
+	#[cfg(feature = "test-utils")]
+	pub fn from_discriminator_and_timeout(num: u32, timeout: Duration) -> Self {
+		let discriminator_buf = num.to_le_bytes().to_vec();
+		Self::from_code(
+			discriminator_buf,
+			ExecutorParams::default(),
+			timeout,
+			PrepareJobKind::Compilation,
+			30 * 1024 * 1024,
+		)
+	}
+
+	/// Creates a structure for tests.
+	#[cfg(feature = "test-utils")]
+	pub fn from_discriminator(num: u32) -> Self {
+		Self::from_discriminator_and_timeout(num, crate::tests::TEST_PREPARATION_TIMEOUT)
+	}
+
+	/// Creates a structure for tests.
+	#[cfg(feature = "test-utils")]
+	pub fn from_discriminator_precheck(num: u32) -> Self {
+		let mut pvf =
+			Self::from_discriminator_and_timeout(num, crate::tests::TEST_PREPARATION_TIMEOUT);
+		pvf.prep_kind = PrepareJobKind::Prechecking;
+		pvf
+	}
+}
+
+impl fmt::Debug for PvfPrepData {
+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+		write!(
+			f,
+			"Pvf {{ code: [...], code_hash: {:?}, executor_params: {:?}, prep_timeout: {:?} }}",
+			self.code_hash, self.executor_params, self.prep_timeout
+		)
+	}
+}
+
+impl PartialEq for PvfPrepData {
+	fn eq(&self, other: &Self) -> bool {
+		self.code_hash == other.code_hash &&
+			self.executor_params.hash() == other.executor_params.hash()
+	}
+}
+
+impl Eq for PvfPrepData {}
@@ -0,0 +1,839 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Functionality common to both prepare and execute workers.
+
+pub mod security;
+
+use crate::{
+	framed_recv_blocking, framed_send_blocking, SecurityStatus, WorkerHandshake, LOG_TARGET,
+};
+use codec::{Decode, Encode};
+use cpu_time::ProcessTime;
+use futures::never::Never;
+use nix::{errno::Errno, sys::resource::Usage};
+use std::{
+	any::Any,
+	fmt::{self},
+	fs::File,
+	io::{self, Read, Write},
+	os::{
+		fd::{AsRawFd, FromRawFd, RawFd},
+		unix::net::UnixStream,
+	},
+	path::PathBuf,
+	sync::mpsc::{Receiver, RecvTimeoutError},
+	time::Duration,
+};
+
+/// Use this macro to declare a `fn main() {}` that will create an executable that can be used for
+/// spawning the desired worker.
+#[macro_export]
+macro_rules! decl_worker_main {
+	($expected_command:expr, $entrypoint:expr, $worker_version:expr, $worker_version_hash:expr $(,)*) => {
+		fn get_full_version() -> String {
+			format!("{}-{}", $worker_version, $worker_version_hash)
+		}
+
+		fn print_help(expected_command: &str) {
+			println!("{} {}", expected_command, $worker_version);
+			println!("commit: {}", $worker_version_hash);
+			println!();
+			println!("PVF worker that is called by pezkuwi.");
+		}
+
+		fn main() {
+			#[cfg(target_os = "linux")]
+			use $crate::worker::security;
+
+			$crate::sp_tracing::try_init_simple();
+
+			let args = std::env::args().collect::<Vec<_>>();
+			if args.len() == 1 {
+				print_help($expected_command);
+				return;
+			}
+
+			match args[1].as_ref() {
+				"--help" | "-h" => {
+					print_help($expected_command);
+					return;
+				},
+				"--version" | "-v" => {
+					println!("{}", $worker_version);
+					return;
+				},
+				// Useful for debugging. --version is used for version checks.
+				"--full-version" => {
+					println!("{}", get_full_version());
+					return;
+				},
+
+				"--check-can-enable-landlock" => {
+					#[cfg(target_os = "linux")]
+					let status = if let Err(err) = security::landlock::check_can_fully_enable() {
+						// Write the error to stderr, log it on the host-side.
+						eprintln!("{}", err);
+						-1
+					} else {
+						0
+					};
+					#[cfg(not(target_os = "linux"))]
+					let status = -1;
+					std::process::exit(status)
+				},
+				"--check-can-enable-seccomp" => {
+					#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
+					let status = if let Err(err) = security::seccomp::check_can_fully_enable() {
+						// Write the error to stderr, log it on the host-side.
+						eprintln!("{}", err);
+						-1
+					} else {
+						0
+					};
+					#[cfg(not(all(target_os = "linux", target_arch = "x86_64")))]
+					let status = -1;
+					std::process::exit(status)
+				},
+				"--check-can-unshare-user-namespace-and-change-root" => {
+					#[cfg(target_os = "linux")]
+					let cache_path_tempdir = std::path::Path::new(&args[2]);
+					#[cfg(target_os = "linux")]
+					let status = if let Err(err) =
+						security::change_root::check_can_fully_enable(&cache_path_tempdir)
+					{
+						// Write the error to stderr, log it on the host-side.
+						eprintln!("{}", err);
+						-1
+					} else {
+						0
+					};
+					#[cfg(not(target_os = "linux"))]
+					let status = -1;
+					std::process::exit(status)
+				},
+				"--check-can-do-secure-clone" => {
+					#[cfg(target_os = "linux")]
+					// SAFETY: new process is spawned within a single threaded process. This
+					// invariant is enforced by tests.
+					let status = if let Err(err) = unsafe { security::clone::check_can_fully_clone() } {
+						// Write the error to stderr, log it on the host-side.
+						eprintln!("{}", err);
+						-1
+					} else {
+						0
+					};
+					#[cfg(not(target_os = "linux"))]
+					let status = -1;
+					std::process::exit(status)
+				},
+
+				"test-sleep" => {
+					std::thread::sleep(std::time::Duration::from_secs(5));
+					return;
+				},
+
+				subcommand => {
+					// Must be passed for compatibility with the single-binary test workers.
+					if subcommand != $expected_command {
+						panic!(
+							"trying to run {} binary with the {} subcommand",
+							$expected_command, subcommand
+						)
+					}
+				},
+			}
+
+			let mut socket_path = None;
+			let mut worker_dir_path = None;
+			let mut node_version = None;
+
+			let mut i = 2;
+			while i < args.len() {
+				match args[i].as_ref() {
+					"--socket-path" => {
+						socket_path = Some(args[i + 1].as_str());
+						i += 1
+					},
+					"--worker-dir-path" => {
+						worker_dir_path = Some(args[i + 1].as_str());
+						i += 1
+					},
+					"--node-impl-version" => {
+						node_version = Some(args[i + 1].as_str());
+						i += 1
+					},
+					arg => panic!("Unexpected argument found: {}", arg),
+				}
+				i += 1;
+			}
+			let socket_path = socket_path.expect("the --socket-path argument is required");
+			let worker_dir_path =
+				worker_dir_path.expect("the --worker-dir-path argument is required");
+
+			let socket_path = std::path::Path::new(socket_path).to_owned();
+			let worker_dir_path = std::path::Path::new(worker_dir_path).to_owned();
+
+			$entrypoint(socket_path, worker_dir_path, node_version, Some($worker_version));
+		}
+	};
+}
+
+//taken from the os_pipe crate. Copied here to reduce one dependency and
+// because its type-safe abstractions do not play well with nix's clone
+#[cfg(not(target_os = "macos"))]
+pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
+	let mut fds: [libc::c_int; 2] = [0; 2];
+	let res = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC) };
+	if res != 0 {
+		return Err(io::Error::last_os_error());
+	}
+	Ok((fds[0], fds[1]))
+}
+
+#[cfg(target_os = "macos")]
+pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
+	let mut fds: [libc::c_int; 2] = [0; 2];
+	let res = unsafe { libc::pipe(fds.as_mut_ptr()) };
+	if res != 0 {
+		return Err(io::Error::last_os_error());
+	}
+	let res = unsafe { libc::fcntl(fds[0], libc::F_SETFD, libc::FD_CLOEXEC) };
+	if res != 0 {
+		return Err(io::Error::last_os_error());
+	}
+	let res = unsafe { libc::fcntl(fds[1], libc::F_SETFD, libc::FD_CLOEXEC) };
+	if res != 0 {
+		return Err(io::Error::last_os_error());
+	}
+	Ok((fds[0], fds[1]))
+}
+
+/// A wrapper around a file descriptor used to encapsulate and restrict
+/// functionality for pipe operations.
+pub struct PipeFd {
+	file: File,
+}
+
+impl AsRawFd for PipeFd {
+	/// Returns the raw file descriptor associated with this `PipeFd`
+	fn as_raw_fd(&self) -> RawFd {
+		self.file.as_raw_fd()
+	}
+}
+
+impl FromRawFd for PipeFd {
+	/// Creates a new `PipeFd` instance from a raw file descriptor.
+	///
+	/// # Safety
+	///
+	/// The fd passed in must be an owned file descriptor; in particular, it must be open.
+	unsafe fn from_raw_fd(fd: RawFd) -> Self {
+		PipeFd { file: File::from_raw_fd(fd) }
+	}
+}
+
+impl Read for PipeFd {
+	fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+		self.file.read(buf)
+	}
+
+	fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
+		self.file.read_to_end(buf)
+	}
+}
+
+impl Write for PipeFd {
+	fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+		self.file.write(buf)
+	}
+
+	fn flush(&mut self) -> io::Result<()> {
+		self.file.flush()
+	}
+
+	fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
+		self.file.write_all(buf)
+	}
+}
+
+/// Some allowed overhead that we account for in the "CPU time monitor" thread's sleeps, on the
+/// child process.
+pub const JOB_TIMEOUT_OVERHEAD: Duration = Duration::from_millis(50);
+
+#[derive(Debug, Clone, Copy)]
+pub enum WorkerKind {
+	Prepare,
+	Execute,
+	CheckPivotRoot,
+}
+
+impl fmt::Display for WorkerKind {
+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+		match self {
+			Self::Prepare => write!(f, "prepare"),
+			Self::Execute => write!(f, "execute"),
+			Self::CheckPivotRoot => write!(f, "check pivot root"),
+		}
+	}
+}
+
+#[derive(Debug)]
+pub struct WorkerInfo {
+	pub pid: u32,
+	pub kind: WorkerKind,
+	pub version: Option<String>,
+	pub worker_dir_path: PathBuf,
+}
+
+// NOTE: The worker version must be passed in so that we accurately get the version of the worker,
+// and not the version that this crate was compiled with.
+//
+// NOTE: This must not spawn any threads due to safety requirements in `event_loop` and to avoid
+// errors in [`security::change_root::try_restrict`].
+//
+/// Initializes the worker process, then runs the given event loop, which spawns a new job process
+/// to securely handle each incoming request.
+pub fn run_worker<F>(
+	worker_kind: WorkerKind,
+	socket_path: PathBuf,
+	worker_dir_path: PathBuf,
+	node_version: Option<&str>,
+	worker_version: Option<&str>,
+	mut event_loop: F,
+) where
+	F: FnMut(UnixStream, &WorkerInfo, SecurityStatus) -> io::Result<Never>,
+{
+	#[cfg_attr(not(target_os = "linux"), allow(unused_mut))]
+	let mut worker_info = WorkerInfo {
+		pid: std::process::id(),
+		kind: worker_kind,
+		version: worker_version.map(|v| v.to_string()),
+		worker_dir_path,
+	};
+	gum::debug!(
+		target: LOG_TARGET,
+		?worker_info,
+		?socket_path,
+		"starting pvf worker ({})",
+		worker_info.kind
+	);
+
+	// Check for a mismatch between the node and worker versions.
+	if let (Some(node_version), Some(worker_version)) = (node_version, &worker_info.version) {
+		if node_version != worker_version {
+			gum::error!(
+				target: LOG_TARGET,
+				?worker_info,
+				%node_version,
+				"Node and worker version mismatch, node needs restarting, forcing shutdown",
+			);
+			kill_parent_node_in_emergency();
+			worker_shutdown(worker_info, "Version mismatch");
+		}
+	}
+
+	// Make sure that we can read the worker dir path, and log its contents.
+	let entries: io::Result<Vec<_>> = std::fs::read_dir(&worker_info.worker_dir_path)
+		.and_then(|d| d.map(|res| res.map(|e| e.file_name())).collect());
+	match entries {
+		Ok(entries) => {
+			gum::trace!(target: LOG_TARGET, ?worker_info, "content of worker dir: {:?}", entries)
+		},
+		Err(err) => {
+			let err = format!("Could not read worker dir: {}", err.to_string());
+			worker_shutdown_error(worker_info, &err);
+		},
+	}
+
+	// Connect to the socket.
+	let stream = || -> io::Result<UnixStream> {
+		let stream = UnixStream::connect(&socket_path)?;
+		let _ = std::fs::remove_file(&socket_path);
+		Ok(stream)
+	}();
+	let mut stream = match stream {
+		Ok(ok) => ok,
+		Err(err) => worker_shutdown_error(worker_info, &err.to_string()),
+	};
+
+	let WorkerHandshake { security_status } = match recv_worker_handshake(&mut stream) {
+		Ok(ok) => ok,
+		Err(err) => worker_shutdown_error(worker_info, &err.to_string()),
+	};
+
+	// Enable some security features.
+	{
+		gum::trace!(target: LOG_TARGET, ?security_status, "Enabling security features");
+
+		// First, make sure env vars were cleared, to match the environment we perform the checks
+		// within. (In theory, running checks with different env vars could result in different
+		// outcomes of the checks.)
+		if !security::check_env_vars_were_cleared(&worker_info) {
+			let err = "not all env vars were cleared when spawning the process";
+			gum::error!(
+				target: LOG_TARGET,
+				?worker_info,
+				"{}",
+				err
+			);
+			if security_status.secure_validator_mode {
+				worker_shutdown(worker_info, err);
+			}
+		}
+
+		// Call based on whether we can change root. Error out if it should work but fails.
+		//
+		// NOTE: This should not be called in a multi-threaded context (i.e. inside the tokio
+		// runtime). `unshare(2)`:
+		//
+		//       > CLONE_NEWUSER requires that the calling process is not threaded.
+		#[cfg(target_os = "linux")]
+		if security_status.can_unshare_user_namespace_and_change_root {
+			if let Err(err) = security::change_root::enable_for_worker(&worker_info) {
+				// The filesystem may be in an inconsistent state, always bail out.
+				let err = format!("Could not change root to be the worker cache path: {}", err);
+				worker_shutdown_error(worker_info, &err);
+			}
+			worker_info.worker_dir_path = std::path::Path::new("/").to_owned();
+		}
+
+		#[cfg(target_os = "linux")]
+		if security_status.can_enable_landlock {
+			if let Err(err) = security::landlock::enable_for_worker(&worker_info) {
+				// We previously were able to enable, so this should never happen. Shutdown if
+				// running in secure mode.
+				let err = format!("could not fully enable landlock: {:?}", err);
+				gum::error!(
+					target: LOG_TARGET,
+					?worker_info,
+					"{}. This should not happen, please report an issue",
+					err
+				);
+				if security_status.secure_validator_mode {
+					worker_shutdown(worker_info, &err);
+				}
+			}
+		}
+
+		// TODO: We can enable the seccomp networking blacklist on aarch64 as well, but we need a CI
+		//       job to catch regressions. See issue ci_cd/issues/609.
+		#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
+		if security_status.can_enable_seccomp {
+			if let Err(err) = security::seccomp::enable_for_worker(&worker_info) {
+				// We previously were able to enable, so this should never happen. Shutdown if
+				// running in secure mode.
+				let err = format!("could not fully enable seccomp: {:?}", err);
+				gum::error!(
+					target: LOG_TARGET,
+					?worker_info,
+					"{}. This should not happen, please report an issue",
+					err
+				);
+				if security_status.secure_validator_mode {
+					worker_shutdown(worker_info, &err);
+				}
+			}
+		}
+	}
+
+	// Run the main worker loop.
+	let err = event_loop(stream, &worker_info, security_status)
+		// It's never `Ok` because it's `Ok(Never)`.
+		.unwrap_err();
+
+	worker_shutdown(worker_info, &err.to_string());
+}
+
+/// Provide a consistent message on unexpected worker shutdown.
+fn worker_shutdown(worker_info: WorkerInfo, err: &str) -> ! {
+	gum::warn!(target: LOG_TARGET, ?worker_info, "quitting pvf worker ({}): {}", worker_info.kind, err);
+	std::process::exit(1);
+}
+
+/// Provide a consistent error on unexpected worker shutdown.
+fn worker_shutdown_error(worker_info: WorkerInfo, err: &str) -> ! {
+	gum::error!(target: LOG_TARGET, ?worker_info, "quitting pvf worker ({}): {}", worker_info.kind, err);
+	std::process::exit(1);
+}
+
+/// Loop that runs in the CPU time monitor thread on prepare and execute jobs. Continuously wakes up
+/// and then either blocks for the remaining CPU time, or returns if we exceed the CPU timeout.
+///
+/// Returning `Some` indicates that we should send a `TimedOut` error to the host. Will return
+/// `None` if the other thread finishes first, without us timing out.
+///
+/// NOTE: Sending a `TimedOut` error to the host will cause the worker, whether preparation or
+/// execution, to be killed by the host. We do not kill the process here because it would interfere
+/// with the proper handling of this error.
+pub fn cpu_time_monitor_loop(
+	cpu_time_start: ProcessTime,
+	timeout: Duration,
+	finished_rx: Receiver<()>,
+) -> Option<Duration> {
+	loop {
+		let cpu_time_elapsed = cpu_time_start.elapsed();
+
+		// Treat the timeout as CPU time, which is less subject to variance due to load.
+		if cpu_time_elapsed <= timeout {
+			// Sleep for the remaining CPU time, plus a bit to account for overhead. (And we don't
+			// want to wake up too often -- so, since we just want to halt the worker thread if it
+			// stalled, we can sleep longer than necessary.) Note that the sleep is wall clock time.
+			// The CPU clock may be slower than the wall clock.
+			let sleep_interval = timeout.saturating_sub(cpu_time_elapsed) + JOB_TIMEOUT_OVERHEAD;
+			match finished_rx.recv_timeout(sleep_interval) {
+				// Received finish signal.
+				Ok(()) => return None,
+				// Timed out, restart loop.
+				Err(RecvTimeoutError::Timeout) => continue,
+				Err(RecvTimeoutError::Disconnected) => return None,
+			}
+		}
+
+		return Some(cpu_time_elapsed);
+	}
+}
+
+/// Attempt to convert an opaque panic payload to a string.
+///
+/// This is a best effort, and is not guaranteed to provide the most accurate value.
+pub fn stringify_panic_payload(payload: Box<dyn Any + Send + 'static>) -> String {
+	match payload.downcast::<&'static str>() {
+		Ok(msg) => msg.to_string(),
+		Err(payload) => match payload.downcast::<String>() {
+			Ok(msg) => *msg,
+			// At least we tried...
+			Err(_) => "unknown panic payload".to_string(),
+		},
+	}
+}
+
+/// In case of node and worker version mismatch (as a result of in-place upgrade), send `SIGTERM`
+/// to the node to tear it down and prevent it from raising disputes on valid candidates. Node
+/// restart should be handled by the node owner. As node exits, Unix sockets opened to workers
+/// get closed by the OS and other workers receive error on socket read and also exit. Preparation
+/// jobs are written to the temporary files that are renamed to real artifacts on the node side, so
+/// no leftover artifacts are possible.
+fn kill_parent_node_in_emergency() {
+	unsafe {
+		// SAFETY: `getpid()` never fails but may return "no-parent" (0) or "parent-init" (1) in
+		// some corner cases, which is checked. `kill()` never fails.
+		let ppid = libc::getppid();
+		if ppid > 1 {
+			libc::kill(ppid, libc::SIGTERM);
+		}
+	}
+}
+
+/// Receives a handshake with information for the worker.
+fn recv_worker_handshake(stream: &mut UnixStream) -> io::Result<WorkerHandshake> {
+	let worker_handshake = framed_recv_blocking(stream)?;
+	let worker_handshake = WorkerHandshake::decode(&mut &worker_handshake[..]).map_err(|e| {
+		io::Error::new(
+			io::ErrorKind::Other,
+			format!("recv_worker_handshake: failed to decode WorkerHandshake: {}", e),
+		)
+	})?;
+	Ok(worker_handshake)
+}
+
+/// Calculate the total CPU time from the given `usage` structure, returned from
+/// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user
+/// and system time.
+///
+/// # Arguments
+///
+/// - `rusage`: Contains resource usage information.
+///
+/// # Returns
+///
+/// Returns a `Duration` representing the total CPU time.
+pub fn get_total_cpu_usage(rusage: Usage) -> Duration {
+	let micros = (((rusage.user_time().tv_sec() + rusage.system_time().tv_sec()) * 1_000_000) +
+		(rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) as u64;
+
+	return Duration::from_micros(micros);
+}
+
+/// Get a job response.
+pub fn recv_child_response<T>(
+	received_data: &mut io::BufReader<&[u8]>,
+	context: &'static str,
+) -> io::Result<T>
+where
+	T: Decode,
+{
+	let response_bytes = framed_recv_blocking(received_data)?;
+	T::decode(&mut response_bytes.as_slice()).map_err(|e| {
+		io::Error::new(
+			io::ErrorKind::Other,
+			format!("{} pvf recv_child_response: decode error: {}", context, e),
+		)
+	})
+}
+
+pub fn send_result<T, E>(
+	stream: &mut UnixStream,
+	result: Result<T, E>,
+	worker_info: &WorkerInfo,
+) -> io::Result<()>
+where
+	T: std::fmt::Debug,
+	E: std::fmt::Debug + std::fmt::Display,
+	Result<T, E>: Encode,
+{
+	if let Err(ref err) = result {
+		gum::warn!(
+			target: LOG_TARGET,
+			?worker_info,
+			"worker: error occurred: {}",
+			err
+		);
+	}
+	gum::trace!(
+		target: LOG_TARGET,
+		?worker_info,
+		"worker: sending result to host: {:?}",
+		result
+	);
+
+	framed_send_blocking(stream, &result.encode()).map_err(|err| {
+		gum::warn!(
+			target: LOG_TARGET,
+			?worker_info,
+			"worker: error occurred sending result to host: {}",
+			err
+		);
+		err
+	})
+}
+
+pub fn stringify_errno(context: &'static str, errno: Errno) -> String {
+	format!("{}: {}: {}", context, errno, io::Error::last_os_error())
+}
+
+/// Functionality related to threads spawned by the workers.
+///
+/// The motivation for this module is to coordinate worker threads without using async Rust.
+pub mod thread {
+	use std::{
+		io, panic,
+		sync::{Arc, Condvar, Mutex},
+		thread,
+		time::Duration,
+	};
+
+	/// Contains the outcome of waiting on threads, or `Pending` if none are ready.
+	#[derive(Debug, Clone, Copy)]
+	pub enum WaitOutcome {
+		Finished,
+		TimedOut,
+		Pending,
+	}
+
+	impl WaitOutcome {
+		pub fn is_pending(&self) -> bool {
+			matches!(self, Self::Pending)
+		}
+	}
+
+	/// Helper type.
+	pub type Cond = Arc<(Mutex<WaitOutcome>, Condvar)>;
+
+	/// Gets a condvar initialized to `Pending`.
+	pub fn get_condvar() -> Cond {
+		Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new()))
+	}
+
+	/// Runs a worker thread. Will run the requested function, and afterwards notify the threads
+	/// waiting on the condvar. Catches panics during execution and resumes the panics after
+	/// triggering the condvar, so that the waiting thread is notified on panics.
+	///
+	/// # Returns
+	///
+	/// Returns the thread's join handle. Calling `.join()` on it returns the result of executing
+	/// `f()`, as well as whether we were able to enable sandboxing.
+	pub fn spawn_worker_thread<F, R>(
+		name: &str,
+		f: F,
+		cond: Cond,
+		outcome: WaitOutcome,
+	) -> io::Result<thread::JoinHandle<R>>
+	where
+		F: FnOnce() -> R,
+		F: Send + 'static + panic::UnwindSafe,
+		R: Send + 'static,
+	{
+		thread::Builder::new()
+			.name(name.into())
+			.spawn(move || cond_notify_on_done(f, cond, outcome))
+	}
+
+	/// Runs a worker thread with the given stack size. See [`spawn_worker_thread`].
+	pub fn spawn_worker_thread_with_stack_size<F, R>(
+		name: &str,
+		f: F,
+		cond: Cond,
+		outcome: WaitOutcome,
+		stack_size: usize,
+	) -> io::Result<thread::JoinHandle<R>>
+	where
+		F: FnOnce() -> R,
+		F: Send + 'static + panic::UnwindSafe,
+		R: Send + 'static,
+	{
+		thread::Builder::new()
+			.name(name.into())
+			.stack_size(stack_size)
+			.spawn(move || cond_notify_on_done(f, cond, outcome))
+	}
+
+	/// Runs a function, afterwards notifying the threads waiting on the condvar. Catches panics and
+	/// resumes them after triggering the condvar, so that the waiting thread is notified on panics.
+	fn cond_notify_on_done<F, R>(f: F, cond: Cond, outcome: WaitOutcome) -> R
+	where
+		F: FnOnce() -> R,
+		F: panic::UnwindSafe,
+	{
+		let result = panic::catch_unwind(|| f());
+		cond_notify_all(cond, outcome);
+		match result {
+			Ok(inner) => return inner,
+			Err(err) => panic::resume_unwind(err),
+		}
+	}
+
+	/// Helper function to notify all threads waiting on this condvar.
+	fn cond_notify_all(cond: Cond, outcome: WaitOutcome) {
+		let (lock, cvar) = &*cond;
+		let mut flag = lock.lock().unwrap();
+		if !flag.is_pending() {
+			// Someone else already triggered the condvar.
+			return;
+		}
+		*flag = outcome;
+		cvar.notify_all();
+	}
+
+	/// Block the thread while it waits on the condvar.
+	pub fn wait_for_threads(cond: Cond) -> WaitOutcome {
+		let (lock, cvar) = &*cond;
+		let guard = cvar.wait_while(lock.lock().unwrap(), |flag| flag.is_pending()).unwrap();
+		*guard
+	}
+
+	/// Block the thread while it waits on the condvar or on a timeout. If the timeout is hit,
+	/// returns `None`.
+	#[cfg_attr(not(any(target_os = "linux", feature = "jemalloc-allocator")), allow(dead_code))]
+	pub fn wait_for_threads_with_timeout(cond: &Cond, dur: Duration) -> Option<WaitOutcome> {
+		let (lock, cvar) = &**cond;
+		let result = cvar
+			.wait_timeout_while(lock.lock().unwrap(), dur, |flag| flag.is_pending())
+			.unwrap();
+		if result.1.timed_out() {
+			None
+		} else {
+			Some(*result.0)
+		}
+	}
+
+	#[cfg(test)]
+	mod tests {
+		use super::*;
+		use assert_matches::assert_matches;
+
+		#[test]
+		fn get_condvar_should_be_pending() {
+			let condvar = get_condvar();
+			let outcome = *condvar.0.lock().unwrap();
+			assert!(outcome.is_pending());
+		}
+
+		#[test]
+		fn wait_for_threads_with_timeout_return_none_on_time_out() {
+			let condvar = Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new()));
+			let outcome = wait_for_threads_with_timeout(&condvar, Duration::from_millis(100));
+			assert!(outcome.is_none());
+		}
+
+		#[test]
+		fn wait_for_threads_with_timeout_returns_outcome() {
+			let condvar = Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new()));
+			let condvar2 = condvar.clone();
+			cond_notify_all(condvar2, WaitOutcome::Finished);
+			let outcome = wait_for_threads_with_timeout(&condvar, Duration::from_secs(2));
+			assert_matches!(outcome.unwrap(), WaitOutcome::Finished);
+		}
+
+		#[test]
+		fn spawn_worker_thread_should_notify_on_done() {
+			let condvar = Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new()));
+			let response =
+				spawn_worker_thread("thread", || 2, condvar.clone(), WaitOutcome::TimedOut);
+			let (lock, _) = &*condvar;
+			let r = response.unwrap().join().unwrap();
+			assert_eq!(r, 2);
+			assert_matches!(*lock.lock().unwrap(), WaitOutcome::TimedOut);
+		}
+
+		#[test]
+		fn spawn_worker_should_not_change_finished_outcome() {
+			let condvar = Arc::new((Mutex::new(WaitOutcome::Finished), Condvar::new()));
+			let response =
+				spawn_worker_thread("thread", move || 2, condvar.clone(), WaitOutcome::TimedOut);
+
+			let r = response.unwrap().join().unwrap();
+			assert_eq!(r, 2);
+			assert_matches!(*condvar.0.lock().unwrap(), WaitOutcome::Finished);
+		}
+
+		#[test]
+		fn cond_notify_on_done_should_update_wait_outcome_when_panic() {
+			let condvar = Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new()));
+			let err = panic::catch_unwind(panic::AssertUnwindSafe(|| {
+				cond_notify_on_done(|| panic!("test"), condvar.clone(), WaitOutcome::Finished)
+			}));
+
+			assert_matches!(*condvar.0.lock().unwrap(), WaitOutcome::Finished);
+			assert!(err.is_err());
+		}
+	}
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+	use std::sync::mpsc::channel;
+
+	#[test]
+	fn cpu_time_monitor_loop_should_return_time_elapsed() {
+		let cpu_time_start = ProcessTime::now();
+		let timeout = Duration::from_secs(0);
+		let (_tx, rx) = channel();
+		let result = cpu_time_monitor_loop(cpu_time_start, timeout, rx);
+		assert_ne!(result, None);
+	}
+
+	#[test]
+	fn cpu_time_monitor_loop_should_return_none() {
+		let cpu_time_start = ProcessTime::now();
+		let timeout = Duration::from_secs(10);
+		let (tx, rx) = channel();
+		tx.send(()).unwrap();
+		let result = cpu_time_monitor_loop(cpu_time_start, timeout, rx);
+		assert_eq!(result, None);
+	}
+}
@@ -0,0 +1,168 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Functionality for securing workers by unsharing some namespaces from other processes and
+//! changing the root.
+
+use crate::{
+	worker::{WorkerInfo, WorkerKind},
+	LOG_TARGET,
+};
+use std::{env, ffi::CString, io, os::unix::ffi::OsStrExt, path::Path, ptr};
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+	#[error("{0}")]
+	OsErrWithContext(String),
+	#[error(transparent)]
+	Io(#[from] io::Error),
+	#[error("assertion failed: {0}")]
+	AssertionFailed(String),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Try to enable for the given kind of worker.
+///
+/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
+///       "CLONE_NEWUSER requires that the calling process is not threaded."
+pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
+	gum::trace!(
+		target: LOG_TARGET,
+		?worker_info,
+		"enabling change-root",
+	);
+
+	try_restrict(worker_info)
+}
+
+/// Runs a check for unshare-and-change-root and returns an error indicating whether it can be fully
+/// enabled on the current Linux environment.
+///
+/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
+///       "CLONE_NEWUSER requires that the calling process is not threaded."
+pub fn check_can_fully_enable(tempdir: &Path) -> Result<()> {
+	let worker_dir_path = tempdir.to_owned();
+	try_restrict(&WorkerInfo {
+		pid: std::process::id(),
+		kind: WorkerKind::CheckPivotRoot,
+		version: None,
+		worker_dir_path,
+	})
+}
+
+/// Unshare the user namespace and change root to be the worker directory.
+///
+/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
+///       "CLONE_NEWUSER requires that the calling process is not threaded."
+fn try_restrict(worker_info: &WorkerInfo) -> Result<()> {
+	// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
+	macro_rules! cstr_ptr {
+		($e:expr) => {
+			concat!($e, "\0").as_ptr().cast::<core::ffi::c_char>()
+		};
+	}
+
+	let worker_dir_path_c = CString::new(worker_info.worker_dir_path.as_os_str().as_bytes())
+		.expect("on unix; the path will never contain 0 bytes; qed");
+
+	// Wrapper around all the work to prevent repetitive error handling.
+	//
+	// # Errors
+	//
+	// It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does
+	// not give the context of which call failed, so we return a &str error.
+	|| -> std::result::Result<(), &'static str> {
+		// SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps
+		//         (2) and (3) are adapted from the example in pivot_root(2), with the additional
+		//         change described in the `pivot_root(".", ".")` section.
+		unsafe {
+			// 1. `unshare` the user and the mount namespaces.
+			if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 {
+				return Err("unshare user and mount namespaces");
+			}
+
+			// 2. Setup mounts.
+			//
+			// Ensure that new root and its parent mount don't have shared propagation (which would
+			// cause pivot_root() to return an error), and prevent propagation of mount events to
+			// the initial mount namespace.
+			if libc::mount(
+				ptr::null(),
+				cstr_ptr!("/"),
+				ptr::null(),
+				libc::MS_REC | libc::MS_PRIVATE,
+				ptr::null(),
+			) < 0
+			{
+				return Err("mount MS_PRIVATE");
+			}
+			// Ensure that the new root is a mount point.
+			let additional_flags =
+				if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_info.kind {
+					libc::MS_RDONLY
+				} else {
+					0
+				};
+			if libc::mount(
+				worker_dir_path_c.as_ptr(),
+				worker_dir_path_c.as_ptr(),
+				ptr::null(), // ignored when MS_BIND is used
+				libc::MS_BIND |
+					libc::MS_REC | libc::MS_NOEXEC |
+					libc::MS_NODEV | libc::MS_NOSUID |
+					libc::MS_NOATIME |
+					additional_flags,
+				ptr::null(), // ignored when MS_BIND is used
+			) < 0
+			{
+				return Err("mount MS_BIND");
+			}
+
+			// 3. `pivot_root` to the artifact directory.
+			if libc::chdir(worker_dir_path_c.as_ptr()) < 0 {
+				return Err("chdir to worker dir path");
+			}
+			if libc::syscall(libc::SYS_pivot_root, cstr_ptr!("."), cstr_ptr!(".")) < 0 {
+				return Err("pivot_root");
+			}
+			if libc::umount2(cstr_ptr!("."), libc::MNT_DETACH) < 0 {
+				return Err("umount the old root mount point");
+			}
+		}
+
+		Ok(())
+	}()
+	.map_err(|err_ctx| {
+		let err = io::Error::last_os_error();
+		Error::OsErrWithContext(format!("{}: {}", err_ctx, err))
+	})?;
+
+	// Do some assertions.
+	if env::current_dir()? != Path::new("/") {
+		return Err(Error::AssertionFailed(
+			"expected current dir after pivot_root to be `/`".into(),
+		));
+	}
+	env::set_current_dir("..")?;
+	if env::current_dir()? != Path::new("/") {
+		return Err(Error::AssertionFailed(
+			"expected not to be able to break out of new root by doing `..`".into(),
+		));
+	}
+
+	Ok(())
+}
@@ -0,0 +1,93 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Functionality for securing the job processes spawned by the workers using `clone`. If
+//! unsupported, falls back to `fork`.
+
+use crate::{worker::WorkerInfo, LOG_TARGET};
+use nix::{
+	errno::Errno,
+	sched::{CloneCb, CloneFlags},
+	unistd::Pid,
+};
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+	#[error("could not clone, errno: {0}")]
+	Clone(Errno),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Try to run clone(2) on the current worker.
+///
+/// SAFETY: new process should be either spawned within a single threaded process, or use only
+/// async-signal-safe functions.
+pub unsafe fn clone_on_worker(
+	worker_info: &WorkerInfo,
+	have_unshare_newuser: bool,
+	cb: CloneCb,
+) -> Result<Pid> {
+	let flags = clone_flags(have_unshare_newuser);
+
+	gum::trace!(
+		target: LOG_TARGET,
+		?worker_info,
+		"calling clone with flags: {:?}",
+		flags
+	);
+
+	try_clone(cb, flags)
+}
+
+/// Runs a check for clone(2) with all sandboxing flags and returns an error indicating whether it
+/// can be fully enabled on the current Linux environment.
+///
+/// SAFETY: new process should be either spawned within a single threaded process, or use only
+/// async-signal-safe functions.
+pub unsafe fn check_can_fully_clone() -> Result<()> {
+	try_clone(Box::new(|| 0), clone_flags(false)).map(|_pid| ())
+}
+
+/// Runs clone(2) with all sandboxing flags.
+///
+/// SAFETY: new process should be either spawned within a single threaded process, or use only
+/// async-signal-safe functions.
+unsafe fn try_clone(cb: CloneCb, flags: CloneFlags) -> Result<Pid> {
+	let mut stack = [0u8; 2 * 1024 * 1024];
+
+	nix::sched::clone(cb, stack.as_mut_slice(), flags, None).map_err(|errno| Error::Clone(errno))
+}
+
+/// Returns flags for `clone(2)`, including all the sandbox-related ones.
+fn clone_flags(have_unshare_newuser: bool) -> CloneFlags {
+	// NOTE: CLONE_NEWUSER does not work in `clone` if we previously called `unshare` with this
+	// flag. On the other hand, if we did not call `unshare` we need this flag for the CAP_SYS_ADMIN
+	// capability.
+	let maybe_clone_newuser =
+		if have_unshare_newuser { CloneFlags::empty() } else { CloneFlags::CLONE_NEWUSER };
+	// SIGCHLD flag is used to inform clone that the parent process is
+	// expecting a child termination signal, without this flag `waitpid` function
+	// return `ECHILD` error.
+	maybe_clone_newuser |
+		CloneFlags::CLONE_NEWCGROUP |
+		CloneFlags::CLONE_NEWIPC |
+		CloneFlags::CLONE_NEWNET |
+		CloneFlags::CLONE_NEWNS |
+		CloneFlags::CLONE_NEWPID |
+		CloneFlags::CLONE_NEWUTS |
+		CloneFlags::from_bits_retain(libc::SIGCHLD)
+}
@@ -0,0 +1,323 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! The [landlock] docs say it best:
+//!
+//! > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict
+//! ambient rights (e.g., global filesystem access) for a set of processes by creating safe security
+//! sandboxes as new security layers in addition to the existing system-wide access-controls. This
+//! kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or
+//! malicious behaviors in applications. Landlock empowers any process, including unprivileged ones,
+//! to securely restrict themselves."
+//!
+//! [landlock]: https://docs.rs/landlock/latest/landlock/index.html
+
+pub use landlock::RulesetStatus;
+
+use crate::{
+	worker::{stringify_panic_payload, WorkerInfo, WorkerKind},
+	LOG_TARGET,
+};
+use landlock::*;
+use std::path::{Path, PathBuf};
+
+/// Landlock ABI version. We use ABI V1 because:
+///
+/// 1. It is supported by our reference kernel version.
+/// 2. Later versions do not (yet) provide additional security that would benefit us.
+///
+/// # Versions (as of October 2023)
+///
+/// - Pezkuwi reference kernel version: 5.16+
+///
+/// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads.
+///
+/// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During
+///   execution an attacker can only affect the name of a symlinked artifact and not the original
+///   one.
+///
+/// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can
+///   prevent attackers from affecting a symlinked artifact. We don't strictly need this as we
+///   plan to check for file integrity anyway; see
+///   <https://github.com/pezkuwichain/pezkuwi-sdk/issues/107>.
+///
+/// # Determinism
+///
+/// You may wonder whether we could always use the latest ABI instead of only the ABI supported
+/// by the reference kernel version. It seems plausible, since landlock provides a best-effort
+/// approach to enabling sandboxing. For example, if the reference version only supported V1 and
+/// we were on V2, then landlock would use V2 if it was supported on the current machine, and
+/// just fall back to V1 if not.
+///
+/// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1,
+/// they may have different semantics on some PVFs. So a malicious PVF now has a new attack
+/// vector: they can exploit this indeterminism between landlock ABIs!
+///
+/// On the other hand we do want validators to be as secure as possible and protect their keys
+/// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy
+/// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version
+/// supports it or if it introduces some new feature that is beneficial to security.
+pub const LANDLOCK_ABI: ABI = ABI::V1;
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+	#[error("Could not fully enable: {0:?}")]
+	NotFullyEnabled(RulesetStatus),
+	#[error("Invalid exception path: {0:?}")]
+	InvalidExceptionPath(PathBuf),
+	#[error(transparent)]
+	RulesetError(#[from] RulesetError),
+	#[error("A panic occurred in try_restrict: {0}")]
+	Panic(String),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Try to enable landlock for the given kind of worker.
+pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
+	let exceptions: Vec<(PathBuf, BitFlags<AccessFs>)> = match worker_info.kind {
+		WorkerKind::Prepare => {
+			vec![(worker_info.worker_dir_path.to_owned(), AccessFs::WriteFile.into())]
+		},
+		WorkerKind::Execute => {
+			vec![(worker_info.worker_dir_path.to_owned(), AccessFs::ReadFile.into())]
+		},
+		WorkerKind::CheckPivotRoot => {
+			panic!("this should only be passed for checking pivot_root; qed")
+		},
+	};
+
+	gum::trace!(
+		target: LOG_TARGET,
+		?worker_info,
+		"enabling landlock with exceptions: {:?}",
+		exceptions,
+	);
+
+	try_restrict(exceptions)
+}
+
+// TODO: <https://github.com/landlock-lsm/rust-landlock/issues/36>
+/// Runs a check for landlock in its own thread, and returns an error indicating whether the given
+/// landlock ABI is fully enabled on the current Linux environment.
+pub fn check_can_fully_enable() -> Result<()> {
+	match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())).join() {
+		Ok(Ok(())) => Ok(()),
+		Ok(Err(err)) => Err(err),
+		Err(err) => Err(Error::Panic(stringify_panic_payload(err))),
+	}
+}
+
+/// Tries to restrict the current thread (should only be called in a process' main thread) with
+/// the following landlock access controls:
+///
+/// 1. all global filesystem access restricted, with optional exceptions
+/// 2. ... more sandbox types (e.g. networking) may be supported in the future.
+///
+/// If landlock is not supported in the current environment this is simply a noop.
+///
+/// # Returns
+///
+/// The status of the restriction (whether it was fully, partially, or not-at-all enforced).
+fn try_restrict<I, P, A>(fs_exceptions: I) -> Result<()>
+where
+	I: IntoIterator<Item = (P, A)>,
+	P: AsRef<Path>,
+	A: Into<BitFlags<AccessFs>>,
+{
+	let mut ruleset =
+		Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?;
+	for (fs_path, access_bits) in fs_exceptions {
+		let paths = &[fs_path.as_ref().to_owned()];
+		let mut rules = path_beneath_rules(paths, access_bits).peekable();
+		if rules.peek().is_none() {
+			// `path_beneath_rules` silently ignores missing paths, so check for it manually.
+			return Err(Error::InvalidExceptionPath(fs_path.as_ref().to_owned()));
+		}
+		ruleset = ruleset.add_rules(rules)?;
+	}
+
+	let status = ruleset.restrict_self()?;
+	if !matches!(status.ruleset, RulesetStatus::FullyEnforced) {
+		return Err(Error::NotFullyEnabled(status.ruleset));
+	}
+
+	Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+	use std::{fs, io::ErrorKind, thread};
+
+	#[test]
+	fn restricted_thread_cannot_read_file() {
+		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
+		if check_can_fully_enable().is_err() {
+			return;
+		}
+
+		// Restricted thread cannot read from FS.
+		let handle = thread::spawn(|| {
+			// Create, write, and read two tmp files. This should succeed before any
+			// landlock restrictions are applied.
+			const TEXT: &str = "foo";
+			let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
+			let path1 = tmpfile1.path();
+			let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
+			let path2 = tmpfile2.path();
+
+			fs::write(path1, TEXT).unwrap();
+			let s = fs::read_to_string(path1).unwrap();
+			assert_eq!(s, TEXT);
+			fs::write(path2, TEXT).unwrap();
+			let s = fs::read_to_string(path2).unwrap();
+			assert_eq!(s, TEXT);
+
+			// Apply Landlock with a read exception for only one of the files.
+			let status = try_restrict(vec![(path1, AccessFs::ReadFile)]);
+			if !matches!(status, Ok(())) {
+				panic!(
+					"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
+					status
+				);
+			}
+
+			// Try to read from both files, only tmpfile1 should succeed.
+			let result = fs::read_to_string(path1);
+			assert!(matches!(
+				result,
+				Ok(s) if s == TEXT
+			));
+			let result = fs::read_to_string(path2);
+			assert!(matches!(
+				result,
+				Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+			));
+
+			// Apply Landlock for all files.
+			let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
+			if !matches!(status, Ok(())) {
+				panic!(
+					"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
+					status
+				);
+			}
+
+			// Try to read from tmpfile1 after landlock, it should fail.
+			let result = fs::read_to_string(path1);
+			assert!(matches!(
+				result,
+				Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+			));
+		});
+
+		assert!(handle.join().is_ok());
+	}
+
+	#[test]
+	fn restricted_thread_cannot_write_file() {
+		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
+		if check_can_fully_enable().is_err() {
+			return;
+		}
+
+		// Restricted thread cannot write to FS.
+		let handle = thread::spawn(|| {
+			// Create and write two tmp files. This should succeed before any landlock
+			// restrictions are applied.
+			const TEXT: &str = "foo";
+			let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
+			let path1 = tmpfile1.path();
+			let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
+			let path2 = tmpfile2.path();
+
+			fs::write(path1, TEXT).unwrap();
+			fs::write(path2, TEXT).unwrap();
+
+			// Apply Landlock with a write exception for only one of the files.
+			let status = try_restrict(vec![(path1, AccessFs::WriteFile)]);
+			if !matches!(status, Ok(())) {
+				panic!(
+					"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
+					status
+				);
+			}
+
+			// Try to write to both files, only tmpfile1 should succeed.
+			let result = fs::write(path1, TEXT);
+			assert!(matches!(result, Ok(_)));
+			let result = fs::write(path2, TEXT);
+			assert!(matches!(
+				result,
+				Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+			));
+
+			// Apply Landlock for all files.
+			let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
+			if !matches!(status, Ok(())) {
+				panic!(
+					"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
+					status
+				);
+			}
+
+			// Try to write to tmpfile1 after landlock, it should fail.
+			let result = fs::write(path1, TEXT);
+			assert!(matches!(
+				result,
+				Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+			));
+		});
+
+		assert!(handle.join().is_ok());
+	}
+
+	// Test that checks whether landlock under our ABI version is able to truncate files.
+	#[test]
+	fn restricted_thread_can_truncate_file() {
+		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
+		if check_can_fully_enable().is_err() {
+			return;
+		}
+
+		// Restricted thread can truncate file.
+		let handle = thread::spawn(|| {
+			// Create and write a file. This should succeed before any landlock
+			// restrictions are applied.
+			const TEXT: &str = "foo";
+			let tmpfile = tempfile::NamedTempFile::new().unwrap();
+			let path = tmpfile.path();
+
+			fs::write(path, TEXT).unwrap();
+
+			// Apply Landlock with all exceptions under the current ABI.
+			let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]);
+			if !matches!(status, Ok(())) {
+				panic!(
+					"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
+					status
+				);
+			}
+
+			// Try to truncate the file.
+			let result = tmpfile.as_file().set_len(0);
+			assert!(result.is_ok());
+		});
+
+		assert!(handle.join().is_ok());
+	}
+}
@@ -0,0 +1,77 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Functionality for securing workers.
+//!
+//! This is needed because workers are used to compile and execute untrusted code (PVFs).
+//!
+//! We currently employ the following security measures:
+//!
+//! - Restrict filesystem
+//!   - Use Landlock to remove all unnecessary FS access rights.
+//!   - Unshare the user and mount namespaces.
+//!   - Change the root directory to a worker-specific temporary directory.
+//! - Restrict networking by blocking socket creation and io_uring.
+//! - Remove env vars
+
+#[cfg(target_os = "linux")]
+pub mod change_root;
+#[cfg(target_os = "linux")]
+pub mod clone;
+#[cfg(target_os = "linux")]
+pub mod landlock;
+#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
+pub mod seccomp;
+
+use crate::{worker::WorkerInfo, LOG_TARGET};
+
+/// Require env vars to have been removed when spawning the process, to prevent malicious code from
+/// accessing them.
+pub fn check_env_vars_were_cleared(worker_info: &WorkerInfo) -> bool {
+	gum::trace!(
+		target: LOG_TARGET,
+		?worker_info,
+		"clearing env vars in worker",
+	);
+
+	let mut ok = true;
+
+	for (key, value) in std::env::vars_os() {
+		// TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of
+		// randomness for malicious code. It should be removed in the job process, which does no
+		// logging.
+		if key == "RUST_LOG" {
+			continue;
+		}
+		// An exception for MacOS. This is not a secure platform anyway, so we let it slide.
+		#[cfg(target_os = "macos")]
+		if key == "__CF_USER_TEXT_ENCODING" {
+			continue;
+		}
+
+		gum::error!(
+			target: LOG_TARGET,
+			?worker_info,
+			?key,
+			?value,
+			"env var was present that should have been removed",
+		);
+
+		ok = false;
+	}
+
+	ok
+}
@@ -0,0 +1,191 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Functionality for sandboxing workers by restricting their capabilities by blocking certain
+//! syscalls with seccomp.
+//!
+//! For security we block the following:
+//!
+//! - creation of new sockets - these are unneeded in PVF jobs, and we can safely block them without
+//!   affecting consensus.
+//!
+//! - `io_uring` - allows for networking and needs to be blocked. See below for a discussion on the
+//!   safety of doing this.
+//!
+//! # Safety of blocking io_uring
+//!
+//! `io_uring` is just a way of issuing system calls in an async manner, and there is nothing
+//! stopping wasmtime from legitimately using it. Fortunately, at the moment it does not. Generally,
+//! not many applications use `io_uring` in production yet, because of the numerous kernel CVEs
+//! discovered. It's still under a lot of development. Android outright banned `io_uring` for these
+//! reasons.
+//!
+//! Considering `io_uring`'s status discussed above, and that it very likely would get detected
+//! either by our [static analysis](https://github.com/paritytech/polkadot-sdk/pull/1663) or by
+//! testing, we think it is safe to block it.
+//!
+//! ## Consensus analysis
+//!
+//! If execution hits an edge case code path unique to a given machine, it's already taken a
+//! non-deterministic branch anyway. After all, we just care that the majority of validators reach
+//! the same result and preserve consensus. So worst-case scenario, there's a dispute, and we can
+//! always admit fault and refund the wrong validator. On the other hand, if all validators take the
+//! code path that results in a seccomp violation, then they would all vote against the current
+//! candidate, which is also fine. The violation would get logged (in big scary letters) and
+//! hopefully some validator reports it to us.
+//!
+//! Actually, a worst-worse-case scenario is that 50% of validators vote against, so that there is
+//! no consensus. But so many things would have to go wrong for that to happen:
+//!
+//! 1. An update to `wasmtime` is introduced that uses io_uring (unlikely as io_uring is mainly for
+//!    IO-heavy applications)
+//!
+//! 2. The new syscall is not detected by our static analysis
+//!
+//! 3. It is never triggered in any of our tests
+//!
+//! 4. It then gets triggered on some super edge case in production on 50% of validators causing a
+//!    stall (bad but very unlikely)
+//!
+//! 5. Or, it triggers on only a few validators causing a dispute (more likely but not as bad)
+//!
+//! Considering how many things would have to go wrong here, we believe it's safe to block
+//! `io_uring`.
+//!
+//! # Action on syscall violations
+//!
+//! When a forbidden syscall is attempted we immediately kill the process in order to prevent the
+//! attacker from doing anything else. In execution, this will result in voting against the
+//! candidate.
+
+use crate::{
+	worker::{stringify_panic_payload, WorkerInfo},
+	LOG_TARGET,
+};
+use seccompiler::*;
+use std::collections::BTreeMap;
+
+/// The action to take on caught syscalls.
+#[cfg(not(test))]
+const CAUGHT_ACTION: SeccompAction = SeccompAction::KillProcess;
+/// Don't kill the process when testing.
+#[cfg(test)]
+const CAUGHT_ACTION: SeccompAction = SeccompAction::Errno(libc::EACCES as u32);
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+	#[error(transparent)]
+	Seccomp(#[from] seccompiler::Error),
+	#[error(transparent)]
+	Backend(#[from] seccompiler::BackendError),
+	#[error("A panic occurred in try_restrict: {0}")]
+	Panic(String),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Try to enable seccomp for the given kind of worker.
+pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
+	gum::trace!(
+		target: LOG_TARGET,
+		?worker_info,
+		"enabling seccomp",
+	);
+
+	try_restrict()
+}
+
+/// Runs a check for seccomp in its own thread, and returns an error indicating whether seccomp with
+/// our rules is fully enabled on the current Linux environment.
+pub fn check_can_fully_enable() -> Result<()> {
+	match std::thread::spawn(|| try_restrict()).join() {
+		Ok(Ok(())) => Ok(()),
+		Ok(Err(err)) => Err(err),
+		Err(err) => Err(Error::Panic(stringify_panic_payload(err))),
+	}
+}
+
+/// Applies a `seccomp` filter to disable networking for the PVF threads.
+fn try_restrict() -> Result<()> {
+	// Build a `seccomp` filter which by default allows all syscalls except those blocked in the
+	// blacklist.
+	let mut blacklisted_rules = BTreeMap::default();
+
+	// Restrict the creation of sockets.
+	blacklisted_rules.insert(libc::SYS_socketpair, vec![]);
+	blacklisted_rules.insert(libc::SYS_socket, vec![]);
+
+	// Prevent connecting to sockets for extra safety.
+	blacklisted_rules.insert(libc::SYS_connect, vec![]);
+
+	// Restrict io_uring.
+	blacklisted_rules.insert(libc::SYS_io_uring_setup, vec![]);
+	blacklisted_rules.insert(libc::SYS_io_uring_enter, vec![]);
+	blacklisted_rules.insert(libc::SYS_io_uring_register, vec![]);
+
+	let filter = SeccompFilter::new(
+		blacklisted_rules,
+		// Mismatch action: what to do if not in rule list.
+		SeccompAction::Allow,
+		// Match action: what to do if in rule list.
+		CAUGHT_ACTION,
+		TargetArch::x86_64,
+	)?;
+
+	let bpf_prog: BpfProgram = filter.try_into()?;
+
+	// Applies filter (runs seccomp) to the calling thread.
+	seccompiler::apply_filter(&bpf_prog)?;
+
+	Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+	use std::{io::ErrorKind, net::TcpListener, thread};
+
+	#[test]
+	fn sandboxed_thread_cannot_use_sockets() {
+		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
+		if check_can_fully_enable().is_err() {
+			return;
+		}
+
+		let handle = thread::spawn(|| {
+			// Open a socket, this should succeed before seccomp is applied.
+			TcpListener::bind("127.0.0.1:0").unwrap();
+
+			let status = try_restrict();
+			if !matches!(status, Ok(())) {
+				panic!("Ruleset should be enforced since we checked if seccomp is enabled");
+			}
+
+			// Try to open a socket after seccomp.
+			assert!(matches!(
+				TcpListener::bind("127.0.0.1:0"),
+				Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+			));
+
+			// Other syscalls should still work.
+			unsafe {
+				assert!(libc::getppid() > 0);
+			}
+		});
+
+		assert!(handle.join().is_ok());
+	}
+}
@@ -0,0 +1,30 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Shared functions for getting the known worker files.
+
+use std::path::{Path, PathBuf};
+
+const WORKER_EXECUTE_ARTIFACT_NAME: &str = "artifact";
+const WORKER_PREPARE_TMP_ARTIFACT_NAME: &str = "tmp-artifact";
+
+pub fn execute_artifact(worker_dir_path: &Path) -> PathBuf {
+	worker_dir_path.join(WORKER_EXECUTE_ARTIFACT_NAME)
+}
+
+pub fn prepare_tmp_artifact(worker_dir_path: &Path) -> PathBuf {
+	worker_dir_path.join(WORKER_PREPARE_TMP_ARTIFACT_NAME)
+}