feat: initialize Kurdistan SDK - independent fork of Polkadot SDK

2025-12-13 15:44:15 +03:00
commit e4778b4576
6838 changed files with 1847450 additions and 0 deletions
@@ -0,0 +1,54 @@
+[package]
+name = "pezkuwi-node-core-pvf-prepare-worker"
+description = "Pezkuwi crate that contains the logic for preparing PVFs. Used by the pezkuwi-prepare-worker binary."
+version = "7.0.0"
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+
+[lints]
+workspace = true
+
+[[bench]]
+name = "prepare_pezkuwichain_runtime"
+harness = false
+
+[dependencies]
+cfg-if = { workspace = true }
+gum = { workspace = true, default-features = true }
+libc = { workspace = true }
+nix = { features = ["process", "resource", "sched"], workspace = true }
+tikv-jemalloc-ctl = { optional = true, workspace = true }
+tikv-jemallocator = { optional = true, workspace = true }
+tracking-allocator = { workspace = true, default-features = true }
+
+codec = { features = ["derive"], workspace = true }
+
+pezkuwi-node-core-pvf-common = { workspace = true, default-features = true }
+pezkuwi-primitives = { workspace = true, default-features = true }
+
+sp-maybe-compressed-blob = { workspace = true, default-features = true }
+
+[target.'cfg(target_os = "linux")'.dependencies]
+tikv-jemallocator = { workspace = true }
+tikv-jemalloc-ctl = { workspace = true }
+
+[dev-dependencies]
+criterion = { features = ["cargo_bench_support"], workspace = true }
+pezkuwichain-runtime = { workspace = true }
+
+[features]
+builder = []
+jemalloc-allocator = [
+	"dep:tikv-jemalloc-ctl",
+	"dep:tikv-jemallocator",
+	"pezkuwi-node-core-pvf-common/jemalloc-allocator",
+]
+runtime-benchmarks = [
+	"gum/runtime-benchmarks",
+	"pezkuwi-node-core-pvf-common/runtime-benchmarks",
+	"pezkuwi-primitives/runtime-benchmarks",
+	"pezkuwichain-runtime/runtime-benchmarks",
+]
@@ -0,0 +1,70 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+use criterion::{criterion_group, criterion_main, Criterion, SamplingMode};
+use pezkuwi_node_core_pvf_common::{
+	executor_interface::{prepare, prevalidate},
+	prepare::PrepareJobKind,
+	pvf::PvfPrepData,
+};
+use pezkuwi_primitives::ExecutorParams;
+use std::time::Duration;
+
+fn do_prepare_runtime(pvf: PvfPrepData) {
+	let maybe_compressed_code = pvf.maybe_compressed_code();
+	let raw_validation_code =
+		sp_maybe_compressed_blob::decompress(&maybe_compressed_code, usize::MAX).unwrap();
+
+	let blob = match prevalidate(&raw_validation_code) {
+		Err(err) => panic!("{:?}", err),
+		Ok(b) => b,
+	};
+
+	match prepare(blob, &pvf.executor_params()) {
+		Ok(_) => (),
+		Err(err) => panic!("{:?}", err),
+	}
+}
+
+fn prepare_pezkuwichain_runtime(c: &mut Criterion) {
+	let blob = pezkuwichain_runtime::WASM_BINARY.unwrap();
+	let pvf = match sp_maybe_compressed_blob::decompress(&blob, 64 * 1024 * 1024) {
+		Ok(code) => PvfPrepData::from_code(
+			code.into_owned(),
+			ExecutorParams::default(),
+			Duration::from_secs(360),
+			PrepareJobKind::Compilation,
+			64 * 1024 * 1024,
+		),
+		Err(e) => {
+			panic!("Cannot decompress blob: {:?}", e);
+		},
+	};
+
+	let mut group = c.benchmark_group("pezkuwichain");
+	group.sampling_mode(SamplingMode::Flat);
+	group.sample_size(20);
+	group.measurement_time(Duration::from_secs(240));
+	group.bench_function("prepare Pezkuwichain runtime", |b| {
+		// `PvfPrepData` is designed to be cheap to clone, so cloning shouldn't affect the
+		// benchmark accuracy
+		b.iter(|| do_prepare_runtime(pvf.clone()))
+	});
+	group.finish();
+}
+
+criterion_group!(preparation, prepare_pezkuwichain_runtime);
+criterion_main!(preparation);
@@ -0,0 +1,785 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Contains the logic for preparing PVFs. Used by the pezkuwi-prepare-worker binary.
+
+mod memory_stats;
+
+// NOTE: Initializing logging in e.g. tests will not have an effect in the workers, as they are
+//       separate spawned processes. Run with e.g. `RUST_LOG=teyrchain::pvf-prepare-worker=trace`.
+const LOG_TARGET: &str = "teyrchain::pvf-prepare-worker";
+
+#[cfg(target_os = "linux")]
+use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread};
+#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
+use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop};
+use codec::{Decode, Encode};
+use nix::{
+	errno::Errno,
+	sys::{
+		resource::{Usage, UsageWho},
+		wait::WaitStatus,
+	},
+	unistd::{ForkResult, Pid},
+};
+use pezkuwi_node_core_pvf_common::{
+	compute_checksum,
+	error::{PrepareError, PrepareWorkerResult},
+	executor_interface::{create_runtime_from_artifact_bytes, prepare, prevalidate},
+	framed_recv_blocking, framed_send_blocking,
+	prepare::{MemoryStats, PrepareJobKind, PrepareStats, PrepareWorkerSuccess},
+	pvf::PvfPrepData,
+	worker::{
+		cpu_time_monitor_loop, get_total_cpu_usage, pipe2_cloexec, recv_child_response, run_worker,
+		send_result, stringify_errno, stringify_panic_payload,
+		thread::{self, spawn_worker_thread, WaitOutcome},
+		PipeFd, WorkerInfo, WorkerKind,
+	},
+	worker_dir, ProcessTime,
+};
+use pezkuwi_primitives::ExecutorParams;
+use std::{
+	fs,
+	io::{self, Read},
+	os::{
+		fd::{AsRawFd, FromRawFd, RawFd},
+		unix::net::UnixStream,
+	},
+	path::{Path, PathBuf},
+	process,
+	sync::{mpsc::channel, Arc},
+	time::Duration,
+};
+use tracking_allocator::TrackingAllocator;
+
+#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
+#[global_allocator]
+static ALLOC: TrackingAllocator<tikv_jemallocator::Jemalloc> =
+	TrackingAllocator(tikv_jemallocator::Jemalloc);
+
+#[cfg(not(any(target_os = "linux", feature = "jemalloc-allocator")))]
+#[global_allocator]
+static ALLOC: TrackingAllocator<std::alloc::System> = TrackingAllocator(std::alloc::System);
+
+/// The number of threads for the child process:
+/// 1 - Main thread
+/// 2 - Cpu monitor thread
+/// 3 - Memory tracker thread
+/// 4 - Prepare thread
+///
+/// NOTE: The correctness of this value is enforced by a test. If the number of threads inside
+/// the child process changes in the future, this value must be changed as well.
+pub const PREPARE_WORKER_THREAD_NUMBER: u32 = 4;
+
+/// Contains the bytes for a successfully compiled artifact.
+#[derive(Encode, Decode)]
+pub struct CompiledArtifact(Vec<u8>);
+
+impl CompiledArtifact {
+	/// Creates a `CompiledArtifact`.
+	pub fn new(code: Vec<u8>) -> Self {
+		Self(code)
+	}
+}
+
+impl AsRef<[u8]> for CompiledArtifact {
+	fn as_ref(&self) -> &[u8] {
+		self.0.as_slice()
+	}
+}
+
+#[derive(Encode, Decode)]
+pub struct PrepareOutcome {
+	pub compiled_artifact: CompiledArtifact,
+	pub observed_wasm_code_len: u32,
+}
+
+/// Get a worker request.
+fn recv_request(stream: &mut UnixStream) -> io::Result<PvfPrepData> {
+	let pvf = framed_recv_blocking(stream)?;
+	let pvf = PvfPrepData::decode(&mut &pvf[..]).map_err(|e| {
+		io::Error::new(
+			io::ErrorKind::Other,
+			format!("prepare pvf recv_request: failed to decode PvfPrepData: {}", e),
+		)
+	})?;
+	Ok(pvf)
+}
+
+fn start_memory_tracking(fd: RawFd, limit: Option<isize>) {
+	unsafe {
+		// SAFETY: Inside the failure handler, the allocator is locked and no allocations or
+		// deallocations are possible. For Linux, that always holds for the code below, so it's
+		// safe. For MacOS, that technically holds at the time of writing, but there are no future
+		// guarantees.
+		// The arguments of unsafe `libc` calls are valid, the payload validity is covered with
+		// a test.
+		ALLOC.start_tracking(
+			limit,
+			Some(Box::new(move || {
+				#[cfg(target_os = "linux")]
+				{
+					// Syscalls never allocate or deallocate, so this is safe.
+					libc::syscall(libc::SYS_write, fd, OOM_PAYLOAD.as_ptr(), OOM_PAYLOAD.len());
+					libc::syscall(libc::SYS_close, fd);
+					// Make sure we exit from all threads. Copied from glibc.
+					libc::syscall(libc::SYS_exit_group, 1);
+					loop {
+						libc::syscall(libc::SYS_exit, 1);
+					}
+				}
+				#[cfg(not(target_os = "linux"))]
+				{
+					// Syscalls are not available on MacOS, so we have to use `libc` wrappers.
+					// Technically, there may be allocations inside, although they shouldn't be
+					// there. In that case, we'll see deadlocks on MacOS after the OOM condition
+					// triggered. As we consider running a validator on MacOS unsafe, and this
+					// code is only run by a validator, it's a lesser evil.
+					libc::write(fd, OOM_PAYLOAD.as_ptr().cast(), OOM_PAYLOAD.len());
+					libc::close(fd);
+					libc::_exit(1);
+				}
+			})),
+		);
+	}
+}
+
+fn end_memory_tracking() -> isize {
+	ALLOC.end_tracking()
+}
+
+/// The entrypoint that the spawned prepare worker should start with.
+///
+/// # Parameters
+///
+/// - `socket_path`: specifies the path to the socket used to communicate with the host.
+///
+/// - `worker_dir_path`: specifies the path to the worker-specific temporary directory.
+///
+/// - `node_version`: if `Some`, is checked against the `worker_version`. A mismatch results in
+///   immediate worker termination. `None` is used for tests and in other situations when version
+///   check is not necessary.
+///
+/// - `worker_version`: see above
+///
+/// # Flow
+///
+/// This runs the following in a loop:
+///
+/// 1. Get the code and parameters for preparation from the host.
+///
+/// 2. Start a new child process
+///
+/// 3. Start the memory tracker and the actual preparation in two separate threads.
+///
+/// 4. Wait on the two threads created in step 3.
+///
+/// 5. Stop the memory tracker and get the stats.
+///
+/// 6. Pipe the result back to the parent process and exit from child process.
+///
+/// 7. If compilation succeeded, write the compiled artifact into a temporary file.
+///
+/// 8. Send the result of preparation back to the host, including the checksum of the artifact. If
+///    any error occurred in the above steps, we send that in the `PrepareWorkerResult`.
+pub fn worker_entrypoint(
+	socket_path: PathBuf,
+	worker_dir_path: PathBuf,
+	node_version: Option<&str>,
+	worker_version: Option<&str>,
+) {
+	run_worker(
+		WorkerKind::Prepare,
+		socket_path,
+		worker_dir_path,
+		node_version,
+		worker_version,
+		|mut stream, worker_info, security_status| {
+			let temp_artifact_dest = worker_dir::prepare_tmp_artifact(&worker_info.worker_dir_path);
+
+			loop {
+				let pvf = recv_request(&mut stream)?;
+				gum::debug!(
+					target: LOG_TARGET,
+					?worker_info,
+					?security_status,
+					"worker: preparing artifact",
+				);
+
+				let preparation_timeout = pvf.prep_timeout();
+				let prepare_job_kind = pvf.prep_kind();
+				let executor_params = pvf.executor_params();
+
+				let (pipe_read_fd, pipe_write_fd) = pipe2_cloexec()?;
+
+				let usage_before = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) {
+					Ok(usage) => usage,
+					Err(errno) => {
+						let result: PrepareWorkerResult =
+							Err(error_from_errno("getrusage before", errno));
+						send_result(&mut stream, result, worker_info)?;
+						continue;
+					},
+				};
+
+				let stream_fd = stream.as_raw_fd();
+
+				cfg_if::cfg_if! {
+					if #[cfg(target_os = "linux")] {
+						let result = if security_status.can_do_secure_clone {
+							handle_clone(
+								&pvf,
+								pipe_write_fd,
+								pipe_read_fd,
+								stream_fd,
+								preparation_timeout,
+								prepare_job_kind,
+								&executor_params,
+								worker_info,
+								security_status.can_unshare_user_namespace_and_change_root,
+								&temp_artifact_dest,
+								usage_before,
+							)
+						} else {
+							// Fall back to using fork.
+							handle_fork(
+								&pvf,
+								pipe_write_fd,
+								pipe_read_fd,
+								stream_fd,
+								preparation_timeout,
+								prepare_job_kind,
+								&executor_params,
+								worker_info,
+								&temp_artifact_dest,
+								usage_before,
+							)
+						};
+					} else {
+						let result = handle_fork(
+							&pvf,
+							pipe_write_fd,
+							pipe_read_fd,
+							stream_fd,
+							preparation_timeout,
+							prepare_job_kind,
+							&executor_params,
+							worker_info,
+							&temp_artifact_dest,
+							usage_before,
+						);
+					}
+				}
+
+				gum::trace!(
+					target: LOG_TARGET,
+					?worker_info,
+					"worker: sending result to host: {:?}",
+					result
+				);
+				send_result(&mut stream, result, worker_info)?;
+			}
+		},
+	);
+}
+
+fn prepare_artifact(pvf: PvfPrepData) -> Result<PrepareOutcome, PrepareError> {
+	let maybe_compressed_code = pvf.maybe_compressed_code();
+	let raw_validation_code = sp_maybe_compressed_blob::decompress(
+		&maybe_compressed_code,
+		pvf.validation_code_bomb_limit() as usize,
+	)
+	.map_err(|e| PrepareError::CouldNotDecompressCodeBlob(e.to_string()))?;
+	let observed_wasm_code_len = raw_validation_code.len() as u32;
+
+	let blob = match prevalidate(&raw_validation_code) {
+		Err(err) => return Err(PrepareError::Prevalidation(format!("{:?}", err))),
+		Ok(b) => b,
+	};
+
+	match prepare(blob, &pvf.executor_params()) {
+		Ok(compiled_artifact) => Ok(PrepareOutcome {
+			compiled_artifact: CompiledArtifact::new(compiled_artifact),
+			observed_wasm_code_len,
+		}),
+		Err(err) => Err(PrepareError::Preparation(format!("{:?}", err))),
+	}
+}
+
+/// Try constructing the runtime to catch any instantiation errors during pre-checking.
+fn runtime_construction_check(
+	artifact_bytes: &[u8],
+	executor_params: &ExecutorParams,
+) -> Result<(), PrepareError> {
+	// SAFETY: We just compiled this artifact.
+	let result = unsafe { create_runtime_from_artifact_bytes(artifact_bytes, executor_params) };
+	result
+		.map(|_runtime| ())
+		.map_err(|err| PrepareError::RuntimeConstruction(format!("{:?}", err)))
+}
+
+#[derive(Encode, Decode)]
+struct JobResponse {
+	artifact: CompiledArtifact,
+	memory_stats: MemoryStats,
+	observed_wasm_code_len: u32,
+}
+
+#[cfg(target_os = "linux")]
+fn handle_clone(
+	pvf: &PvfPrepData,
+	pipe_write_fd: i32,
+	pipe_read_fd: i32,
+	stream_fd: i32,
+	preparation_timeout: Duration,
+	prepare_job_kind: PrepareJobKind,
+	executor_params: &Arc<ExecutorParams>,
+	worker_info: &WorkerInfo,
+	have_unshare_newuser: bool,
+	temp_artifact_dest: &Path,
+	usage_before: Usage,
+) -> Result<PrepareWorkerSuccess, PrepareError> {
+	use pezkuwi_node_core_pvf_common::worker::security;
+
+	// SAFETY: new process is spawned within a single threaded process. This invariant
+	// is enforced by tests. Stack size being specified to ensure child doesn't overflow
+	match unsafe {
+		security::clone::clone_on_worker(
+			worker_info,
+			have_unshare_newuser,
+			Box::new(|| {
+				handle_child_process(
+					pvf.clone(),
+					pipe_write_fd,
+					pipe_read_fd,
+					stream_fd,
+					preparation_timeout,
+					prepare_job_kind,
+					Arc::clone(&executor_params),
+				)
+			}),
+		)
+	} {
+		Ok(child) => handle_parent_process(
+			pipe_read_fd,
+			pipe_write_fd,
+			worker_info,
+			child,
+			temp_artifact_dest,
+			usage_before,
+			preparation_timeout,
+		),
+		Err(security::clone::Error::Clone(errno)) => Err(error_from_errno("clone", errno)),
+	}
+}
+
+fn handle_fork(
+	pvf: &PvfPrepData,
+	pipe_write_fd: i32,
+	pipe_read_fd: i32,
+	stream_fd: i32,
+	preparation_timeout: Duration,
+	prepare_job_kind: PrepareJobKind,
+	executor_params: &Arc<ExecutorParams>,
+	worker_info: &WorkerInfo,
+	temp_artifact_dest: &Path,
+	usage_before: Usage,
+) -> Result<PrepareWorkerSuccess, PrepareError> {
+	// SAFETY: new process is spawned within a single threaded process. This invariant
+	// is enforced by tests.
+	match unsafe { nix::unistd::fork() } {
+		Ok(ForkResult::Child) => handle_child_process(
+			pvf.clone(),
+			pipe_write_fd,
+			pipe_read_fd,
+			stream_fd,
+			preparation_timeout,
+			prepare_job_kind,
+			Arc::clone(executor_params),
+		),
+		Ok(ForkResult::Parent { child }) => handle_parent_process(
+			pipe_read_fd,
+			pipe_write_fd,
+			worker_info,
+			child,
+			temp_artifact_dest,
+			usage_before,
+			preparation_timeout,
+		),
+		Err(errno) => Err(error_from_errno("fork", errno)),
+	}
+}
+
+/// This is used to handle child process during pvf prepare worker.
+/// It prepares the artifact and tracks memory stats during preparation
+/// and pipes back the response to the parent process.
+///
+/// # Returns
+///
+/// - If any error occur, pipe response back with `PrepareError`.
+///
+/// - If success, pipe back `JobResponse`.
+fn handle_child_process(
+	pvf: PvfPrepData,
+	pipe_write_fd: i32,
+	pipe_read_fd: i32,
+	stream_fd: i32,
+	preparation_timeout: Duration,
+	prepare_job_kind: PrepareJobKind,
+	executor_params: Arc<ExecutorParams>,
+) -> ! {
+	// SAFETY: pipe_writer is an open and owned file descriptor at this point.
+	let mut pipe_write = unsafe { PipeFd::from_raw_fd(pipe_write_fd) };
+
+	// Drop the read end so we don't have too many FDs open.
+	if let Err(errno) = nix::unistd::close(pipe_read_fd) {
+		send_child_response(
+			&mut pipe_write,
+			JobResult::Err(error_from_errno("closing pipe", errno)),
+		);
+	}
+
+	// Dropping the stream closes the underlying socket. We want to make sure
+	// that the sandboxed child can't get any kind of information from the
+	// outside world. The only IPC it should be able to do is sending its
+	// response over the pipe.
+	if let Err(errno) = nix::unistd::close(stream_fd) {
+		send_child_response(
+			&mut pipe_write,
+			JobResult::Err(error_from_errno("error closing stream", errno)),
+		);
+	}
+
+	let worker_job_pid = process::id();
+	gum::debug!(
+		target: LOG_TARGET,
+		%worker_job_pid,
+		?prepare_job_kind,
+		?preparation_timeout,
+		"worker job: preparing artifact",
+	);
+
+	// Conditional variable to notify us when a thread is done.
+	let condvar = thread::get_condvar();
+
+	// Run the memory tracker in a regular, non-worker thread.
+	#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
+	let condvar_memory = Arc::clone(&condvar);
+	#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
+	let memory_tracker_thread = std::thread::spawn(|| memory_tracker_loop(condvar_memory));
+
+	start_memory_tracking(
+		pipe_write.as_raw_fd(),
+		executor_params.prechecking_max_memory().map(|v| {
+			v.try_into().unwrap_or_else(|_| {
+				gum::warn!(
+					LOG_TARGET,
+					%worker_job_pid,
+					"Illegal pre-checking max memory value {} discarded",
+					v,
+				);
+				0
+			})
+		}),
+	);
+
+	let cpu_time_start = ProcessTime::now();
+
+	// Spawn a new thread that runs the CPU time monitor.
+	let (cpu_time_monitor_tx, cpu_time_monitor_rx) = channel::<()>();
+	let cpu_time_monitor_thread = thread::spawn_worker_thread(
+		"cpu time monitor thread",
+		move || cpu_time_monitor_loop(cpu_time_start, preparation_timeout, cpu_time_monitor_rx),
+		Arc::clone(&condvar),
+		WaitOutcome::TimedOut,
+	)
+	.unwrap_or_else(|err| {
+		send_child_response(&mut pipe_write, Err(PrepareError::IoErr(err.to_string())))
+	});
+
+	let prepare_thread = spawn_worker_thread(
+		"prepare worker",
+		move || {
+			#[allow(unused_mut)]
+			let mut result = prepare_artifact(pvf).map(|o| (o,));
+
+			// Get the `ru_maxrss` stat. If supported, call getrusage for the thread.
+			#[cfg(target_os = "linux")]
+			let mut result = result.map(|outcome| (outcome.0, get_max_rss_thread()));
+
+			// If we are pre-checking, check for runtime construction errors.
+			//
+			// As pre-checking is more strict than just preparation in terms of memory
+			// and time, it is okay to do extra checks here. This takes negligible time
+			// anyway.
+			if let PrepareJobKind::Prechecking = prepare_job_kind {
+				result = result.and_then(|output| {
+					runtime_construction_check(
+						output.0.compiled_artifact.as_ref(),
+						&executor_params,
+					)?;
+					Ok(output)
+				});
+			}
+			result
+		},
+		Arc::clone(&condvar),
+		WaitOutcome::Finished,
+	)
+	.unwrap_or_else(|err| {
+		send_child_response(&mut pipe_write, Err(PrepareError::IoErr(err.to_string())))
+	});
+
+	let outcome = thread::wait_for_threads(condvar);
+
+	let peak_alloc = {
+		let peak = end_memory_tracking();
+		gum::debug!(
+			target: LOG_TARGET,
+			%worker_job_pid,
+			"prepare job peak allocation is {} bytes",
+			peak,
+		);
+		peak
+	};
+
+	let result = match outcome {
+		WaitOutcome::Finished => {
+			let _ = cpu_time_monitor_tx.send(());
+
+			match prepare_thread.join().unwrap_or_else(|err| {
+				send_child_response(
+					&mut pipe_write,
+					Err(PrepareError::JobError(stringify_panic_payload(err))),
+				)
+			}) {
+				Err(err) => Err(err),
+				Ok(ok) => {
+					cfg_if::cfg_if! {
+						if #[cfg(target_os = "linux")] {
+							let (PrepareOutcome { compiled_artifact, observed_wasm_code_len }, max_rss) = ok;
+						} else {
+							let (PrepareOutcome { compiled_artifact, observed_wasm_code_len },) = ok;
+						}
+					}
+
+					// Stop the memory stats worker and get its observed memory stats.
+					#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
+					let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, process::id());
+
+					let memory_stats = MemoryStats {
+						#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
+						memory_tracker_stats,
+						#[cfg(target_os = "linux")]
+						max_rss: extract_max_rss_stat(max_rss, process::id()),
+						// Negative peak allocation values are legit; they are narrow
+						// corner cases and shouldn't affect overall statistics
+						// significantly
+						peak_tracked_alloc: if peak_alloc > 0 { peak_alloc as u64 } else { 0u64 },
+					};
+
+					Ok(JobResponse {
+						artifact: compiled_artifact,
+						observed_wasm_code_len,
+						memory_stats,
+					})
+				},
+			}
+		},
+
+		// If the CPU thread is not selected, we signal it to end, the join handle is
+		// dropped and the thread will finish in the background.
+		WaitOutcome::TimedOut => match cpu_time_monitor_thread.join() {
+			Ok(Some(_cpu_time_elapsed)) => Err(PrepareError::TimedOut),
+			Ok(None) => Err(PrepareError::IoErr("error communicating over closed channel".into())),
+			Err(err) => Err(PrepareError::IoErr(stringify_panic_payload(err))),
+		},
+		WaitOutcome::Pending => {
+			unreachable!("we run wait_while until the outcome is no longer pending; qed")
+		},
+	};
+
+	send_child_response(&mut pipe_write, result);
+}
+
+/// Waits for child process to finish and handle child response from pipe.
+///
+/// # Returns
+///
+/// - If the child send response without an error, this function returns `Ok(PrepareStats)`
+///   containing memory and CPU usage statistics.
+///
+/// - If the child send response with an error, it returns a `PrepareError` with that error.
+///
+/// - If the child process timeout, it returns `PrepareError::TimedOut`.
+fn handle_parent_process(
+	pipe_read_fd: i32,
+	pipe_write_fd: i32,
+	worker_info: &WorkerInfo,
+	job_pid: Pid,
+	temp_artifact_dest: &Path,
+	usage_before: Usage,
+	timeout: Duration,
+) -> Result<PrepareWorkerSuccess, PrepareError> {
+	// the read end will wait until all write ends have been closed,
+	// this drop is necessary to avoid deadlock
+	if let Err(errno) = nix::unistd::close(pipe_write_fd) {
+		return Err(error_from_errno("closing pipe write fd", errno));
+	};
+
+	// SAFETY: this is an open and owned file descriptor at this point.
+	let mut pipe_read = unsafe { PipeFd::from_raw_fd(pipe_read_fd) };
+
+	// Read from the child. Don't decode unless the process exited normally, which we check later.
+	let mut received_data = Vec::new();
+	pipe_read
+		.read_to_end(&mut received_data)
+		.map_err(|err| PrepareError::IoErr(err.to_string()))?;
+
+	let status = nix::sys::wait::waitpid(job_pid, None);
+	gum::trace!(
+		target: LOG_TARGET,
+		?worker_info,
+		%job_pid,
+		"prepare worker received wait status from job: {:?}",
+		status,
+	);
+
+	let usage_after = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN)
+		.map_err(|errno| error_from_errno("getrusage after", errno))?;
+
+	// Using `getrusage` is needed to check whether child has timedout since we cannot rely on
+	// child to report its own time.
+	// As `getrusage` returns resource usage from all terminated child processes,
+	// it is necessary to subtract the usage before the current child process to isolate its cpu
+	// time
+	let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before);
+	if cpu_tv >= timeout {
+		gum::warn!(
+			target: LOG_TARGET,
+			?worker_info,
+			%job_pid,
+			"prepare job took {}ms cpu time, exceeded prepare timeout {}ms",
+			cpu_tv.as_millis(),
+			timeout.as_millis(),
+		);
+		return Err(PrepareError::TimedOut);
+	}
+
+	match status {
+		Ok(WaitStatus::Exited(_pid, exit_status)) => {
+			let mut reader = io::BufReader::new(received_data.as_slice());
+			let result = recv_child_response(&mut reader, "prepare")
+				.map_err(|err| PrepareError::JobError(err.to_string()))?;
+
+			match result {
+				Err(err) => Err(err),
+				Ok(JobResponse { artifact, memory_stats, observed_wasm_code_len }) => {
+					// The exit status should have been zero if no error occurred.
+					if exit_status != 0 {
+						return Err(PrepareError::JobError(format!(
+							"unexpected exit status: {}",
+							exit_status
+						)));
+					}
+
+					// Write the serialized artifact into a temp file.
+					//
+					// PVF host only keeps artifacts statuses in its memory,
+					// successfully compiled code gets stored on the disk (and
+					// consequently deserialized by execute-workers). The prepare worker
+					// is only required to send `Ok` to the pool to indicate the
+					// success.
+					gum::debug!(
+						target: LOG_TARGET,
+						?worker_info,
+						%job_pid,
+						"worker: writing artifact to {}",
+						temp_artifact_dest.display(),
+					);
+					// Write to the temp file created by the host.
+					if let Err(err) = fs::write(temp_artifact_dest, &artifact) {
+						return Err(PrepareError::IoErr(err.to_string()));
+					};
+
+					let checksum = compute_checksum(&artifact.as_ref());
+					Ok(PrepareWorkerSuccess {
+						checksum,
+						stats: PrepareStats {
+							memory_stats,
+							cpu_time_elapsed: cpu_tv,
+							observed_wasm_code_len,
+						},
+					})
+				},
+			}
+		},
+		// The job was killed by the given signal.
+		//
+		// The job gets SIGSYS on seccomp violations, but this signal may have been sent for some
+		// other reason, so we still need to check for seccomp violations elsewhere.
+		Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) => Err(PrepareError::JobDied {
+			err: format!("received signal: {signal:?}"),
+			job_pid: job_pid.as_raw(),
+		}),
+		Err(errno) => Err(error_from_errno("waitpid", errno)),
+
+		// An attacker can make the child process return any exit status it wants. So we can treat
+		// all unexpected cases the same way.
+		Ok(unexpected_wait_status) => Err(PrepareError::JobDied {
+			err: format!("unexpected status from wait: {unexpected_wait_status:?}"),
+			job_pid: job_pid.as_raw(),
+		}),
+	}
+}
+
+/// Write a job response to the pipe and exit process after.
+///
+/// # Arguments
+///
+/// - `pipe_write`: A `PipeFd` structure, the writing end of a pipe.
+///
+/// - `response`: Child process response
+fn send_child_response(pipe_write: &mut PipeFd, response: JobResult) -> ! {
+	framed_send_blocking(pipe_write, response.encode().as_slice())
+		.unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE));
+
+	if response.is_ok() {
+		process::exit(libc::EXIT_SUCCESS)
+	} else {
+		process::exit(libc::EXIT_FAILURE)
+	}
+}
+
+fn error_from_errno(context: &'static str, errno: Errno) -> PrepareError {
+	PrepareError::Kernel(stringify_errno(context, errno))
+}
+
+type JobResult = Result<JobResponse, PrepareError>;
+
+/// Pre-encoded length-prefixed `JobResult::Err(PrepareError::OutOfMemory)`
+const OOM_PAYLOAD: &[u8] = b"\x02\x00\x00\x00\x00\x00\x00\x00\x01\x08";
+
+#[test]
+fn pre_encoded_payloads() {
+	// NOTE: This must match the type of `response` in `send_child_response`.
+	let oom_unencoded: JobResult = JobResult::Err(PrepareError::OutOfMemory);
+	let oom_encoded = oom_unencoded.encode();
+	// The payload is prefixed with	its length in `framed_send`.
+	let mut oom_payload = oom_encoded.len().to_le_bytes().to_vec();
+	oom_payload.extend(oom_encoded);
+	assert_eq!(oom_payload, OOM_PAYLOAD);
+}
@@ -0,0 +1,196 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Memory stats for preparation.
+//!
+//! Right now we gather three measurements:
+//!
+//! - `ru_maxrss` (resident set size) from `getrusage`.
+//! - `resident` memory stat provided by `tikv-malloc-ctl`.
+//! - `allocated` memory stat also from `tikv-malloc-ctl`.
+//!
+//! Currently we are only logging these for the purposes of gathering data. In the future, we may
+//! use these stats to reject PVFs during pre-checking. See
+//! <https://github.com/paritytech/polkadot/issues/6472#issuecomment-1381941762> for more
+//! background.
+
+/// Module for the memory tracker. The memory tracker runs in its own thread, where it polls memory
+/// usage at an interval.
+///
+/// NOTE: Requires jemalloc enabled.
+#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
+pub mod memory_tracker {
+	use crate::LOG_TARGET;
+	use pezkuwi_node_core_pvf_common::{
+		prepare::MemoryAllocationStats,
+		worker::{stringify_panic_payload, thread},
+	};
+	use std::{thread::JoinHandle, time::Duration};
+	use tikv_jemalloc_ctl::{epoch, stats, Error};
+
+	#[derive(Clone)]
+	struct MemoryAllocationTracker {
+		epoch: tikv_jemalloc_ctl::epoch_mib,
+		allocated: stats::allocated_mib,
+		resident: stats::resident_mib,
+	}
+
+	impl MemoryAllocationTracker {
+		pub fn new() -> Result<Self, Error> {
+			Ok(Self {
+				epoch: epoch::mib()?,
+				allocated: stats::allocated::mib()?,
+				resident: stats::resident::mib()?,
+			})
+		}
+
+		pub fn snapshot(&self) -> Result<MemoryAllocationStats, Error> {
+			// update stats by advancing the allocation epoch
+			self.epoch.advance()?;
+
+			// Convert to `u64`, as `usize` is not `Encode`able.
+			let allocated = self.allocated.read()? as u64;
+			let resident = self.resident.read()? as u64;
+			Ok(MemoryAllocationStats { allocated, resident })
+		}
+	}
+
+	/// Runs a thread in the background that observes memory statistics. The goal is to try to get
+	/// accurate stats during preparation.
+	///
+	/// # Algorithm
+	///
+	/// 1. Create the memory tracker.
+	///
+	/// 2. Sleep for some short interval. Whenever we wake up, take a snapshot by updating the
+	///    allocation epoch.
+	///
+	/// 3. When we are notified that preparation has completed, take one last snapshot and return
+	///    the maximum observed values.
+	///
+	/// # Errors
+	///
+	/// For simplicity, any errors are returned as a string. As this is not a critical component,
+	/// errors are used for informational purposes (logging) only.
+	pub fn memory_tracker_loop(condvar: thread::Cond) -> Result<MemoryAllocationStats, String> {
+		// NOTE: This doesn't need to be too fine-grained since preparation currently takes 3-10s or
+		// more. Apart from that, there is not really a science to this number.
+		const POLL_INTERVAL: Duration = Duration::from_millis(100);
+
+		let tracker = MemoryAllocationTracker::new().map_err(|err| err.to_string())?;
+		let mut max_stats = MemoryAllocationStats::default();
+
+		let mut update_stats = || -> Result<(), String> {
+			let current_stats = tracker.snapshot().map_err(|err| err.to_string())?;
+			if current_stats.resident > max_stats.resident {
+				max_stats.resident = current_stats.resident;
+			}
+			if current_stats.allocated > max_stats.allocated {
+				max_stats.allocated = current_stats.allocated;
+			}
+			Ok(())
+		};
+
+		loop {
+			// Take a snapshot and update the max stats.
+			update_stats()?;
+
+			// Sleep for the poll interval, or wake up if the condvar is triggered. Note that
+			// `wait_timeout_while` is documented as not being very precise or reliable, which is
+			// fine here -- see note above.
+			match thread::wait_for_threads_with_timeout(&condvar, POLL_INTERVAL) {
+				Some(_outcome) => {
+					update_stats()?;
+					return Ok(max_stats);
+				},
+				None => continue,
+			}
+		}
+	}
+
+	/// Helper function to get the stats from the memory tracker. Helps isolate this error handling.
+	pub fn get_memory_tracker_loop_stats(
+		thread: JoinHandle<Result<MemoryAllocationStats, String>>,
+		worker_pid: u32,
+	) -> Option<MemoryAllocationStats> {
+		match thread.join() {
+			Ok(Ok(stats)) => Some(stats),
+			Ok(Err(err)) => {
+				gum::warn!(
+					target: LOG_TARGET,
+					%worker_pid,
+					"worker: error occurred in the memory tracker thread: {}", err
+				);
+				None
+			},
+			Err(err) => {
+				gum::warn!(
+					target: LOG_TARGET,
+					%worker_pid,
+					"worker: error joining on memory tracker thread: {}", stringify_panic_payload(err)
+				);
+				None
+			},
+		}
+	}
+}
+
+/// Module for dealing with the `ru_maxrss` (peak resident memory) stat from `getrusage`.
+///
+/// NOTE: `getrusage` with the `RUSAGE_THREAD` parameter is only supported on Linux. `RUSAGE_SELF`
+/// works on MacOS, but we need to get the max rss only for the preparation thread. Getting it for
+/// the current process would conflate the stats of previous jobs run by the process.
+#[cfg(target_os = "linux")]
+pub mod max_rss_stat {
+	use crate::LOG_TARGET;
+	use core::mem::MaybeUninit;
+	use libc::{getrusage, rusage, RUSAGE_THREAD};
+	use std::io;
+
+	/// Get the rusage stats for the current thread.
+	fn getrusage_thread() -> io::Result<rusage> {
+		let mut result: MaybeUninit<rusage> = MaybeUninit::zeroed();
+
+		// SAFETY: `result` is a valid pointer, so calling this is safe.
+		if unsafe { getrusage(RUSAGE_THREAD, result.as_mut_ptr()) } == -1 {
+			return Err(io::Error::last_os_error());
+		}
+
+		// SAFETY: `result` was successfully initialized by `getrusage`.
+		unsafe { Ok(result.assume_init()) }
+	}
+
+	/// Gets the `ru_maxrss` for the current thread.
+	pub fn get_max_rss_thread() -> io::Result<i64> {
+		// `c_long` is either `i32` or `i64` depending on architecture. `i64::from` always works.
+		getrusage_thread().map(|rusage| i64::from(rusage.ru_maxrss))
+	}
+
+	/// Extracts the max_rss stat and logs any error.
+	pub fn extract_max_rss_stat(max_rss: io::Result<i64>, worker_pid: u32) -> Option<i64> {
+		max_rss
+			.map_err(|err| {
+				gum::warn!(
+					target: LOG_TARGET,
+					%worker_pid,
+					"error getting `ru_maxrss` in preparation thread: {}",
+					err
+				);
+				err
+			})
+			.ok()
+	}
+}