Use clone instead of fork on pvf (#2477)

@mrcnski Done the change on the prepare worker, once the prepare worker part is good I'll do the same for the execute worker. This is based on https://github.com/koute/polkavm/blob/11beebd06276ce9b84f335350138479e714f6caf/crates/polkavm/src/sandbox/linux.rs#L711. ## TODO - [x] Add a check for this capability at startup - [x] Add prdoc mentioning the new Secure Validator Mode (optional) requirement. ## Related Closes #2162 --------- Co-authored-by: Marcin S <marcin@realemail.net>
2026-06-21 05:11:02 +00:00 · 2024-01-21 11:15:36 +00:00
parent caa987d26d
commit 21ef949b6e
20 changed files with 861 additions and 395 deletions
@@ -92,10 +92,11 @@ pub enum JobError {
 	TimedOut,
 	#[error("An unexpected panic has occurred in the execution job: {0}")]
 	Panic(String),
+	/// Some error occurred when interfacing with the kernel.
+	#[error("Error interfacing with the kernel: {0}")]
+	Kernel(String),
 	#[error("Could not spawn the requested thread: {0}")]
 	CouldNotSpawnThread(String),
 	#[error("An error occurred in the CPU time monitor thread: {0}")]
 	CpuTimeMonitorThread(String),
-	#[error("Could not set pdeathsig: {0}")]
-	CouldNotSetPdeathsig(String),
 }
@@ -140,7 +140,7 @@ pub unsafe fn create_runtime_from_artifact_bytes(
 	executor_params: &ExecutorParams,
 ) -> Result<WasmtimeRuntime, WasmError> {
 	let mut config = DEFAULT_CONFIG.clone();
-	config.semantics = params_to_wasmtime_semantics(executor_params);
+	config.semantics = params_to_wasmtime_semantics(executor_params).0;

 	sc_executor_wasmtime::create_runtime_from_artifact_bytes::<HostFunctions>(
 		compiled_artifact_blob,
@@ -148,7 +148,10 @@ pub unsafe fn create_runtime_from_artifact_bytes(
 	)
 }

-pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Semantics {
+/// Takes the default config and overwrites any settings with existing executor parameters.
+///
+/// Returns the semantics as well as the stack limit (since we are guaranteed to have it).
+pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> (Semantics, DeterministicStackLimit) {
 	let mut sem = DEFAULT_CONFIG.semantics.clone();
 	let mut stack_limit = sem
 		.deterministic_stack_limit
@@ -169,8 +172,8 @@ pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Semantics {
 			ExecutorParam::PvfExecTimeout(_, _) => (), /* Not used here */
 		}
 	}
-	sem.deterministic_stack_limit = Some(stack_limit);
-	sem
+	sem.deterministic_stack_limit = Some(stack_limit.clone());
+	(sem, stack_limit)
 }

 /// Runs the prevalidation on the given code. Returns a [`RuntimeBlob`] if it succeeds.
@@ -187,7 +190,7 @@ pub fn prepare(
 	blob: RuntimeBlob,
 	executor_params: &ExecutorParams,
 ) -> Result<Vec<u8>, sc_executor_common::error::WasmError> {
-	let semantics = params_to_wasmtime_semantics(executor_params);
+	let (semantics, _) = params_to_wasmtime_semantics(executor_params);
 	sc_executor_wasmtime::prepare_runtime_artifact(blob, &semantics)
 }

@@ -57,6 +57,8 @@ pub struct SecurityStatus {
 	pub can_enable_seccomp: bool,
 	/// Whether we are able to unshare the user namespace and change the filesystem root.
 	pub can_unshare_user_namespace_and_change_root: bool,
+	/// Whether we are able to call `clone` with all sandboxing flags.
+	pub can_do_secure_clone: bool,
 }

 /// A handshake with information for the worker.
@@ -18,14 +18,19 @@

 pub mod security;

-use crate::{framed_recv_blocking, WorkerHandshake, LOG_TARGET};
+use crate::{framed_recv_blocking, SecurityStatus, WorkerHandshake, LOG_TARGET};
 use cpu_time::ProcessTime;
 use futures::never::Never;
 use parity_scale_codec::Decode;
 use std::{
 	any::Any,
-	fmt, io,
-	os::unix::net::UnixStream,
+	fmt::{self},
+	fs::File,
+	io::{self, Read, Write},
+	os::{
+		fd::{AsRawFd, FromRawFd, RawFd},
+		unix::net::UnixStream,
+	},
 	path::PathBuf,
 	sync::mpsc::{Receiver, RecvTimeoutError},
 	time::Duration,
@@ -78,7 +83,7 @@ macro_rules! decl_worker_main {

 				"--check-can-enable-landlock" => {
 					#[cfg(target_os = "linux")]
-					let status = if let Err(err) = security::landlock::check_is_fully_enabled() {
+					let status = if let Err(err) = security::landlock::check_can_fully_enable() {
 						// Write the error to stderr, log it on the host-side.
 						eprintln!("{}", err);
 						-1
@@ -91,7 +96,7 @@ macro_rules! decl_worker_main {
 				},
 				"--check-can-enable-seccomp" => {
 					#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
-					let status = if let Err(err) = security::seccomp::check_is_fully_enabled() {
+					let status = if let Err(err) = security::seccomp::check_can_fully_enable() {
 						// Write the error to stderr, log it on the host-side.
 						eprintln!("{}", err);
 						-1
@@ -107,7 +112,7 @@ macro_rules! decl_worker_main {
 					let cache_path_tempdir = std::path::Path::new(&args[2]);
 					#[cfg(target_os = "linux")]
 					let status = if let Err(err) =
-						security::change_root::check_is_fully_enabled(&cache_path_tempdir)
+						security::change_root::check_can_fully_enable(&cache_path_tempdir)
 					{
 						// Write the error to stderr, log it on the host-side.
 						eprintln!("{}", err);
@@ -119,6 +124,21 @@ macro_rules! decl_worker_main {
 					let status = -1;
 					std::process::exit(status)
 				},
+				"--check-can-do-secure-clone" => {
+					#[cfg(target_os = "linux")]
+					// SAFETY: new process is spawned within a single threaded process. This
+					// invariant is enforced by tests.
+					let status = if let Err(err) = unsafe { security::clone::check_can_fully_clone() } {
+						// Write the error to stderr, log it on the host-side.
+						eprintln!("{}", err);
+						-1
+					} else {
+						0
+					};
+					#[cfg(not(target_os = "linux"))]
+					let status = -1;
+					std::process::exit(status)
+				},

 				"test-sleep" => {
 					std::thread::sleep(std::time::Duration::from_secs(5));
@@ -171,6 +191,84 @@ macro_rules! decl_worker_main {
 	};
 }

+//taken from the os_pipe crate. Copied here to reduce one dependency and
+// because its type-safe abstractions do not play well with nix's clone
+#[cfg(not(target_os = "macos"))]
+pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
+	let mut fds: [libc::c_int; 2] = [0; 2];
+	let res = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC) };
+	if res != 0 {
+		return Err(io::Error::last_os_error())
+	}
+	Ok((fds[0], fds[1]))
+}
+
+#[cfg(target_os = "macos")]
+pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
+	let mut fds: [libc::c_int; 2] = [0; 2];
+	let res = unsafe { libc::pipe(fds.as_mut_ptr()) };
+	if res != 0 {
+		return Err(io::Error::last_os_error())
+	}
+	let res = unsafe { libc::fcntl(fds[0], libc::F_SETFD, libc::FD_CLOEXEC) };
+	if res != 0 {
+		return Err(io::Error::last_os_error())
+	}
+	let res = unsafe { libc::fcntl(fds[1], libc::F_SETFD, libc::FD_CLOEXEC) };
+	if res != 0 {
+		return Err(io::Error::last_os_error())
+	}
+	Ok((fds[0], fds[1]))
+}
+
+/// A wrapper around a file descriptor used to encapsulate and restrict
+/// functionality for pipe operations.
+pub struct PipeFd {
+	file: File,
+}
+
+impl AsRawFd for PipeFd {
+	/// Returns the raw file descriptor associated with this `PipeFd`
+	fn as_raw_fd(&self) -> RawFd {
+		self.file.as_raw_fd()
+	}
+}
+
+impl FromRawFd for PipeFd {
+	/// Creates a new `PipeFd` instance from a raw file descriptor.
+	///
+	/// # Safety
+	///
+	/// The fd passed in must be an owned file descriptor; in particular, it must be open.
+	unsafe fn from_raw_fd(fd: RawFd) -> Self {
+		PipeFd { file: File::from_raw_fd(fd) }
+	}
+}
+
+impl Read for PipeFd {
+	fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+		self.file.read(buf)
+	}
+
+	fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
+		self.file.read_to_end(buf)
+	}
+}
+
+impl Write for PipeFd {
+	fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+		self.file.write(buf)
+	}
+
+	fn flush(&mut self) -> io::Result<()> {
+		self.file.flush()
+	}
+
+	fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
+		self.file.write_all(buf)
+	}
+}
+
 /// Some allowed overhead that we account for in the "CPU time monitor" thread's sleeps, on the
 /// child process.
 pub const JOB_TIMEOUT_OVERHEAD: Duration = Duration::from_millis(50);
@@ -192,14 +290,12 @@ impl fmt::Display for WorkerKind {
 	}
 }

-// Some fields are only used for logging, and dead-code analysis ignores Debug.
-#[allow(dead_code)]
 #[derive(Debug)]
 pub struct WorkerInfo {
-	pid: u32,
-	kind: WorkerKind,
-	version: Option<String>,
-	worker_dir_path: PathBuf,
+	pub pid: u32,
+	pub kind: WorkerKind,
+	pub version: Option<String>,
+	pub worker_dir_path: PathBuf,
 }

 // NOTE: The worker version must be passed in so that we accurately get the version of the worker,
@@ -218,7 +314,7 @@ pub fn run_worker<F>(
 	worker_version: Option<&str>,
 	mut event_loop: F,
 ) where
-	F: FnMut(UnixStream, PathBuf) -> io::Result<Never>,
+	F: FnMut(UnixStream, &WorkerInfo, SecurityStatus) -> io::Result<Never>,
 {
 	#[cfg_attr(not(target_os = "linux"), allow(unused_mut))]
 	let mut worker_info = WorkerInfo {
@@ -250,11 +346,8 @@ pub fn run_worker<F>(
 	}

 	// Make sure that we can read the worker dir path, and log its contents.
-	let entries = || -> Result<Vec<_>, io::Error> {
-		std::fs::read_dir(&worker_info.worker_dir_path)?
-			.map(|res| res.map(|e| e.file_name()))
-			.collect()
-	}();
+	let entries: io::Result<Vec<_>> = std::fs::read_dir(&worker_info.worker_dir_path)
+		.and_then(|d| d.map(|res| res.map(|e| e.file_name())).collect());
 	match entries {
 		Ok(entries) =>
 			gum::trace!(target: LOG_TARGET, ?worker_info, "content of worker dir: {:?}", entries),
@@ -284,6 +377,22 @@ pub fn run_worker<F>(
 	{
 		gum::trace!(target: LOG_TARGET, ?security_status, "Enabling security features");

+		// First, make sure env vars were cleared, to match the environment we perform the checks
+		// within. (In theory, running checks with different env vars could result in different
+		// outcomes of the checks.)
+		if !security::check_env_vars_were_cleared(&worker_info) {
+			let err = "not all env vars were cleared when spawning the process";
+			gum::error!(
+				target: LOG_TARGET,
+				?worker_info,
+				"{}",
+				err
+			);
+			if security_status.secure_validator_mode {
+				worker_shutdown(worker_info, err);
+			}
+		}
+
 		// Call based on whether we can change root. Error out if it should work but fails.
 		//
 		// NOTE: This should not be called in a multi-threaded context (i.e. inside the tokio
@@ -337,23 +446,10 @@ pub fn run_worker<F>(
 				}
 			}
 		}
-
-		if !security::check_env_vars_were_cleared(&worker_info) {
-			let err = "not all env vars were cleared when spawning the process";
-			gum::error!(
-				target: LOG_TARGET,
-				?worker_info,
-				"{}",
-				err
-			);
-			if security_status.secure_validator_mode {
-				worker_shutdown(worker_info, err);
-			}
-		}
 	}

 	// Run the main worker loop.
-	let err = event_loop(stream, worker_info.worker_dir_path.clone())
+	let err = event_loop(stream, &worker_info, security_status)
 		// It's never `Ok` because it's `Ok(Never)`.
 		.unwrap_err();

@@ -54,8 +54,7 @@ pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
 ///
 /// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
 ///       "CLONE_NEWUSER requires that the calling process is not threaded."
-#[cfg(target_os = "linux")]
-pub fn check_is_fully_enabled(tempdir: &Path) -> Result<()> {
+pub fn check_can_fully_enable(tempdir: &Path) -> Result<()> {
 	let worker_dir_path = tempdir.to_owned();
 	try_restrict(&WorkerInfo {
 		pid: std::process::id(),
@@ -69,7 +68,6 @@ pub fn check_is_fully_enabled(tempdir: &Path) -> Result<()> {
 ///
 /// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
 ///       "CLONE_NEWUSER requires that the calling process is not threaded."
-#[cfg(target_os = "linux")]
 fn try_restrict(worker_info: &WorkerInfo) -> Result<()> {
 	// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
 	macro_rules! cstr_ptr {
@@ -78,12 +76,6 @@ fn try_restrict(worker_info: &WorkerInfo) -> Result<()> {
 		};
 	}

-	gum::trace!(
-		target: LOG_TARGET,
-		?worker_info,
-		"unsharing the user namespace and calling pivot_root",
-	);
-
 	let worker_dir_path_c = CString::new(worker_info.worker_dir_path.as_os_str().as_bytes())
 		.expect("on unix; the path will never contain 0 bytes; qed");

@@ -0,0 +1,93 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Functionality for securing the job processes spawned by the workers using `clone`. If
+//! unsupported, falls back to `fork`.
+
+use crate::{worker::WorkerInfo, LOG_TARGET};
+use nix::{
+	errno::Errno,
+	sched::{CloneCb, CloneFlags},
+	unistd::Pid,
+};
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+	#[error("could not clone, errno: {0}")]
+	Clone(Errno),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Try to run clone(2) on the current worker.
+///
+/// SAFETY: new process should be either spawned within a single threaded process, or use only
+/// async-signal-safe functions.
+pub unsafe fn clone_on_worker(
+	worker_info: &WorkerInfo,
+	have_unshare_newuser: bool,
+	cb: CloneCb,
+) -> Result<Pid> {
+	let flags = clone_flags(have_unshare_newuser);
+
+	gum::trace!(
+		target: LOG_TARGET,
+		?worker_info,
+		"calling clone with flags: {:?}",
+		flags
+	);
+
+	try_clone(cb, flags)
+}
+
+/// Runs a check for clone(2) with all sandboxing flags and returns an error indicating whether it
+/// can be fully enabled on the current Linux environment.
+///
+/// SAFETY: new process should be either spawned within a single threaded process, or use only
+/// async-signal-safe functions.
+pub unsafe fn check_can_fully_clone() -> Result<()> {
+	try_clone(Box::new(|| 0), clone_flags(false)).map(|_pid| ())
+}
+
+/// Runs clone(2) with all sandboxing flags.
+///
+/// SAFETY: new process should be either spawned within a single threaded process, or use only
+/// async-signal-safe functions.
+unsafe fn try_clone(cb: CloneCb, flags: CloneFlags) -> Result<Pid> {
+	let mut stack = [0u8; 2 * 1024 * 1024];
+
+	nix::sched::clone(cb, stack.as_mut_slice(), flags, None).map_err(|errno| Error::Clone(errno))
+}
+
+/// Returns flags for `clone(2)`, including all the sandbox-related ones.
+fn clone_flags(have_unshare_newuser: bool) -> CloneFlags {
+	// NOTE: CLONE_NEWUSER does not work in `clone` if we previously called `unshare` with this
+	// flag. On the other hand, if we did not call `unshare` we need this flag for the CAP_SYS_ADMIN
+	// capability.
+	let maybe_clone_newuser =
+		if have_unshare_newuser { CloneFlags::empty() } else { CloneFlags::CLONE_NEWUSER };
+	// SIGCHLD flag is used to inform clone that the parent process is
+	// expecting a child termination signal, without this flag `waitpid` function
+	// return `ECHILD` error.
+	maybe_clone_newuser |
+		CloneFlags::CLONE_NEWCGROUP |
+		CloneFlags::CLONE_NEWIPC |
+		CloneFlags::CLONE_NEWNET |
+		CloneFlags::CLONE_NEWNS |
+		CloneFlags::CLONE_NEWPID |
+		CloneFlags::CLONE_NEWUTS |
+		CloneFlags::from_bits_retain(libc::SIGCHLD)
+}
@@ -112,7 +112,7 @@ pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
 // TODO: <https://github.com/landlock-lsm/rust-landlock/issues/36>
 /// Runs a check for landlock in its own thread, and returns an error indicating whether the given
 /// landlock ABI is fully enabled on the current Linux environment.
-pub fn check_is_fully_enabled() -> Result<()> {
+pub fn check_can_fully_enable() -> Result<()> {
 	match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())).join() {
 		Ok(Ok(())) => Ok(()),
 		Ok(Err(err)) => Err(err),
@@ -165,7 +165,7 @@ mod tests {
 	#[test]
 	fn restricted_thread_cannot_read_file() {
 		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
-		if check_is_fully_enabled().is_err() {
+		if check_can_fully_enable().is_err() {
 			return
 		}

@@ -230,7 +230,7 @@ mod tests {
 	#[test]
 	fn restricted_thread_cannot_write_file() {
 		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
-		if check_is_fully_enabled().is_err() {
+		if check_can_fully_enable().is_err() {
 			return
 		}

@@ -289,7 +289,7 @@ mod tests {
 	#[test]
 	fn restricted_thread_can_truncate_file() {
 		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
-		if check_is_fully_enabled().is_err() {
+		if check_can_fully_enable().is_err() {
 			return
 		}

@@ -27,15 +27,17 @@
 //! - Restrict networking by blocking socket creation and io_uring.
 //! - Remove env vars

-use crate::{worker::WorkerInfo, LOG_TARGET};
-
 #[cfg(target_os = "linux")]
 pub mod change_root;
 #[cfg(target_os = "linux")]
+pub mod clone;
+#[cfg(target_os = "linux")]
 pub mod landlock;
 #[cfg(all(target_os = "linux", target_arch = "x86_64"))]
 pub mod seccomp;

+use crate::{worker::WorkerInfo, LOG_TARGET};
+
 /// Require env vars to have been removed when spawning the process, to prevent malicious code from
 /// accessing them.
 pub fn check_env_vars_were_cleared(worker_info: &WorkerInfo) -> bool {
@@ -110,7 +110,7 @@ pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {

 /// Runs a check for seccomp in its own thread, and returns an error indicating whether seccomp with
 /// our rules is fully enabled on the current Linux environment.
-pub fn check_is_fully_enabled() -> Result<()> {
+pub fn check_can_fully_enable() -> Result<()> {
 	match std::thread::spawn(|| try_restrict()).join() {
 		Ok(Ok(())) => Ok(()),
 		Ok(Err(err)) => Err(err),
@@ -161,7 +161,7 @@ mod tests {
 	#[test]
 	fn sandboxed_thread_cannot_use_sockets() {
 		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
-		if check_is_fully_enabled().is_err() {
+		if check_can_fully_enable().is_err() {
 			return
 		}