PVF: more filesystem sandboxing (#1373)

2026-08-03 03:25:40 +00:00 · 2023-09-28 18:24:29 +02:00
parent de71fecc4e
commit c1eb342b14
24 changed files with 1528 additions and 612 deletions
@@ -44,7 +44,17 @@ pub enum PrepareError {
 	/// The response from the worker is received, but the file cannot be renamed (moved) to the
 	/// final destination location. This state is reported by the validation host (not by the
 	/// worker).
-	RenameTmpFileErr(String),
+	RenameTmpFileErr {
+		err: String,
+		// Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible
+		// conversion to `Option<String>`.
+		src: Option<String>,
+		dest: Option<String>,
+	},
+	/// The response from the worker is received, but the worker cache could not be cleared. The
+	/// worker has to be killed to avoid jobs having access to data from other jobs. This state is
+	/// reported by the validation host (not by the worker).
+	ClearWorkerDir(String),
 }

 impl PrepareError {
@@ -58,7 +68,11 @@ impl PrepareError {
 		use PrepareError::*;
 		match self {
 			Prevalidation(_) | Preparation(_) | Panic(_) => true,
-			TimedOut | IoErr(_) | CreateTmpFileErr(_) | RenameTmpFileErr(_) => false,
+			TimedOut |
+			IoErr(_) |
+			CreateTmpFileErr(_) |
+			RenameTmpFileErr { .. } |
+			ClearWorkerDir(_) => false,
 			// Can occur due to issues with the PVF, but also due to local errors.
 			RuntimeConstruction(_) => false,
 		}
@@ -76,7 +90,9 @@ impl fmt::Display for PrepareError {
 			TimedOut => write!(f, "prepare: timeout"),
 			IoErr(err) => write!(f, "prepare: io error while receiving response: {}", err),
 			CreateTmpFileErr(err) => write!(f, "prepare: error creating tmp file: {}", err),
-			RenameTmpFileErr(err) => write!(f, "prepare: error renaming tmp file: {}", err),
+			RenameTmpFileErr { err, src, dest } =>
+				write!(f, "prepare: error renaming tmp file ({:?} -> {:?}): {}", src, dest, err),
+			ClearWorkerDir(err) => write!(f, "prepare: error clearing worker cache: {}", err),
 		}
 	}
 }
@@ -89,8 +105,17 @@ impl fmt::Display for PrepareError {
 pub enum InternalValidationError {
 	/// Some communication error occurred with the host.
 	HostCommunication(String),
+	/// Host could not create a hard link to the artifact path.
+	CouldNotCreateLink(String),
 	/// Could not find or open compiled artifact file.
 	CouldNotOpenFile(String),
+	/// Host could not clear the worker cache after a job.
+	CouldNotClearWorkerDir {
+		err: String,
+		// Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible
+		// conversion to `Option<String>`.
+		path: Option<String>,
+	},
 	/// An error occurred in the CPU time monitor thread. Should be totally unrelated to
 	/// validation.
 	CpuTimeMonitorThread(String),
@@ -104,8 +129,18 @@ impl fmt::Display for InternalValidationError {
 		match self {
 			HostCommunication(err) =>
 				write!(f, "validation: some communication error occurred with the host: {}", err),
+			CouldNotCreateLink(err) => write!(
+				f,
+				"validation: host could not create a hard link to the artifact path: {}",
+				err
+			),
 			CouldNotOpenFile(err) =>
 				write!(f, "validation: could not find or open compiled artifact file: {}", err),
+			CouldNotClearWorkerDir { err, path } => write!(
+				f,
+				"validation: host could not clear the worker cache ({:?}) after a job: {}",
+				path, err
+			),
 			CpuTimeMonitorThread(err) =>
 				write!(f, "validation: an error occurred in the CPU time monitor thread: {}", err),
 			NonDeterministicPrepareError(err) => write!(f, "validation: prepare: {}", err),
@@ -29,7 +29,7 @@ pub struct Handshake {
 }

 /// The response from an execution job on the worker.
-#[derive(Encode, Decode)]
+#[derive(Debug, Encode, Decode)]
 pub enum Response {
 	/// The job completed successfully.
 	Ok {
@@ -22,6 +22,7 @@ pub mod executor_intf;
 pub mod prepare;
 pub mod pvf;
 pub mod worker;
+pub mod worker_dir;

 pub use cpu_time::ProcessTime;

@@ -30,8 +31,11 @@ pub use sp_tracing;

 const LOG_TARGET: &str = "parachain::pvf-common";

-use std::mem;
-use tokio::io::{self, AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _};
+use std::{
+	io::{Read, Write},
+	mem,
+};
+use tokio::io;

 #[cfg(feature = "test-utils")]
 pub mod tests {
@@ -41,20 +45,31 @@ pub mod tests {
 	pub const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(30);
 }

-/// Write some data prefixed by its length into `w`.
-pub async fn framed_send(w: &mut (impl AsyncWrite + Unpin), buf: &[u8]) -> io::Result<()> {
+/// Status of security features on the current system.
+#[derive(Debug, Clone, Default)]
+pub struct SecurityStatus {
+	/// Whether the landlock features we use are fully available on this system.
+	pub can_enable_landlock: bool,
+	// Whether we are able to unshare the user namespace and change the filesystem root.
+	pub can_unshare_user_namespace_and_change_root: bool,
+}
+
+/// Write some data prefixed by its length into `w`. Sync version of `framed_send` to avoid
+/// dependency on tokio.
+pub fn framed_send_blocking(w: &mut (impl Write + Unpin), buf: &[u8]) -> io::Result<()> {
 	let len_buf = buf.len().to_le_bytes();
-	w.write_all(&len_buf).await?;
-	w.write_all(buf).await?;
+	w.write_all(&len_buf)?;
+	w.write_all(buf)?;
 	Ok(())
 }

-/// Read some data prefixed by its length from `r`.
-pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result<Vec<u8>> {
+/// Read some data prefixed by its length from `r`. Sync version of `framed_recv` to avoid
+/// dependency on tokio.
+pub fn framed_recv_blocking(r: &mut (impl Read + Unpin)) -> io::Result<Vec<u8>> {
 	let mut len_buf = [0u8; mem::size_of::<usize>()];
-	r.read_exact(&mut len_buf).await?;
+	r.read_exact(&mut len_buf)?;
 	let len = usize::from_le_bytes(len_buf);
 	let mut buf = vec![0; len];
-	r.read_exact(&mut buf).await?;
+	r.read_exact(&mut buf)?;
 	Ok(buf)
 }
@@ -18,16 +18,18 @@

 pub mod security;

-use crate::LOG_TARGET;
+use crate::{worker_dir, SecurityStatus, LOG_TARGET};
 use cpu_time::ProcessTime;
 use futures::never::Never;
 use std::{
 	any::Any,
+	fmt,
+	os::unix::net::UnixStream,
 	path::PathBuf,
 	sync::mpsc::{Receiver, RecvTimeoutError},
 	time::Duration,
 };
-use tokio::{io, net::UnixStream, runtime::Runtime};
+use tokio::{io, runtime::Runtime};

 /// Use this macro to declare a `fn main() {}` that will create an executable that can be used for
 /// spawning the desired worker.
@@ -41,10 +43,15 @@ macro_rules! decl_worker_main {
 		}

 		fn main() {
+			#[cfg(target_os = "linux")]
+			use $crate::worker::security;
+
 			// TODO: Remove this dependency, and `pub use sp_tracing` in `lib.rs`.
 			// See <https://github.com/paritytech/polkadot/issues/7117>.
 			$crate::sp_tracing::try_init_simple();

+			let worker_pid = std::process::id();
+
 			let args = std::env::args().collect::<Vec<_>>();
 			if args.len() == 1 {
 				print_help($expected_command);
@@ -60,10 +67,43 @@ macro_rules! decl_worker_main {
 					println!("{}", $worker_version);
 					return
 				},
+
+				"--check-can-enable-landlock" => {
+					#[cfg(target_os = "linux")]
+					let status = if security::landlock::check_is_fully_enabled() { 0 } else { -1 };
+					#[cfg(not(target_os = "linux"))]
+					let status = -1;
+					std::process::exit(status)
+				},
+				"--check-can-unshare-user-namespace-and-change-root" => {
+					#[cfg(target_os = "linux")]
+					let status = if let Err(err) = security::unshare_user_namespace_and_change_root(
+						$crate::worker::WorkerKind::CheckPivotRoot,
+						worker_pid,
+						// We're not accessing any files, so we can try to pivot_root in the temp
+						// dir without conflicts with other processes.
+						&std::env::temp_dir(),
+					) {
+						// Write the error to stderr, log it on the host-side.
+						eprintln!("{}", err);
+						-1
+					} else {
+						0
+					};
+					#[cfg(not(target_os = "linux"))]
+					let status = {
+						// Write the error to stderr, log it on the host-side.
+						eprintln!("not available on macos");
+						-1
+					};
+					std::process::exit(status)
+				},
+
 				"test-sleep" => {
 					std::thread::sleep(std::time::Duration::from_secs(5));
 					return
 				},
+
 				subcommand => {
 					// Must be passed for compatibility with the single-binary test workers.
 					if subcommand != $expected_command {
@@ -75,18 +115,39 @@ macro_rules! decl_worker_main {
 				},
 			}

+			let mut worker_dir_path = None;
 			let mut node_version = None;
-			let mut socket_path: &str = "";
+			let mut can_enable_landlock = false;
+			let mut can_unshare_user_namespace_and_change_root = false;

-			for i in (2..args.len()).step_by(2) {
+			let mut i = 2;
+			while i < args.len() {
 				match args[i].as_ref() {
-					"--socket-path" => socket_path = args[i + 1].as_str(),
-					"--node-impl-version" => node_version = Some(args[i + 1].as_str()),
+					"--worker-dir-path" => {
+						worker_dir_path = Some(args[i + 1].as_str());
+						i += 1
+					},
+					"--node-impl-version" => {
+						node_version = Some(args[i + 1].as_str());
+						i += 1
+					},
+					"--can-enable-landlock" => can_enable_landlock = true,
+					"--can-unshare-user-namespace-and-change-root" =>
+						can_unshare_user_namespace_and_change_root = true,
 					arg => panic!("Unexpected argument found: {}", arg),
 				}
+				i += 1;
 			}
+			let worker_dir_path =
+				worker_dir_path.expect("the --worker-dir-path argument is required");

-			$entrypoint(&socket_path, node_version, Some($worker_version));
+			let worker_dir_path = std::path::Path::new(worker_dir_path).to_owned();
+			let security_status = $crate::SecurityStatus {
+				can_enable_landlock,
+				can_unshare_user_namespace_and_change_root,
+			};
+
+			$entrypoint(worker_dir_path, node_version, Some($worker_version), security_status);
 		}
 	};
 }
@@ -95,61 +156,181 @@ macro_rules! decl_worker_main {
 /// child process.
 pub const JOB_TIMEOUT_OVERHEAD: Duration = Duration::from_millis(50);

-/// Interprets the given bytes as a path. Returns `None` if the given bytes do not constitute a
-/// a proper utf-8 string.
-pub fn bytes_to_path(bytes: &[u8]) -> Option<PathBuf> {
-	std::str::from_utf8(bytes).ok().map(PathBuf::from)
+#[derive(Debug, Clone, Copy)]
+pub enum WorkerKind {
+	Prepare,
+	Execute,
+	CheckPivotRoot,
+}
+
+impl fmt::Display for WorkerKind {
+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+		match self {
+			Self::Prepare => write!(f, "prepare"),
+			Self::Execute => write!(f, "execute"),
+			Self::CheckPivotRoot => write!(f, "check pivot root"),
+		}
+	}
 }

 // The worker version must be passed in so that we accurately get the version of the worker, and not
 // the version that this crate was compiled with.
 pub fn worker_event_loop<F, Fut>(
-	debug_id: &'static str,
-	socket_path: &str,
+	worker_kind: WorkerKind,
+	#[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut worker_dir_path: PathBuf,
 	node_version: Option<&str>,
 	worker_version: Option<&str>,
+	#[cfg_attr(not(target_os = "linux"), allow(unused_variables))] security_status: &SecurityStatus,
 	mut event_loop: F,
 ) where
-	F: FnMut(UnixStream) -> Fut,
+	F: FnMut(UnixStream, PathBuf) -> Fut,
 	Fut: futures::Future<Output = io::Result<Never>>,
 {
 	let worker_pid = std::process::id();
-	gum::debug!(target: LOG_TARGET, %worker_pid, "starting pvf worker ({})", debug_id);
+	gum::debug!(
+		target: LOG_TARGET,
+		%worker_pid,
+		?worker_dir_path,
+		?security_status,
+		"starting pvf worker ({})",
+		worker_kind
+	);

 	// Check for a mismatch between the node and worker versions.
 	if let (Some(node_version), Some(worker_version)) = (node_version, worker_version) {
 		if node_version != worker_version {
 			gum::error!(
 				target: LOG_TARGET,
+				%worker_kind,
 				%worker_pid,
 				%node_version,
 				%worker_version,
 				"Node and worker version mismatch, node needs restarting, forcing shutdown",
 			);
 			kill_parent_node_in_emergency();
-			let err = io::Error::new(io::ErrorKind::Unsupported, "Version mismatch");
-			worker_shutdown_message(debug_id, worker_pid, err);
+			worker_shutdown_message(worker_kind, worker_pid, "Version mismatch");
 			return
 		}
 	}

-	remove_env_vars(debug_id);
+	// Make sure that we can read the worker dir path, and log its contents.
+	let entries = || -> Result<Vec<_>, io::Error> {
+		std::fs::read_dir(&worker_dir_path)?
+			.map(|res| res.map(|e| e.file_name()))
+			.collect()
+	}();
+	match entries {
+		Ok(entries) =>
+			gum::trace!(target: LOG_TARGET, %worker_pid, ?worker_dir_path, "content of worker dir: {:?}", entries),
+		Err(err) => {
+			gum::error!(
+				target: LOG_TARGET,
+				%worker_kind,
+				%worker_pid,
+				?worker_dir_path,
+				"Could not read worker dir: {}",
+				err.to_string()
+			);
+			worker_shutdown_message(worker_kind, worker_pid, &err.to_string());
+			return
+		},
+	}
+
+	// Connect to the socket.
+	let socket_path = worker_dir::socket(&worker_dir_path);
+	let stream = || -> std::io::Result<UnixStream> {
+		let stream = UnixStream::connect(&socket_path)?;
+		// Remove the socket here. We don't also need to do this on the host-side; on failed
+		// rendezvous, the host will delete the whole worker dir.
+		std::fs::remove_file(&socket_path)?;
+		Ok(stream)
+	}();
+	let stream = match stream {
+		Ok(s) => s,
+		Err(err) => {
+			gum::error!(
+				target: LOG_TARGET,
+				%worker_kind,
+				%worker_pid,
+				"{}",
+				err
+			);
+			worker_shutdown_message(worker_kind, worker_pid, &err.to_string());
+			return
+		},
+	};
+
+	// Enable some security features.
+	{
+		// Call based on whether we can change root. Error out if it should work but fails.
+		//
+		// NOTE: This should not be called in a multi-threaded context (i.e. inside the tokio
+		// runtime). `unshare(2)`:
+		//
+		//       > CLONE_NEWUSER requires that the calling process is not threaded.
+		#[cfg(target_os = "linux")]
+		if security_status.can_unshare_user_namespace_and_change_root {
+			if let Err(err) = security::unshare_user_namespace_and_change_root(
+				worker_kind,
+				worker_pid,
+				&worker_dir_path,
+			) {
+				// The filesystem may be in an inconsistent state, bail out.
+				gum::error!(
+					target: LOG_TARGET,
+					%worker_kind,
+					%worker_pid,
+					?worker_dir_path,
+					"Could not change root to be the worker cache path: {}",
+					err
+				);
+				worker_shutdown_message(worker_kind, worker_pid, &err);
+				return
+			}
+			worker_dir_path = std::path::Path::new("/").to_owned();
+		}
+
+		#[cfg(target_os = "linux")]
+		if security_status.can_enable_landlock {
+			let landlock_status =
+				security::landlock::enable_for_worker(worker_kind, worker_pid, &worker_dir_path);
+			if !matches!(landlock_status, Ok(landlock::RulesetStatus::FullyEnforced)) {
+				// We previously were able to enable, so this should never happen.
+				//
+				// TODO: Make this a real error in secure-mode. See:
+				// <https://github.com/paritytech/polkadot-sdk/issues/1444>
+				gum::error!(
+					target: LOG_TARGET,
+					%worker_kind,
+					%worker_pid,
+					"could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs",
+					landlock_status
+				);
+			}
+		}
+
+		if !security::check_env_vars_were_cleared(worker_kind, worker_pid) {
+			let err = "not all env vars were cleared when spawning the process";
+			gum::error!(
+				target: LOG_TARGET,
+				%worker_kind,
+				%worker_pid,
+				"{}",
+				err
+			);
+			worker_shutdown_message(worker_kind, worker_pid, err);
+			return
+		}
+	}

 	// Run the main worker loop.
 	let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it.");
 	let err = rt
-		.block_on(async move {
-			let stream = UnixStream::connect(socket_path).await?;
-			let _ = tokio::fs::remove_file(socket_path).await;
-
-			let result = event_loop(stream).await;
-
-			result
-		})
+		.block_on(event_loop(stream, worker_dir_path))
 		// It's never `Ok` because it's `Ok(Never)`.
 		.unwrap_err();

-	worker_shutdown_message(debug_id, worker_pid, err);
+	worker_shutdown_message(worker_kind, worker_pid, &err.to_string());

 	// We don't want tokio to wait for the tasks to finish. We want to bring down the worker as fast
 	// as possible and not wait for stalled validation to finish. This isn't strictly necessary now,
@@ -157,51 +338,9 @@ pub fn worker_event_loop<F, Fut>(
 	rt.shutdown_background();
 }

-/// Delete all env vars to prevent malicious code from accessing them.
-fn remove_env_vars(debug_id: &'static str) {
-	for (key, value) in std::env::vars_os() {
-		// TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of
-		// randomness for malicious code. In the future we can remove it also and log in the host;
-		// see <https://github.com/paritytech/polkadot/issues/7117>.
-		if key == "RUST_LOG" {
-			continue
-		}
-
-		// In case of a key or value that would cause [`env::remove_var` to
-		// panic](https://doc.rust-lang.org/std/env/fn.remove_var.html#panics), we first log a
-		// warning and then proceed to attempt to remove the env var.
-		let mut err_reasons = vec![];
-		let (key_str, value_str) = (key.to_str(), value.to_str());
-		if key.is_empty() {
-			err_reasons.push("key is empty");
-		}
-		if key_str.is_some_and(|s| s.contains('=')) {
-			err_reasons.push("key contains '='");
-		}
-		if key_str.is_some_and(|s| s.contains('\0')) {
-			err_reasons.push("key contains null character");
-		}
-		if value_str.is_some_and(|s| s.contains('\0')) {
-			err_reasons.push("value contains null character");
-		}
-		if !err_reasons.is_empty() {
-			gum::warn!(
-				target: LOG_TARGET,
-				%debug_id,
-				?key,
-				?value,
-				"Attempting to remove badly-formatted env var, this may cause the PVF worker to crash. Please remove it yourself. Reasons: {:?}",
-				err_reasons
-			);
-		}
-
-		std::env::remove_var(key);
-	}
-}
-
 /// Provide a consistent message on worker shutdown.
-fn worker_shutdown_message(debug_id: &'static str, worker_pid: u32, err: io::Error) {
-	gum::debug!(target: LOG_TARGET, %worker_pid, "quitting pvf worker ({}): {:?}", debug_id, err);
+fn worker_shutdown_message(worker_kind: WorkerKind, worker_pid: u32, err: &str) {
+	gum::debug!(target: LOG_TARGET, %worker_pid, "quitting pvf worker ({}): {}", worker_kind, err);
 }

 /// Loop that runs in the CPU time monitor thread on prepare and execute jobs. Continuously wakes up
@@ -305,7 +444,7 @@ pub mod thread {
 		Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new()))
 	}

-	/// Runs a worker thread. Will first enable security features, and afterwards notify the threads
+	/// Runs a worker thread. Will run the requested function, and afterwards notify the threads
 	/// waiting on the condvar. Catches panics during execution and resumes the panics after
 	/// triggering the condvar, so that the waiting thread is notified on panics.
 	///
@@ -17,30 +17,189 @@
 //! Functionality for securing workers.
 //!
 //! This is needed because workers are used to compile and execute untrusted code (PVFs).
+//!
+//! We currently employ the following security measures:
+//!
+//! - Restrict filesystem
+//!   - Use Landlock to remove all unnecessary FS access rights.
+//!   - Unshare the user and mount namespaces.
+//!   - Change the root directory to a worker-specific temporary directory.
+//! - Remove env vars

-/// To what degree landlock is enabled. It's a separate struct from `RulesetStatus` because that is
-/// only available on Linux, plus this has a nicer name.
-pub enum LandlockStatus {
-	FullyEnforced,
-	PartiallyEnforced,
-	NotEnforced,
-	/// Thread panicked, we don't know what the status is.
-	Unavailable,
-}
+use crate::{worker::WorkerKind, LOG_TARGET};

-impl LandlockStatus {
-	#[cfg(target_os = "linux")]
-	pub fn from_ruleset_status(ruleset_status: ::landlock::RulesetStatus) -> Self {
-		use ::landlock::RulesetStatus::*;
-		match ruleset_status {
-			FullyEnforced => LandlockStatus::FullyEnforced,
-			PartiallyEnforced => LandlockStatus::PartiallyEnforced,
-			NotEnforced => LandlockStatus::NotEnforced,
+/// Unshare the user namespace and change root to be the artifact directory.
+///
+/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
+///       "CLONE_NEWUSER requires that the calling process is not threaded."
+#[cfg(target_os = "linux")]
+pub fn unshare_user_namespace_and_change_root(
+	worker_kind: WorkerKind,
+	worker_pid: u32,
+	worker_dir_path: &std::path::Path,
+) -> Result<(), String> {
+	use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr};
+
+	// The following was copied from the `cstr_core` crate.
+	//
+	// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
+	#[inline]
+	#[doc(hidden)]
+	const fn cstr_is_valid(bytes: &[u8]) -> bool {
+		if bytes.is_empty() || bytes[bytes.len() - 1] != 0 {
+			return false
 		}
+
+		let mut index = 0;
+		while index < bytes.len() - 1 {
+			if bytes[index] == 0 {
+				return false
+			}
+			index += 1;
+		}
+		true
 	}
+
+	macro_rules! cstr {
+		($e:expr) => {{
+			const STR: &[u8] = concat!($e, "\0").as_bytes();
+			const STR_VALID: bool = cstr_is_valid(STR);
+			let _ = [(); 0 - (!(STR_VALID) as usize)];
+			#[allow(unused_unsafe)]
+			unsafe {
+				core::ffi::CStr::from_bytes_with_nul_unchecked(STR)
+			}
+		}}
+	}
+
+	gum::debug!(
+		target: LOG_TARGET,
+		%worker_kind,
+		%worker_pid,
+		?worker_dir_path,
+		"unsharing the user namespace and calling pivot_root",
+	);
+
+	let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes())
+		.expect("on unix; the path will never contain 0 bytes; qed");
+
+	// Wrapper around all the work to prevent repetitive error handling.
+	//
+	// # Errors
+	//
+	// It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does
+	// not give the context of which call failed, so we return a &str error.
+	|| -> Result<(), &'static str> {
+		// SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps
+		//         (2) and (3) are adapted from the example in pivot_root(2), with the additional
+		//         change described in the `pivot_root(".", ".")` section.
+		unsafe {
+			// 1. `unshare` the user and the mount namespaces.
+			if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 {
+				return Err("unshare user and mount namespaces")
+			}
+
+			// 2. Setup mounts.
+			//
+			// Ensure that new root and its parent mount don't have shared propagation (which would
+			// cause pivot_root() to return an error), and prevent propagation of mount events to
+			// the initial mount namespace.
+			if libc::mount(
+				ptr::null(),
+				cstr!("/").as_ptr(),
+				ptr::null(),
+				libc::MS_REC | libc::MS_PRIVATE,
+				ptr::null(),
+			) < 0
+			{
+				return Err("mount MS_PRIVATE")
+			}
+			// Ensure that the new root is a mount point.
+			let additional_flags =
+				if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind {
+					libc::MS_RDONLY
+				} else {
+					0
+				};
+			if libc::mount(
+				worker_dir_path_c.as_ptr(),
+				worker_dir_path_c.as_ptr(),
+				ptr::null(), // ignored when MS_BIND is used
+				libc::MS_BIND |
+					libc::MS_REC | libc::MS_NOEXEC |
+					libc::MS_NODEV | libc::MS_NOSUID |
+					libc::MS_NOATIME | additional_flags,
+				ptr::null(), // ignored when MS_BIND is used
+			) < 0
+			{
+				return Err("mount MS_BIND")
+			}
+
+			// 3. `pivot_root` to the artifact directory.
+			if libc::chdir(worker_dir_path_c.as_ptr()) < 0 {
+				return Err("chdir to worker dir path")
+			}
+			if libc::syscall(libc::SYS_pivot_root, cstr!(".").as_ptr(), cstr!(".").as_ptr()) < 0 {
+				return Err("pivot_root")
+			}
+			if libc::umount2(cstr!(".").as_ptr(), libc::MNT_DETACH) < 0 {
+				return Err("umount the old root mount point")
+			}
+		}
+
+		Ok(())
+	}()
+	.map_err(|err_ctx| {
+		let err = std::io::Error::last_os_error();
+		format!("{}: {}", err_ctx, err)
+	})?;
+
+	// Do some assertions.
+	if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") {
+		return Err("expected current dir after pivot_root to be `/`".into())
+	}
+	env::set_current_dir("..").map_err(|err| err.to_string())?;
+	if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") {
+		return Err("expected not to be able to break out of new root by doing `..`".into())
+	}
+
+	Ok(())
 }

-/// The	[landlock] docs say it best:
+/// Require env vars to have been removed when spawning the process, to prevent malicious code from
+/// accessing them.
+pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool {
+	let mut ok = true;
+
+	for (key, value) in std::env::vars_os() {
+		// TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of
+		// randomness for malicious code. In the future we can remove it also and log in the host;
+		// see <https://github.com/paritytech/polkadot/issues/7117>.
+		if key == "RUST_LOG" {
+			continue
+		}
+		// An exception for MacOS. This is not a secure platform anyway, so we let it slide.
+		#[cfg(target_os = "macos")]
+		if key == "__CF_USER_TEXT_ENCODING" {
+			continue
+		}
+
+		gum::error!(
+			target: LOG_TARGET,
+			%worker_kind,
+			%worker_pid,
+			?key,
+			?value,
+			"env var was present that should have been removed",
+		);
+
+		ok = false;
+	}
+
+	ok
+}
+
+/// The [landlock] docs say it best:
 ///
 /// > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict
 /// ambient rights (e.g., global filesystem access) for a set of processes by creating safe security
@@ -52,14 +211,21 @@ impl LandlockStatus {
 /// [landlock]: https://docs.rs/landlock/latest/landlock/index.html
 #[cfg(target_os = "linux")]
 pub mod landlock {
-	use landlock::{Access, AccessFs, Ruleset, RulesetAttr, RulesetError, RulesetStatus, ABI};
+	pub use landlock::RulesetStatus;
+
+	use crate::{worker::WorkerKind, LOG_TARGET};
+	use landlock::*;
+	use std::{
+		fmt,
+		path::{Path, PathBuf},
+	};

 	/// Landlock ABI version. We use ABI V1 because:
 	///
 	/// 1. It is supported by our reference kernel version.
 	/// 2. Later versions do not (yet) provide additional security.
 	///
-	/// # Versions (June 2023)
+	/// # Versions (as of June 2023)
 	///
 	/// - Polkadot reference kernel version: 5.16+
 	/// - ABI V1: 5.13 - introduces	landlock, including full restrictions on file reads
@@ -83,46 +249,103 @@ pub mod landlock {
 	/// supports it or if it introduces some new feature that is beneficial to security.
 	pub const LANDLOCK_ABI: ABI = ABI::V1;

-	// TODO: <https://github.com/landlock-lsm/rust-landlock/issues/36>
-	/// Returns to what degree landlock is enabled with the given ABI on the current Linux
-	/// environment.
-	pub fn get_status() -> Result<RulesetStatus, Box<dyn std::error::Error>> {
-		match std::thread::spawn(|| try_restrict_thread()).join() {
-			Ok(Ok(status)) => Ok(status),
-			Ok(Err(ruleset_err)) => Err(ruleset_err.into()),
-			Err(_err) => Err("a panic occurred in try_restrict_thread".into()),
+	#[derive(Debug)]
+	pub enum TryRestrictError {
+		InvalidExceptionPath(PathBuf),
+		RulesetError(RulesetError),
+	}
+
+	impl From<RulesetError> for TryRestrictError {
+		fn from(err: RulesetError) -> Self {
+			Self::RulesetError(err)
 		}
 	}

-	/// Based on the given `status`, returns a single bool indicating whether the given landlock
-	/// ABI is fully enabled on the current Linux environment.
-	pub fn status_is_fully_enabled(
-		status: &Result<RulesetStatus, Box<dyn std::error::Error>>,
-	) -> bool {
-		matches!(status, Ok(RulesetStatus::FullyEnforced))
+	impl fmt::Display for TryRestrictError {
+		fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+			match self {
+				Self::InvalidExceptionPath(path) => write!(f, "invalid exception path: {:?}", path),
+				Self::RulesetError(err) => write!(f, "ruleset error: {}", err.to_string()),
+			}
+		}
 	}

+	impl std::error::Error for TryRestrictError {}
+
+	/// Try to enable landlock for the given kind of worker.
+	pub fn enable_for_worker(
+		worker_kind: WorkerKind,
+		worker_pid: u32,
+		worker_dir_path: &Path,
+	) -> Result<RulesetStatus, Box<dyn std::error::Error>> {
+		let exceptions: Vec<(PathBuf, BitFlags<AccessFs>)> = match worker_kind {
+			WorkerKind::Prepare => {
+				vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())]
+			},
+			WorkerKind::Execute => {
+				vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())]
+			},
+			WorkerKind::CheckPivotRoot =>
+				panic!("this should only be passed for checking pivot_root; qed"),
+		};
+
+		gum::debug!(
+			target: LOG_TARGET,
+			%worker_kind,
+			%worker_pid,
+			?worker_dir_path,
+			"enabling landlock with exceptions: {:?}",
+			exceptions,
+		);
+
+		Ok(try_restrict(exceptions)?)
+	}
+
+	// TODO: <https://github.com/landlock-lsm/rust-landlock/issues/36>
 	/// Runs a check for landlock and returns a single bool indicating whether the given landlock
 	/// ABI is fully enabled on the current Linux environment.
 	pub fn check_is_fully_enabled() -> bool {
-		status_is_fully_enabled(&get_status())
+		let status_from_thread: Result<RulesetStatus, Box<dyn std::error::Error>> =
+			match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()))
+				.join()
+			{
+				Ok(Ok(status)) => Ok(status),
+				Ok(Err(ruleset_err)) => Err(ruleset_err.into()),
+				Err(_err) => Err("a panic occurred in try_restrict".into()),
+			};
+
+		matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced))
 	}

-	/// Tries to restrict the current thread with the following landlock access controls:
+	/// Tries to restrict the current thread (should only be called in a process' main thread) with
+	/// the following landlock access controls:
 	///
-	/// 1. all global filesystem access
-	/// 2. ... more may be supported in the future.
+	/// 1. all global filesystem access restricted, with optional exceptions
+	/// 2. ... more sandbox types (e.g. networking) may be supported in the future.
 	///
 	/// If landlock is not supported in the current environment this is simply a noop.
 	///
 	/// # Returns
 	///
 	/// The status of the restriction (whether it was fully, partially, or not-at-all enforced).
-	pub fn try_restrict_thread() -> Result<RulesetStatus, RulesetError> {
-		let status = Ruleset::new()
-			.handle_access(AccessFs::from_all(LANDLOCK_ABI))?
-			.create()?
-			.restrict_self()?;
+	fn try_restrict<I, P, A>(fs_exceptions: I) -> Result<RulesetStatus, TryRestrictError>
+	where
+		I: IntoIterator<Item = (P, A)>,
+		P: AsRef<Path>,
+		A: Into<BitFlags<AccessFs>>,
+	{
+		let mut ruleset =
+			Ruleset::new().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?;
+		for (fs_path, access_bits) in fs_exceptions {
+			let paths = &[fs_path.as_ref().to_owned()];
+			let mut rules = path_beneath_rules(paths, access_bits).peekable();
+			if rules.peek().is_none() {
+				// `path_beneath_rules` silently ignores missing paths, so check for it manually.
+				return Err(TryRestrictError::InvalidExceptionPath(fs_path.as_ref().to_owned()))
+			}
+			ruleset = ruleset.add_rules(rules)?;
+		}
+		let status = ruleset.restrict_self()?;
 		Ok(status.ruleset)
 	}

@@ -132,55 +355,114 @@ pub mod landlock {
 		use std::{fs, io::ErrorKind, thread};

 		#[test]
-		fn restricted_thread_cannot_access_fs() {
+		fn restricted_thread_cannot_read_file() {
 			// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
 			if !check_is_fully_enabled() {
 				return
 			}

 			// Restricted thread cannot read from FS.
-			let handle = thread::spawn(|| {
-				// Write to a tmp file, this should succeed before landlock is applied.
-				let text = "foo";
-				let tmpfile = tempfile::NamedTempFile::new().unwrap();
-				let path = tmpfile.path();
-				fs::write(path, text).unwrap();
-				let s = fs::read_to_string(path).unwrap();
-				assert_eq!(s, text);
+			let handle =
+				thread::spawn(|| {
+					// Create, write, and read two tmp files. This should succeed before any
+					// landlock restrictions are applied.
+					const TEXT: &str = "foo";
+					let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
+					let path1 = tmpfile1.path();
+					let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
+					let path2 = tmpfile2.path();

-				let status = try_restrict_thread().unwrap();
-				if !matches!(status, RulesetStatus::FullyEnforced) {
-					panic!("Ruleset should be enforced since we checked if landlock is enabled");
-				}
+					fs::write(path1, TEXT).unwrap();
+					let s = fs::read_to_string(path1).unwrap();
+					assert_eq!(s, TEXT);
+					fs::write(path2, TEXT).unwrap();
+					let s = fs::read_to_string(path2).unwrap();
+					assert_eq!(s, TEXT);

-				// Try to read from the tmp file after landlock.
-				let result = fs::read_to_string(path);
-				assert!(matches!(
-					result,
-					Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
-				));
-			});
+					// Apply Landlock with a read exception for only one of the files.
+					let status = try_restrict(vec![(path1, AccessFs::ReadFile)]);
+					if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
+						panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
+					}
+
+					// Try to read from both files, only tmpfile1 should succeed.
+					let result = fs::read_to_string(path1);
+					assert!(matches!(
+						result,
+						Ok(s) if s == TEXT
+					));
+					let result = fs::read_to_string(path2);
+					assert!(matches!(
+						result,
+						Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+					));
+
+					// Apply Landlock for all files.
+					let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
+					if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
+						panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
+					}
+
+					// Try to read from tmpfile1 after landlock, it should fail.
+					let result = fs::read_to_string(path1);
+					assert!(matches!(
+						result,
+						Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+					));
+				});

 			assert!(handle.join().is_ok());
+		}
+
+		#[test]
+		fn restricted_thread_cannot_write_file() {
+			// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
+			if !check_is_fully_enabled() {
+				return
+			}

 			// Restricted thread cannot write to FS.
-			let handle = thread::spawn(|| {
-				let text = "foo";
-				let tmpfile = tempfile::NamedTempFile::new().unwrap();
-				let path = tmpfile.path();
+			let handle =
+				thread::spawn(|| {
+					// Create and write two tmp files. This should succeed before any landlock
+					// restrictions are applied.
+					const TEXT: &str = "foo";
+					let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
+					let path1 = tmpfile1.path();
+					let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
+					let path2 = tmpfile2.path();

-				let status = try_restrict_thread().unwrap();
-				if !matches!(status, RulesetStatus::FullyEnforced) {
-					panic!("Ruleset should be enforced since we checked if landlock is enabled");
-				}
+					fs::write(path1, TEXT).unwrap();
+					fs::write(path2, TEXT).unwrap();

-				// Try to write to the tmp file after landlock.
-				let result = fs::write(path, text);
-				assert!(matches!(
-					result,
-					Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
-				));
-			});
+					// Apply Landlock with a write exception for only one of the files.
+					let status = try_restrict(vec![(path1, AccessFs::WriteFile)]);
+					if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
+						panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
+					}
+
+					// Try to write to both files, only tmpfile1 should succeed.
+					let result = fs::write(path1, TEXT);
+					assert!(matches!(result, Ok(_)));
+					let result = fs::write(path2, TEXT);
+					assert!(matches!(
+						result,
+						Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+					));
+
+					// Apply Landlock for all files.
+					let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
+					if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
+						panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
+					}
+
+					// Try to write to tmpfile1 after landlock, it should fail.
+					let result = fs::write(path1, TEXT);
+					assert!(matches!(
+						result,
+						Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+					));
+				});

 			assert!(handle.join().is_ok());
 		}
@@ -0,0 +1,35 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Shared functions for getting the known worker files.
+
+use std::path::{Path, PathBuf};
+
+const WORKER_EXECUTE_ARTIFACT_NAME: &str = "artifact";
+const WORKER_PREPARE_TMP_ARTIFACT_NAME: &str = "tmp-artifact";
+const WORKER_SOCKET_NAME: &str = "socket";
+
+pub fn execute_artifact(worker_dir_path: &Path) -> PathBuf {
+	worker_dir_path.join(WORKER_EXECUTE_ARTIFACT_NAME)
+}
+
+pub fn prepare_tmp_artifact(worker_dir_path: &Path) -> PathBuf {
+	worker_dir_path.join(WORKER_PREPARE_TMP_ARTIFACT_NAME)
+}
+
+pub fn socket(worker_dir_path: &Path) -> PathBuf {
+	worker_dir_path.join(WORKER_SOCKET_NAME)
+}