PVF worker: Add seccomp restrictions (restrict networking) (#2009)

2026-06-14 19:11:04 +00:00 · 2023-10-31 11:08:08 +01:00
parent 2d9426f1cc
commit 9faea380dc
27 changed files with 1376 additions and 714 deletions
@@ -32,10 +32,9 @@ pub use sp_tracing;
 const LOG_TARGET: &str = "parachain::pvf-common";

 use std::{
-	io::{Read, Write},
+	io::{self, Read, Write},
 	mem,
 };
-use tokio::io;

 #[cfg(feature = "test-utils")]
 pub mod tests {
@@ -50,6 +49,8 @@ pub mod tests {
 pub struct SecurityStatus {
 	/// Whether the landlock features we use are fully available on this system.
 	pub can_enable_landlock: bool,
+	/// Whether the seccomp features we use are fully available on this system.
+	pub can_enable_seccomp: bool,
 	// Whether we are able to unshare the user namespace and change the filesystem root.
 	pub can_unshare_user_namespace_and_change_root: bool,
 }
@@ -23,13 +23,12 @@ use cpu_time::ProcessTime;
 use futures::never::Never;
 use std::{
 	any::Any,
-	fmt,
+	fmt, io,
 	os::unix::net::UnixStream,
 	path::PathBuf,
 	sync::mpsc::{Receiver, RecvTimeoutError},
 	time::Duration,
 };
-use tokio::{io, runtime::Runtime};

 /// Use this macro to declare a `fn main() {}` that will create an executable that can be used for
 /// spawning the desired worker.
@@ -85,6 +84,13 @@ macro_rules! decl_worker_main {
 					let status = -1;
 					std::process::exit(status)
 				},
+				"--check-can-enable-seccomp" => {
+					#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
+					let status = if security::seccomp::check_is_fully_enabled() { 0 } else { -1 };
+					#[cfg(not(all(target_os = "linux", target_arch = "x86_64")))]
+					let status = -1;
+					std::process::exit(status)
+				},
 				"--check-can-unshare-user-namespace-and-change-root" => {
 					#[cfg(target_os = "linux")]
 					let status = if let Err(err) = security::unshare_user_namespace_and_change_root(
@@ -129,6 +135,7 @@ macro_rules! decl_worker_main {
 			let mut worker_dir_path = None;
 			let mut node_version = None;
 			let mut can_enable_landlock = false;
+			let mut can_enable_seccomp = false;
 			let mut can_unshare_user_namespace_and_change_root = false;

 			let mut i = 2;
@@ -147,6 +154,7 @@ macro_rules! decl_worker_main {
 						i += 1
 					},
 					"--can-enable-landlock" => can_enable_landlock = true,
+					"--can-enable-seccomp" => can_enable_seccomp = true,
 					"--can-unshare-user-namespace-and-change-root" =>
 						can_unshare_user_namespace_and_change_root = true,
 					arg => panic!("Unexpected argument found: {}", arg),
@@ -161,6 +169,7 @@ macro_rules! decl_worker_main {
 			let worker_dir_path = std::path::Path::new(worker_dir_path).to_owned();
 			let security_status = $crate::SecurityStatus {
 				can_enable_landlock,
+				can_enable_seccomp,
 				can_unshare_user_namespace_and_change_root,
 			};

@@ -198,7 +207,7 @@ impl fmt::Display for WorkerKind {

 // The worker version must be passed in so that we accurately get the version of the worker, and not
 // the version that this crate was compiled with.
-pub fn worker_event_loop<F, Fut>(
+pub fn worker_event_loop<F>(
 	worker_kind: WorkerKind,
 	socket_path: PathBuf,
 	#[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut worker_dir_path: PathBuf,
@@ -207,8 +216,7 @@ pub fn worker_event_loop<F, Fut>(
 	#[cfg_attr(not(target_os = "linux"), allow(unused_variables))] security_status: &SecurityStatus,
 	mut event_loop: F,
 ) where
-	F: FnMut(UnixStream, PathBuf) -> Fut,
-	Fut: futures::Future<Output = io::Result<Never>>,
+	F: FnMut(UnixStream, PathBuf) -> io::Result<Never>,
 {
 	let worker_pid = std::process::id();
 	gum::debug!(
@@ -262,7 +270,7 @@ pub fn worker_event_loop<F, Fut>(
 	}

 	// Connect to the socket.
-	let stream = || -> std::io::Result<UnixStream> {
+	let stream = || -> io::Result<UnixStream> {
 		let stream = UnixStream::connect(&socket_path)?;
 		let _ = std::fs::remove_file(&socket_path);
 		Ok(stream)
@@ -317,6 +325,24 @@ pub fn worker_event_loop<F, Fut>(
 			let landlock_status =
 				security::landlock::enable_for_worker(worker_kind, worker_pid, &worker_dir_path);
 			if !matches!(landlock_status, Ok(landlock::RulesetStatus::FullyEnforced)) {
+				// We previously were able to enable, so this should never happen.
+				gum::error!(
+					target: LOG_TARGET,
+					%worker_kind,
+					%worker_pid,
+					"could not fully enable landlock: {:?}. This should not happen, please report an issue",
+					landlock_status
+				);
+			}
+		}
+
+		// TODO: We can enable the seccomp networking blacklist on aarch64 as well, but we need a CI
+		//       job to catch regressions. See <https://github.com/paritytech/ci_cd/issues/609>.
+		#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
+		if security_status.can_enable_seccomp {
+			let seccomp_status =
+				security::seccomp::enable_for_worker(worker_kind, worker_pid, &worker_dir_path);
+			if !matches!(seccomp_status, Ok(())) {
 				// We previously were able to enable, so this should never happen.
 				//
 				// TODO: Make this a real error in secure-mode. See:
@@ -325,8 +351,8 @@ pub fn worker_event_loop<F, Fut>(
 					target: LOG_TARGET,
 					%worker_kind,
 					%worker_pid,
-					"could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs",
-					landlock_status
+					"could not fully enable seccomp: {:?}. This should not happen, please report an issue",
+					seccomp_status
 				);
 			}
 		}
@@ -346,18 +372,11 @@ pub fn worker_event_loop<F, Fut>(
 	}

 	// Run the main worker loop.
-	let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it.");
-	let err = rt
-		.block_on(event_loop(stream, worker_dir_path))
+	let err = event_loop(stream, worker_dir_path)
 		// It's never `Ok` because it's `Ok(Never)`.
 		.unwrap_err();

 	worker_shutdown_message(worker_kind, worker_pid, &err.to_string());
-
-	// We don't want tokio to wait for the tasks to finish. We want to bring down the worker as fast
-	// as possible and not wait for stalled validation to finish. This isn't strictly necessary now,
-	// but may be in the future.
-	rt.shutdown_background();
 }

 /// Provide a consistent message on worker shutdown.
@@ -438,7 +457,7 @@ fn kill_parent_node_in_emergency() {
 /// The motivation for this module is to coordinate worker threads without using async Rust.
 pub mod thread {
 	use std::{
-		panic,
+		io, panic,
 		sync::{Arc, Condvar, Mutex},
 		thread,
 		time::Duration,
@@ -479,7 +498,7 @@ pub mod thread {
 		f: F,
 		cond: Cond,
 		outcome: WaitOutcome,
-	) -> std::io::Result<thread::JoinHandle<R>>
+	) -> io::Result<thread::JoinHandle<R>>
 	where
 		F: FnOnce() -> R,
 		F: Send + 'static + panic::UnwindSafe,
@@ -497,7 +516,7 @@ pub mod thread {
 		cond: Cond,
 		outcome: WaitOutcome,
 		stack_size: usize,
-	) -> std::io::Result<thread::JoinHandle<R>>
+	) -> io::Result<thread::JoinHandle<R>>
 	where
 		F: FnOnce() -> R,
 		F: Send + 'static + panic::UnwindSafe,
@@ -1,512 +0,0 @@
-// Copyright (C) Parity Technologies (UK) Ltd.
-// This file is part of Polkadot.
-
-// Polkadot is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-
-// Polkadot is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-
-// You should have received a copy of the GNU General Public License
-// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
-
-//! Functionality for securing workers.
-//!
-//! This is needed because workers are used to compile and execute untrusted code (PVFs).
-//!
-//! We currently employ the following security measures:
-//!
-//! - Restrict filesystem
-//!   - Use Landlock to remove all unnecessary FS access rights.
-//!   - Unshare the user and mount namespaces.
-//!   - Change the root directory to a worker-specific temporary directory.
-//! - Remove env vars
-
-use crate::{worker::WorkerKind, LOG_TARGET};
-
-/// Unshare the user namespace and change root to be the artifact directory.
-///
-/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
-///       "CLONE_NEWUSER requires that the calling process is not threaded."
-#[cfg(target_os = "linux")]
-pub fn unshare_user_namespace_and_change_root(
-	worker_kind: WorkerKind,
-	worker_pid: u32,
-	worker_dir_path: &std::path::Path,
-) -> Result<(), String> {
-	use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr};
-
-	// The following was copied from the `cstr_core` crate.
-	//
-	// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
-	#[inline]
-	#[doc(hidden)]
-	const fn cstr_is_valid(bytes: &[u8]) -> bool {
-		if bytes.is_empty() || bytes[bytes.len() - 1] != 0 {
-			return false
-		}
-
-		let mut index = 0;
-		while index < bytes.len() - 1 {
-			if bytes[index] == 0 {
-				return false
-			}
-			index += 1;
-		}
-		true
-	}
-
-	macro_rules! cstr {
-		($e:expr) => {{
-			const STR: &[u8] = concat!($e, "\0").as_bytes();
-			const STR_VALID: bool = cstr_is_valid(STR);
-			let _ = [(); 0 - (!(STR_VALID) as usize)];
-			#[allow(unused_unsafe)]
-			unsafe {
-				core::ffi::CStr::from_bytes_with_nul_unchecked(STR)
-			}
-		}}
-	}
-
-	gum::debug!(
-		target: LOG_TARGET,
-		%worker_kind,
-		%worker_pid,
-		?worker_dir_path,
-		"unsharing the user namespace and calling pivot_root",
-	);
-
-	let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes())
-		.expect("on unix; the path will never contain 0 bytes; qed");
-
-	// Wrapper around all the work to prevent repetitive error handling.
-	//
-	// # Errors
-	//
-	// It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does
-	// not give the context of which call failed, so we return a &str error.
-	|| -> Result<(), &'static str> {
-		// SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps
-		//         (2) and (3) are adapted from the example in pivot_root(2), with the additional
-		//         change described in the `pivot_root(".", ".")` section.
-		unsafe {
-			// 1. `unshare` the user and the mount namespaces.
-			if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 {
-				return Err("unshare user and mount namespaces")
-			}
-
-			// 2. Setup mounts.
-			//
-			// Ensure that new root and its parent mount don't have shared propagation (which would
-			// cause pivot_root() to return an error), and prevent propagation of mount events to
-			// the initial mount namespace.
-			if libc::mount(
-				ptr::null(),
-				cstr!("/").as_ptr(),
-				ptr::null(),
-				libc::MS_REC | libc::MS_PRIVATE,
-				ptr::null(),
-			) < 0
-			{
-				return Err("mount MS_PRIVATE")
-			}
-			// Ensure that the new root is a mount point.
-			let additional_flags =
-				if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind {
-					libc::MS_RDONLY
-				} else {
-					0
-				};
-			if libc::mount(
-				worker_dir_path_c.as_ptr(),
-				worker_dir_path_c.as_ptr(),
-				ptr::null(), // ignored when MS_BIND is used
-				libc::MS_BIND |
-					libc::MS_REC | libc::MS_NOEXEC |
-					libc::MS_NODEV | libc::MS_NOSUID |
-					libc::MS_NOATIME | additional_flags,
-				ptr::null(), // ignored when MS_BIND is used
-			) < 0
-			{
-				return Err("mount MS_BIND")
-			}
-
-			// 3. `pivot_root` to the artifact directory.
-			if libc::chdir(worker_dir_path_c.as_ptr()) < 0 {
-				return Err("chdir to worker dir path")
-			}
-			if libc::syscall(libc::SYS_pivot_root, cstr!(".").as_ptr(), cstr!(".").as_ptr()) < 0 {
-				return Err("pivot_root")
-			}
-			if libc::umount2(cstr!(".").as_ptr(), libc::MNT_DETACH) < 0 {
-				return Err("umount the old root mount point")
-			}
-		}
-
-		Ok(())
-	}()
-	.map_err(|err_ctx| {
-		let err = std::io::Error::last_os_error();
-		format!("{}: {}", err_ctx, err)
-	})?;
-
-	// Do some assertions.
-	if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") {
-		return Err("expected current dir after pivot_root to be `/`".into())
-	}
-	env::set_current_dir("..").map_err(|err| err.to_string())?;
-	if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") {
-		return Err("expected not to be able to break out of new root by doing `..`".into())
-	}
-
-	Ok(())
-}
-
-/// Require env vars to have been removed when spawning the process, to prevent malicious code from
-/// accessing them.
-pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool {
-	let mut ok = true;
-
-	for (key, value) in std::env::vars_os() {
-		// TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of
-		// randomness for malicious code. In the future we can remove it also and log in the host;
-		// see <https://github.com/paritytech/polkadot/issues/7117>.
-		if key == "RUST_LOG" {
-			continue
-		}
-		// An exception for MacOS. This is not a secure platform anyway, so we let it slide.
-		#[cfg(target_os = "macos")]
-		if key == "__CF_USER_TEXT_ENCODING" {
-			continue
-		}
-
-		gum::error!(
-			target: LOG_TARGET,
-			%worker_kind,
-			%worker_pid,
-			?key,
-			?value,
-			"env var was present that should have been removed",
-		);
-
-		ok = false;
-	}
-
-	ok
-}
-
-/// The [landlock] docs say it best:
-///
-/// > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict
-/// ambient rights (e.g., global filesystem access) for a set of processes by creating safe security
-/// sandboxes as new security layers in addition to the existing system-wide access-controls. This
-/// kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or
-/// malicious behaviors in applications. Landlock empowers any process, including unprivileged ones,
-/// to securely restrict themselves."
-///
-/// [landlock]: https://docs.rs/landlock/latest/landlock/index.html
-#[cfg(target_os = "linux")]
-pub mod landlock {
-	pub use landlock::RulesetStatus;
-
-	use crate::{worker::WorkerKind, LOG_TARGET};
-	use landlock::*;
-	use std::{
-		fmt,
-		path::{Path, PathBuf},
-	};
-
-	/// Landlock ABI version. We use ABI V1 because:
-	///
-	/// 1. It is supported by our reference kernel version.
-	/// 2. Later versions do not (yet) provide additional security that would benefit us.
-	///
-	/// # Versions (as of October 2023)
-	///
-	/// - Polkadot reference kernel version: 5.16+
-	///
-	/// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads.
-	///
-	/// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During
-	///   execution an attacker can only affect the name of a symlinked artifact and not the
-	///   original one.
-	///
-	/// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can
-	///   prevent attackers from affecting a symlinked artifact. We don't strictly need this as we
-	///   plan to check for file integrity anyway; see
-	///   <https://github.com/paritytech/polkadot-sdk/issues/677>.
-	///
-	/// # Determinism
-	///
-	/// You may wonder whether we could always use the latest ABI instead of only the ABI supported
-	/// by the reference kernel version. It seems plausible, since landlock provides a best-effort
-	/// approach to enabling sandboxing. For example, if the reference version only supported V1 and
-	/// we were on V2, then landlock would use V2 if it was supported on the current machine, and
-	/// just fall back to V1 if not.
-	///
-	/// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1,
-	/// they may have different semantics on some PVFs. So a malicious PVF now has a new attack
-	/// vector: they can exploit this indeterminism between landlock ABIs!
-	///
-	/// On the other hand we do want validators to be as secure as possible and protect their keys
-	/// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy
-	/// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version
-	/// supports it or if it introduces some new feature that is beneficial to security.
-	pub const LANDLOCK_ABI: ABI = ABI::V1;
-
-	#[derive(Debug)]
-	pub enum TryRestrictError {
-		InvalidExceptionPath(PathBuf),
-		RulesetError(RulesetError),
-	}
-
-	impl From<RulesetError> for TryRestrictError {
-		fn from(err: RulesetError) -> Self {
-			Self::RulesetError(err)
-		}
-	}
-
-	impl fmt::Display for TryRestrictError {
-		fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-			match self {
-				Self::InvalidExceptionPath(path) => write!(f, "invalid exception path: {:?}", path),
-				Self::RulesetError(err) => write!(f, "ruleset error: {}", err.to_string()),
-			}
-		}
-	}
-
-	impl std::error::Error for TryRestrictError {}
-
-	/// Try to enable landlock for the given kind of worker.
-	pub fn enable_for_worker(
-		worker_kind: WorkerKind,
-		worker_pid: u32,
-		worker_dir_path: &Path,
-	) -> Result<RulesetStatus, Box<dyn std::error::Error>> {
-		let exceptions: Vec<(PathBuf, BitFlags<AccessFs>)> = match worker_kind {
-			WorkerKind::Prepare => {
-				vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())]
-			},
-			WorkerKind::Execute => {
-				vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())]
-			},
-			WorkerKind::CheckPivotRoot =>
-				panic!("this should only be passed for checking pivot_root; qed"),
-		};
-
-		gum::debug!(
-			target: LOG_TARGET,
-			%worker_kind,
-			%worker_pid,
-			?worker_dir_path,
-			"enabling landlock with exceptions: {:?}",
-			exceptions,
-		);
-
-		Ok(try_restrict(exceptions)?)
-	}
-
-	// TODO: <https://github.com/landlock-lsm/rust-landlock/issues/36>
-	/// Runs a check for landlock and returns a single bool indicating whether the given landlock
-	/// ABI is fully enabled on the current Linux environment.
-	pub fn check_is_fully_enabled() -> bool {
-		let status_from_thread: Result<RulesetStatus, Box<dyn std::error::Error>> =
-			match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()))
-				.join()
-			{
-				Ok(Ok(status)) => Ok(status),
-				Ok(Err(ruleset_err)) => Err(ruleset_err.into()),
-				Err(_err) => Err("a panic occurred in try_restrict".into()),
-			};
-
-		matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced))
-	}
-
-	/// Tries to restrict the current thread (should only be called in a process' main thread) with
-	/// the following landlock access controls:
-	///
-	/// 1. all global filesystem access restricted, with optional exceptions
-	/// 2. ... more sandbox types (e.g. networking) may be supported in the future.
-	///
-	/// If landlock is not supported in the current environment this is simply a noop.
-	///
-	/// # Returns
-	///
-	/// The status of the restriction (whether it was fully, partially, or not-at-all enforced).
-	fn try_restrict<I, P, A>(fs_exceptions: I) -> Result<RulesetStatus, TryRestrictError>
-	where
-		I: IntoIterator<Item = (P, A)>,
-		P: AsRef<Path>,
-		A: Into<BitFlags<AccessFs>>,
-	{
-		let mut ruleset =
-			Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?;
-		for (fs_path, access_bits) in fs_exceptions {
-			let paths = &[fs_path.as_ref().to_owned()];
-			let mut rules = path_beneath_rules(paths, access_bits).peekable();
-			if rules.peek().is_none() {
-				// `path_beneath_rules` silently ignores missing paths, so check for it manually.
-				return Err(TryRestrictError::InvalidExceptionPath(fs_path.as_ref().to_owned()))
-			}
-			ruleset = ruleset.add_rules(rules)?;
-		}
-		let status = ruleset.restrict_self()?;
-		Ok(status.ruleset)
-	}
-
-	#[cfg(test)]
-	mod tests {
-		use super::*;
-		use std::{fs, io::ErrorKind, thread};
-
-		#[test]
-		fn restricted_thread_cannot_read_file() {
-			// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
-			if !check_is_fully_enabled() {
-				return
-			}
-
-			// Restricted thread cannot read from FS.
-			let handle =
-				thread::spawn(|| {
-					// Create, write, and read two tmp files. This should succeed before any
-					// landlock restrictions are applied.
-					const TEXT: &str = "foo";
-					let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
-					let path1 = tmpfile1.path();
-					let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
-					let path2 = tmpfile2.path();
-
-					fs::write(path1, TEXT).unwrap();
-					let s = fs::read_to_string(path1).unwrap();
-					assert_eq!(s, TEXT);
-					fs::write(path2, TEXT).unwrap();
-					let s = fs::read_to_string(path2).unwrap();
-					assert_eq!(s, TEXT);
-
-					// Apply Landlock with a read exception for only one of the files.
-					let status = try_restrict(vec![(path1, AccessFs::ReadFile)]);
-					if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
-						panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
-					}
-
-					// Try to read from both files, only tmpfile1 should succeed.
-					let result = fs::read_to_string(path1);
-					assert!(matches!(
-						result,
-						Ok(s) if s == TEXT
-					));
-					let result = fs::read_to_string(path2);
-					assert!(matches!(
-						result,
-						Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
-					));
-
-					// Apply Landlock for all files.
-					let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
-					if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
-						panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
-					}
-
-					// Try to read from tmpfile1 after landlock, it should fail.
-					let result = fs::read_to_string(path1);
-					assert!(matches!(
-						result,
-						Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
-					));
-				});
-
-			assert!(handle.join().is_ok());
-		}
-
-		#[test]
-		fn restricted_thread_cannot_write_file() {
-			// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
-			if !check_is_fully_enabled() {
-				return
-			}
-
-			// Restricted thread cannot write to FS.
-			let handle =
-				thread::spawn(|| {
-					// Create and write two tmp files. This should succeed before any landlock
-					// restrictions are applied.
-					const TEXT: &str = "foo";
-					let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
-					let path1 = tmpfile1.path();
-					let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
-					let path2 = tmpfile2.path();
-
-					fs::write(path1, TEXT).unwrap();
-					fs::write(path2, TEXT).unwrap();
-
-					// Apply Landlock with a write exception for only one of the files.
-					let status = try_restrict(vec![(path1, AccessFs::WriteFile)]);
-					if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
-						panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
-					}
-
-					// Try to write to both files, only tmpfile1 should succeed.
-					let result = fs::write(path1, TEXT);
-					assert!(matches!(result, Ok(_)));
-					let result = fs::write(path2, TEXT);
-					assert!(matches!(
-						result,
-						Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
-					));
-
-					// Apply Landlock for all files.
-					let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
-					if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
-						panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
-					}
-
-					// Try to write to tmpfile1 after landlock, it should fail.
-					let result = fs::write(path1, TEXT);
-					assert!(matches!(
-						result,
-						Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
-					));
-				});
-
-			assert!(handle.join().is_ok());
-		}
-
-		// Test that checks whether landlock under our ABI version is able to truncate files.
-		#[test]
-		fn restricted_thread_can_truncate_file() {
-			// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
-			if !check_is_fully_enabled() {
-				return
-			}
-
-			// Restricted thread can truncate file.
-			let handle =
-				thread::spawn(|| {
-					// Create and write a file. This should succeed before any landlock
-					// restrictions are applied.
-					const TEXT: &str = "foo";
-					let tmpfile = tempfile::NamedTempFile::new().unwrap();
-					let path = tmpfile.path();
-
-					fs::write(path, TEXT).unwrap();
-
-					// Apply Landlock with all exceptions under the current ABI.
-					let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]);
-					if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
-						panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
-					}
-
-					// Try to truncate the file.
-					let result = tmpfile.as_file().set_len(0);
-					assert!(result.is_ok());
-				});
-
-			assert!(handle.join().is_ok());
-		}
-	}
-}
@@ -0,0 +1,325 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+//! The [landlock] docs say it best:
+//!
+//! > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict
+//! ambient rights (e.g., global filesystem access) for a set of processes by creating safe security
+//! sandboxes as new security layers in addition to the existing system-wide access-controls. This
+//! kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or
+//! malicious behaviors in applications. Landlock empowers any process, including unprivileged ones,
+//! to securely restrict themselves."
+//!
+//! [landlock]: https://docs.rs/landlock/latest/landlock/index.html
+
+pub use landlock::RulesetStatus;
+
+use crate::{
+	worker::{stringify_panic_payload, WorkerKind},
+	LOG_TARGET,
+};
+use landlock::*;
+use std::path::{Path, PathBuf};
+
+/// Landlock ABI version. We use ABI V1 because:
+///
+/// 1. It is supported by our reference kernel version.
+/// 2. Later versions do not (yet) provide additional security that would benefit us.
+///
+/// # Versions (as of October 2023)
+///
+/// - Polkadot reference kernel version: 5.16+
+///
+/// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads.
+///
+/// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During
+///   execution an attacker can only affect the name of a symlinked artifact and not the original
+///   one.
+///
+/// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can
+///   prevent attackers from affecting a symlinked artifact. We don't strictly need this as we
+///   plan to check for file integrity anyway; see
+///   <https://github.com/paritytech/polkadot-sdk/issues/677>.
+///
+/// # Determinism
+///
+/// You may wonder whether we could always use the latest ABI instead of only the ABI supported
+/// by the reference kernel version. It seems plausible, since landlock provides a best-effort
+/// approach to enabling sandboxing. For example, if the reference version only supported V1 and
+/// we were on V2, then landlock would use V2 if it was supported on the current machine, and
+/// just fall back to V1 if not.
+///
+/// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1,
+/// they may have different semantics on some PVFs. So a malicious PVF now has a new attack
+/// vector: they can exploit this indeterminism between landlock ABIs!
+///
+/// On the other hand we do want validators to be as secure as possible and protect their keys
+/// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy
+/// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version
+/// supports it or if it introduces some new feature that is beneficial to security.
+pub const LANDLOCK_ABI: ABI = ABI::V1;
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+	#[error("Invalid exception path: {0:?}")]
+	InvalidExceptionPath(PathBuf),
+	#[error(transparent)]
+	RulesetError(#[from] RulesetError),
+	#[error("A panic occurred in try_restrict: {0}")]
+	Panic(String),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Try to enable landlock for the given kind of worker.
+pub fn enable_for_worker(
+	worker_kind: WorkerKind,
+	worker_pid: u32,
+	worker_dir_path: &Path,
+) -> Result<RulesetStatus> {
+	let exceptions: Vec<(PathBuf, BitFlags<AccessFs>)> = match worker_kind {
+		WorkerKind::Prepare => {
+			vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())]
+		},
+		WorkerKind::Execute => {
+			vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())]
+		},
+		WorkerKind::CheckPivotRoot =>
+			panic!("this should only be passed for checking pivot_root; qed"),
+	};
+
+	gum::trace!(
+		target: LOG_TARGET,
+		%worker_kind,
+		%worker_pid,
+		?worker_dir_path,
+		"enabling landlock with exceptions: {:?}",
+		exceptions,
+	);
+
+	try_restrict(exceptions)
+}
+
+// TODO: <https://github.com/landlock-lsm/rust-landlock/issues/36>
+/// Runs a check for landlock and returns a single bool indicating whether the given landlock
+/// ABI is fully enabled on the current Linux environment.
+pub fn check_is_fully_enabled() -> bool {
+	let status_from_thread: Result<RulesetStatus> =
+		match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())).join()
+		{
+			Ok(Ok(status)) => Ok(status),
+			Ok(Err(ruleset_err)) => Err(ruleset_err.into()),
+			Err(err) => Err(Error::Panic(stringify_panic_payload(err))),
+		};
+
+	matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced))
+}
+
+/// Tries to restrict the current thread (should only be called in a process' main thread) with
+/// the following landlock access controls:
+///
+/// 1. all global filesystem access restricted, with optional exceptions
+/// 2. ... more sandbox types (e.g. networking) may be supported in the future.
+///
+/// If landlock is not supported in the current environment this is simply a noop.
+///
+/// # Returns
+///
+/// The status of the restriction (whether it was fully, partially, or not-at-all enforced).
+fn try_restrict<I, P, A>(fs_exceptions: I) -> Result<RulesetStatus>
+where
+	I: IntoIterator<Item = (P, A)>,
+	P: AsRef<Path>,
+	A: Into<BitFlags<AccessFs>>,
+{
+	let mut ruleset =
+		Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?;
+	for (fs_path, access_bits) in fs_exceptions {
+		let paths = &[fs_path.as_ref().to_owned()];
+		let mut rules = path_beneath_rules(paths, access_bits).peekable();
+		if rules.peek().is_none() {
+			// `path_beneath_rules` silently ignores missing paths, so check for it manually.
+			return Err(Error::InvalidExceptionPath(fs_path.as_ref().to_owned()))
+		}
+		ruleset = ruleset.add_rules(rules)?;
+	}
+	let status = ruleset.restrict_self()?;
+	Ok(status.ruleset)
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+	use std::{fs, io::ErrorKind, thread};
+
+	#[test]
+	fn restricted_thread_cannot_read_file() {
+		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
+		if !check_is_fully_enabled() {
+			return
+		}
+
+		// Restricted thread cannot read from FS.
+		let handle = thread::spawn(|| {
+			// Create, write, and read two tmp files. This should succeed before any
+			// landlock restrictions are applied.
+			const TEXT: &str = "foo";
+			let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
+			let path1 = tmpfile1.path();
+			let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
+			let path2 = tmpfile2.path();
+
+			fs::write(path1, TEXT).unwrap();
+			let s = fs::read_to_string(path1).unwrap();
+			assert_eq!(s, TEXT);
+			fs::write(path2, TEXT).unwrap();
+			let s = fs::read_to_string(path2).unwrap();
+			assert_eq!(s, TEXT);
+
+			// Apply Landlock with a read exception for only one of the files.
+			let status = try_restrict(vec![(path1, AccessFs::ReadFile)]);
+			if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
+				panic!(
+					"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
+					status
+				);
+			}
+
+			// Try to read from both files, only tmpfile1 should succeed.
+			let result = fs::read_to_string(path1);
+			assert!(matches!(
+				result,
+				Ok(s) if s == TEXT
+			));
+			let result = fs::read_to_string(path2);
+			assert!(matches!(
+				result,
+				Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+			));
+
+			// Apply Landlock for all files.
+			let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
+			if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
+				panic!(
+					"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
+					status
+				);
+			}
+
+			// Try to read from tmpfile1 after landlock, it should fail.
+			let result = fs::read_to_string(path1);
+			assert!(matches!(
+				result,
+				Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+			));
+		});
+
+		assert!(handle.join().is_ok());
+	}
+
+	#[test]
+	fn restricted_thread_cannot_write_file() {
+		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
+		if !check_is_fully_enabled() {
+			return
+		}
+
+		// Restricted thread cannot write to FS.
+		let handle = thread::spawn(|| {
+			// Create and write two tmp files. This should succeed before any landlock
+			// restrictions are applied.
+			const TEXT: &str = "foo";
+			let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
+			let path1 = tmpfile1.path();
+			let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
+			let path2 = tmpfile2.path();
+
+			fs::write(path1, TEXT).unwrap();
+			fs::write(path2, TEXT).unwrap();
+
+			// Apply Landlock with a write exception for only one of the files.
+			let status = try_restrict(vec![(path1, AccessFs::WriteFile)]);
+			if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
+				panic!(
+					"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
+					status
+				);
+			}
+
+			// Try to write to both files, only tmpfile1 should succeed.
+			let result = fs::write(path1, TEXT);
+			assert!(matches!(result, Ok(_)));
+			let result = fs::write(path2, TEXT);
+			assert!(matches!(
+				result,
+				Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+			));
+
+			// Apply Landlock for all files.
+			let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
+			if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
+				panic!(
+					"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
+					status
+				);
+			}
+
+			// Try to write to tmpfile1 after landlock, it should fail.
+			let result = fs::write(path1, TEXT);
+			assert!(matches!(
+				result,
+				Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+			));
+		});
+
+		assert!(handle.join().is_ok());
+	}
+
+	// Test that checks whether landlock under our ABI version is able to truncate files.
+	#[test]
+	fn restricted_thread_can_truncate_file() {
+		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
+		if !check_is_fully_enabled() {
+			return
+		}
+
+		// Restricted thread can truncate file.
+		let handle = thread::spawn(|| {
+			// Create and write a file. This should succeed before any landlock
+			// restrictions are applied.
+			const TEXT: &str = "foo";
+			let tmpfile = tempfile::NamedTempFile::new().unwrap();
+			let path = tmpfile.path();
+
+			fs::write(path, TEXT).unwrap();
+
+			// Apply Landlock with all exceptions under the current ABI.
+			let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]);
+			if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
+				panic!(
+					"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
+					status
+				);
+			}
+
+			// Try to truncate the file.
+			let result = tmpfile.as_file().set_len(0);
+			assert!(result.is_ok());
+		});
+
+		assert!(handle.join().is_ok());
+	}
+}
@@ -0,0 +1,189 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Functionality for securing workers.
+//!
+//! This is needed because workers are used to compile and execute untrusted code (PVFs).
+//!
+//! We currently employ the following security measures:
+//!
+//! - Restrict filesystem
+//!   - Use Landlock to remove all unnecessary FS access rights.
+//!   - Unshare the user and mount namespaces.
+//!   - Change the root directory to a worker-specific temporary directory.
+//! - Restrict networking by blocking socket creation and io_uring.
+//! - Remove env vars
+
+use crate::{worker::WorkerKind, LOG_TARGET};
+
+#[cfg(target_os = "linux")]
+pub mod landlock;
+
+#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
+pub mod seccomp;
+
+/// Unshare the user namespace and change root to be the artifact directory.
+///
+/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
+///       "CLONE_NEWUSER requires that the calling process is not threaded."
+#[cfg(target_os = "linux")]
+pub fn unshare_user_namespace_and_change_root(
+	worker_kind: WorkerKind,
+	worker_pid: u32,
+	worker_dir_path: &std::path::Path,
+) -> Result<(), String> {
+	use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr};
+
+	// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
+	macro_rules! cstr_ptr {
+		($e:expr) => {
+			concat!($e, "\0").as_ptr().cast::<core::ffi::c_char>()
+		};
+	}
+
+	gum::trace!(
+		target: LOG_TARGET,
+		%worker_kind,
+		%worker_pid,
+		?worker_dir_path,
+		"unsharing the user namespace and calling pivot_root",
+	);
+
+	let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes())
+		.expect("on unix; the path will never contain 0 bytes; qed");
+
+	// Wrapper around all the work to prevent repetitive error handling.
+	//
+	// # Errors
+	//
+	// It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does
+	// not give the context of which call failed, so we return a &str error.
+	|| -> Result<(), &'static str> {
+		// SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps
+		//         (2) and (3) are adapted from the example in pivot_root(2), with the additional
+		//         change described in the `pivot_root(".", ".")` section.
+		unsafe {
+			// 1. `unshare` the user and the mount namespaces.
+			if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 {
+				return Err("unshare user and mount namespaces")
+			}
+
+			// 2. Setup mounts.
+			//
+			// Ensure that new root and its parent mount don't have shared propagation (which would
+			// cause pivot_root() to return an error), and prevent propagation of mount events to
+			// the initial mount namespace.
+			if libc::mount(
+				ptr::null(),
+				cstr_ptr!("/"),
+				ptr::null(),
+				libc::MS_REC | libc::MS_PRIVATE,
+				ptr::null(),
+			) < 0
+			{
+				return Err("mount MS_PRIVATE")
+			}
+			// Ensure that the new root is a mount point.
+			let additional_flags =
+				if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind {
+					libc::MS_RDONLY
+				} else {
+					0
+				};
+			if libc::mount(
+				worker_dir_path_c.as_ptr(),
+				worker_dir_path_c.as_ptr(),
+				ptr::null(), // ignored when MS_BIND is used
+				libc::MS_BIND |
+					libc::MS_REC | libc::MS_NOEXEC |
+					libc::MS_NODEV | libc::MS_NOSUID |
+					libc::MS_NOATIME | additional_flags,
+				ptr::null(), // ignored when MS_BIND is used
+			) < 0
+			{
+				return Err("mount MS_BIND")
+			}
+
+			// 3. `pivot_root` to the artifact directory.
+			if libc::chdir(worker_dir_path_c.as_ptr()) < 0 {
+				return Err("chdir to worker dir path")
+			}
+			if libc::syscall(libc::SYS_pivot_root, cstr_ptr!("."), cstr_ptr!(".")) < 0 {
+				return Err("pivot_root")
+			}
+			if libc::umount2(cstr_ptr!("."), libc::MNT_DETACH) < 0 {
+				return Err("umount the old root mount point")
+			}
+		}
+
+		Ok(())
+	}()
+	.map_err(|err_ctx| {
+		let err = std::io::Error::last_os_error();
+		format!("{}: {}", err_ctx, err)
+	})?;
+
+	// Do some assertions.
+	if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") {
+		return Err("expected current dir after pivot_root to be `/`".into())
+	}
+	env::set_current_dir("..").map_err(|err| err.to_string())?;
+	if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") {
+		return Err("expected not to be able to break out of new root by doing `..`".into())
+	}
+
+	Ok(())
+}
+
+/// Require env vars to have been removed when spawning the process, to prevent malicious code from
+/// accessing them.
+pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool {
+	gum::trace!(
+		target: LOG_TARGET,
+		%worker_kind,
+		%worker_pid,
+		"clearing env vars in worker",
+	);
+
+	let mut ok = true;
+
+	for (key, value) in std::env::vars_os() {
+		// TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of
+		// randomness for malicious code. In the future we can remove it also and log in the host;
+		// see <https://github.com/paritytech/polkadot/issues/7117>.
+		if key == "RUST_LOG" {
+			continue
+		}
+		// An exception for MacOS. This is not a secure platform anyway, so we let it slide.
+		#[cfg(target_os = "macos")]
+		if key == "__CF_USER_TEXT_ENCODING" {
+			continue
+		}
+
+		gum::error!(
+			target: LOG_TARGET,
+			%worker_kind,
+			%worker_pid,
+			?key,
+			?value,
+			"env var was present that should have been removed",
+		);
+
+		ok = false;
+	}
+
+	ok
+}
@@ -0,0 +1,201 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Functionality for sandboxing workers by restricting their capabilities by blocking certain
+//! syscalls with seccomp.
+//!
+//! For security we block the following:
+//!
+//! - creation of new sockets - these are unneeded in PVF jobs, and we can safely block them without
+//!   affecting consensus.
+//!
+//! - `io_uring` - allows for networking and needs to be blocked. See below for a discussion on the
+//!   safety of doing this.
+//!
+//! # Safety of blocking io_uring
+//!
+//! `io_uring` is just a way of issuing system calls in an async manner, and there is nothing
+//! stopping wasmtime from legitimately using it. Fortunately, at the moment it does not. Generally,
+//! not many applications use `io_uring` in production yet, because of the numerous kernel CVEs
+//! discovered. It's still under a lot of development. Android outright banned `io_uring` for these
+//! reasons.
+//!
+//! Considering `io_uring`'s status discussed above, and that it very likely would get detected
+//! either by our [static analysis](https://github.com/paritytech/polkadot-sdk/pull/1663) or by
+//! testing, we think it is safe to block it.
+//!
+//! ## Consensus analysis
+//!
+//! If execution hits an edge case code path unique to a given machine, it's already taken a
+//! non-deterministic branch anyway. After all, we just care that the majority of validators reach
+//! the same result and preserve consensus. So worst-case scenario, there's a dispute, and we can
+//! always admit fault and refund the wrong validator. On the other hand, if all validators take the
+//! code path that results in a seccomp violation, then they would all vote against the current
+//! candidate, which is also fine. The violation would get logged (in big scary letters) and
+//! hopefully some validator reports it to us.
+//!
+//! Actually, a worst-worse-case scenario is that 50% of validators vote against, so that there is
+//! no consensus. But so many things would have to go wrong for that to happen:
+//!
+//! 1. An update to `wasmtime` is introduced that uses io_uring (unlikely as io_uring is mainly for
+//!    IO-heavy applications)
+//!
+//! 2. The new syscall is not detected by our static analysis
+//!
+//! 3. It is never triggered in any of our tests
+//!
+//! 4. It then gets triggered on some super edge case in production on 50% of validators causing a
+//!    stall (bad but very unlikely)
+//!
+//! 5. Or, it triggers on only a few validators causing a dispute (more likely but not as bad)
+//!
+//! Considering how many things would have to go wrong here, we believe it's safe to block
+//! `io_uring`.
+//!
+//! # Action on syscall violations
+//!
+//! On syscall violations we currently only log, to make sure this works correctly before enforcing.
+//!
+//! In the future, when a forbidden syscall is attempted we immediately kill the process in order to
+//! prevent the attacker from doing anything else. In execution, this will result in voting against
+//! the candidate.
+
+use crate::{
+	worker::{stringify_panic_payload, WorkerKind},
+	LOG_TARGET,
+};
+use seccompiler::*;
+use std::{collections::BTreeMap, path::Path};
+
+/// The action to take on caught syscalls.
+#[cfg(not(test))]
+const CAUGHT_ACTION: SeccompAction = SeccompAction::Log;
+/// Don't kill the process when testing.
+#[cfg(test)]
+const CAUGHT_ACTION: SeccompAction = SeccompAction::Errno(libc::EACCES as u32);
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+	#[error(transparent)]
+	Seccomp(#[from] seccompiler::Error),
+	#[error(transparent)]
+	Backend(#[from] seccompiler::BackendError),
+	#[error("A panic occurred in try_restrict: {0}")]
+	Panic(String),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Try to enable seccomp for the given kind of worker.
+pub fn enable_for_worker(
+	worker_kind: WorkerKind,
+	worker_pid: u32,
+	worker_dir_path: &Path,
+) -> Result<()> {
+	gum::trace!(
+		target: LOG_TARGET,
+		%worker_kind,
+		%worker_pid,
+		?worker_dir_path,
+		"enabling seccomp",
+	);
+
+	try_restrict()
+}
+
+/// Runs a check for seccomp and returns a single bool indicating whether seccomp with our rules is
+/// fully enabled on the current Linux environment.
+pub fn check_is_fully_enabled() -> bool {
+	let status_from_thread: Result<()> = match std::thread::spawn(|| try_restrict()).join() {
+		Ok(Ok(())) => Ok(()),
+		Ok(Err(err)) => Err(err.into()),
+		Err(err) => Err(Error::Panic(stringify_panic_payload(err))),
+	};
+
+	matches!(status_from_thread, Ok(()))
+}
+
+/// Applies a `seccomp` filter to disable networking for the PVF threads.
+pub fn try_restrict() -> Result<()> {
+	// Build a `seccomp` filter which by default allows all syscalls except those blocked in the
+	// blacklist.
+	let mut blacklisted_rules = BTreeMap::default();
+
+	// Restrict the creation of sockets.
+	blacklisted_rules.insert(libc::SYS_socketpair, vec![]);
+	blacklisted_rules.insert(libc::SYS_socket, vec![]);
+
+	// Prevent connecting to sockets for extra safety.
+	blacklisted_rules.insert(libc::SYS_connect, vec![]);
+
+	// Restrict io_uring.
+	blacklisted_rules.insert(libc::SYS_io_uring_setup, vec![]);
+	blacklisted_rules.insert(libc::SYS_io_uring_enter, vec![]);
+	blacklisted_rules.insert(libc::SYS_io_uring_register, vec![]);
+
+	let filter = SeccompFilter::new(
+		blacklisted_rules,
+		// Mismatch action: what to do if not in rule list.
+		SeccompAction::Allow,
+		// Match action: what to do if in rule list.
+		CAUGHT_ACTION,
+		TargetArch::x86_64,
+	)?;
+
+	let bpf_prog: BpfProgram = filter.try_into()?;
+
+	// Applies filter (runs seccomp) to the calling thread.
+	seccompiler::apply_filter(&bpf_prog)?;
+
+	Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+	use std::{io::ErrorKind, net::TcpListener, thread};
+
+	#[test]
+	fn sandboxed_thread_cannot_use_sockets() {
+		// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
+		if !check_is_fully_enabled() {
+			return
+		}
+
+		let handle = thread::spawn(|| {
+			// Open a socket, this should succeed before seccomp is applied.
+			TcpListener::bind("127.0.0.1:0").unwrap();
+
+			let status = try_restrict();
+			if !matches!(status, Ok(())) {
+				panic!("Ruleset should be enforced since we checked if seccomp is enabled");
+			}
+
+			// Try to open a socket after seccomp.
+			assert!(matches!(
+				TcpListener::bind("127.0.0.1:0"),
+				Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
+			));
+
+			// Other syscalls should still work.
+			unsafe {
+				assert!(libc::getppid() > 0);
+			}
+		});
+
+		assert!(handle.join().is_ok());
+	}
+}