PVF: Add Secure Validator Mode (#2486)

Co-authored-by: Javier Viola <javier@parity.io>
2026-06-14 12:11:09 +00:00 · 2023-12-05 13:32:56 +01:00
parent f240e02557
commit c046a9d5ed
31 changed files with 690 additions and 469 deletions
@@ -66,6 +66,7 @@ use polkadot_parachain_primitives::primitives::ValidationCodeHash;
 use polkadot_primitives::ExecutorParamsHash;
 use std::{
 	collections::HashMap,
+	io,
 	path::{Path, PathBuf},
 	str::FromStr as _,
 	time::{Duration, SystemTime},
@@ -290,7 +291,17 @@ impl Artifacts {
 		}

 		// Make sure that the cache path directory and all its parents are created.
-		let _ = tokio::fs::create_dir_all(cache_path).await;
+		if let Err(err) = tokio::fs::create_dir_all(cache_path).await {
+			if err.kind() != io::ErrorKind::AlreadyExists {
+				gum::error!(
+					target: LOG_TARGET,
+					?err,
+					"failed to create dir {:?}",
+					cache_path,
+				);
+				return
+			}
+		}

 		let mut dir = match tokio::fs::read_dir(cache_path).await {
 			Ok(dir) => dir,
@@ -62,16 +62,16 @@ pub async fn spawn(
 		security_status,
 	)
 	.await?;
-	send_handshake(&mut idle_worker.stream, Handshake { executor_params })
+	send_execute_handshake(&mut idle_worker.stream, Handshake { executor_params })
 		.await
 		.map_err(|error| {
+			let err = SpawnErr::Handshake { err: error.to_string() };
 			gum::warn!(
 				target: LOG_TARGET,
 				worker_pid = %idle_worker.pid,
-				?error,
-				"failed to send a handshake to the spawned worker",
+				%err
 			);
-			SpawnErr::Handshake
+			err
 		})?;
 	Ok((idle_worker, worker_handle))
 }
@@ -286,7 +286,8 @@ where
 	outcome
 }

-async fn send_handshake(stream: &mut UnixStream, handshake: Handshake) -> io::Result<()> {
+/// Sends a handshake with information specific to the execute worker.
+async fn send_execute_handshake(stream: &mut UnixStream, handshake: Handshake) -> io::Result<()> {
 	framed_send(stream, &handshake.encode()).await
 }

@@ -36,7 +36,7 @@ use polkadot_node_core_pvf_common::{
 	prepare::PrepareSuccess,
 	pvf::PvfPrepData,
 };
-use polkadot_node_subsystem::SubsystemResult;
+use polkadot_node_subsystem::{SubsystemError, SubsystemResult};
 use polkadot_parachain_primitives::primitives::ValidationResult;
 use std::{
 	collections::HashMap,
@@ -156,6 +156,8 @@ pub struct Config {
 	pub cache_path: PathBuf,
 	/// The version of the node. `None` can be passed to skip the version check (only for tests).
 	pub node_version: Option<String>,
+	/// Whether the node is attempting to run as a secure validator.
+	pub secure_validator_mode: bool,

 	/// The path to the program that can be used to spawn the prepare workers.
 	pub prepare_worker_program_path: PathBuf,
@@ -180,12 +182,14 @@ impl Config {
 	pub fn new(
 		cache_path: PathBuf,
 		node_version: Option<String>,
+		secure_validator_mode: bool,
 		prepare_worker_program_path: PathBuf,
 		execute_worker_program_path: PathBuf,
 	) -> Self {
 		Self {
 			cache_path,
 			node_version,
+			secure_validator_mode,

 			prepare_worker_program_path,
 			prepare_worker_spawn_timeout: Duration::from_secs(3),
@@ -213,8 +217,12 @@ pub async fn start(
 ) -> SubsystemResult<(ValidationHost, impl Future<Output = ()>)> {
 	gum::debug!(target: LOG_TARGET, ?config, "starting PVF validation host");

-	// Run checks for supported security features once per host startup. Warn here if not enabled.
-	let security_status = security::check_security_status(&config).await;
+	// Run checks for supported security features once per host startup. If some checks fail, warn
+	// if Secure Validator Mode is disabled and return an error otherwise.
+	let security_status = match security::check_security_status(&config).await {
+		Ok(ok) => ok,
+		Err(err) => return Err(SubsystemError::Context(err)),
+	};

 	let (to_host_tx, to_host_rx) = mpsc::channel(10);

@@ -18,18 +18,19 @@ use crate::{Config, SecurityStatus, LOG_TARGET};
 use futures::join;
 use std::{fmt, path::Path};

-const SECURE_MODE_ANNOUNCEMENT: &'static str =
-	"In the next release this will be a hard error by default.
-     \nMore information: https://wiki.polkadot.network/docs/maintain-guides-secure-validator#secure-validator-mode";
-
 /// Run checks for supported security features.
 ///
 /// # Returns
 ///
 /// Returns the set of security features that we were able to enable. If an error occurs while
 /// enabling a security feature we set the corresponding status to `false`.
-pub async fn check_security_status(config: &Config) -> SecurityStatus {
-	let Config { prepare_worker_program_path, cache_path, .. } = config;
+///
+/// # Errors
+///
+/// Returns an error only if we could not fully enforce the security level required by the current
+/// configuration.
+pub async fn check_security_status(config: &Config) -> Result<SecurityStatus, String> {
+	let Config { prepare_worker_program_path, secure_validator_mode, cache_path, .. } = config;

 	let (landlock, seccomp, change_root) = join!(
 		check_landlock(prepare_worker_program_path),
@@ -37,26 +38,81 @@ pub async fn check_security_status(config: &Config) -> SecurityStatus {
 		check_can_unshare_user_namespace_and_change_root(prepare_worker_program_path, cache_path)
 	);

-	let security_status = SecurityStatus {
-		can_enable_landlock: landlock.is_ok(),
-		can_enable_seccomp: seccomp.is_ok(),
-		can_unshare_user_namespace_and_change_root: change_root.is_ok(),
-	};
+	let full_security_status =
+		FullSecurityStatus::new(*secure_validator_mode, landlock, seccomp, change_root);
+	let security_status = full_security_status.as_partial();

-	let errs: Vec<SecureModeError> = [landlock, seccomp, change_root]
-		.into_iter()
-		.filter_map(|result| result.err())
-		.collect();
-	let err_occurred = print_secure_mode_message(errs);
-	if err_occurred {
-		gum::error!(
+	if full_security_status.err_occurred() {
+		print_secure_mode_error_or_warning(&full_security_status);
+		if !full_security_status.all_errs_allowed() {
+			return Err("could not enable Secure Validator Mode; check logs".into())
+		}
+	}
+
+	if security_status.secure_validator_mode {
+		gum::info!(
 			target: LOG_TARGET,
-			"{}",
-			SECURE_MODE_ANNOUNCEMENT,
+			"👮‍♀️ Running in Secure Validator Mode. \
+			 It is highly recommended that you operate according to our security guidelines. \
+			 \nMore information: https://wiki.polkadot.network/docs/maintain-guides-secure-validator#secure-validator-mode"
 		);
 	}

-	security_status
+	Ok(security_status)
+}
+
+/// Contains the full security status including error states.
+struct FullSecurityStatus {
+	partial: SecurityStatus,
+	errs: Vec<SecureModeError>,
+}
+
+impl FullSecurityStatus {
+	fn new(
+		secure_validator_mode: bool,
+		landlock: SecureModeResult,
+		seccomp: SecureModeResult,
+		change_root: SecureModeResult,
+	) -> Self {
+		Self {
+			partial: SecurityStatus {
+				secure_validator_mode,
+				can_enable_landlock: landlock.is_ok(),
+				can_enable_seccomp: seccomp.is_ok(),
+				can_unshare_user_namespace_and_change_root: change_root.is_ok(),
+			},
+			errs: [landlock, seccomp, change_root]
+				.into_iter()
+				.filter_map(|result| result.err())
+				.collect(),
+		}
+	}
+
+	fn as_partial(&self) -> SecurityStatus {
+		self.partial.clone()
+	}
+
+	fn err_occurred(&self) -> bool {
+		!self.errs.is_empty()
+	}
+
+	fn all_errs_allowed(&self) -> bool {
+		!self.partial.secure_validator_mode ||
+			self.errs.iter().all(|err| err.is_allowed_in_secure_mode(&self.partial))
+	}
+
+	fn errs_string(&self) -> String {
+		self.errs
+			.iter()
+			.map(|err| {
+				format!(
+					"\n  - {}{}",
+					if err.is_allowed_in_secure_mode(&self.partial) { "Optional: " } else { "" },
+					err
+				)
+			})
+			.collect()
+	}
 }

 type SecureModeResult = std::result::Result<(), SecureModeError>;
@@ -71,12 +127,17 @@ enum SecureModeError {

 impl SecureModeError {
 	/// Whether this error is allowed with Secure Validator Mode enabled.
-	fn is_allowed_in_secure_mode(&self) -> bool {
+	fn is_allowed_in_secure_mode(&self, security_status: &SecurityStatus) -> bool {
 		use SecureModeError::*;
 		match self {
-			CannotEnableLandlock(_) => true,
+			// Landlock is present on relatively recent Linuxes. This is optional if the unshare
+			// capability is present, providing FS sandboxing a different way.
+			CannotEnableLandlock(_) => security_status.can_unshare_user_namespace_and_change_root,
+			// seccomp should be present on all modern Linuxes unless it's been disabled.
 			CannotEnableSeccomp(_) => false,
-			CannotUnshareUserNamespaceAndChangeRoot(_) => false,
+			// Should always be present on modern Linuxes. If not, Landlock also provides FS
+			// sandboxing, so don't enforce this.
+			CannotUnshareUserNamespaceAndChangeRoot(_) => security_status.can_enable_landlock,
 		}
 	}
 }
@@ -92,12 +153,8 @@ impl fmt::Display for SecureModeError {
 	}
 }

-/// Errors if Secure Validator Mode and some mandatory errors occurred, warn otherwise.
-///
-/// # Returns
-///
-/// `true` if an error was printed, `false` otherwise.
-fn print_secure_mode_message(errs: Vec<SecureModeError>) -> bool {
+/// Print an error if Secure Validator Mode and some mandatory errors occurred, warn otherwise.
+fn print_secure_mode_error_or_warning(security_status: &FullSecurityStatus) {
 	// Trying to run securely and some mandatory errors occurred.
 	const SECURE_MODE_ERROR: &'static str = "🚨 Your system cannot securely run a validator. \
 		 \nRunning validation of malicious PVF code has a higher risk of compromising this machine.";
@@ -105,39 +162,31 @@ fn print_secure_mode_message(errs: Vec<SecureModeError>) -> bool {
 	// securely.
 	const SECURE_MODE_WARNING: &'static str = "🚨 Some security issues have been detected. \
 		 \nRunning validation of malicious PVF code has a higher risk of compromising this machine.";
+	// Message to be printed only when running securely and mandatory errors occurred.
+	const IGNORE_SECURE_MODE_TIP: &'static str =
+		"\nYou can ignore this error with the `--insecure-validator-i-know-what-i-do` \
+		 command line argument if you understand and accept the risks of running insecurely. \
+		 With this flag, security features are enabled on a best-effort basis, but not mandatory. \
+		 \nMore information: https://wiki.polkadot.network/docs/maintain-guides-secure-validator#secure-validator-mode";

-	if errs.is_empty() {
-		return false
-	}
+	let all_errs_allowed = security_status.all_errs_allowed();
+	let errs_string = security_status.errs_string();

-	let errs_allowed = errs.iter().all(|err| err.is_allowed_in_secure_mode());
-	let errs_string: String = errs
-		.iter()
-		.map(|err| {
-			format!(
-				"\n  - {}{}",
-				if err.is_allowed_in_secure_mode() { "Optional: " } else { "" },
-				err
-			)
-		})
-		.collect();
-
-	if errs_allowed {
+	if all_errs_allowed {
 		gum::warn!(
 			target: LOG_TARGET,
 			"{}{}",
 			SECURE_MODE_WARNING,
 			errs_string,
 		);
-		false
 	} else {
 		gum::error!(
 			target: LOG_TARGET,
-			"{}{}",
+			"{}{}{}",
 			SECURE_MODE_ERROR,
 			errs_string,
+			IGNORE_SECURE_MODE_TIP
 		);
-		true
 	}
 }

@@ -298,3 +347,53 @@ async fn check_seccomp(
 		}
 	}
 }
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+
+	#[test]
+	fn test_secure_mode_error_optionality() {
+		let err = SecureModeError::CannotEnableLandlock(String::new());
+		assert!(err.is_allowed_in_secure_mode(&SecurityStatus {
+			secure_validator_mode: true,
+			can_enable_landlock: false,
+			can_enable_seccomp: false,
+			can_unshare_user_namespace_and_change_root: true
+		}));
+		assert!(!err.is_allowed_in_secure_mode(&SecurityStatus {
+			secure_validator_mode: true,
+			can_enable_landlock: false,
+			can_enable_seccomp: true,
+			can_unshare_user_namespace_and_change_root: false
+		}));
+
+		let err = SecureModeError::CannotEnableSeccomp(String::new());
+		assert!(!err.is_allowed_in_secure_mode(&SecurityStatus {
+			secure_validator_mode: true,
+			can_enable_landlock: false,
+			can_enable_seccomp: false,
+			can_unshare_user_namespace_and_change_root: true
+		}));
+		assert!(!err.is_allowed_in_secure_mode(&SecurityStatus {
+			secure_validator_mode: true,
+			can_enable_landlock: false,
+			can_enable_seccomp: true,
+			can_unshare_user_namespace_and_change_root: false
+		}));
+
+		let err = SecureModeError::CannotUnshareUserNamespaceAndChangeRoot(String::new());
+		assert!(err.is_allowed_in_secure_mode(&SecurityStatus {
+			secure_validator_mode: true,
+			can_enable_landlock: true,
+			can_enable_seccomp: false,
+			can_unshare_user_namespace_and_change_root: false
+		}));
+		assert!(!err.is_allowed_in_secure_mode(&SecurityStatus {
+			secure_validator_mode: true,
+			can_enable_landlock: false,
+			can_enable_seccomp: true,
+			can_unshare_user_namespace_and_change_root: false
+		}));
+	}
+}
@@ -19,8 +19,9 @@
 use crate::LOG_TARGET;
 use futures::FutureExt as _;
 use futures_timer::Delay;
+use parity_scale_codec::Encode;
 use pin_project::pin_project;
-use polkadot_node_core_pvf_common::SecurityStatus;
+use polkadot_node_core_pvf_common::{SecurityStatus, WorkerHandshake};
 use rand::Rng;
 use std::{
 	fmt, mem,
@@ -68,83 +69,54 @@ pub async fn spawn_with_program_path(
 	let program_path = program_path.into();
 	let worker_dir = WorkerDir::new(debug_id, cache_path).await?;
 	let extra_args: Vec<String> = extra_args.iter().map(|arg| arg.to_string()).collect();
+	// Hack the borrow-checker.
+	let program_path_clone = program_path.clone();
+	let worker_dir_clone = worker_dir.path().to_owned();
+	let extra_args_clone = extra_args.clone();

 	with_transient_socket_path(debug_id, |socket_path| {
 		let socket_path = socket_path.to_owned();
-		let worker_dir_path = worker_dir.path().to_owned();

 		async move {
-			let listener = UnixListener::bind(&socket_path).map_err(|err| {
-				gum::warn!(
-					target: LOG_TARGET,
-					%debug_id,
-					?program_path,
-					?extra_args,
-					?worker_dir,
-					?socket_path,
-					"cannot bind unix socket: {:?}",
-					err,
-				);
-				SpawnErr::Bind
-			})?;
+			let listener = match UnixListener::bind(&socket_path) {
+				Ok(ok) => ok,
+				Err(err) => return Err(SpawnErr::Bind { socket_path, err: err.to_string() }),
+			};

-			let handle = WorkerHandle::spawn(
-				&program_path,
-				&extra_args,
-				&socket_path,
-				&worker_dir_path,
-				security_status,
-			)
-			.map_err(|err| {
-				gum::warn!(
-					target: LOG_TARGET,
-					%debug_id,
-					?program_path,
-					?extra_args,
-					?worker_dir_path,
-					?socket_path,
-					"cannot spawn a worker: {:?}",
-					err,
-				);
-				SpawnErr::ProcessSpawn
-			})?;
+			let handle =
+				WorkerHandle::spawn(&program_path, &extra_args, &socket_path, &worker_dir.path())
+					.map_err(|err| SpawnErr::ProcessSpawn { program_path, err: err.to_string() })?;

 			futures::select! {
 				accept_result = listener.accept().fuse() => {
-					let (stream, _) = accept_result.map_err(|err| {
-						gum::warn!(
-							target: LOG_TARGET,
-							%debug_id,
-							?program_path,
-							?extra_args,
-							?worker_dir_path,
-							?socket_path,
-							"cannot accept a worker: {:?}",
-							err,
-						);
-						SpawnErr::Accept
-					})?;
+					let (mut stream, _) = accept_result
+						.map_err(|err| SpawnErr::Accept { socket_path, err: err.to_string() })?;
+					send_worker_handshake(&mut stream, WorkerHandshake { security_status })
+						.await
+						.map_err(|err| SpawnErr::Handshake { err: err.to_string() })?;
 					Ok((IdleWorker { stream, pid: handle.id(), worker_dir }, handle))
 				}
-				_ = Delay::new(spawn_timeout).fuse() => {
-					gum::warn!(
-						target: LOG_TARGET,
-						%debug_id,
-						?program_path,
-						?extra_args,
-						?worker_dir_path,
-						?socket_path,
-						?spawn_timeout,
-						"spawning and connecting to socket timed out",
-					);
-					Err(SpawnErr::AcceptTimeout)
-				}
+				_ = Delay::new(spawn_timeout).fuse() => Err(SpawnErr::AcceptTimeout{spawn_timeout}),
 			}
 		}
 	})
 	.await
+	.map_err(|err| {
+		gum::warn!(
+			target: LOG_TARGET,
+			%debug_id,
+			?program_path_clone,
+			?extra_args_clone,
+			?worker_dir_clone,
+			"error spawning worker: {}",
+			err,
+		);
+		err
+	})
 }

+/// A temporary, random, free path that is necessary only to establish socket communications. If a
+/// directory exists at the path at the end of this function, it is removed then.
 async fn with_transient_socket_path<T, F, Fut>(debug_id: &'static str, f: F) -> Result<T, SpawnErr>
 where
 	F: FnOnce(&Path) -> Fut,
@@ -214,21 +186,26 @@ pub struct IdleWorker {
 	pub worker_dir: WorkerDir,
 }

+/// This is publicly exposed only for integration tests.
+///
 /// An error happened during spawning a worker process.
-#[derive(Clone, Debug)]
+#[derive(thiserror::Error, Clone, Debug)]
+#[doc(hidden)]
 pub enum SpawnErr {
-	/// Cannot obtain a temporary path location.
+	#[error("cannot obtain a temporary path location")]
 	TmpPath,
-	/// Cannot bind the socket to the given path.
-	Bind,
-	/// An error happened during accepting a connection to the socket.
-	Accept,
-	/// An error happened during spawning the process.
-	ProcessSpawn,
-	/// The deadline allotted for the worker spawning and connecting to the socket has elapsed.
-	AcceptTimeout,
-	/// Failed to send handshake after successful spawning was signaled
-	Handshake,
+	#[error("cannot bind the socket to the given path {socket_path:?}: {err}")]
+	Bind { socket_path: PathBuf, err: String },
+	#[error(
+		"an error happened during accepting a connection to the socket {socket_path:?}: {err}"
+	)]
+	Accept { socket_path: PathBuf, err: String },
+	#[error("an error happened during spawning the process at path {program_path:?}: {err}")]
+	ProcessSpawn { program_path: PathBuf, err: String },
+	#[error("the deadline {}ms allotted for the worker spawning and connecting to the socket has elapsed", .spawn_timeout.as_millis())]
+	AcceptTimeout { spawn_timeout: Duration },
+	#[error("failed to send handshake after successful spawning was signaled: {err}")]
+	Handshake { err: String },
 }

 /// This is a representation of a potentially running worker. Drop it and the process will be
@@ -256,22 +233,7 @@ impl WorkerHandle {
 		extra_args: &[String],
 		socket_path: impl AsRef<Path>,
 		worker_dir_path: impl AsRef<Path>,
-		security_status: SecurityStatus,
 	) -> io::Result<Self> {
-		let security_args = {
-			let mut args = vec![];
-			if security_status.can_enable_landlock {
-				args.push("--can-enable-landlock".to_string());
-			}
-			if security_status.can_enable_seccomp {
-				args.push("--can-enable-seccomp".to_string());
-			}
-			if security_status.can_unshare_user_namespace_and_change_root {
-				args.push("--can-unshare-user-namespace-and-change-root".to_string());
-			}
-			args
-		};
-
 		// Clear all env vars from the spawned process.
 		let mut command = process::Command::new(program.as_ref());
 		command.env_clear();
@@ -286,7 +248,6 @@ impl WorkerHandle {
 			.arg(socket_path.as_ref().as_os_str())
 			.arg("--worker-dir-path")
 			.arg(worker_dir_path.as_ref().as_os_str())
-			.args(&security_args)
 			.stdout(std::process::Stdio::piped())
 			.kill_on_drop(true)
 			.spawn()?;
@@ -386,6 +347,14 @@ pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result<Vec<u8>
 	Ok(buf)
 }

+/// Sends a handshake with information for the worker.
+async fn send_worker_handshake(
+	stream: &mut UnixStream,
+	handshake: WorkerHandshake,
+) -> io::Result<()> {
+	framed_send(stream, &handshake.encode()).await
+}
+
 /// A temporary worker dir that contains only files needed by the worker. The worker will change its
 /// root (the `/` directory) to this directory; it should have access to no other paths on its
 /// filesystem.
@@ -433,8 +402,6 @@ impl WorkerDir {

 // Not async since Rust has trouble with async recursion. There should be few files here anyway.
 //
-// TODO: A lingering malicious job can still access future files in this dir. See
-// <https://github.com/paritytech/polkadot-sdk/issues/574> for how to fully secure this.
 /// Clear the temporary worker dir without deleting it. Not deleting is important because the worker
 /// has mounted its own separate filesystem here.
 ///