PVF worker: switch on seccomp networking restrictions (#2221)

2026-06-23 22:31:06 +00:00 · 2023-11-21 12:52:46 +01:00
parent 40afc77c4e
commit 552be4800d
13 changed files with 202 additions and 146 deletions
@@ -77,7 +77,7 @@ pub enum PrepareError {
 	#[codec(index = 9)]
 	ClearWorkerDir(String),
 	/// The preparation job process died, due to OOM, a seccomp violation, or some other factor.
-	JobDied(String),
+	JobDied { err: String, job_pid: i32 },
 	#[codec(index = 10)]
 	/// Some error occurred when interfacing with the kernel.
 	#[codec(index = 11)]
@@ -96,7 +96,7 @@ impl PrepareError {
 		match self {
 			Prevalidation(_) | Preparation(_) | JobError(_) | OutOfMemory => true,
 			IoErr(_) |
-			JobDied(_) |
+			JobDied { .. } |
 			CreateTmpFile(_) |
 			RenameTmpFile { .. } |
 			ClearWorkerDir(_) |
@@ -119,7 +119,8 @@ impl fmt::Display for PrepareError {
 			JobError(err) => write!(f, "panic: {}", err),
 			TimedOut => write!(f, "prepare: timeout"),
 			IoErr(err) => write!(f, "prepare: io error while receiving response: {}", err),
-			JobDied(err) => write!(f, "prepare: prepare job died: {}", err),
+			JobDied { err, job_pid } =>
+				write!(f, "prepare: prepare job with pid {job_pid} died: {err}"),
 			CreateTmpFile(err) => write!(f, "prepare: error creating tmp file: {}", err),
 			RenameTmpFile { err, src, dest } =>
 				write!(f, "prepare: error renaming tmp file ({:?} -> {:?}): {}", src, dest, err),
@@ -46,7 +46,7 @@ pub enum WorkerResponse {
 	///
 	/// We cannot treat this as an internal error because malicious code may have killed the job.
 	/// We still retry it, because in the non-malicious case it is likely spurious.
-	JobDied(String),
+	JobDied { err: String, job_pid: i32 },
 	/// An unexpected error occurred in the job process, e.g. failing to spawn a thread, panic,
 	/// etc.
 	///
@@ -53,7 +53,7 @@ pub struct SecurityStatus {
 	pub can_enable_landlock: bool,
 	/// Whether the seccomp features we use are fully available on this system.
 	pub can_enable_seccomp: bool,
-	// Whether we are able to unshare the user namespace and change the filesystem root.
+	/// Whether we are able to unshare the user namespace and change the filesystem root.
 	pub can_unshare_user_namespace_and_change_root: bool,
 }

@@ -219,7 +219,7 @@ pub fn run_worker<F>(
 	#[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut worker_dir_path: PathBuf,
 	node_version: Option<&str>,
 	worker_version: Option<&str>,
-	#[cfg_attr(not(target_os = "linux"), allow(unused_variables))] security_status: &SecurityStatus,
+	security_status: &SecurityStatus,
 	mut event_loop: F,
 ) where
 	F: FnMut(UnixStream, PathBuf) -> io::Result<Never>,
@@ -67,11 +67,9 @@
 //!
 //! # Action on syscall violations
 //!
-//! On syscall violations we currently only log, to make sure this works correctly before enforcing.
-//!
-//! In the future, when a forbidden syscall is attempted we immediately kill the process in order to
-//! prevent the attacker from doing anything else. In execution, this will result in voting against
-//! the candidate.
+//! When a forbidden syscall is attempted we immediately kill the process in order to prevent the
+//! attacker from doing anything else. In execution, this will result in voting against the
+//! candidate.

 use crate::{
 	worker::{stringify_panic_payload, WorkerKind},
@@ -82,7 +80,7 @@ use std::{collections::BTreeMap, path::Path};

 /// The action to take on caught syscalls.
 #[cfg(not(test))]
-const CAUGHT_ACTION: SeccompAction = SeccompAction::Log;
+const CAUGHT_ACTION: SeccompAction = SeccompAction::KillProcess;
 /// Don't kill the process when testing.
 #[cfg(test)]
 const CAUGHT_ACTION: SeccompAction = SeccompAction::Errno(libc::EACCES as u32);
@@ -222,12 +222,6 @@ pub fn worker_entrypoint(
 					},
 				};

-				gum::trace!(
-					target: LOG_TARGET,
-					%worker_pid,
-					"worker: sending response to host: {:?}",
-					response
-				);
 				send_response(&mut stream, response)?;
 			}
 		},
@@ -360,7 +354,7 @@ fn handle_child_process(
 /// - The response, either `Ok` or some error state.
 fn handle_parent_process(
 	mut pipe_read: PipeReader,
-	child: Pid,
+	job_pid: Pid,
 	worker_pid: u32,
 	usage_before: Usage,
 	timeout: Duration,
@@ -373,10 +367,11 @@ fn handle_parent_process(
 		// Should retry at any rate.
 		.map_err(|err| io::Error::new(io::ErrorKind::Other, err.to_string()))?;

-	let status = nix::sys::wait::waitpid(child, None);
+	let status = nix::sys::wait::waitpid(job_pid, None);
 	gum::trace!(
 		target: LOG_TARGET,
 		%worker_pid,
+		%job_pid,
 		"execute worker received wait status from job: {:?}",
 		status,
 	);
@@ -396,6 +391,7 @@ fn handle_parent_process(
 		gum::warn!(
 			target: LOG_TARGET,
 			%worker_pid,
+			%job_pid,
 			"execute job took {}ms cpu time, exceeded execute timeout {}ms",
 			cpu_tv.as_millis(),
 			timeout.as_millis(),
@@ -428,6 +424,7 @@ fn handle_parent_process(
 					gum::warn!(
 						target: LOG_TARGET,
 						%worker_pid,
+						%job_pid,
 						"execute job error: {}",
 						job_error,
 					);
@@ -443,15 +440,18 @@ fn handle_parent_process(
 		//
 		// The job gets SIGSYS on seccomp violations, but this signal may have been sent for some
 		// other reason, so we still need to check for seccomp violations elsewhere.
-		Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) =>
-			Ok(WorkerResponse::JobDied(format!("received signal: {signal:?}"))),
+		Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) => Ok(WorkerResponse::JobDied {
+			err: format!("received signal: {signal:?}"),
+			job_pid: job_pid.as_raw(),
+		}),
 		Err(errno) => Ok(internal_error_from_errno("waitpid", errno)),

 		// It is within an attacker's power to send an unexpected exit status. So we cannot treat
 		// this as an internal error (which would make us abstain), but must vote against.
-		Ok(unexpected_wait_status) => Ok(WorkerResponse::JobDied(format!(
-			"unexpected status from wait: {unexpected_wait_status:?}"
-		))),
+		Ok(unexpected_wait_status) => Ok(WorkerResponse::JobDied {
+			err: format!("unexpected status from wait: {unexpected_wait_status:?}"),
+			job_pid: job_pid.as_raw(),
+		}),
 	}
 }

@@ -257,9 +257,9 @@ pub fn worker_entrypoint(

 						handle_parent_process(
 							pipe_reader,
+							worker_pid,
 							child,
 							temp_artifact_dest.clone(),
-							worker_pid,
 							usage_before,
 							preparation_timeout,
 						)
@@ -506,9 +506,9 @@ fn handle_child_process(
 /// - If the child process timeout, it returns `PrepareError::TimedOut`.
 fn handle_parent_process(
 	mut pipe_read: PipeReader,
-	child: Pid,
-	temp_artifact_dest: PathBuf,
 	worker_pid: u32,
+	job_pid: Pid,
+	temp_artifact_dest: PathBuf,
 	usage_before: Usage,
 	timeout: Duration,
 ) -> Result<PrepareWorkerSuccess, PrepareError> {
@@ -518,10 +518,11 @@ fn handle_parent_process(
 		.read_to_end(&mut received_data)
 		.map_err(|err| PrepareError::IoErr(err.to_string()))?;

-	let status = nix::sys::wait::waitpid(child, None);
+	let status = nix::sys::wait::waitpid(job_pid, None);
 	gum::trace!(
 		target: LOG_TARGET,
 		%worker_pid,
+		%job_pid,
 		"prepare worker received wait status from job: {:?}",
 		status,
 	);
@@ -539,6 +540,7 @@ fn handle_parent_process(
 		gum::warn!(
 			target: LOG_TARGET,
 			%worker_pid,
+			%job_pid,
 			"prepare job took {}ms cpu time, exceeded prepare timeout {}ms",
 			cpu_tv.as_millis(),
 			timeout.as_millis(),
@@ -573,6 +575,7 @@ fn handle_parent_process(
 					gum::debug!(
 						target: LOG_TARGET,
 						%worker_pid,
+						%job_pid,
 						"worker: writing artifact to {}",
 						temp_artifact_dest.display(),
 					);
@@ -593,15 +596,18 @@ fn handle_parent_process(
 		//
 		// The job gets SIGSYS on seccomp violations, but this signal may have been sent for some
 		// other reason, so we still need to check for seccomp violations elsewhere.
-		Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) =>
-			Err(PrepareError::JobDied(format!("received signal: {signal:?}"))),
+		Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) => Err(PrepareError::JobDied {
+			err: format!("received signal: {signal:?}"),
+			job_pid: job_pid.as_raw(),
+		}),
 		Err(errno) => Err(error_from_errno("waitpid", errno)),

 		// An attacker can make the child process return any exit status it wants. So we can treat
 		// all unexpected cases the same way.
-		Ok(unexpected_wait_status) => Err(PrepareError::JobDied(format!(
-			"unexpected status from wait: {unexpected_wait_status:?}"
-		))),
+		Ok(unexpected_wait_status) => Err(PrepareError::JobDied {
+			err: format!("unexpected status from wait: {unexpected_wait_status:?}"),
+			job_pid: job_pid.as_raw(),
+		}),
 	}
 }

@@ -33,7 +33,7 @@ use polkadot_node_core_pvf_common::{
 	execute::{Handshake, WorkerResponse},
 	worker_dir, SecurityStatus,
 };
-use polkadot_parachain_primitives::primitives::ValidationResult;
+use polkadot_parachain_primitives::primitives::{ValidationCodeHash, ValidationResult};
 use polkadot_primitives::ExecutorParams;
 use std::{path::Path, time::Duration};
 use tokio::{io, net::UnixStream};
@@ -156,6 +156,16 @@ pub async fn start_work(
 		let response = futures::select! {
 			response = recv_response(&mut stream).fuse() => {
 				match response {
+					Ok(response) =>
+						handle_response(
+							response,
+							pid,
+							&artifact.id.code_hash,
+							&artifact_path,
+							execution_timeout,
+							audit_log_file
+						)
+							.await,
 					Err(error) => {
 						gum::warn!(
 							target: LOG_TARGET,
@@ -164,56 +174,9 @@ pub async fn start_work(
 							?error,
 							"failed to recv an execute response",
 						);
-						// The worker died. Check if it was due to a seccomp violation.
-						//
-						// NOTE: Log, but don't change the outcome. Not all validators may have
-						// auditing enabled, so we don't want attackers to abuse a non-deterministic
-						// outcome.
-						for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await {
-							gum::error!(
-								target: LOG_TARGET,
-								worker_pid = %pid,
-								%syscall,
-								validation_code_hash = ?artifact.id.code_hash,
-								?artifact_path,
-								"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
-							);
-						}

 						return Outcome::WorkerIntfErr
 					},
-					Ok(response) => {
-						// Check if any syscall violations occurred during the job. For now this is
-						// only informative, as we are not enforcing the seccomp policy yet.
-						for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await {
-							gum::error!(
-								target: LOG_TARGET,
-								worker_pid = %pid,
-								%syscall,
-								validation_code_hash = ?artifact.id.code_hash,
-								?artifact_path,
-								"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
-							);
-						}
-
-						if let WorkerResponse::Ok{duration, ..} = response {
-							if duration > execution_timeout {
-								// The job didn't complete within the timeout.
-								gum::warn!(
-									target: LOG_TARGET,
-									worker_pid = %pid,
-									"execute job took {}ms cpu time, exceeded execution timeout {}ms.",
-									duration.as_millis(),
-									execution_timeout.as_millis(),
-								);
-
-								// Return a timeout error.
-								return Outcome::HardTimeout
-							}
-						}
-
-						response
-					},
 				}
 			},
 			_ = Delay::new(timeout).fuse() => {
@@ -238,7 +201,7 @@ pub async fn start_work(
 				idle_worker: IdleWorker { stream, pid, worker_dir },
 			},
 			WorkerResponse::JobTimedOut => Outcome::HardTimeout,
-			WorkerResponse::JobDied(err) => Outcome::JobDied { err },
+			WorkerResponse::JobDied { err, job_pid: _ } => Outcome::JobDied { err },
 			WorkerResponse::JobError(err) => Outcome::JobError { err },

 			WorkerResponse::InternalError(err) => Outcome::InternalError { err },
@@ -247,6 +210,56 @@ pub async fn start_work(
 	.await
 }

+/// Handles the case where we successfully received response bytes on the host from the child.
+///
+/// Here we know the artifact exists, but is still located in a temporary file which will be cleared
+/// by [`with_worker_dir_setup`].
+async fn handle_response(
+	response: WorkerResponse,
+	worker_pid: u32,
+	validation_code_hash: &ValidationCodeHash,
+	artifact_path: &Path,
+	execution_timeout: Duration,
+	audit_log_file: Option<security::AuditLogFile>,
+) -> WorkerResponse {
+	if let WorkerResponse::Ok { duration, .. } = response {
+		if duration > execution_timeout {
+			// The job didn't complete within the timeout.
+			gum::warn!(
+				target: LOG_TARGET,
+				worker_pid,
+				"execute job took {}ms cpu time, exceeded execution timeout {}ms.",
+				duration.as_millis(),
+				execution_timeout.as_millis(),
+			);
+
+			// Return a timeout error.
+			return WorkerResponse::JobTimedOut
+		}
+	}
+
+	if let WorkerResponse::JobDied { err: _, job_pid } = response {
+		// The job died. Check if it was due to a seccomp violation.
+		//
+		// NOTE: Log, but don't change the outcome. Not all validators may have
+		// auditing enabled, so we don't want attackers to abuse a non-deterministic
+		// outcome.
+		for syscall in security::check_seccomp_violations_for_job(audit_log_file, job_pid).await {
+			gum::error!(
+				target: LOG_TARGET,
+				%worker_pid,
+				%job_pid,
+				%syscall,
+				?validation_code_hash,
+				?artifact_path,
+				"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
+			);
+		}
+	}
+
+	response
+}
+
 /// Create a temporary file for an artifact in the worker cache, execute the given future/closure
 /// passing the file path in, and clean up the worker cache.
 ///
@@ -388,14 +388,14 @@ fn handle_mux(
 					Ok(())
 				},
 				// The worker might still be usable, but we kill it just in case.
-				Outcome::JobDied(err) => {
+				Outcome::JobDied { err, job_pid } => {
 					if attempt_retire(metrics, spawned, worker) {
 						reply(
 							from_pool,
 							FromPool::Concluded {
 								worker,
 								rip: true,
-								result: Err(PrepareError::JobDied(err)),
+								result: Err(PrepareError::JobDied { err, job_pid }),
 							},
 						)?;
 					}
@@ -104,7 +104,7 @@ pub enum Outcome {
 	/// The preparation job process died, due to OOM, a seccomp violation, or some other factor.
 	///
 	/// The worker might still be usable, but we kill it just in case.
-	JobDied(String),
+	JobDied { err: String, job_pid: i32 },
 }

 /// Given the idle token of a worker and parameters of work, communicates with the worker and
@@ -160,19 +160,7 @@ pub async fn start_work(

 			match result {
 				// Received bytes from worker within the time limit.
-				Ok(Ok(prepare_worker_result)) => {
-					// Check if any syscall violations occurred during the job. For now this is only
-					// informative, as we are not enforcing the seccomp policy yet.
-					for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await {
-						gum::error!(
-							target: LOG_TARGET,
-							worker_pid = %pid,
-							%syscall,
-							?pvf,
-							"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
-						);
-					}
-
+				Ok(Ok(prepare_worker_result)) =>
 					handle_response(
 						metrics,
 						IdleWorker { stream, pid, worker_dir },
@@ -182,9 +170,9 @@ pub async fn start_work(
 						&pvf,
 						&cache_path,
 						preparation_timeout,
+						audit_log_file,
 					)
-					.await
-				},
+					.await,
 				Ok(Err(err)) => {
 					// Communication error within the time limit.
 					gum::warn!(
@@ -221,15 +209,36 @@ async fn handle_response(
 	worker_pid: u32,
 	tmp_file: PathBuf,
 	pvf: &PvfPrepData,
-	cache_path: &PathBuf,
+	cache_path: &Path,
 	preparation_timeout: Duration,
+	audit_log_file: Option<security::AuditLogFile>,
 ) -> Outcome {
 	let PrepareWorkerSuccess { checksum, stats: PrepareStats { cpu_time_elapsed, memory_stats } } =
 		match result.clone() {
 			Ok(result) => result,
 			// Timed out on the child. This should already be logged by the child.
 			Err(PrepareError::TimedOut) => return Outcome::TimedOut,
-			Err(PrepareError::JobDied(err)) => return Outcome::JobDied(err),
+			Err(PrepareError::JobDied { err, job_pid }) => {
+				// The job died. Check if it was due to a seccomp violation.
+				//
+				// NOTE: Log, but don't change the outcome. Not all validators may have
+				// auditing enabled, so we don't want attackers to abuse a non-deterministic
+				// outcome.
+				for syscall in
+					security::check_seccomp_violations_for_job(audit_log_file, job_pid).await
+				{
+					gum::error!(
+						target: LOG_TARGET,
+						%worker_pid,
+						%job_pid,
+						%syscall,
+						?pvf,
+						"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
+					);
+				}
+
+				return Outcome::JobDied { err, job_pid }
+			},
 			Err(PrepareError::OutOfMemory) => return Outcome::OutOfMemory,
 			Err(err) => return Outcome::Concluded { worker, result: Err(err) },
 		};
@@ -178,9 +178,15 @@ async fn check_can_unshare_user_namespace_and_change_root(
 					let stderr = std::str::from_utf8(&output.stderr)
 						.expect("child process writes a UTF-8 string to stderr; qed")
 						.trim();
-					Err(SecureModeError::CannotUnshareUserNamespaceAndChangeRoot(
-						format!("not available: {}", stderr)
-					))
+					if stderr.is_empty() {
+						Err(SecureModeError::CannotUnshareUserNamespaceAndChangeRoot(
+							"not available".into()
+						))
+					} else {
+						Err(SecureModeError::CannotUnshareUserNamespaceAndChangeRoot(
+							format!("not available: {}", stderr)
+						))
+					}
 				},
 				Err(err) =>
 					Err(SecureModeError::CannotUnshareUserNamespaceAndChangeRoot(
@@ -208,16 +214,25 @@ async fn check_landlock(
 		if #[cfg(target_os = "linux")] {
 			match tokio::process::Command::new(prepare_worker_program_path)
 				.arg("--check-can-enable-landlock")
-				.status()
+				.output()
 				.await
 			{
-				Ok(status) if status.success() => Ok(()),
-				Ok(_status) => {
+				Ok(output) if output.status.success() => Ok(()),
+				Ok(output) => {
 					let abi =
 						polkadot_node_core_pvf_common::worker::security::landlock::LANDLOCK_ABI as u8;
-					Err(SecureModeError::CannotEnableLandlock(
-						format!("landlock ABI {} not available", abi)
-					))
+					let stderr = std::str::from_utf8(&output.stderr)
+						.expect("child process writes a UTF-8 string to stderr; qed")
+						.trim();
+					if stderr.is_empty() {
+						Err(SecureModeError::CannotEnableLandlock(
+							format!("landlock ABI {} not available", abi)
+						))
+					} else {
+						Err(SecureModeError::CannotEnableLandlock(
+							format!("not available: {}", stderr)
+						))
+					}
 				},
 				Err(err) =>
 					Err(SecureModeError::CannotEnableLandlock(
@@ -238,7 +253,7 @@ async fn check_landlock(
 /// to running the check in a worker, we try it... in a worker. The expected return status is 0 on
 /// success and -1 on failure.
 async fn check_seccomp(
-	#[cfg_attr(not(target_os = "linux"), allow(unused_variables))]
+	#[cfg_attr(not(all(target_os = "linux", target_arch = "x86_64")), allow(unused_variables))]
 	prepare_worker_program_path: &Path,
 ) -> SecureModeResult {
 	cfg_if::cfg_if! {
@@ -247,14 +262,24 @@ async fn check_seccomp(
 				if #[cfg(target_arch = "x86_64")] {
 					match tokio::process::Command::new(prepare_worker_program_path)
 						.arg("--check-can-enable-seccomp")
-						.status()
+						.output()
 						.await
 					{
-						Ok(status) if status.success() => Ok(()),
-						Ok(_status) =>
-							Err(SecureModeError::CannotEnableSeccomp(
-								"not available".into()
-							)),
+						Ok(output) if output.status.success() => Ok(()),
+						Ok(output) => {
+							let stderr = std::str::from_utf8(&output.stderr)
+								.expect("child process writes a UTF-8 string to stderr; qed")
+								.trim();
+							if stderr.is_empty() {
+								Err(SecureModeError::CannotEnableSeccomp(
+									"not available".into()
+								))
+							} else {
+								Err(SecureModeError::CannotEnableSeccomp(
+									format!("not available: {}", stderr)
+								))
+							}
+						},
 						Err(err) =>
 							Err(SecureModeError::CannotEnableSeccomp(
 								format!("could not start child process: {}", err)
@@ -320,25 +345,25 @@ impl AuditLogFile {
 	}
 }

-/// Check if a seccomp violation occurred for the given worker. As the syslog may be in a different
-/// location, or seccomp auditing may be disabled, this function provides a best-effort attempt
-/// only.
+/// Check if a seccomp violation occurred for the given job process. As the syslog may be in a
+/// different location, or seccomp auditing may be disabled, this function provides a best-effort
+/// attempt only.
 ///
 /// The `audit_log_file` must have been obtained before the job started. It only allows reading
 /// entries that were written since it was obtained, so that we do not consider events from previous
 /// processes with the same pid. This can still be racy, but it's unlikely and fine for a
 /// best-effort attempt.
-pub async fn check_seccomp_violations_for_worker(
+pub async fn check_seccomp_violations_for_job(
 	audit_log_file: Option<AuditLogFile>,
-	worker_pid: u32,
+	job_pid: i32,
 ) -> Vec<u32> {
-	let audit_event_pid_field = format!("pid={worker_pid}");
+	let audit_event_pid_field = format!("pid={job_pid}");

 	let audit_log_file = match audit_log_file {
 		Some(file) => {
-			gum::debug!(
+			gum::trace!(
 				target: LOG_TARGET,
-				%worker_pid,
+				%job_pid,
 				audit_log_path = ?file.path,
 				"checking audit log for seccomp violations",
 			);
@@ -347,7 +372,7 @@ pub async fn check_seccomp_violations_for_worker(
 		None => {
 			gum::warn!(
 				target: LOG_TARGET,
-				%worker_pid,
+				%job_pid,
 				"could not open either {AUDIT_LOG_PATH} or {SYSLOG_PATH} for reading audit logs"
 			);
 			return vec![]
@@ -248,7 +248,7 @@ rusty_fork_test! {
 			// Note that we get a more specific error if the job died than if the whole worker died.
 			assert_matches!(
 				result,
-				Err(PrepareError::JobDied(err)) if err == "received signal: SIGKILL"
+				Err(PrepareError::JobDied{ err, job_pid: _ }) if err == "received signal: SIGKILL"
 			);
 		})
 	}