PVF: more filesystem sandboxing (#1373)

2026-07-29 23:05:43 +00:00 · 2023-09-28 18:24:29 +02:00
parent de71fecc4e
commit c1eb342b14
24 changed files with 1528 additions and 612 deletions
@@ -30,6 +30,7 @@ use futures::{
 	stream::{FuturesUnordered, StreamExt as _},
 	Future, FutureExt,
 };
+use polkadot_node_core_pvf_common::SecurityStatus;
 use polkadot_primitives::{ExecutorParams, ExecutorParamsHash};
 use slotmap::HopSlotMap;
 use std::{
@@ -139,8 +140,10 @@ struct Queue {

 	// Some variables related to the current session.
 	program_path: PathBuf,
+	cache_path: PathBuf,
 	spawn_timeout: Duration,
 	node_version: Option<String>,
+	security_status: SecurityStatus,

 	/// The queue of jobs that are waiting for a worker to pick up.
 	queue: VecDeque<ExecuteJob>,
@@ -152,16 +155,20 @@ impl Queue {
 	fn new(
 		metrics: Metrics,
 		program_path: PathBuf,
+		cache_path: PathBuf,
 		worker_capacity: usize,
 		spawn_timeout: Duration,
 		node_version: Option<String>,
+		security_status: SecurityStatus,
 		to_queue_rx: mpsc::Receiver<ToQueue>,
 	) -> Self {
 		Self {
 			metrics,
 			program_path,
+			cache_path,
 			spawn_timeout,
 			node_version,
+			security_status,
 			to_queue_rx,
 			queue: VecDeque::new(),
 			mux: Mux::new(),
@@ -405,9 +412,11 @@ fn spawn_extra_worker(queue: &mut Queue, job: ExecuteJob) {
 	queue.mux.push(
 		spawn_worker_task(
 			queue.program_path.clone(),
+			queue.cache_path.clone(),
 			job,
 			queue.spawn_timeout,
 			queue.node_version.clone(),
+			queue.security_status.clone(),
 		)
 		.boxed(),
 	);
@@ -423,18 +432,22 @@ fn spawn_extra_worker(queue: &mut Queue, job: ExecuteJob) {
 /// execute other jobs with a compatible execution environment.
 async fn spawn_worker_task(
 	program_path: PathBuf,
+	cache_path: PathBuf,
 	job: ExecuteJob,
 	spawn_timeout: Duration,
 	node_version: Option<String>,
+	security_status: SecurityStatus,
 ) -> QueueEvent {
 	use futures_timer::Delay;

 	loop {
 		match super::worker_intf::spawn(
 			&program_path,
+			&cache_path,
 			job.executor_params.clone(),
 			spawn_timeout,
 			node_version.as_deref(),
+			security_status.clone(),
 		)
 		.await
 		{
@@ -496,17 +509,21 @@ fn assign(queue: &mut Queue, worker: Worker, job: ExecuteJob) {
 pub fn start(
 	metrics: Metrics,
 	program_path: PathBuf,
+	cache_path: PathBuf,
 	worker_capacity: usize,
 	spawn_timeout: Duration,
 	node_version: Option<String>,
+	security_status: SecurityStatus,
 ) -> (mpsc::Sender<ToQueue>, impl Future<Output = ()>) {
 	let (to_queue_tx, to_queue_rx) = mpsc::channel(20);
 	let run = Queue::new(
 		metrics,
 		program_path,
+		cache_path,
 		worker_capacity,
 		spawn_timeout,
 		node_version,
+		security_status,
 		to_queue_rx,
 	)
 	.run();
@@ -19,8 +19,8 @@
 use crate::{
 	artifacts::ArtifactPathId,
 	worker_intf::{
-		path_to_bytes, spawn_with_program_path, IdleWorker, SpawnErr, WorkerHandle,
-		JOB_TIMEOUT_WALL_CLOCK_FACTOR,
+		clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker,
+		SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
 	},
 	LOG_TARGET,
 };
@@ -30,7 +30,7 @@ use parity_scale_codec::{Decode, Encode};
 use polkadot_node_core_pvf_common::{
 	error::InternalValidationError,
 	execute::{Handshake, Response},
-	framed_recv, framed_send,
+	worker_dir, SecurityStatus,
 };
 use polkadot_parachain_primitives::primitives::ValidationResult;
 use polkadot_primitives::ExecutorParams;
@@ -38,21 +38,30 @@ use std::{path::Path, time::Duration};
 use tokio::{io, net::UnixStream};

 /// Spawns a new worker with the given program path that acts as the worker and the spawn timeout.
-/// Sends a handshake message to the worker as soon as it is spawned.
 ///
-/// The program should be able to handle `<program-path> execute-worker <socket-path>` invocation.
+/// Sends a handshake message to the worker as soon as it is spawned.
 pub async fn spawn(
 	program_path: &Path,
+	cache_path: &Path,
 	executor_params: ExecutorParams,
 	spawn_timeout: Duration,
 	node_version: Option<&str>,
+	security_status: SecurityStatus,
 ) -> Result<(IdleWorker, WorkerHandle), SpawnErr> {
 	let mut extra_args = vec!["execute-worker"];
 	if let Some(node_version) = node_version {
 		extra_args.extend_from_slice(&["--node-impl-version", node_version]);
 	}
-	let (mut idle_worker, worker_handle) =
-		spawn_with_program_path("execute", program_path, &extra_args, spawn_timeout).await?;
+
+	let (mut idle_worker, worker_handle) = spawn_with_program_path(
+		"execute",
+		program_path,
+		cache_path,
+		&extra_args,
+		spawn_timeout,
+		security_status,
+	)
+	.await?;
 	send_handshake(&mut idle_worker.stream, Handshake { executor_params })
 		.await
 		.map_err(|error| {
@@ -104,89 +113,151 @@ pub async fn start_work(
 	execution_timeout: Duration,
 	validation_params: Vec<u8>,
 ) -> Outcome {
-	let IdleWorker { mut stream, pid } = worker;
+	let IdleWorker { mut stream, pid, worker_dir } = worker;

 	gum::debug!(
 		target: LOG_TARGET,
 		worker_pid = %pid,
+		?worker_dir,
 		validation_code_hash = ?artifact.id.code_hash,
 		"starting execute for {}",
 		artifact.path.display(),
 	);

-	if let Err(error) =
-		send_request(&mut stream, &artifact.path, &validation_params, execution_timeout).await
-	{
-		gum::warn!(
-			target: LOG_TARGET,
-			worker_pid = %pid,
-			validation_code_hash = ?artifact.id.code_hash,
-			?error,
-			"failed to send an execute request",
-		);
-		return Outcome::IoErr
-	}
-
-	// We use a generous timeout here. This is in addition to the one in the child process, in
-	// case the child stalls. We have a wall clock timeout here in the host, but a CPU timeout
-	// in the child. We want to use CPU time because it varies less than wall clock time under
-	// load, but the CPU resources of the child can only be measured from the parent after the
-	// child process terminates.
-	let timeout = execution_timeout * JOB_TIMEOUT_WALL_CLOCK_FACTOR;
-	let response = futures::select! {
-		response = recv_response(&mut stream).fuse() => {
-			match response {
-				Err(error) => {
-					gum::warn!(
-						target: LOG_TARGET,
-						worker_pid = %pid,
-						validation_code_hash = ?artifact.id.code_hash,
-						?error,
-						"failed to recv an execute response",
-					);
-					return Outcome::IoErr
-				},
-				Ok(response) => {
-					if let Response::Ok{duration, ..} = response {
-						if duration > execution_timeout {
-							// The job didn't complete within the timeout.
-							gum::warn!(
-								target: LOG_TARGET,
-								worker_pid = %pid,
-								"execute job took {}ms cpu time, exceeded execution timeout {}ms.",
-								duration.as_millis(),
-								execution_timeout.as_millis(),
-							);
-
-							// Return a timeout error.
-							return Outcome::HardTimeout;
-						}
-					}
-
-					response
-				},
-			}
-		},
-		_ = Delay::new(timeout).fuse() => {
+	with_worker_dir_setup(worker_dir, pid, &artifact.path, |worker_dir| async move {
+		if let Err(error) = send_request(&mut stream, &validation_params, execution_timeout).await {
 			gum::warn!(
 				target: LOG_TARGET,
 				worker_pid = %pid,
 				validation_code_hash = ?artifact.id.code_hash,
-				"execution worker exceeded lenient timeout for execution, child worker likely stalled",
+				?error,
+				"failed to send an execute request",
 			);
-			Response::TimedOut
-		},
-	};
+			return Outcome::IoErr
+		}

-	match response {
-		Response::Ok { result_descriptor, duration } =>
-			Outcome::Ok { result_descriptor, duration, idle_worker: IdleWorker { stream, pid } },
-		Response::InvalidCandidate(err) =>
-			Outcome::InvalidCandidate { err, idle_worker: IdleWorker { stream, pid } },
-		Response::TimedOut => Outcome::HardTimeout,
-		Response::Panic(err) => Outcome::Panic { err },
-		Response::InternalError(err) => Outcome::InternalError { err },
+		// We use a generous timeout here. This is in addition to the one in the child process, in
+		// case the child stalls. We have a wall clock timeout here in the host, but a CPU timeout
+		// in the child. We want to use CPU time because it varies less than wall clock time under
+		// load, but the CPU resources of the child can only be measured from the parent after the
+		// child process terminates.
+		let timeout = execution_timeout * JOB_TIMEOUT_WALL_CLOCK_FACTOR;
+		let response = futures::select! {
+			response = recv_response(&mut stream).fuse() => {
+				match response {
+					Err(error) => {
+						gum::warn!(
+							target: LOG_TARGET,
+							worker_pid = %pid,
+							validation_code_hash = ?artifact.id.code_hash,
+							?error,
+							"failed to recv an execute response",
+						);
+						return Outcome::IoErr
+					},
+					Ok(response) => {
+						if let Response::Ok{duration, ..} = response {
+							if duration > execution_timeout {
+								// The job didn't complete within the timeout.
+								gum::warn!(
+									target: LOG_TARGET,
+									worker_pid = %pid,
+									"execute job took {}ms cpu time, exceeded execution timeout {}ms.",
+									duration.as_millis(),
+									execution_timeout.as_millis(),
+								);
+
+								// Return a timeout error.
+								return Outcome::HardTimeout;
+							}
+						}
+
+						response
+					},
+				}
+			},
+			_ = Delay::new(timeout).fuse() => {
+				gum::warn!(
+					target: LOG_TARGET,
+					worker_pid = %pid,
+					validation_code_hash = ?artifact.id.code_hash,
+					"execution worker exceeded lenient timeout for execution, child worker likely stalled",
+				);
+				Response::TimedOut
+			},
+		};
+
+		match response {
+			Response::Ok { result_descriptor, duration } => Outcome::Ok {
+				result_descriptor,
+				duration,
+				idle_worker: IdleWorker { stream, pid, worker_dir },
+			},
+			Response::InvalidCandidate(err) => Outcome::InvalidCandidate {
+				err,
+				idle_worker: IdleWorker { stream, pid, worker_dir },
+			},
+			Response::TimedOut => Outcome::HardTimeout,
+			Response::Panic(err) => Outcome::Panic { err },
+			Response::InternalError(err) => Outcome::InternalError { err },
+		}
+	})
+	.await
+}
+
+/// Create a temporary file for an artifact in the worker cache, execute the given future/closure
+/// passing the file path in, and clean up the worker cache.
+///
+/// Failure to clean up the worker cache results in an error - leaving any files here could be a
+/// security issue, and we should shut down the worker. This should be very rare.
+async fn with_worker_dir_setup<F, Fut>(
+	worker_dir: WorkerDir,
+	pid: u32,
+	artifact_path: &Path,
+	f: F,
+) -> Outcome
+where
+	Fut: futures::Future<Output = Outcome>,
+	F: FnOnce(WorkerDir) -> Fut,
+{
+	// Cheaply create a hard link to the artifact. The artifact is always at a known location in the
+	// worker cache, and the child can't access any other artifacts or gain any information from the
+	// original filename.
+	let link_path = worker_dir::execute_artifact(&worker_dir.path);
+	if let Err(err) = tokio::fs::hard_link(artifact_path, link_path).await {
+		gum::warn!(
+			target: LOG_TARGET,
+			worker_pid = %pid,
+			?worker_dir,
+			"failed to clear worker cache after the job: {:?}",
+			err,
+		);
+		return Outcome::InternalError {
+			err: InternalValidationError::CouldNotCreateLink(format!("{:?}", err)),
+		}
 	}
+
+	let worker_dir_path = worker_dir.path.clone();
+	let outcome = f(worker_dir).await;
+
+	// Try to clear the worker dir.
+	if let Err(err) = clear_worker_dir_path(&worker_dir_path) {
+		gum::warn!(
+			target: LOG_TARGET,
+			worker_pid = %pid,
+			?worker_dir_path,
+			"failed to clear worker cache after the job: {:?}",
+			err,
+		);
+		return Outcome::InternalError {
+			err: InternalValidationError::CouldNotClearWorkerDir {
+				err: format!("{:?}", err),
+				path: worker_dir_path.to_str().map(String::from),
+			},
+		}
+	}
+
+	outcome
 }

 async fn send_handshake(stream: &mut UnixStream, handshake: Handshake) -> io::Result<()> {
@@ -195,11 +266,9 @@ async fn send_handshake(stream: &mut UnixStream, handshake: Handshake) -> io::Re

 async fn send_request(
 	stream: &mut UnixStream,
-	artifact_path: &Path,
 	validation_params: &[u8],
 	execution_timeout: Duration,
 ) -> io::Result<()> {
-	framed_send(stream, path_to_bytes(artifact_path)).await?;
 	framed_send(stream, validation_params).await?;
 	framed_send(stream, &execution_timeout.encode()).await
 }