change prepare worker to use fork instead of threads (#1685)

Co-authored-by: Marcin S <marcin@realemail.net>
2026-04-27 10:27:59 +00:00 · 2023-11-14 14:50:18 -03:00
parent 3a87390b30
commit 54f84285bf
24 changed files with 1468 additions and 534 deletions
@@ -19,23 +19,23 @@
 use assert_matches::assert_matches;
 use parity_scale_codec::Encode as _;
 use polkadot_node_core_pvf::{
-	start, testing::get_and_check_worker_paths, Config, InvalidCandidate, Metrics, PrepareError,
+	start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, PrepareError,
 	PrepareJobKind, PrepareStats, PvfPrepData, ValidationError, ValidationHost,
 	JOB_TIMEOUT_WALL_CLOCK_FACTOR,
 };
 use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult};
 use polkadot_primitives::{ExecutorParam, ExecutorParams};
-#[cfg(target_os = "linux")]
-use rusty_fork::rusty_fork_test;

 use std::time::Duration;
 use tokio::sync::Mutex;

 mod adder;
+#[cfg(target_os = "linux")]
+mod process;
 mod worker_common;

-const TEST_EXECUTION_TIMEOUT: Duration = Duration::from_secs(3);
-const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(3);
+const TEST_EXECUTION_TIMEOUT: Duration = Duration::from_secs(6);
+const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(6);

 struct TestHost {
 	cache_dir: tempfile::TempDir,
@@ -51,7 +51,7 @@ impl TestHost {
 	where
 		F: FnOnce(&mut Config),
 	{
-		let (prepare_worker_path, execute_worker_path) = get_and_check_worker_paths();
+		let (prepare_worker_path, execute_worker_path) = build_workers_and_get_paths(false);

 		let cache_dir = tempfile::tempdir().unwrap();
 		let mut config = Config::new(
@@ -126,7 +126,26 @@ impl TestHost {
 }

 #[tokio::test]
-async fn terminates_on_timeout() {
+async fn prepare_job_terminates_on_timeout() {
+	let host = TestHost::new().await;
+
+	let start = std::time::Instant::now();
+	let result = host
+		.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default())
+		.await;
+
+	match result {
+		Err(PrepareError::TimedOut) => {},
+		r => panic!("{:?}", r),
+	}
+
+	let duration = std::time::Instant::now().duration_since(start);
+	assert!(duration >= TEST_PREPARATION_TIMEOUT);
+	assert!(duration < TEST_PREPARATION_TIMEOUT * JOB_TIMEOUT_WALL_CLOCK_FACTOR);
+}
+
+#[tokio::test]
+async fn execute_job_terminates_on_timeout() {
 	let host = TestHost::new().await;

 	let start = std::time::Instant::now();
@@ -153,108 +172,6 @@ async fn terminates_on_timeout() {
 	assert!(duration < TEST_EXECUTION_TIMEOUT * JOB_TIMEOUT_WALL_CLOCK_FACTOR);
 }

-#[cfg(target_os = "linux")]
-fn kill_by_sid_and_name(sid: i32, exe_name: &'static str) {
-	use procfs::process;
-
-	let all_processes: Vec<process::Process> = process::all_processes()
-		.expect("Can't read /proc")
-		.filter_map(|p| match p {
-			Ok(p) => Some(p), // happy path
-			Err(e) => match e {
-				// process vanished during iteration, ignore it
-				procfs::ProcError::NotFound(_) => None,
-				x => {
-					panic!("some unknown error: {}", x);
-				},
-			},
-		})
-		.collect();
-
-	for process in all_processes {
-		if process.stat().unwrap().session == sid &&
-			process.exe().unwrap().to_str().unwrap().contains(exe_name)
-		{
-			assert_eq!(unsafe { libc::kill(process.pid(), 9) }, 0);
-		}
-	}
-}
-
-// Run these tests in their own processes with rusty-fork. They work by each creating a new session,
-// then killing the worker process that matches the session ID and expected worker name.
-#[cfg(target_os = "linux")]
-rusty_fork_test! {
-	// What happens when the prepare worker dies in the middle of a job?
-	#[test]
-	fn prepare_worker_killed_during_job() {
-		const PROCESS_NAME: &'static str = "polkadot-prepare-worker";
-
-		let rt  = tokio::runtime::Runtime::new().unwrap();
-		rt.block_on(async {
-			let host = TestHost::new().await;
-
-			// Create a new session and get the session ID.
-			let sid = unsafe { libc::setsid() };
-			assert!(sid > 0);
-
-			let (result, _) = futures::join!(
-				// Choose a job that would normally take the entire timeout.
-				host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()),
-				// Run a future that kills the job in the middle of the timeout.
-				async {
-					tokio::time::sleep(TEST_PREPARATION_TIMEOUT / 2).await;
-					kill_by_sid_and_name(sid, PROCESS_NAME);
-				}
-			);
-
-			assert_matches!(result, Err(PrepareError::IoErr(_)));
-		})
-	}
-
-	// What happens when the execute worker dies in the middle of a job?
-	#[test]
-	fn execute_worker_killed_during_job() {
-		const PROCESS_NAME: &'static str = "polkadot-execute-worker";
-
-		let rt  = tokio::runtime::Runtime::new().unwrap();
-		rt.block_on(async {
-			let host = TestHost::new().await;
-
-			// Create a new session and get the session ID.
-			let sid = unsafe { libc::setsid() };
-			assert!(sid > 0);
-
-			// Prepare the artifact ahead of time.
-			let binary = halt::wasm_binary_unwrap();
-			host.precheck_pvf(binary, Default::default()).await.unwrap();
-
-			let (result, _) = futures::join!(
-				// Choose an job that would normally take the entire timeout.
-				host.validate_candidate(
-					binary,
-					ValidationParams {
-						block_data: BlockData(Vec::new()),
-						parent_head: Default::default(),
-						relay_parent_number: 1,
-						relay_parent_storage_root: Default::default(),
-					},
-					Default::default(),
-				),
-				// Run a future that kills the job in the middle of the timeout.
-				async {
-					tokio::time::sleep(TEST_EXECUTION_TIMEOUT / 2).await;
-					kill_by_sid_and_name(sid, PROCESS_NAME);
-				}
-			);
-
-			assert_matches!(
-				result,
-				Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath))
-			);
-		})
-	}
-}
-
 #[cfg(feature = "ci-only-tests")]
 #[tokio::test]
 async fn ensure_parallel_execution() {
@@ -0,0 +1,383 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Test unexpected behaviors of the spawned processes. We test both worker processes (directly
+//! spawned by the host) and job processes (spawned by the workers to securely perform PVF jobs).
+
+use super::TestHost;
+use assert_matches::assert_matches;
+use polkadot_node_core_pvf::{InvalidCandidate, PrepareError, ValidationError};
+use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams};
+use procfs::process;
+use rusty_fork::rusty_fork_test;
+use std::time::Duration;
+
+const PREPARE_PROCESS_NAME: &'static str = "polkadot-prepare-worker";
+const EXECUTE_PROCESS_NAME: &'static str = "polkadot-execute-worker";
+
+const SIGNAL_KILL: i32 = 9;
+const SIGNAL_STOP: i32 = 19;
+
+fn send_signal_by_sid_and_name(
+	sid: i32,
+	exe_name: &'static str,
+	is_direct_child: bool,
+	signal: i32,
+) {
+	let process = find_process_by_sid_and_name(sid, exe_name, is_direct_child);
+	assert_eq!(unsafe { libc::kill(process.pid(), signal) }, 0);
+}
+fn get_num_threads_by_sid_and_name(sid: i32, exe_name: &'static str, is_direct_child: bool) -> i64 {
+	let process = find_process_by_sid_and_name(sid, exe_name, is_direct_child);
+	process.stat().unwrap().num_threads
+}
+
+fn find_process_by_sid_and_name(
+	sid: i32,
+	exe_name: &'static str,
+	is_direct_child: bool,
+) -> process::Process {
+	let all_processes: Vec<process::Process> = process::all_processes()
+		.expect("Can't read /proc")
+		.filter_map(|p| match p {
+			Ok(p) => Some(p), // happy path
+			Err(e) => match e {
+				// process vanished during iteration, ignore it
+				procfs::ProcError::NotFound(_) => None,
+				x => {
+					panic!("some unknown error: {}", x);
+				},
+			},
+		})
+		.collect();
+
+	let mut found = None;
+	for process in all_processes {
+		let stat = process.stat().unwrap();
+
+		if stat.session != sid || !process.exe().unwrap().to_str().unwrap().contains(exe_name) {
+			continue
+		}
+		// The workers are direct children of the current process, the worker job processes are not
+		// (they are children of the workers).
+		let process_is_direct_child = stat.ppid as u32 == std::process::id();
+		if is_direct_child != process_is_direct_child {
+			continue
+		}
+
+		if found.is_some() {
+			panic!("Found more than one process")
+		}
+		found = Some(process);
+	}
+	found.expect("Should have found the expected process")
+}
+
+// Run these tests in their own processes with rusty-fork. They work by each creating a new session,
+// then doing something with the child process that matches the session ID and expected process
+// name.
+rusty_fork_test! {
+	// What happens when the prepare worker (not the job) times out?
+	#[test]
+	fn prepare_worker_timeout() {
+		let rt  = tokio::runtime::Runtime::new().unwrap();
+		rt.block_on(async {
+			let host = TestHost::new().await;
+
+			// Create a new session and get the session ID.
+			let sid = unsafe { libc::setsid() };
+			assert!(sid > 0);
+
+			let (result, _) = futures::join!(
+				// Choose a job that would normally take the entire timeout.
+				host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()),
+				// Send a stop signal to pause the worker.
+				async {
+					tokio::time::sleep(Duration::from_secs(1)).await;
+					send_signal_by_sid_and_name(sid, PREPARE_PROCESS_NAME, true, SIGNAL_STOP);
+				}
+			);
+
+			assert_matches!(result, Err(PrepareError::TimedOut));
+		})
+	}
+
+	// What happens when the execute worker (not the job) times out?
+	#[test]
+	fn execute_worker_timeout() {
+		let rt  = tokio::runtime::Runtime::new().unwrap();
+		rt.block_on(async {
+			let host = TestHost::new().await;
+
+			// Create a new session and get the session ID.
+			let sid = unsafe { libc::setsid() };
+			assert!(sid > 0);
+
+			// Prepare the artifact ahead of time.
+			let binary = halt::wasm_binary_unwrap();
+			host.precheck_pvf(binary, Default::default()).await.unwrap();
+
+			let (result, _) = futures::join!(
+				// Choose an job that would normally take the entire timeout.
+				host.validate_candidate(
+					binary,
+					ValidationParams {
+						block_data: BlockData(Vec::new()),
+						parent_head: Default::default(),
+						relay_parent_number: 1,
+						relay_parent_storage_root: Default::default(),
+					},
+					Default::default(),
+				),
+				// Send a stop signal to pause the worker.
+				async {
+					tokio::time::sleep(Duration::from_secs(1)).await;
+					send_signal_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, true, SIGNAL_STOP);
+				}
+			);
+
+			assert_matches!(
+				result,
+				Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout))
+			);
+		})
+	}
+
+	// What happens when the prepare worker dies in the middle of a job?
+	#[test]
+	fn prepare_worker_killed_during_job() {
+		let rt  = tokio::runtime::Runtime::new().unwrap();
+		rt.block_on(async {
+			let host = TestHost::new().await;
+
+			// Create a new session and get the session ID.
+			let sid = unsafe { libc::setsid() };
+			assert!(sid > 0);
+
+			let (result, _) = futures::join!(
+				// Choose a job that would normally take the entire timeout.
+				host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()),
+				// Run a future that kills the job while it's running.
+				async {
+					tokio::time::sleep(Duration::from_secs(1)).await;
+					send_signal_by_sid_and_name(sid, PREPARE_PROCESS_NAME, true, SIGNAL_KILL);
+				}
+			);
+
+			assert_matches!(result, Err(PrepareError::IoErr(_)));
+		})
+	}
+
+	// What happens when the execute worker dies in the middle of a job?
+	#[test]
+	fn execute_worker_killed_during_job() {
+		let rt  = tokio::runtime::Runtime::new().unwrap();
+		rt.block_on(async {
+			let host = TestHost::new().await;
+
+			// Create a new session and get the session ID.
+			let sid = unsafe { libc::setsid() };
+			assert!(sid > 0);
+
+			// Prepare the artifact ahead of time.
+			let binary = halt::wasm_binary_unwrap();
+			host.precheck_pvf(binary, Default::default()).await.unwrap();
+
+			let (result, _) = futures::join!(
+				// Choose an job that would normally take the entire timeout.
+				host.validate_candidate(
+					binary,
+					ValidationParams {
+						block_data: BlockData(Vec::new()),
+						parent_head: Default::default(),
+						relay_parent_number: 1,
+						relay_parent_storage_root: Default::default(),
+					},
+					Default::default(),
+				),
+				// Run a future that kills the job while it's running.
+				async {
+					tokio::time::sleep(Duration::from_secs(1)).await;
+					send_signal_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, true, SIGNAL_KILL);
+				}
+			);
+
+			assert_matches!(
+				result,
+				Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath))
+			);
+		})
+	}
+
+	// What happens when the forked prepare job dies in the middle of its job?
+	#[test]
+	fn forked_prepare_job_killed_during_job() {
+		let rt  = tokio::runtime::Runtime::new().unwrap();
+		rt.block_on(async {
+			let host = TestHost::new().await;
+
+			// Create a new session and get the session ID.
+			let sid = unsafe { libc::setsid() };
+			assert!(sid > 0);
+
+			let (result, _) = futures::join!(
+				// Choose a job that would normally take the entire timeout.
+				host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()),
+				// Run a future that kills the job while it's running.
+				async {
+					tokio::time::sleep(Duration::from_secs(1)).await;
+					send_signal_by_sid_and_name(sid, PREPARE_PROCESS_NAME, false, SIGNAL_KILL);
+				}
+			);
+
+			// Note that we get a more specific error if the job died than if the whole worker died.
+			assert_matches!(
+				result,
+				Err(PrepareError::JobDied(err)) if err == "received signal: SIGKILL"
+			);
+		})
+	}
+
+	// What happens when the forked execute job dies in the middle of its job?
+	#[test]
+	fn forked_execute_job_killed_during_job() {
+		let rt  = tokio::runtime::Runtime::new().unwrap();
+		rt.block_on(async {
+			let host = TestHost::new().await;
+
+			// Create a new session and get the session ID.
+			let sid = unsafe { libc::setsid() };
+			assert!(sid > 0);
+
+			// Prepare the artifact ahead of time.
+			let binary = halt::wasm_binary_unwrap();
+			host.precheck_pvf(binary, Default::default()).await.unwrap();
+
+			let (result, _) = futures::join!(
+				// Choose a job that would normally take the entire timeout.
+				host.validate_candidate(
+					binary,
+					ValidationParams {
+						block_data: BlockData(Vec::new()),
+						parent_head: Default::default(),
+						relay_parent_number: 1,
+						relay_parent_storage_root: Default::default(),
+					},
+					Default::default(),
+				),
+				// Run a future that kills the job while it's running.
+				async {
+					tokio::time::sleep(Duration::from_secs(1)).await;
+					send_signal_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, false, SIGNAL_KILL);
+				}
+			);
+
+			// Note that we get a more specific error if the job died than if the whole worker died.
+			assert_matches!(
+				result,
+				Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err)))
+					if err == "received signal: SIGKILL"
+			);
+		})
+	}
+
+	// Ensure that the spawned prepare worker is single-threaded.
+	//
+	// See `run_worker` for why we need this invariant.
+	#[test]
+	fn ensure_prepare_processes_have_correct_num_threads() {
+		let rt  = tokio::runtime::Runtime::new().unwrap();
+		rt.block_on(async {
+			let host = TestHost::new().await;
+
+			// Create a new session and get the session ID.
+			let sid = unsafe { libc::setsid() };
+			assert!(sid > 0);
+
+			let _ = futures::join!(
+				// Choose a job that would normally take the entire timeout.
+				host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()),
+				// Run a future that kills the job while it's running.
+				async {
+					tokio::time::sleep(Duration::from_secs(1)).await;
+					assert_eq!(
+						get_num_threads_by_sid_and_name(sid, PREPARE_PROCESS_NAME, true),
+						1
+					);
+					// Child job should have three threads: main thread, execute thread, CPU time
+					// monitor, and memory tracking.
+					assert_eq!(
+						get_num_threads_by_sid_and_name(sid, PREPARE_PROCESS_NAME, false),
+						4
+					);
+
+					// End the test.
+					send_signal_by_sid_and_name(sid, PREPARE_PROCESS_NAME, true, SIGNAL_KILL);
+				}
+			);
+		})
+	}
+
+	// Ensure that the spawned execute worker is single-threaded.
+	//
+	// See `run_worker` for why we need this invariant.
+	#[test]
+	fn ensure_execute_processes_have_correct_num_threads() {
+		let rt  = tokio::runtime::Runtime::new().unwrap();
+		rt.block_on(async {
+			let host = TestHost::new().await;
+
+			// Create a new session and get the session ID.
+			let sid = unsafe { libc::setsid() };
+			assert!(sid > 0);
+
+			// Prepare the artifact ahead of time.
+			let binary = halt::wasm_binary_unwrap();
+			host.precheck_pvf(binary, Default::default()).await.unwrap();
+
+			let _ = futures::join!(
+				// Choose a job that would normally take the entire timeout.
+				host.validate_candidate(
+					binary,
+					ValidationParams {
+						block_data: BlockData(Vec::new()),
+						parent_head: Default::default(),
+						relay_parent_number: 1,
+						relay_parent_storage_root: Default::default(),
+					},
+					Default::default(),
+				),
+				// Run a future that tests the thread count while the worker is running.
+				async {
+					tokio::time::sleep(Duration::from_secs(1)).await;
+					assert_eq!(
+						get_num_threads_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, true),
+						1
+					);
+					// Child job should have three threads: main thread, execute thread, and CPU
+					// time monitor.
+					assert_eq!(
+						get_num_threads_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, false),
+						3
+					);
+
+					// End the test.
+					send_signal_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, true, SIGNAL_KILL);
+				}
+			);
+		})
+	}
+}
@@ -15,7 +15,7 @@
 // along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.

 use polkadot_node_core_pvf::{
-	testing::{get_and_check_worker_paths, spawn_with_program_path, SpawnErr},
+	testing::{build_workers_and_get_paths, spawn_with_program_path, SpawnErr},
 	SecurityStatus,
 };
 use std::{env, time::Duration};
@@ -23,7 +23,7 @@ use std::{env, time::Duration};
 // Test spawning a program that immediately exits with a failure code.
 #[tokio::test]
 async fn spawn_immediate_exit() {
-	let (prepare_worker_path, _) = get_and_check_worker_paths();
+	let (prepare_worker_path, _) = build_workers_and_get_paths(false);

 	// There's no explicit `exit` subcommand in the worker; it will panic on an unknown
 	// subcommand anyway
@@ -41,7 +41,7 @@ async fn spawn_immediate_exit() {

 #[tokio::test]
 async fn spawn_timeout() {
-	let (_, execute_worker_path) = get_and_check_worker_paths();
+	let (_, execute_worker_path) = build_workers_and_get_paths(false);

 	let result = spawn_with_program_path(
 		"integration-test",
@@ -57,7 +57,7 @@ async fn spawn_timeout() {

 #[tokio::test]
 async fn should_connect() {
-	let (prepare_worker_path, _) = get_and_check_worker_paths();
+	let (prepare_worker_path, _) = build_workers_and_get_paths(false);

 	let _ = spawn_with_program_path(
 		"integration-test",