PVF: Move PVF workers into separate crate (#7101)

* Move PVF workers into separate crate * Fix indentation * Fix compilation errors * Fix more compilation errors * Rename `worker.rs` files, make host interface to worker more clear * Fix more compilation errors * Fix more compilation errors * Add link to issue * Address review comments * Update comment
2026-07-17 06:35:42 +00:00 · 2023-04-21 12:40:09 +02:00
parent ac09a84115
commit e277f95b3b
42 changed files with 878 additions and 627 deletions
@@ -1,237 +0,0 @@
-// Copyright (C) Parity Technologies (UK) Ltd.
-// This file is part of Polkadot.
-
-// Polkadot is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-
-// Polkadot is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-
-// You should have received a copy of the GNU General Public License
-// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
-
-//! Memory stats for preparation.
-//!
-//! Right now we gather three measurements:
-//!
-//! - `ru_maxrss` (resident set size) from `getrusage`.
-//! - `resident` memory stat provided by `tikv-malloc-ctl`.
-//! - `allocated` memory stat also from `tikv-malloc-ctl`.
-//!
-//! Currently we are only logging these for the purposes of gathering data. In the future, we may
-//! use these stats to reject PVFs during pre-checking. See
-//! <https://github.com/paritytech/polkadot/issues/6472#issuecomment-1381941762> for more
-//! background.
-
-use parity_scale_codec::{Decode, Encode};
-
-/// Helper struct to contain all the memory stats, including [`MemoryAllocationStats`] and, if
-/// supported by the OS, `ru_maxrss`.
-#[derive(Clone, Debug, Default, Encode, Decode)]
-pub struct MemoryStats {
-	/// Memory stats from `tikv_jemalloc_ctl`.
-	#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
-	pub memory_tracker_stats: Option<MemoryAllocationStats>,
-	/// `ru_maxrss` from `getrusage`. A string error since `io::Error` is not `Encode`able.
-	#[cfg(target_os = "linux")]
-	pub max_rss: Option<i64>,
-}
-
-/// Statistics of collected memory metrics.
-#[non_exhaustive]
-#[derive(Clone, Debug, Default, Encode, Decode)]
-pub struct MemoryAllocationStats {
-	/// Total resident memory, in bytes.
-	pub resident: u64,
-	/// Total allocated memory, in bytes.
-	pub allocated: u64,
-}
-
-/// Module for the memory tracker. The memory tracker runs in its own thread, where it polls memory
-/// usage at an interval.
-///
-/// NOTE: Requires jemalloc enabled.
-#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
-pub mod memory_tracker {
-	use super::*;
-	use crate::LOG_TARGET;
-	use std::{
-		sync::mpsc::{Receiver, RecvTimeoutError, Sender},
-		time::Duration,
-	};
-	use tikv_jemalloc_ctl::{epoch, stats, Error};
-	use tokio::task::JoinHandle;
-
-	#[derive(Clone)]
-	struct MemoryAllocationTracker {
-		epoch: tikv_jemalloc_ctl::epoch_mib,
-		allocated: stats::allocated_mib,
-		resident: stats::resident_mib,
-	}
-
-	impl MemoryAllocationTracker {
-		pub fn new() -> Result<Self, Error> {
-			Ok(Self {
-				epoch: epoch::mib()?,
-				allocated: stats::allocated::mib()?,
-				resident: stats::resident::mib()?,
-			})
-		}
-
-		pub fn snapshot(&self) -> Result<MemoryAllocationStats, Error> {
-			// update stats by advancing the allocation epoch
-			self.epoch.advance()?;
-
-			// Convert to `u64`, as `usize` is not `Encode`able.
-			let allocated = self.allocated.read()? as u64;
-			let resident = self.resident.read()? as u64;
-			Ok(MemoryAllocationStats { allocated, resident })
-		}
-	}
-
-	/// Runs a thread in the background that observes memory statistics. The goal is to try to get
-	/// accurate stats during preparation.
-	///
-	/// # Algorithm
-	///
-	/// 1. Create the memory tracker.
-	///
-	/// 2. Sleep for some short interval. Whenever we wake up, take a snapshot by updating the
-	///    allocation epoch.
-	///
-	/// 3. When we receive a signal that preparation has completed, take one last snapshot and return
-	///    the maximum observed values.
-	///
-	/// # Errors
-	///
-	/// For simplicity, any errors are returned as a string. As this is not a critical component, errors
-	/// are used for informational purposes (logging) only.
-	pub fn memory_tracker_loop(finished_rx: Receiver<()>) -> Result<MemoryAllocationStats, String> {
-		// This doesn't need to be too fine-grained since preparation currently takes 3-10s or more.
-		// Apart from that, there is not really a science to this number.
-		const POLL_INTERVAL: Duration = Duration::from_millis(100);
-
-		let tracker = MemoryAllocationTracker::new().map_err(|err| err.to_string())?;
-		let mut max_stats = MemoryAllocationStats::default();
-
-		let mut update_stats = || -> Result<(), String> {
-			let current_stats = tracker.snapshot().map_err(|err| err.to_string())?;
-			if current_stats.resident > max_stats.resident {
-				max_stats.resident = current_stats.resident;
-			}
-			if current_stats.allocated > max_stats.allocated {
-				max_stats.allocated = current_stats.allocated;
-			}
-			Ok(())
-		};
-
-		loop {
-			// Take a snapshot and update the max stats.
-			update_stats()?;
-
-			// Sleep.
-			match finished_rx.recv_timeout(POLL_INTERVAL) {
-				// Received finish signal.
-				Ok(()) => {
-					update_stats()?;
-					return Ok(max_stats)
-				},
-				// Timed out, restart loop.
-				Err(RecvTimeoutError::Timeout) => continue,
-				Err(RecvTimeoutError::Disconnected) =>
-					return Err("memory_tracker_loop: finished_rx disconnected".into()),
-			}
-		}
-	}
-
-	/// Helper function to terminate the memory tracker thread and get the stats. Helps isolate all this
-	/// error handling.
-	pub async fn get_memory_tracker_loop_stats(
-		fut: JoinHandle<Result<MemoryAllocationStats, String>>,
-		tx: Sender<()>,
-		worker_pid: u32,
-	) -> Option<MemoryAllocationStats> {
-		// Signal to the memory tracker thread to terminate.
-		if let Err(err) = tx.send(()) {
-			gum::warn!(
-				target: LOG_TARGET,
-				%worker_pid,
-				"worker: error sending signal to memory tracker_thread: {}",
-				err
-			);
-			None
-		} else {
-			// Join on the thread handle.
-			match fut.await {
-				Ok(Ok(stats)) => Some(stats),
-				Ok(Err(err)) => {
-					gum::warn!(
-						target: LOG_TARGET,
-						%worker_pid,
-						"worker: error occurred in the memory tracker thread: {}", err
-					);
-					None
-				},
-				Err(err) => {
-					gum::warn!(
-						target: LOG_TARGET,
-						%worker_pid,
-						"worker: error joining on memory tracker thread: {}", err
-					);
-					None
-				},
-			}
-		}
-	}
-}
-
-/// Module for dealing with the `ru_maxrss` (peak resident memory) stat from `getrusage`.
-///
-/// NOTE: `getrusage` with the `RUSAGE_THREAD` parameter is only supported on Linux. `RUSAGE_SELF`
-/// works on MacOS, but we need to get the max rss only for the preparation thread. Gettng it for
-/// the current process would conflate the stats of previous jobs run by the process.
-#[cfg(target_os = "linux")]
-pub mod max_rss_stat {
-	use crate::LOG_TARGET;
-	use core::mem::MaybeUninit;
-	use libc::{getrusage, rusage, RUSAGE_THREAD};
-	use std::io;
-
-	/// Get the rusage stats for the current thread.
-	fn getrusage_thread() -> io::Result<rusage> {
-		let mut result: MaybeUninit<rusage> = MaybeUninit::zeroed();
-
-		// SAFETY: `result` is a valid pointer, so calling this is safe.
-		if unsafe { getrusage(RUSAGE_THREAD, result.as_mut_ptr()) } == -1 {
-			return Err(io::Error::last_os_error())
-		}
-
-		// SAFETY: `result` was successfully initialized by `getrusage`.
-		unsafe { Ok(result.assume_init()) }
-	}
-
-	/// Gets the `ru_maxrss` for the current thread.
-	pub fn get_max_rss_thread() -> io::Result<i64> {
-		// `c_long` is either `i32` or `i64` depending on architecture. `i64::from` always works.
-		getrusage_thread().map(|rusage| i64::from(rusage.ru_maxrss))
-	}
-
-	/// Extracts the max_rss stat and logs any error.
-	pub fn extract_max_rss_stat(max_rss: io::Result<i64>, worker_pid: u32) -> Option<i64> {
-		max_rss
-			.map_err(|err| {
-				gum::warn!(
-					target: LOG_TARGET,
-					%worker_pid,
-					"error getting `ru_maxrss` in preparation thread: {}",
-					err
-				);
-				err
-			})
-			.ok()
-	}
-}
@@ -20,23 +20,44 @@
 //! (by running [`start_pool`]).
 //!
 //! The pool will spawn workers in new processes and those should execute pass control to
-//! [`worker_entrypoint`].
+//! `polkadot_node_core_pvf_worker::prepare_worker_entrypoint`.

-mod memory_stats;
 mod pool;
 mod queue;
-mod worker;
+mod worker_intf;

-pub use memory_stats::MemoryStats;
 pub use pool::start as start_pool;
 pub use queue::{start as start_queue, FromQueue, ToQueue};
-pub use worker::worker_entrypoint;

 use parity_scale_codec::{Decode, Encode};

 /// Preparation statistics, including the CPU time and memory taken.
 #[derive(Debug, Clone, Default, Encode, Decode)]
 pub struct PrepareStats {
-	cpu_time_elapsed: std::time::Duration,
-	memory_stats: MemoryStats,
+	/// The CPU time that elapsed for the preparation job.
+	pub cpu_time_elapsed: std::time::Duration,
+	/// The observed memory statistics for the preparation job.
+	pub memory_stats: MemoryStats,
+}
+
+/// Helper struct to contain all the memory stats, including `MemoryAllocationStats` and, if
+/// supported by the OS, `ru_maxrss`.
+#[derive(Clone, Debug, Default, Encode, Decode)]
+pub struct MemoryStats {
+	/// Memory stats from `tikv_jemalloc_ctl`.
+	#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
+	pub memory_tracker_stats: Option<MemoryAllocationStats>,
+	/// `ru_maxrss` from `getrusage`. `None` if an error occurred.
+	#[cfg(target_os = "linux")]
+	pub max_rss: Option<i64>,
+}
+
+/// Statistics of collected memory metrics.
+#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
+#[derive(Clone, Debug, Default, Encode, Decode)]
+pub struct MemoryAllocationStats {
+	/// Total resident memory, in bytes.
+	pub resident: u64,
+	/// Total allocated memory, in bytes.
+	pub allocated: u64,
 }
@@ -14,7 +14,7 @@
 // You should have received a copy of the GNU General Public License
 // along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.

-use super::worker::{self, Outcome};
+use super::worker_intf::{self, Outcome};
 use crate::{
 	error::{PrepareError, PrepareResult},
 	metrics::Metrics,
@@ -250,7 +250,7 @@ async fn spawn_worker_task(program_path: PathBuf, spawn_timeout: Duration) -> Po
 	use futures_timer::Delay;

 	loop {
-		match worker::spawn(&program_path, spawn_timeout).await {
+		match worker_intf::spawn(&program_path, spawn_timeout).await {
 			Ok((idle, handle)) => break PoolEvent::Spawn(idle, handle),
 			Err(err) => {
 				gum::warn!(target: LOG_TARGET, "failed to spawn a prepare worker: {:?}", err);
@@ -271,7 +271,7 @@ async fn start_work_task<Timer>(
 	artifact_path: PathBuf,
 	_preparation_timer: Option<Timer>,
 ) -> PoolEvent {
-	let outcome = worker::start_work(&metrics, idle, pvf, &cache_path, artifact_path).await;
+	let outcome = worker_intf::start_work(&metrics, idle, pvf, &cache_path, artifact_path).await;
 	PoolEvent::StartWork(worker, outcome)
 }

@@ -226,7 +226,7 @@ async fn handle_enqueue(
 		target: LOG_TARGET,
 		validation_code_hash = ?pvf.code_hash(),
 		?priority,
-		preparation_timeout = ?pvf.prep_timeout,
+		preparation_timeout = ?pvf.prep_timeout(),
 		"PVF is enqueued for preparation.",
 	);
 	queue.metrics.prepare_enqueued();
@@ -14,33 +14,24 @@
 // You should have received a copy of the GNU General Public License
 // along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.

-#[cfg(target_os = "linux")]
-use super::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread};
-#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
-use super::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop};
-use super::memory_stats::MemoryStats;
+//! Host interface to the prepare worker.
+
 use crate::{
-	artifacts::CompiledArtifact,
 	error::{PrepareError, PrepareResult},
 	metrics::Metrics,
 	prepare::PrepareStats,
 	pvf::PvfPrepData,
 	worker_common::{
-		bytes_to_path, cpu_time_monitor_loop, framed_recv, framed_send, path_to_bytes,
-		spawn_with_program_path, tmpfile_in, worker_event_loop, IdleWorker, SpawnErr, WorkerHandle,
-		JOB_TIMEOUT_WALL_CLOCK_FACTOR,
+		framed_recv, framed_send, path_to_bytes, spawn_with_program_path, tmpfile_in, IdleWorker,
+		SpawnErr, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
 	},
 	LOG_TARGET,
 };
-use cpu_time::ProcessTime;
-use futures::{pin_mut, select_biased, FutureExt};
 use parity_scale_codec::{Decode, Encode};

 use sp_core::hexdisplay::HexDisplay;
 use std::{
-	panic,
 	path::{Path, PathBuf},
-	sync::mpsc::channel,
 	time::Duration,
 };
 use tokio::{io, net::UnixStream};
@@ -104,7 +95,7 @@ pub async fn start_work(
 	);

 	with_tmp_file(stream, pid, cache_path, |tmp_file, mut stream| async move {
-		let preparation_timeout = pvf.prep_timeout;
+		let preparation_timeout = pvf.prep_timeout();
 		if let Err(err) = send_request(&mut stream, pvf, &tmp_file).await {
 			gum::warn!(
 				target: LOG_TARGET,
@@ -285,28 +276,6 @@ async fn send_request(
 	Ok(())
 }

-async fn recv_request(stream: &mut UnixStream) -> io::Result<(PvfPrepData, PathBuf)> {
-	let pvf = framed_recv(stream).await?;
-	let pvf = PvfPrepData::decode(&mut &pvf[..]).map_err(|e| {
-		io::Error::new(
-			io::ErrorKind::Other,
-			format!("prepare pvf recv_request: failed to decode PvfPrepData: {}", e),
-		)
-	})?;
-	let tmp_file = framed_recv(stream).await?;
-	let tmp_file = bytes_to_path(&tmp_file).ok_or_else(|| {
-		io::Error::new(
-			io::ErrorKind::Other,
-			"prepare pvf recv_request: non utf-8 artifact path".to_string(),
-		)
-	})?;
-	Ok((pvf, tmp_file))
-}
-
-async fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Result<()> {
-	framed_send(stream, &result.encode()).await
-}
-
 async fn recv_response(stream: &mut UnixStream, pid: u32) -> io::Result<PrepareResult> {
 	let result = framed_recv(stream).await?;
 	let result = PrepareResult::decode(&mut &result[..]).map_err(|e| {
@@ -325,158 +294,3 @@ async fn recv_response(stream: &mut UnixStream, pid: u32) -> io::Result<PrepareR
 	})?;
 	Ok(result)
 }
-
-/// The entrypoint that the spawned prepare worker should start with. The `socket_path` specifies
-/// the path to the socket used to communicate with the host. The `node_version`, if `Some`,
-/// is checked against the worker version. A mismatch results in immediate worker termination.
-/// `None` is used for tests and in other situations when version check is not necessary.
-///
-/// # Flow
-///
-///	This runs the following in a loop:
-///
-///	1. Get the code and parameters for preparation from the host.
-///
-///	2. Start a memory tracker in a separate thread.
-///
-///	3. Start the CPU time monitor loop and the actual preparation in two separate threads.
-///
-///	4. Select on the two threads created in step 3. If the CPU timeout was hit, the CPU time monitor
-///	   thread will trigger first.
-///
-///	5. Stop the memory tracker and get the stats.
-///
-/// 6. If compilation succeeded, write the compiled artifact into a temporary file.
-///
-///	7. Send the result of preparation back to the host. If any error occurred in the above steps, we
-///	   send that in the `PrepareResult`.
-pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) {
-	worker_event_loop("prepare", socket_path, node_version, |rt_handle, mut stream| async move {
-		let worker_pid = std::process::id();
-
-		loop {
-			let (pvf, dest) = recv_request(&mut stream).await?;
-			gum::debug!(
-				target: LOG_TARGET,
-				%worker_pid,
-				"worker: preparing artifact",
-			);
-
-			let cpu_time_start = ProcessTime::now();
-			let preparation_timeout = pvf.prep_timeout;
-
-			// Run the memory tracker.
-			#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
-			let (memory_tracker_tx, memory_tracker_rx) = channel::<()>();
-			#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
-			let memory_tracker_fut = rt_handle.spawn_blocking(move || memory_tracker_loop(memory_tracker_rx));
-
-			// Spawn a new thread that runs the CPU time monitor.
-			let (cpu_time_monitor_tx, cpu_time_monitor_rx) = channel::<()>();
-			let cpu_time_monitor_fut = rt_handle
-				.spawn_blocking(move || {
-					cpu_time_monitor_loop(cpu_time_start, preparation_timeout, cpu_time_monitor_rx)
-				})
-				.fuse();
-			// Spawn another thread for preparation.
-			let prepare_fut = rt_handle
-				.spawn_blocking(move || {
-					let result = prepare_artifact(pvf);
-
-					// Get the `ru_maxrss` stat. If supported, call getrusage for the thread.
-					#[cfg(target_os = "linux")]
-					let result = result.map(|artifact| (artifact, get_max_rss_thread()));
-
-					result
-				})
-				.fuse();
-
-			pin_mut!(cpu_time_monitor_fut);
-			pin_mut!(prepare_fut);
-
-			let result = select_biased! {
-				// If this future is not selected, the join handle is dropped and the thread will
-				// finish in the background.
-				join_res = cpu_time_monitor_fut => {
-					match join_res {
-						Ok(Some(cpu_time_elapsed)) => {
-							// Log if we exceed the timeout and the other thread hasn't finished.
-							gum::warn!(
-								target: LOG_TARGET,
-								%worker_pid,
-								"prepare job took {}ms cpu time, exceeded prepare timeout {}ms",
-								cpu_time_elapsed.as_millis(),
-								preparation_timeout.as_millis(),
-							);
-							Err(PrepareError::TimedOut)
-						},
-						Ok(None) => Err(PrepareError::IoErr("error communicating over finished channel".into())),
-						Err(err) => Err(PrepareError::IoErr(err.to_string())),
-					}
-				},
-				prepare_res = prepare_fut => {
-					let cpu_time_elapsed = cpu_time_start.elapsed();
-					let _ = cpu_time_monitor_tx.send(());
-
-					match prepare_res.unwrap_or_else(|err| Err(PrepareError::IoErr(err.to_string()))) {
-						Err(err) => {
-							// Serialized error will be written into the socket.
-							Err(err)
-						},
-						Ok(ok) => {
-							// Stop the memory stats worker and get its observed memory stats.
-							#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
-							let memory_tracker_stats =
-								get_memory_tracker_loop_stats(memory_tracker_fut, memory_tracker_tx, worker_pid).await;
-							#[cfg(target_os = "linux")]
-							let (ok, max_rss) = ok;
-							let memory_stats = MemoryStats {
-								#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
-								memory_tracker_stats,
-								#[cfg(target_os = "linux")]
-								max_rss: extract_max_rss_stat(max_rss, worker_pid),
-							};
-
-							// Write the serialized artifact into a temp file.
-							//
-							// PVF host only keeps artifacts statuses in its memory, successfully
-							// compiled code gets stored on the disk (and consequently deserialized
-							// by execute-workers). The prepare worker is only required to send `Ok`
-							// to the pool to indicate the success.
-
-							gum::debug!(
-								target: LOG_TARGET,
-								%worker_pid,
-								"worker: writing artifact to {}",
-								dest.display(),
-							);
-							tokio::fs::write(&dest, &ok).await?;
-
-							Ok(PrepareStats{cpu_time_elapsed, memory_stats})
-						},
-					}
-				},
-			};
-
-			send_response(&mut stream, result).await?;
-		}
-	});
-}
-
-fn prepare_artifact(pvf: PvfPrepData) -> Result<CompiledArtifact, PrepareError> {
-	panic::catch_unwind(|| {
-		let blob = match crate::executor_intf::prevalidate(&pvf.code()) {
-			Err(err) => return Err(PrepareError::Prevalidation(format!("{:?}", err))),
-			Ok(b) => b,
-		};
-
-		match crate::executor_intf::prepare(blob, &pvf.executor_params()) {
-			Ok(compiled_artifact) => Ok(CompiledArtifact::new(compiled_artifact)),
-			Err(err) => Err(PrepareError::Preparation(format!("{:?}", err))),
-		}
-	})
-	.map_err(|panic_payload| {
-		PrepareError::Panic(crate::error::stringify_panic_payload(panic_payload))
-	})
-	.and_then(|inner_result| inner_result)
-}