mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-09 21:21:11 +00:00
Use CPU clock timeout for PVF jobs (#6282)
* Put in skeleton logic for CPU-time-preparation Still needed: - Flesh out logic - Refactor some spots - Tests * Continue filling in logic for prepare worker CPU time changes * Fix compiler errors * Update lenience factor * Fix some clippy lints for PVF module * Fix compilation errors * Address some review comments * Add logging * Add another log * Address some review comments; change Mutex to AtomicBool * Refactor handling response bytes * Add CPU clock timeout logic for execute jobs * Properly handle AtomicBool flag * Use `Ordering::Relaxed` * Refactor thread coordination logic * Fix bug * Add some timing information to execute tests * Add section about the mitigation to the IG * minor: Change more `Ordering`s to `Relaxed` * candidate-validation: Fix build errors
This commit is contained in:
@@ -24,4 +24,4 @@ mod queue;
|
||||
mod worker;
|
||||
|
||||
pub use queue::{start, ToQueue};
|
||||
pub use worker::worker_entrypoint;
|
||||
pub use worker::{worker_entrypoint, Response as ExecuteResponse};
|
||||
|
||||
@@ -225,8 +225,9 @@ fn handle_job_finish(
|
||||
result_tx: ResultSender,
|
||||
) {
|
||||
let (idle_worker, result) = match outcome {
|
||||
Outcome::Ok { result_descriptor, duration_ms: _, idle_worker } => {
|
||||
Outcome::Ok { result_descriptor, duration: _, idle_worker } => {
|
||||
// TODO: propagate the soft timeout
|
||||
|
||||
(Some(idle_worker), Ok(result_descriptor))
|
||||
},
|
||||
Outcome::InvalidCandidate { err, idle_worker } => (
|
||||
|
||||
@@ -18,8 +18,9 @@ use crate::{
|
||||
artifacts::ArtifactPathId,
|
||||
executor_intf::Executor,
|
||||
worker_common::{
|
||||
bytes_to_path, framed_recv, framed_send, path_to_bytes, spawn_with_program_path,
|
||||
worker_event_loop, IdleWorker, SpawnErr, WorkerHandle,
|
||||
bytes_to_path, cpu_time_monitor_loop, framed_recv, framed_send, path_to_bytes,
|
||||
spawn_with_program_path, worker_event_loop, IdleWorker, JobKind, SpawnErr, WorkerHandle,
|
||||
JOB_TIMEOUT_WALL_CLOCK_FACTOR,
|
||||
},
|
||||
LOG_TARGET,
|
||||
};
|
||||
@@ -27,12 +28,21 @@ use async_std::{
|
||||
io,
|
||||
os::unix::net::UnixStream,
|
||||
path::{Path, PathBuf},
|
||||
task,
|
||||
};
|
||||
use cpu_time::ProcessTime;
|
||||
use futures::FutureExt;
|
||||
use futures_timer::Delay;
|
||||
use parity_scale_codec::{Decode, Encode};
|
||||
use polkadot_parachain::primitives::ValidationResult;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{
|
||||
sync::{
|
||||
atomic::{AtomicBool, Ordering},
|
||||
Arc,
|
||||
},
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
/// Spawns a new worker with the given program path that acts as the worker and the spawn timeout.
|
||||
///
|
||||
@@ -48,7 +58,7 @@ pub async fn spawn(
|
||||
pub enum Outcome {
|
||||
/// PVF execution completed successfully and the result is returned. The worker is ready for
|
||||
/// another job.
|
||||
Ok { result_descriptor: ValidationResult, duration_ms: u64, idle_worker: IdleWorker },
|
||||
Ok { result_descriptor: ValidationResult, duration: Duration, idle_worker: IdleWorker },
|
||||
/// The candidate validation failed. It may be for example because the wasm execution triggered a trap.
|
||||
/// Errors related to the preparation process are not expected to be encountered by the execution workers.
|
||||
InvalidCandidate { err: String, idle_worker: IdleWorker },
|
||||
@@ -80,7 +90,9 @@ pub async fn start_work(
|
||||
artifact.path.display(),
|
||||
);
|
||||
|
||||
if let Err(error) = send_request(&mut stream, &artifact.path, &validation_params).await {
|
||||
if let Err(error) =
|
||||
send_request(&mut stream, &artifact.path, &validation_params, execution_timeout).await
|
||||
{
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
worker_pid = %pid,
|
||||
@@ -91,6 +103,12 @@ pub async fn start_work(
|
||||
return Outcome::IoErr
|
||||
}
|
||||
|
||||
// We use a generous timeout here. This is in addition to the one in the child process, in
|
||||
// case the child stalls. We have a wall clock timeout here in the host, but a CPU timeout
|
||||
// in the child. We want to use CPU time because it varies less than wall clock time under
|
||||
// load, but the CPU resources of the child can only be measured from the parent after the
|
||||
// child process terminates.
|
||||
let timeout = execution_timeout * JOB_TIMEOUT_WALL_CLOCK_FACTOR;
|
||||
let response = futures::select! {
|
||||
response = recv_response(&mut stream).fuse() => {
|
||||
match response {
|
||||
@@ -104,25 +122,47 @@ pub async fn start_work(
|
||||
);
|
||||
return Outcome::IoErr
|
||||
},
|
||||
Ok(response) => response,
|
||||
Ok(response) => {
|
||||
if let Response::Ok{duration, ..} = response {
|
||||
if duration > execution_timeout {
|
||||
// The job didn't complete within the timeout.
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
worker_pid = %pid,
|
||||
"execute job took {}ms cpu time, exceeded execution timeout {}ms.",
|
||||
duration.as_millis(),
|
||||
execution_timeout.as_millis(),
|
||||
);
|
||||
|
||||
// Return a timeout error.
|
||||
return Outcome::HardTimeout;
|
||||
}
|
||||
}
|
||||
|
||||
response
|
||||
},
|
||||
}
|
||||
},
|
||||
_ = Delay::new(execution_timeout).fuse() => {
|
||||
_ = Delay::new(timeout).fuse() => {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
worker_pid = %pid,
|
||||
validation_code_hash = ?artifact.id.code_hash,
|
||||
"execution worker exceeded alloted time for execution",
|
||||
);
|
||||
return Outcome::HardTimeout;
|
||||
// TODO: This case is not really a hard timeout as the timeout here in the host is
|
||||
// lenient. Should fix this as part of
|
||||
// https://github.com/paritytech/polkadot/issues/3754.
|
||||
Response::TimedOut
|
||||
},
|
||||
};
|
||||
|
||||
match response {
|
||||
Response::Ok { result_descriptor, duration_ms } =>
|
||||
Outcome::Ok { result_descriptor, duration_ms, idle_worker: IdleWorker { stream, pid } },
|
||||
Response::Ok { result_descriptor, duration } =>
|
||||
Outcome::Ok { result_descriptor, duration, idle_worker: IdleWorker { stream, pid } },
|
||||
Response::InvalidCandidate(err) =>
|
||||
Outcome::InvalidCandidate { err, idle_worker: IdleWorker { stream, pid } },
|
||||
Response::TimedOut => Outcome::HardTimeout,
|
||||
Response::InternalError(err) =>
|
||||
Outcome::InternalError { err, idle_worker: IdleWorker { stream, pid } },
|
||||
}
|
||||
@@ -132,12 +172,14 @@ async fn send_request(
|
||||
stream: &mut UnixStream,
|
||||
artifact_path: &Path,
|
||||
validation_params: &[u8],
|
||||
execution_timeout: Duration,
|
||||
) -> io::Result<()> {
|
||||
framed_send(stream, path_to_bytes(artifact_path)).await?;
|
||||
framed_send(stream, validation_params).await
|
||||
framed_send(stream, validation_params).await?;
|
||||
framed_send(stream, &execution_timeout.encode()).await
|
||||
}
|
||||
|
||||
async fn recv_request(stream: &mut UnixStream) -> io::Result<(PathBuf, Vec<u8>)> {
|
||||
async fn recv_request(stream: &mut UnixStream) -> io::Result<(PathBuf, Vec<u8>, Duration)> {
|
||||
let artifact_path = framed_recv(stream).await?;
|
||||
let artifact_path = bytes_to_path(&artifact_path).ok_or_else(|| {
|
||||
io::Error::new(
|
||||
@@ -146,7 +188,14 @@ async fn recv_request(stream: &mut UnixStream) -> io::Result<(PathBuf, Vec<u8>)>
|
||||
)
|
||||
})?;
|
||||
let params = framed_recv(stream).await?;
|
||||
Ok((artifact_path, params))
|
||||
let execution_timeout = framed_recv(stream).await?;
|
||||
let execution_timeout = Duration::decode(&mut &execution_timeout[..]).map_err(|_| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"execute pvf recv_request: failed to decode duration".to_string(),
|
||||
)
|
||||
})?;
|
||||
Ok((artifact_path, params, execution_timeout))
|
||||
}
|
||||
|
||||
async fn send_response(stream: &mut UnixStream, response: Response) -> io::Result<()> {
|
||||
@@ -164,9 +213,10 @@ async fn recv_response(stream: &mut UnixStream) -> io::Result<Response> {
|
||||
}
|
||||
|
||||
#[derive(Encode, Decode)]
|
||||
enum Response {
|
||||
Ok { result_descriptor: ValidationResult, duration_ms: u64 },
|
||||
pub enum Response {
|
||||
Ok { result_descriptor: ValidationResult, duration: Duration },
|
||||
InvalidCandidate(String),
|
||||
TimedOut,
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
@@ -187,15 +237,53 @@ pub fn worker_entrypoint(socket_path: &str) {
|
||||
let executor = Executor::new().map_err(|e| {
|
||||
io::Error::new(io::ErrorKind::Other, format!("cannot create executor: {}", e))
|
||||
})?;
|
||||
|
||||
loop {
|
||||
let (artifact_path, params) = recv_request(&mut stream).await?;
|
||||
let (artifact_path, params, execution_timeout) = recv_request(&mut stream).await?;
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
worker_pid = %std::process::id(),
|
||||
"worker: validating artifact {}",
|
||||
artifact_path.display(),
|
||||
);
|
||||
let response = validate_using_artifact(&artifact_path, ¶ms, &executor).await;
|
||||
|
||||
// Create a lock flag. We set it when either thread finishes.
|
||||
let lock = Arc::new(AtomicBool::new(false));
|
||||
let cpu_time_start = ProcessTime::now();
|
||||
|
||||
// Spawn a new thread that runs the CPU time monitor. Continuously wakes up from
|
||||
// sleeping and then either sleeps for the remaining CPU time, or kills the process if
|
||||
// we exceed the CPU timeout.
|
||||
let (stream_2, cpu_time_start_2, execution_timeout_2, lock_2) =
|
||||
(stream.clone(), cpu_time_start, execution_timeout, lock.clone());
|
||||
let handle =
|
||||
thread::Builder::new().name("CPU time monitor".into()).spawn(move || {
|
||||
task::block_on(async {
|
||||
cpu_time_monitor_loop(
|
||||
JobKind::Execute,
|
||||
stream_2,
|
||||
cpu_time_start_2,
|
||||
execution_timeout_2,
|
||||
lock_2,
|
||||
)
|
||||
.await;
|
||||
})
|
||||
})?;
|
||||
|
||||
let response =
|
||||
validate_using_artifact(&artifact_path, ¶ms, &executor, cpu_time_start).await;
|
||||
|
||||
let lock_result =
|
||||
lock.compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed);
|
||||
if lock_result.is_err() {
|
||||
// The other thread is still sending an error response over the socket. Wait on it
|
||||
// and return.
|
||||
let _ = handle.join();
|
||||
// Monitor thread detected timeout and likely already terminated the process,
|
||||
// nothing to do.
|
||||
continue
|
||||
}
|
||||
|
||||
send_response(&mut stream, response).await?;
|
||||
}
|
||||
});
|
||||
@@ -205,19 +293,19 @@ async fn validate_using_artifact(
|
||||
artifact_path: &Path,
|
||||
params: &[u8],
|
||||
executor: &Executor,
|
||||
cpu_time_start: ProcessTime,
|
||||
) -> Response {
|
||||
let validation_started_at = Instant::now();
|
||||
let descriptor_bytes = match unsafe {
|
||||
// SAFETY: this should be safe since the compiled artifact passed here comes from the
|
||||
// file created by the prepare workers. These files are obtained by calling
|
||||
// [`executor_intf::prepare`].
|
||||
executor.execute(artifact_path.as_ref(), params)
|
||||
} {
|
||||
Err(err) => return Response::format_invalid("execute", &err.to_string()),
|
||||
Err(err) => return Response::format_invalid("execute", &err),
|
||||
Ok(d) => d,
|
||||
};
|
||||
|
||||
let duration_ms = validation_started_at.elapsed().as_millis() as u64;
|
||||
let duration = cpu_time_start.elapsed();
|
||||
|
||||
let result_descriptor = match ValidationResult::decode(&mut &descriptor_bytes[..]) {
|
||||
Err(err) =>
|
||||
@@ -225,5 +313,5 @@ async fn validate_using_artifact(
|
||||
Ok(r) => r,
|
||||
};
|
||||
|
||||
Response::Ok { result_descriptor, duration_ms }
|
||||
Response::Ok { result_descriptor, duration }
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user