PVF timeouts follow-up (#6151)

* Rename timeout consts and timeout parameter; bump leniency

* Update implementor's guide with info about PVFs

* Make glossary a bit easier to read

* Add a note to LENIENT_PREPARATION_TIMEOUT

* Remove PVF-specific section from glossary

* Fix some typos
This commit is contained in:
Marcin S
2022-11-01 10:59:53 -04:00
committed by GitHub
parent 351d37e324
commit 1f8219767e
10 changed files with 118 additions and 116 deletions
+2 -2
View File
@@ -96,7 +96,7 @@ pub enum ArtifactState {
/// That means that the artifact should be accessible through the path obtained by the artifact
/// id (unless, it was removed externally).
Prepared {
/// The time when the artifact was the last time needed.
/// The time when the artifact was last needed.
///
/// This is updated when we get the heads up for this artifact or when we just discover
/// this file.
@@ -120,7 +120,7 @@ impl Artifacts {
///
/// The recognized artifacts will be filled in the table and unrecognized will be removed.
pub async fn new(cache_path: &Path) -> Self {
// Make sure that the cache path directory and all it's parents are created.
// Make sure that the cache path directory and all its parents are created.
// First delete the entire cache. Nodes are long-running so this should populate shortly.
let _ = async_std::fs::remove_dir_all(cache_path).await;
let _ = async_std::fs::create_dir_all(cache_path).await;
+17 -14
View File
@@ -38,15 +38,16 @@ use std::{
time::{Duration, SystemTime},
};
/// The time period after which the precheck preparation worker is considered unresponsive and will
/// be killed.
/// For prechecking requests, the time period after which the preparation worker is considered
/// unresponsive and will be killed.
// NOTE: If you change this make sure to fix the buckets of `pvf_preparation_time` metric.
pub const PRECHECK_COMPILATION_TIMEOUT: Duration = Duration::from_secs(60);
pub const PRECHECK_PREPARATION_TIMEOUT: Duration = Duration::from_secs(60);
/// The time period after which the execute preparation worker is considered unresponsive and will
/// be killed.
/// For execution and heads-up requests, the time period after which the preparation worker is
/// considered unresponsive and will be killed. More lenient than the timeout for prechecking to
/// prevent honest validators from timing out on valid PVFs.
// NOTE: If you change this make sure to fix the buckets of `pvf_preparation_time` metric.
pub const EXECUTE_COMPILATION_TIMEOUT: Duration = Duration::from_secs(180);
pub const LENIENT_PREPARATION_TIMEOUT: Duration = Duration::from_secs(360);
/// An alias to not spell the type for the oneshot sender for the PVF execution result.
pub(crate) type ResultSender = oneshot::Sender<Result<ValidationResult, ValidationError>>;
@@ -429,9 +430,10 @@ async fn handle_to_host(
Ok(())
}
/// Handles PVF prechecking.
/// Handles PVF prechecking requests.
///
/// This tries to prepare the PVF by compiling the WASM blob within a given timeout ([`PRECHECK_COMPILATION_TIMEOUT`]).
/// This tries to prepare the PVF by compiling the WASM blob within a given timeout
/// ([`PRECHECK_PREPARATION_TIMEOUT`]).
async fn handle_precheck_pvf(
artifacts: &mut Artifacts,
prepare_queue: &mut mpsc::Sender<prepare::ToQueue>,
@@ -459,7 +461,7 @@ async fn handle_precheck_pvf(
prepare::ToQueue::Enqueue {
priority: Priority::Normal,
pvf,
compilation_timeout: PRECHECK_COMPILATION_TIMEOUT,
preparation_timeout: PRECHECK_PREPARATION_TIMEOUT,
},
)
.await?;
@@ -469,9 +471,10 @@ async fn handle_precheck_pvf(
/// Handles PVF execution.
///
/// This will first try to prepare the PVF, if a prepared artifact does not already exist. If there is already a
/// preparation job, we coalesce the two preparation jobs. When preparing for execution, we use a more lenient timeout
/// ([`EXECUTE_COMPILATION_TIMEOUT`]) than when prechecking.
/// This will first try to prepare the PVF, if a prepared artifact does not already exist. If there
/// is already a preparation job, we coalesce the two preparation jobs. When preparing for
/// execution, we use a more lenient timeout ([`LENIENT_PREPARATION_TIMEOUT`]) than when
/// prechecking.
async fn handle_execute_pvf(
cache_path: &Path,
artifacts: &mut Artifacts,
@@ -518,7 +521,7 @@ async fn handle_execute_pvf(
prepare::ToQueue::Enqueue {
priority,
pvf,
compilation_timeout: EXECUTE_COMPILATION_TIMEOUT,
preparation_timeout: LENIENT_PREPARATION_TIMEOUT,
},
)
.await?;
@@ -557,7 +560,7 @@ async fn handle_heads_up(
prepare::ToQueue::Enqueue {
priority: Priority::Normal,
pvf: active_pvf,
compilation_timeout: EXECUTE_COMPILATION_TIMEOUT,
preparation_timeout: LENIENT_PREPARATION_TIMEOUT,
},
)
.await?;
+6 -3
View File
@@ -155,8 +155,8 @@ impl metrics::Metrics for Metrics {
"Time spent in preparing PVF artifacts in seconds",
)
.buckets(vec![
// This is synchronized with the PRECHECK_COMPILATION_TIMEOUT=60s
// and EXECUTE_COMPILATION_TIMEOUT=180s constants found in
// This is synchronized with the PRECHECK_PREPARATION_TIMEOUT=60s
// and LENIENT_PREPARATION_TIMEOUT=360s constants found in
// src/prepare/worker.rs
0.1,
0.5,
@@ -167,7 +167,10 @@ impl metrics::Metrics for Metrics {
20.0,
30.0,
60.0,
180.0,
120.0,
240.0,
360.0,
480.0,
]),
)?,
registry,
+5 -5
View File
@@ -65,7 +65,7 @@ pub enum ToPool {
worker: Worker,
code: Arc<Vec<u8>>,
artifact_path: PathBuf,
compilation_timeout: Duration,
preparation_timeout: Duration,
},
}
@@ -210,7 +210,7 @@ fn handle_to_pool(
metrics.prepare_worker().on_begin_spawn();
mux.push(spawn_worker_task(program_path.to_owned(), spawn_timeout).boxed());
},
ToPool::StartWork { worker, code, artifact_path, compilation_timeout } => {
ToPool::StartWork { worker, code, artifact_path, preparation_timeout } => {
if let Some(data) = spawned.get_mut(worker) {
if let Some(idle) = data.idle.take() {
let preparation_timer = metrics.time_preparation();
@@ -221,7 +221,7 @@ fn handle_to_pool(
code,
cache_path.to_owned(),
artifact_path,
compilation_timeout,
preparation_timeout,
preparation_timer,
)
.boxed(),
@@ -269,11 +269,11 @@ async fn start_work_task<Timer>(
code: Arc<Vec<u8>>,
cache_path: PathBuf,
artifact_path: PathBuf,
compilation_timeout: Duration,
preparation_timeout: Duration,
_preparation_timer: Option<Timer>,
) -> PoolEvent {
let outcome =
worker::start_work(idle, code, &cache_path, artifact_path, compilation_timeout).await;
worker::start_work(idle, code, &cache_path, artifact_path, preparation_timeout).await;
PoolEvent::StartWork(worker, outcome)
}
+24 -24
View File
@@ -33,7 +33,7 @@ pub enum ToQueue {
///
/// Note that it is incorrect to enqueue the same PVF again without first receiving the
/// [`FromQueue`] response.
Enqueue { priority: Priority, pvf: Pvf, compilation_timeout: Duration },
Enqueue { priority: Priority, pvf: Pvf, preparation_timeout: Duration },
}
/// A response from queue.
@@ -80,7 +80,7 @@ struct JobData {
priority: Priority,
pvf: Pvf,
/// The timeout for the preparation job.
compilation_timeout: Duration,
preparation_timeout: Duration,
worker: Option<Worker>,
}
@@ -208,8 +208,8 @@ impl Queue {
async fn handle_to_queue(queue: &mut Queue, to_queue: ToQueue) -> Result<(), Fatal> {
match to_queue {
ToQueue::Enqueue { priority, pvf, compilation_timeout } => {
handle_enqueue(queue, priority, pvf, compilation_timeout).await?;
ToQueue::Enqueue { priority, pvf, preparation_timeout } => {
handle_enqueue(queue, priority, pvf, preparation_timeout).await?;
},
}
Ok(())
@@ -219,13 +219,13 @@ async fn handle_enqueue(
queue: &mut Queue,
priority: Priority,
pvf: Pvf,
compilation_timeout: Duration,
preparation_timeout: Duration,
) -> Result<(), Fatal> {
gum::debug!(
target: LOG_TARGET,
validation_code_hash = ?pvf.code_hash,
?priority,
?compilation_timeout,
?preparation_timeout,
"PVF is enqueued for preparation.",
);
queue.metrics.prepare_enqueued();
@@ -247,7 +247,7 @@ async fn handle_enqueue(
return Ok(())
}
let job = queue.jobs.insert(JobData { priority, pvf, compilation_timeout, worker: None });
let job = queue.jobs.insert(JobData { priority, pvf, preparation_timeout, worker: None });
queue.artifact_id_to_job.insert(artifact_id, job);
if let Some(available) = find_idle_worker(queue) {
@@ -439,7 +439,7 @@ async fn assign(queue: &mut Queue, worker: Worker, job: Job) -> Result<(), Fatal
worker,
code: job_data.pvf.code.clone(),
artifact_path,
compilation_timeout: job_data.compilation_timeout,
preparation_timeout: job_data.preparation_timeout,
},
)
.await?;
@@ -494,7 +494,7 @@ pub fn start(
#[cfg(test)]
mod tests {
use super::*;
use crate::{error::PrepareError, host::PRECHECK_COMPILATION_TIMEOUT};
use crate::{error::PrepareError, host::PRECHECK_PREPARATION_TIMEOUT};
use assert_matches::assert_matches;
use futures::{future::BoxFuture, FutureExt};
use slotmap::SlotMap;
@@ -612,7 +612,7 @@ mod tests {
test.send_queue(ToQueue::Enqueue {
priority: Priority::Normal,
pvf: pvf(1),
compilation_timeout: PRECHECK_COMPILATION_TIMEOUT,
preparation_timeout: PRECHECK_PREPARATION_TIMEOUT,
});
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
@@ -626,12 +626,12 @@ mod tests {
#[async_std::test]
async fn dont_spawn_over_soft_limit_unless_critical() {
let mut test = Test::new(2, 3);
let compilation_timeout = PRECHECK_COMPILATION_TIMEOUT;
let preparation_timeout = PRECHECK_PREPARATION_TIMEOUT;
let priority = Priority::Normal;
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(1), compilation_timeout });
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(2), compilation_timeout });
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(3), compilation_timeout });
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(1), preparation_timeout });
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(2), preparation_timeout });
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(3), preparation_timeout });
// Receive only two spawns.
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
@@ -655,7 +655,7 @@ mod tests {
test.send_queue(ToQueue::Enqueue {
priority: Priority::Critical,
pvf: pvf(4),
compilation_timeout,
preparation_timeout,
});
// 2 out of 2 are working, but there is a critical job incoming. That means that spawning
@@ -666,12 +666,12 @@ mod tests {
#[async_std::test]
async fn cull_unwanted() {
let mut test = Test::new(1, 2);
let compilation_timeout = PRECHECK_COMPILATION_TIMEOUT;
let preparation_timeout = PRECHECK_PREPARATION_TIMEOUT;
test.send_queue(ToQueue::Enqueue {
priority: Priority::Normal,
pvf: pvf(1),
compilation_timeout,
preparation_timeout,
});
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
let w1 = test.workers.insert(());
@@ -682,7 +682,7 @@ mod tests {
test.send_queue(ToQueue::Enqueue {
priority: Priority::Critical,
pvf: pvf(2),
compilation_timeout,
preparation_timeout,
});
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
@@ -701,10 +701,10 @@ mod tests {
async fn worker_mass_die_out_doesnt_stall_queue() {
let mut test = Test::new(2, 2);
let (priority, compilation_timeout) = (Priority::Normal, PRECHECK_COMPILATION_TIMEOUT);
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(1), compilation_timeout });
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(2), compilation_timeout });
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(3), compilation_timeout });
let (priority, preparation_timeout) = (Priority::Normal, PRECHECK_PREPARATION_TIMEOUT);
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(1), preparation_timeout });
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(2), preparation_timeout });
test.send_queue(ToQueue::Enqueue { priority, pvf: pvf(3), preparation_timeout });
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
@@ -734,7 +734,7 @@ mod tests {
test.send_queue(ToQueue::Enqueue {
priority: Priority::Normal,
pvf: pvf(1),
compilation_timeout: PRECHECK_COMPILATION_TIMEOUT,
preparation_timeout: PRECHECK_PREPARATION_TIMEOUT,
});
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
@@ -759,7 +759,7 @@ mod tests {
test.send_queue(ToQueue::Enqueue {
priority: Priority::Normal,
pvf: pvf(1),
compilation_timeout: PRECHECK_COMPILATION_TIMEOUT,
preparation_timeout: PRECHECK_PREPARATION_TIMEOUT,
});
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
+2 -2
View File
@@ -65,7 +65,7 @@ pub async fn start_work(
code: Arc<Vec<u8>>,
cache_path: &Path,
artifact_path: PathBuf,
compilation_timeout: Duration,
preparation_timeout: Duration,
) -> Outcome {
let IdleWorker { mut stream, pid } = worker;
@@ -100,7 +100,7 @@ pub async fn start_work(
}
let selected =
match async_std::future::timeout(compilation_timeout, framed_recv(&mut stream)).await {
match async_std::future::timeout(preparation_timeout, framed_recv(&mut stream)).await {
Ok(Ok(response_bytes)) => {
// Received bytes from worker within the time limit.
// By convention we expect encoded `PrepareResult`.
+1 -1
View File
@@ -19,7 +19,7 @@ use polkadot_parachain::primitives::ValidationCodeHash;
use sp_core::blake2_256;
use std::{fmt, sync::Arc};
/// A struct that carries code of a parachain validation function and it's hash.
/// A struct that carries code of a parachain validation function and its hash.
///
/// Should be cheap to clone.
#[derive(Clone)]