mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-13 17:31:05 +00:00
PVF: re-preparing artifact on failed runtime construction (#3187)
resolve https://github.com/paritytech/polkadot-sdk/issues/3139 - [x] use a distinguishable error for `execute_artifact` - [x] remove artifact in case of a `RuntimeConstruction` error during the execution - [x] augment the `validate_candidate_with_retry` of `ValidationBackend` with the case of retriable `RuntimeConstruction` error during the execution - [x] update the book (https://paritytech.github.io/polkadot-sdk/book/node/utility/pvf-host-and-workers.html#retrying-execution-requests) - [x] add a test - [x] run zombienet tests --------- Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
This commit is contained in:
@@ -695,6 +695,8 @@ async fn validate_candidate_exhaustive(
|
||||
))),
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))) =>
|
||||
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(err))) =>
|
||||
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),
|
||||
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))) =>
|
||||
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(format!(
|
||||
@@ -780,40 +782,50 @@ trait ValidationBackend {
|
||||
return validation_result
|
||||
}
|
||||
|
||||
macro_rules! break_if_no_retries_left {
|
||||
($counter:ident) => {
|
||||
if $counter > 0 {
|
||||
$counter -= 1;
|
||||
} else {
|
||||
break
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Allow limited retries for each kind of error.
|
||||
let mut num_death_retries_left = 1;
|
||||
let mut num_job_error_retries_left = 1;
|
||||
let mut num_internal_retries_left = 1;
|
||||
let mut num_runtime_construction_retries_left = 1;
|
||||
loop {
|
||||
// Stop retrying if we exceeded the timeout.
|
||||
if total_time_start.elapsed() + retry_delay > exec_timeout {
|
||||
break
|
||||
}
|
||||
|
||||
let mut retry_immediately = false;
|
||||
match validation_result {
|
||||
Err(ValidationError::PossiblyInvalid(
|
||||
PossiblyInvalidError::AmbiguousWorkerDeath |
|
||||
PossiblyInvalidError::AmbiguousJobDeath(_),
|
||||
)) =>
|
||||
if num_death_retries_left > 0 {
|
||||
num_death_retries_left -= 1;
|
||||
} else {
|
||||
break
|
||||
},
|
||||
)) => break_if_no_retries_left!(num_death_retries_left),
|
||||
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(_))) =>
|
||||
if num_job_error_retries_left > 0 {
|
||||
num_job_error_retries_left -= 1;
|
||||
} else {
|
||||
break
|
||||
},
|
||||
break_if_no_retries_left!(num_job_error_retries_left),
|
||||
|
||||
Err(ValidationError::Internal(_)) =>
|
||||
if num_internal_retries_left > 0 {
|
||||
num_internal_retries_left -= 1;
|
||||
} else {
|
||||
break
|
||||
},
|
||||
break_if_no_retries_left!(num_internal_retries_left),
|
||||
|
||||
Err(ValidationError::PossiblyInvalid(
|
||||
PossiblyInvalidError::RuntimeConstruction(_),
|
||||
)) => {
|
||||
break_if_no_retries_left!(num_runtime_construction_retries_left);
|
||||
self.precheck_pvf(pvf.clone()).await?;
|
||||
// In this case the error is deterministic
|
||||
// And a retry forces the ValidationBackend
|
||||
// to re-prepare the artifact so
|
||||
// there is no need to wait before the retry
|
||||
retry_immediately = true;
|
||||
},
|
||||
|
||||
Ok(_) | Err(ValidationError::Invalid(_) | ValidationError::Preparation(_)) => break,
|
||||
}
|
||||
@@ -821,8 +833,11 @@ trait ValidationBackend {
|
||||
// If we got a possibly transient error, retry once after a brief delay, on the
|
||||
// assumption that the conditions that caused this error may have resolved on their own.
|
||||
{
|
||||
// Wait a brief delay before retrying.
|
||||
futures_timer::Delay::new(retry_delay).await;
|
||||
// In case of many transient errors it is necessary to wait a little bit
|
||||
// for the error to be probably resolved
|
||||
if !retry_immediately {
|
||||
futures_timer::Delay::new(retry_delay).await;
|
||||
}
|
||||
|
||||
let new_timeout = exec_timeout.saturating_sub(total_time_start.elapsed());
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
use crate::prepare::{PrepareSuccess, PrepareWorkerSuccess};
|
||||
use parity_scale_codec::{Decode, Encode};
|
||||
pub use sc_executor_common::error::Error as ExecuteError;
|
||||
|
||||
/// Result of PVF preparation from a worker, with checksum of the compiled PVF and stats of the
|
||||
/// preparation if successful.
|
||||
|
||||
@@ -40,6 +40,9 @@ pub enum WorkerResponse {
|
||||
},
|
||||
/// The candidate is invalid.
|
||||
InvalidCandidate(String),
|
||||
/// Instantiation of the WASM module instance failed during an execution.
|
||||
/// Possibly related to local issues or dirty node update. May be retried with re-preparation.
|
||||
RuntimeConstruction(String),
|
||||
/// The job timed out.
|
||||
JobTimedOut,
|
||||
/// The job process has died. We must kill the worker just in case.
|
||||
@@ -68,6 +71,9 @@ pub enum JobResponse {
|
||||
/// The result of parachain validation.
|
||||
result_descriptor: ValidationResult,
|
||||
},
|
||||
/// A possibly transient runtime instantiation error happened during the execution; may be
|
||||
/// retried with re-preparation
|
||||
RuntimeConstruction(String),
|
||||
/// The candidate is invalid.
|
||||
InvalidCandidate(String),
|
||||
}
|
||||
@@ -81,6 +87,15 @@ impl JobResponse {
|
||||
Self::InvalidCandidate(format!("{}: {}", ctx, msg))
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a may retry response from a context `ctx` and a message `msg` (which can be empty).
|
||||
pub fn runtime_construction(ctx: &'static str, msg: &str) -> Self {
|
||||
if msg.is_empty() {
|
||||
Self::RuntimeConstruction(ctx.to_string())
|
||||
} else {
|
||||
Self::RuntimeConstruction(format!("{}: {}", ctx, msg))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An unexpected error occurred in the execution job process. Because this comes from the job,
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
//! Interface to the Substrate Executor
|
||||
|
||||
use crate::error::ExecuteError;
|
||||
use polkadot_primitives::{
|
||||
executor_params::{DEFAULT_LOGICAL_STACK_MAX, DEFAULT_NATIVE_STACK_MAX},
|
||||
ExecutorParam, ExecutorParams,
|
||||
@@ -109,7 +110,7 @@ pub unsafe fn execute_artifact(
|
||||
compiled_artifact_blob: &[u8],
|
||||
executor_params: &ExecutorParams,
|
||||
params: &[u8],
|
||||
) -> Result<Vec<u8>, String> {
|
||||
) -> Result<Vec<u8>, ExecuteError> {
|
||||
let mut extensions = sp_externalities::Extensions::new();
|
||||
|
||||
extensions.register(sp_core::traits::ReadRuntimeVersionExt::new(ReadRuntimeVersion));
|
||||
@@ -123,7 +124,6 @@ pub unsafe fn execute_artifact(
|
||||
Ok(Ok(ok)) => Ok(ok),
|
||||
Ok(Err(err)) | Err(err) => Err(err),
|
||||
}
|
||||
.map_err(|err| format!("execute error: {:?}", err))
|
||||
}
|
||||
|
||||
/// Constructs the runtime for the given PVF, given the artifact bytes.
|
||||
|
||||
@@ -16,7 +16,9 @@
|
||||
|
||||
//! Contains the logic for executing PVFs. Used by the polkadot-execute-worker binary.
|
||||
|
||||
pub use polkadot_node_core_pvf_common::executor_interface::execute_artifact;
|
||||
pub use polkadot_node_core_pvf_common::{
|
||||
error::ExecuteError, executor_interface::execute_artifact,
|
||||
};
|
||||
|
||||
// NOTE: Initializing logging in e.g. tests will not have an effect in the workers, as they are
|
||||
// separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-execute-worker=trace`.
|
||||
@@ -237,7 +239,9 @@ fn validate_using_artifact(
|
||||
// [`executor_interface::prepare`].
|
||||
execute_artifact(compiled_artifact_blob, executor_params, params)
|
||||
} {
|
||||
Err(err) => return JobResponse::format_invalid("execute", &err),
|
||||
Err(ExecuteError::RuntimeConstruction(wasmerr)) =>
|
||||
return JobResponse::runtime_construction("execute", &wasmerr.to_string()),
|
||||
Err(err) => return JobResponse::format_invalid("execute", &err.to_string()),
|
||||
Ok(d) => d,
|
||||
};
|
||||
|
||||
@@ -550,6 +554,8 @@ fn handle_parent_process(
|
||||
Ok(WorkerResponse::Ok { result_descriptor, duration: cpu_tv })
|
||||
},
|
||||
Ok(JobResponse::InvalidCandidate(err)) => Ok(WorkerResponse::InvalidCandidate(err)),
|
||||
Ok(JobResponse::RuntimeConstruction(err)) =>
|
||||
Ok(WorkerResponse::RuntimeConstruction(err)),
|
||||
Err(job_error) => {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
|
||||
@@ -238,6 +238,14 @@ impl Artifacts {
|
||||
.is_none());
|
||||
}
|
||||
|
||||
/// Remove artifact by its id.
|
||||
pub fn remove(&mut self, artifact_id: ArtifactId) -> Option<(ArtifactId, PathBuf)> {
|
||||
self.inner.remove(&artifact_id).and_then(|state| match state {
|
||||
ArtifactState::Prepared { path, .. } => Some((artifact_id, path)),
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Remove artifacts older than the given TTL and return id and path of the removed ones.
|
||||
pub fn prune(&mut self, artifact_ttl: Duration) -> Vec<(ArtifactId, PathBuf)> {
|
||||
let now = SystemTime::now();
|
||||
|
||||
@@ -86,6 +86,10 @@ pub enum PossiblyInvalidError {
|
||||
/// vote invalid.
|
||||
#[error("possibly invalid: job error: {0}")]
|
||||
JobError(String),
|
||||
/// Instantiation of the WASM module instance failed during an execution.
|
||||
/// Possibly related to local issues or dirty node update. May be retried with re-preparation.
|
||||
#[error("possibly invalid: runtime construction: {0}")]
|
||||
RuntimeConstruction(String),
|
||||
}
|
||||
|
||||
impl From<PrepareError> for ValidationError {
|
||||
|
||||
@@ -23,4 +23,4 @@
|
||||
mod queue;
|
||||
mod worker_interface;
|
||||
|
||||
pub use queue::{start, PendingExecutionRequest, ToQueue};
|
||||
pub use queue::{start, FromQueue, PendingExecutionRequest, ToQueue};
|
||||
|
||||
@@ -25,7 +25,7 @@ use crate::{
|
||||
InvalidCandidate, PossiblyInvalidError, ValidationError, LOG_TARGET,
|
||||
};
|
||||
use futures::{
|
||||
channel::mpsc,
|
||||
channel::{mpsc, oneshot},
|
||||
future::BoxFuture,
|
||||
stream::{FuturesUnordered, StreamExt as _},
|
||||
Future, FutureExt,
|
||||
@@ -54,6 +54,12 @@ pub enum ToQueue {
|
||||
Enqueue { artifact: ArtifactPathId, pending_execution_request: PendingExecutionRequest },
|
||||
}
|
||||
|
||||
/// A response from queue.
|
||||
#[derive(Debug)]
|
||||
pub enum FromQueue {
|
||||
RemoveArtifact { artifact: ArtifactId, reply_to: oneshot::Sender<()> },
|
||||
}
|
||||
|
||||
/// An execution request that should execute the PVF (known in the context) and send the results
|
||||
/// to the given result sender.
|
||||
#[derive(Debug)]
|
||||
@@ -137,6 +143,8 @@ struct Queue {
|
||||
|
||||
/// The receiver that receives messages to the pool.
|
||||
to_queue_rx: mpsc::Receiver<ToQueue>,
|
||||
/// The sender to send messages back to validation host.
|
||||
from_queue_tx: mpsc::UnboundedSender<FromQueue>,
|
||||
|
||||
// Some variables related to the current session.
|
||||
program_path: PathBuf,
|
||||
@@ -161,6 +169,7 @@ impl Queue {
|
||||
node_version: Option<String>,
|
||||
security_status: SecurityStatus,
|
||||
to_queue_rx: mpsc::Receiver<ToQueue>,
|
||||
from_queue_tx: mpsc::UnboundedSender<FromQueue>,
|
||||
) -> Self {
|
||||
Self {
|
||||
metrics,
|
||||
@@ -170,6 +179,7 @@ impl Queue {
|
||||
node_version,
|
||||
security_status,
|
||||
to_queue_rx,
|
||||
from_queue_tx,
|
||||
queue: VecDeque::new(),
|
||||
mux: Mux::new(),
|
||||
workers: Workers {
|
||||
@@ -301,7 +311,7 @@ async fn handle_mux(queue: &mut Queue, event: QueueEvent) {
|
||||
handle_worker_spawned(queue, idle, handle, job);
|
||||
},
|
||||
QueueEvent::StartWork(worker, outcome, artifact_id, result_tx) => {
|
||||
handle_job_finish(queue, worker, outcome, artifact_id, result_tx);
|
||||
handle_job_finish(queue, worker, outcome, artifact_id, result_tx).await;
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -327,42 +337,69 @@ fn handle_worker_spawned(
|
||||
|
||||
/// If there are pending jobs in the queue, schedules the next of them onto the just freed up
|
||||
/// worker. Otherwise, puts back into the available workers list.
|
||||
fn handle_job_finish(
|
||||
async fn handle_job_finish(
|
||||
queue: &mut Queue,
|
||||
worker: Worker,
|
||||
outcome: Outcome,
|
||||
artifact_id: ArtifactId,
|
||||
result_tx: ResultSender,
|
||||
) {
|
||||
let (idle_worker, result, duration) = match outcome {
|
||||
let (idle_worker, result, duration, sync_channel) = match outcome {
|
||||
Outcome::Ok { result_descriptor, duration, idle_worker } => {
|
||||
// TODO: propagate the soft timeout
|
||||
|
||||
(Some(idle_worker), Ok(result_descriptor), Some(duration))
|
||||
(Some(idle_worker), Ok(result_descriptor), Some(duration), None)
|
||||
},
|
||||
Outcome::InvalidCandidate { err, idle_worker } => (
|
||||
Some(idle_worker),
|
||||
Err(ValidationError::Invalid(InvalidCandidate::WorkerReportedInvalid(err))),
|
||||
None,
|
||||
None,
|
||||
),
|
||||
Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None),
|
||||
Outcome::RuntimeConstruction { err, idle_worker } => {
|
||||
// The task for artifact removal is executed concurrently with
|
||||
// the message to the host on the execution result.
|
||||
let (result_tx, result_rx) = oneshot::channel();
|
||||
queue
|
||||
.from_queue_tx
|
||||
.unbounded_send(FromQueue::RemoveArtifact {
|
||||
artifact: artifact_id.clone(),
|
||||
reply_to: result_tx,
|
||||
})
|
||||
.expect("from execute queue receiver is listened by the host; qed");
|
||||
(
|
||||
Some(idle_worker),
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(
|
||||
err,
|
||||
))),
|
||||
None,
|
||||
Some(result_rx),
|
||||
)
|
||||
},
|
||||
Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None, None),
|
||||
// Either the worker or the job timed out. Kill the worker in either case. Treated as
|
||||
// definitely-invalid, because if we timed out, there's no time left for a retry.
|
||||
Outcome::HardTimeout =>
|
||||
(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None),
|
||||
(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None, None),
|
||||
// "Maybe invalid" errors (will retry).
|
||||
Outcome::WorkerIntfErr => (
|
||||
None,
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
|
||||
None,
|
||||
None,
|
||||
),
|
||||
Outcome::JobDied { err } => (
|
||||
None,
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))),
|
||||
None,
|
||||
None,
|
||||
),
|
||||
Outcome::JobError { err } => (
|
||||
None,
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))),
|
||||
None,
|
||||
None,
|
||||
),
|
||||
Outcome::JobError { err } =>
|
||||
(None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))), None),
|
||||
};
|
||||
|
||||
queue.metrics.execute_finished();
|
||||
@@ -386,6 +423,12 @@ fn handle_job_finish(
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(sync_channel) = sync_channel {
|
||||
// err means the sender is dropped (the artifact is already removed from the cache)
|
||||
// so that's legitimate to ignore the result
|
||||
let _ = sync_channel.await;
|
||||
}
|
||||
|
||||
// First we send the result. It may fail due to the other end of the channel being dropped,
|
||||
// that's legitimate and we don't treat that as an error.
|
||||
let _ = result_tx.send(result);
|
||||
@@ -521,8 +564,10 @@ pub fn start(
|
||||
spawn_timeout: Duration,
|
||||
node_version: Option<String>,
|
||||
security_status: SecurityStatus,
|
||||
) -> (mpsc::Sender<ToQueue>, impl Future<Output = ()>) {
|
||||
) -> (mpsc::Sender<ToQueue>, mpsc::UnboundedReceiver<FromQueue>, impl Future<Output = ()>) {
|
||||
let (to_queue_tx, to_queue_rx) = mpsc::channel(20);
|
||||
let (from_queue_tx, from_queue_rx) = mpsc::unbounded();
|
||||
|
||||
let run = Queue::new(
|
||||
metrics,
|
||||
program_path,
|
||||
@@ -532,7 +577,8 @@ pub fn start(
|
||||
node_version,
|
||||
security_status,
|
||||
to_queue_rx,
|
||||
from_queue_tx,
|
||||
)
|
||||
.run();
|
||||
(to_queue_tx, run)
|
||||
(to_queue_tx, from_queue_rx, run)
|
||||
}
|
||||
|
||||
@@ -87,6 +87,10 @@ pub enum Outcome {
|
||||
/// a trap. Errors related to the preparation process are not expected to be encountered by the
|
||||
/// execution workers.
|
||||
InvalidCandidate { err: String, idle_worker: IdleWorker },
|
||||
/// The error is probably transient. It may be for example
|
||||
/// because the artifact was prepared with a Wasmtime version different from the version
|
||||
/// in the current execution environment.
|
||||
RuntimeConstruction { err: String, idle_worker: IdleWorker },
|
||||
/// The execution time exceeded the hard limit. The worker is terminated.
|
||||
HardTimeout,
|
||||
/// An I/O error happened during communication with the worker. This may mean that the worker
|
||||
@@ -193,6 +197,10 @@ pub async fn start_work(
|
||||
err,
|
||||
idle_worker: IdleWorker { stream, pid, worker_dir },
|
||||
},
|
||||
WorkerResponse::RuntimeConstruction(err) => Outcome::RuntimeConstruction {
|
||||
err,
|
||||
idle_worker: IdleWorker { stream, pid, worker_dir },
|
||||
},
|
||||
WorkerResponse::JobTimedOut => Outcome::HardTimeout,
|
||||
WorkerResponse::JobDied { err, job_pid: _ } => Outcome::JobDied { err },
|
||||
WorkerResponse::JobError(err) => Outcome::JobError { err },
|
||||
|
||||
@@ -274,7 +274,7 @@ pub async fn start(
|
||||
from_prepare_pool,
|
||||
);
|
||||
|
||||
let (to_execute_queue_tx, run_execute_queue) = execute::start(
|
||||
let (to_execute_queue_tx, from_execute_queue_rx, run_execute_queue) = execute::start(
|
||||
metrics,
|
||||
config.execute_worker_program_path.to_owned(),
|
||||
config.cache_path.clone(),
|
||||
@@ -296,6 +296,7 @@ pub async fn start(
|
||||
to_prepare_queue_tx,
|
||||
from_prepare_queue_rx,
|
||||
to_execute_queue_tx,
|
||||
from_execute_queue_rx,
|
||||
to_sweeper_tx,
|
||||
awaiting_prepare: AwaitingPrepare::default(),
|
||||
})
|
||||
@@ -342,6 +343,8 @@ struct Inner {
|
||||
from_prepare_queue_rx: mpsc::UnboundedReceiver<prepare::FromQueue>,
|
||||
|
||||
to_execute_queue_tx: mpsc::Sender<execute::ToQueue>,
|
||||
from_execute_queue_rx: mpsc::UnboundedReceiver<execute::FromQueue>,
|
||||
|
||||
to_sweeper_tx: mpsc::Sender<PathBuf>,
|
||||
|
||||
awaiting_prepare: AwaitingPrepare,
|
||||
@@ -358,6 +361,7 @@ async fn run(
|
||||
to_host_rx,
|
||||
from_prepare_queue_rx,
|
||||
mut to_prepare_queue_tx,
|
||||
from_execute_queue_rx,
|
||||
mut to_execute_queue_tx,
|
||||
mut to_sweeper_tx,
|
||||
mut awaiting_prepare,
|
||||
@@ -384,10 +388,21 @@ async fn run(
|
||||
|
||||
let mut to_host_rx = to_host_rx.fuse();
|
||||
let mut from_prepare_queue_rx = from_prepare_queue_rx.fuse();
|
||||
let mut from_execute_queue_rx = from_execute_queue_rx.fuse();
|
||||
|
||||
loop {
|
||||
// biased to make it behave deterministically for tests.
|
||||
futures::select_biased! {
|
||||
from_execute_queue_rx = from_execute_queue_rx.next() => {
|
||||
let from_queue = break_if_fatal!(from_execute_queue_rx.ok_or(Fatal));
|
||||
let execute::FromQueue::RemoveArtifact { artifact, reply_to } = from_queue;
|
||||
break_if_fatal!(handle_artifact_removal(
|
||||
&mut to_sweeper_tx,
|
||||
&mut artifacts,
|
||||
artifact,
|
||||
reply_to,
|
||||
).await);
|
||||
},
|
||||
() = cleanup_pulse.select_next_some() => {
|
||||
// `select_next_some` because we don't expect this to fail, but if it does, we
|
||||
// still don't fail. The trade-off is that the compiled cache will start growing
|
||||
@@ -861,6 +876,37 @@ async fn handle_cleanup_pulse(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_artifact_removal(
|
||||
sweeper_tx: &mut mpsc::Sender<PathBuf>,
|
||||
artifacts: &mut Artifacts,
|
||||
artifact_id: ArtifactId,
|
||||
reply_to: oneshot::Sender<()>,
|
||||
) -> Result<(), Fatal> {
|
||||
let (artifact_id, path) = if let Some(artifact) = artifacts.remove(artifact_id) {
|
||||
artifact
|
||||
} else {
|
||||
// if we haven't found the artifact by its id,
|
||||
// it has been probably removed
|
||||
// anyway with the randomness of the artifact name
|
||||
// it is safe to ignore
|
||||
return Ok(());
|
||||
};
|
||||
reply_to
|
||||
.send(())
|
||||
.expect("the execute queue waits for the artifact remove confirmation; qed");
|
||||
// Thanks to the randomness of the artifact name (see
|
||||
// `artifacts::generate_artifact_path`) there is no issue with any name conflict on
|
||||
// future repreparation.
|
||||
// So we can confirm the artifact removal already
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
validation_code_hash = ?artifact_id.code_hash,
|
||||
"PVF pruning: pruning artifact by request from the execute queue",
|
||||
);
|
||||
sweeper_tx.send(path).await.map_err(|_| Fatal)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// A simple task which sole purpose is to delete files thrown at it.
|
||||
async fn sweeper_task(mut sweeper_rx: mpsc::Receiver<PathBuf>) {
|
||||
loop {
|
||||
@@ -968,6 +1014,8 @@ pub(crate) mod tests {
|
||||
to_prepare_queue_rx: mpsc::Receiver<prepare::ToQueue>,
|
||||
from_prepare_queue_tx: mpsc::UnboundedSender<prepare::FromQueue>,
|
||||
to_execute_queue_rx: mpsc::Receiver<execute::ToQueue>,
|
||||
#[allow(unused)]
|
||||
from_execute_queue_tx: mpsc::UnboundedSender<execute::FromQueue>,
|
||||
to_sweeper_rx: mpsc::Receiver<PathBuf>,
|
||||
|
||||
run: BoxFuture<'static, ()>,
|
||||
@@ -979,6 +1027,7 @@ pub(crate) mod tests {
|
||||
let (to_prepare_queue_tx, to_prepare_queue_rx) = mpsc::channel(10);
|
||||
let (from_prepare_queue_tx, from_prepare_queue_rx) = mpsc::unbounded();
|
||||
let (to_execute_queue_tx, to_execute_queue_rx) = mpsc::channel(10);
|
||||
let (from_execute_queue_tx, from_execute_queue_rx) = mpsc::unbounded();
|
||||
let (to_sweeper_tx, to_sweeper_rx) = mpsc::channel(10);
|
||||
|
||||
let run = run(Inner {
|
||||
@@ -989,6 +1038,7 @@ pub(crate) mod tests {
|
||||
to_prepare_queue_tx,
|
||||
from_prepare_queue_rx,
|
||||
to_execute_queue_tx,
|
||||
from_execute_queue_rx,
|
||||
to_sweeper_tx,
|
||||
awaiting_prepare: AwaitingPrepare::default(),
|
||||
})
|
||||
@@ -999,6 +1049,7 @@ pub(crate) mod tests {
|
||||
to_prepare_queue_rx,
|
||||
from_prepare_queue_tx,
|
||||
to_execute_queue_rx,
|
||||
from_execute_queue_tx,
|
||||
to_sweeper_rx,
|
||||
run,
|
||||
}
|
||||
|
||||
@@ -21,13 +21,14 @@ use parity_scale_codec::Encode as _;
|
||||
#[cfg(all(feature = "ci-only-tests", target_os = "linux"))]
|
||||
use polkadot_node_core_pvf::SecurityStatus;
|
||||
use polkadot_node_core_pvf::{
|
||||
start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, PrepareError,
|
||||
PrepareJobKind, PvfPrepData, ValidationError, ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
|
||||
start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics,
|
||||
PossiblyInvalidError, PrepareError, PrepareJobKind, PvfPrepData, ValidationError,
|
||||
ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
|
||||
};
|
||||
use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult};
|
||||
use polkadot_primitives::{ExecutorParam, ExecutorParams};
|
||||
|
||||
use std::time::Duration;
|
||||
use std::{io::Write, time::Duration};
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
mod adder;
|
||||
@@ -352,10 +353,80 @@ async fn deleting_prepared_artifact_does_not_dispute() {
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)) => {},
|
||||
r => panic!("{:?}", r),
|
||||
assert_matches!(result, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)));
|
||||
}
|
||||
|
||||
// Test that corruption of a prepared artifact does not lead to a dispute when we try to execute it.
|
||||
#[tokio::test]
|
||||
async fn corrupted_prepared_artifact_does_not_dispute() {
|
||||
let host = TestHost::new().await;
|
||||
let cache_dir = host.cache_dir.path();
|
||||
|
||||
let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), Default::default()).await.unwrap();
|
||||
|
||||
// Manually corrupting the prepared artifact from disk. The in-memory artifacts table won't
|
||||
// change.
|
||||
let artifact_path = {
|
||||
// Get the artifact path (asserting it exists).
|
||||
let mut cache_dir: Vec<_> = std::fs::read_dir(cache_dir).unwrap().collect();
|
||||
// Should contain the artifact and the worker dir.
|
||||
assert_eq!(cache_dir.len(), 2);
|
||||
let mut artifact_path = cache_dir.pop().unwrap().unwrap();
|
||||
if artifact_path.path().is_dir() {
|
||||
artifact_path = cache_dir.pop().unwrap().unwrap();
|
||||
}
|
||||
|
||||
// Corrupt the artifact.
|
||||
let mut f = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.truncate(true)
|
||||
.open(artifact_path.path())
|
||||
.unwrap();
|
||||
f.write_all(b"corrupted wasm").unwrap();
|
||||
f.flush().unwrap();
|
||||
artifact_path
|
||||
};
|
||||
|
||||
assert!(artifact_path.path().exists());
|
||||
|
||||
// Try to validate, artifact should get removed because of the corruption.
|
||||
let result = host
|
||||
.validate_candidate(
|
||||
halt::wasm_binary_unwrap(),
|
||||
ValidationParams {
|
||||
block_data: BlockData(Vec::new()),
|
||||
parent_head: Default::default(),
|
||||
relay_parent_number: 1,
|
||||
relay_parent_storage_root: Default::default(),
|
||||
},
|
||||
Default::default(),
|
||||
)
|
||||
.await;
|
||||
|
||||
assert_matches!(
|
||||
result,
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(_)))
|
||||
);
|
||||
|
||||
// because of RuntimeConstruction we may retry
|
||||
host.precheck_pvf(halt::wasm_binary_unwrap(), Default::default()).await.unwrap();
|
||||
|
||||
// The actual artifact removal is done concurrently
|
||||
// with sending of the result of the execution
|
||||
// it is not a problem for further re-preparation as
|
||||
// artifact filenames are random
|
||||
for _ in 1..5 {
|
||||
if !artifact_path.path().exists() {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
|
||||
assert!(
|
||||
!artifact_path.path().exists(),
|
||||
"the corrupted artifact ({}) should be deleted by the host",
|
||||
artifact_path.path().display()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -159,7 +159,9 @@ impl sp_std::fmt::LowerHex for ExecutorParamsHash {
|
||||
// into individual fields of the structure. Thus, complex migrations shall be avoided when adding
|
||||
// new entries and removing old ones. At the moment, there's no mandatory parameters defined. If
|
||||
// they show up, they must be clearly documented as mandatory ones.
|
||||
#[derive(Clone, Debug, Encode, Decode, PartialEq, Eq, TypeInfo, Serialize, Deserialize)]
|
||||
#[derive(
|
||||
Clone, Debug, Default, Encode, Decode, PartialEq, Eq, TypeInfo, Serialize, Deserialize,
|
||||
)]
|
||||
pub struct ExecutorParams(Vec<ExecutorParam>);
|
||||
|
||||
impl ExecutorParams {
|
||||
@@ -334,9 +336,3 @@ impl From<&[ExecutorParam]> for ExecutorParams {
|
||||
ExecutorParams(arr.to_vec())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ExecutorParams {
|
||||
fn default() -> Self {
|
||||
ExecutorParams(vec![])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,6 +125,14 @@ execution request:
|
||||
reason, which may or may not be independent of the candidate or PVF.
|
||||
5. **Internal errors:** See "Internal Errors" section. In this case, after the
|
||||
retry we abstain from voting.
|
||||
6. **RuntimeConstruction** error. The precheck handles a general case of a wrong
|
||||
artifact but doesn't guarantee its consistency between the preparation and
|
||||
the execution. If something happened with the artifact between
|
||||
the preparation of the artifact and its execution (e.g. the artifact was
|
||||
corrupted on disk or a dirty node upgrade happened when the prepare worker
|
||||
has a wasmtime version different from the execute worker's wasmtime version).
|
||||
We treat such an error as possibly transient due to local issues and retry
|
||||
one time.
|
||||
|
||||
### Preparation timeouts
|
||||
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0
|
||||
# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json
|
||||
|
||||
title: Retrying an execution on failed runtime construction
|
||||
|
||||
doc:
|
||||
- audience: Node Dev
|
||||
description: |
|
||||
If a runtime construction error happened during the execution request, then the artifact is re-prepared
|
||||
and the execution request is retried at most once. See also the related issue.
|
||||
|
||||
crates:
|
||||
- name: polkadot-node-core-candidate-validation
|
||||
- name: polkadot-node-core-pvf
|
||||
- name: polkadot-node-core-pvf-execute-worker
|
||||
- name: polkadot-node-core-pvf-common
|
||||
Reference in New Issue
Block a user