PVF: re-preparing artifact on failed runtime construction (#3187)

resolve https://github.com/paritytech/polkadot-sdk/issues/3139

- [x] use a distinguishable error for `execute_artifact`
- [x] remove artifact in case of a `RuntimeConstruction` error during
the execution
- [x] augment the `validate_candidate_with_retry` of `ValidationBackend`
with the case of retriable `RuntimeConstruction` error during the
execution
- [x] update the book
(https://paritytech.github.io/polkadot-sdk/book/node/utility/pvf-host-and-workers.html#retrying-execution-requests)
- [x] add a test
- [x] run zombienet tests

---------

Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
This commit is contained in:
maksimryndin
2024-02-28 17:29:27 +01:00
committed by GitHub
parent 14530269b7
commit 426136671a
15 changed files with 294 additions and 49 deletions
+1 -1
View File
@@ -23,4 +23,4 @@
mod queue;
mod worker_interface;
pub use queue::{start, PendingExecutionRequest, ToQueue};
pub use queue::{start, FromQueue, PendingExecutionRequest, ToQueue};
+57 -11
View File
@@ -25,7 +25,7 @@ use crate::{
InvalidCandidate, PossiblyInvalidError, ValidationError, LOG_TARGET,
};
use futures::{
channel::mpsc,
channel::{mpsc, oneshot},
future::BoxFuture,
stream::{FuturesUnordered, StreamExt as _},
Future, FutureExt,
@@ -54,6 +54,12 @@ pub enum ToQueue {
Enqueue { artifact: ArtifactPathId, pending_execution_request: PendingExecutionRequest },
}
/// A response from queue.
#[derive(Debug)]
pub enum FromQueue {
RemoveArtifact { artifact: ArtifactId, reply_to: oneshot::Sender<()> },
}
/// An execution request that should execute the PVF (known in the context) and send the results
/// to the given result sender.
#[derive(Debug)]
@@ -137,6 +143,8 @@ struct Queue {
/// The receiver that receives messages to the pool.
to_queue_rx: mpsc::Receiver<ToQueue>,
/// The sender to send messages back to validation host.
from_queue_tx: mpsc::UnboundedSender<FromQueue>,
// Some variables related to the current session.
program_path: PathBuf,
@@ -161,6 +169,7 @@ impl Queue {
node_version: Option<String>,
security_status: SecurityStatus,
to_queue_rx: mpsc::Receiver<ToQueue>,
from_queue_tx: mpsc::UnboundedSender<FromQueue>,
) -> Self {
Self {
metrics,
@@ -170,6 +179,7 @@ impl Queue {
node_version,
security_status,
to_queue_rx,
from_queue_tx,
queue: VecDeque::new(),
mux: Mux::new(),
workers: Workers {
@@ -301,7 +311,7 @@ async fn handle_mux(queue: &mut Queue, event: QueueEvent) {
handle_worker_spawned(queue, idle, handle, job);
},
QueueEvent::StartWork(worker, outcome, artifact_id, result_tx) => {
handle_job_finish(queue, worker, outcome, artifact_id, result_tx);
handle_job_finish(queue, worker, outcome, artifact_id, result_tx).await;
},
}
}
@@ -327,42 +337,69 @@ fn handle_worker_spawned(
/// If there are pending jobs in the queue, schedules the next of them onto the just freed up
/// worker. Otherwise, puts back into the available workers list.
fn handle_job_finish(
async fn handle_job_finish(
queue: &mut Queue,
worker: Worker,
outcome: Outcome,
artifact_id: ArtifactId,
result_tx: ResultSender,
) {
let (idle_worker, result, duration) = match outcome {
let (idle_worker, result, duration, sync_channel) = match outcome {
Outcome::Ok { result_descriptor, duration, idle_worker } => {
// TODO: propagate the soft timeout
(Some(idle_worker), Ok(result_descriptor), Some(duration))
(Some(idle_worker), Ok(result_descriptor), Some(duration), None)
},
Outcome::InvalidCandidate { err, idle_worker } => (
Some(idle_worker),
Err(ValidationError::Invalid(InvalidCandidate::WorkerReportedInvalid(err))),
None,
None,
),
Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None),
Outcome::RuntimeConstruction { err, idle_worker } => {
// The task for artifact removal is executed concurrently with
// the message to the host on the execution result.
let (result_tx, result_rx) = oneshot::channel();
queue
.from_queue_tx
.unbounded_send(FromQueue::RemoveArtifact {
artifact: artifact_id.clone(),
reply_to: result_tx,
})
.expect("from execute queue receiver is listened by the host; qed");
(
Some(idle_worker),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(
err,
))),
None,
Some(result_rx),
)
},
Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None, None),
// Either the worker or the job timed out. Kill the worker in either case. Treated as
// definitely-invalid, because if we timed out, there's no time left for a retry.
Outcome::HardTimeout =>
(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None),
(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None, None),
// "Maybe invalid" errors (will retry).
Outcome::WorkerIntfErr => (
None,
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
None,
None,
),
Outcome::JobDied { err } => (
None,
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))),
None,
None,
),
Outcome::JobError { err } => (
None,
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))),
None,
None,
),
Outcome::JobError { err } =>
(None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))), None),
};
queue.metrics.execute_finished();
@@ -386,6 +423,12 @@ fn handle_job_finish(
);
}
if let Some(sync_channel) = sync_channel {
// err means the sender is dropped (the artifact is already removed from the cache)
// so that's legitimate to ignore the result
let _ = sync_channel.await;
}
// First we send the result. It may fail due to the other end of the channel being dropped,
// that's legitimate and we don't treat that as an error.
let _ = result_tx.send(result);
@@ -521,8 +564,10 @@ pub fn start(
spawn_timeout: Duration,
node_version: Option<String>,
security_status: SecurityStatus,
) -> (mpsc::Sender<ToQueue>, impl Future<Output = ()>) {
) -> (mpsc::Sender<ToQueue>, mpsc::UnboundedReceiver<FromQueue>, impl Future<Output = ()>) {
let (to_queue_tx, to_queue_rx) = mpsc::channel(20);
let (from_queue_tx, from_queue_rx) = mpsc::unbounded();
let run = Queue::new(
metrics,
program_path,
@@ -532,7 +577,8 @@ pub fn start(
node_version,
security_status,
to_queue_rx,
from_queue_tx,
)
.run();
(to_queue_tx, run)
(to_queue_tx, from_queue_rx, run)
}
@@ -87,6 +87,10 @@ pub enum Outcome {
/// a trap. Errors related to the preparation process are not expected to be encountered by the
/// execution workers.
InvalidCandidate { err: String, idle_worker: IdleWorker },
/// The error is probably transient. It may be for example
/// because the artifact was prepared with a Wasmtime version different from the version
/// in the current execution environment.
RuntimeConstruction { err: String, idle_worker: IdleWorker },
/// The execution time exceeded the hard limit. The worker is terminated.
HardTimeout,
/// An I/O error happened during communication with the worker. This may mean that the worker
@@ -193,6 +197,10 @@ pub async fn start_work(
err,
idle_worker: IdleWorker { stream, pid, worker_dir },
},
WorkerResponse::RuntimeConstruction(err) => Outcome::RuntimeConstruction {
err,
idle_worker: IdleWorker { stream, pid, worker_dir },
},
WorkerResponse::JobTimedOut => Outcome::HardTimeout,
WorkerResponse::JobDied { err, job_pid: _ } => Outcome::JobDied { err },
WorkerResponse::JobError(err) => Outcome::JobError { err },