mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-15 17:21:08 +00:00
PVF: re-preparing artifact on failed runtime construction (#3187)
resolve https://github.com/paritytech/polkadot-sdk/issues/3139 - [x] use a distinguishable error for `execute_artifact` - [x] remove artifact in case of a `RuntimeConstruction` error during the execution - [x] augment the `validate_candidate_with_retry` of `ValidationBackend` with the case of retriable `RuntimeConstruction` error during the execution - [x] update the book (https://paritytech.github.io/polkadot-sdk/book/node/utility/pvf-host-and-workers.html#retrying-execution-requests) - [x] add a test - [x] run zombienet tests --------- Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
This commit is contained in:
@@ -23,4 +23,4 @@
|
||||
mod queue;
|
||||
mod worker_interface;
|
||||
|
||||
pub use queue::{start, PendingExecutionRequest, ToQueue};
|
||||
pub use queue::{start, FromQueue, PendingExecutionRequest, ToQueue};
|
||||
|
||||
@@ -25,7 +25,7 @@ use crate::{
|
||||
InvalidCandidate, PossiblyInvalidError, ValidationError, LOG_TARGET,
|
||||
};
|
||||
use futures::{
|
||||
channel::mpsc,
|
||||
channel::{mpsc, oneshot},
|
||||
future::BoxFuture,
|
||||
stream::{FuturesUnordered, StreamExt as _},
|
||||
Future, FutureExt,
|
||||
@@ -54,6 +54,12 @@ pub enum ToQueue {
|
||||
Enqueue { artifact: ArtifactPathId, pending_execution_request: PendingExecutionRequest },
|
||||
}
|
||||
|
||||
/// A response from queue.
|
||||
#[derive(Debug)]
|
||||
pub enum FromQueue {
|
||||
RemoveArtifact { artifact: ArtifactId, reply_to: oneshot::Sender<()> },
|
||||
}
|
||||
|
||||
/// An execution request that should execute the PVF (known in the context) and send the results
|
||||
/// to the given result sender.
|
||||
#[derive(Debug)]
|
||||
@@ -137,6 +143,8 @@ struct Queue {
|
||||
|
||||
/// The receiver that receives messages to the pool.
|
||||
to_queue_rx: mpsc::Receiver<ToQueue>,
|
||||
/// The sender to send messages back to validation host.
|
||||
from_queue_tx: mpsc::UnboundedSender<FromQueue>,
|
||||
|
||||
// Some variables related to the current session.
|
||||
program_path: PathBuf,
|
||||
@@ -161,6 +169,7 @@ impl Queue {
|
||||
node_version: Option<String>,
|
||||
security_status: SecurityStatus,
|
||||
to_queue_rx: mpsc::Receiver<ToQueue>,
|
||||
from_queue_tx: mpsc::UnboundedSender<FromQueue>,
|
||||
) -> Self {
|
||||
Self {
|
||||
metrics,
|
||||
@@ -170,6 +179,7 @@ impl Queue {
|
||||
node_version,
|
||||
security_status,
|
||||
to_queue_rx,
|
||||
from_queue_tx,
|
||||
queue: VecDeque::new(),
|
||||
mux: Mux::new(),
|
||||
workers: Workers {
|
||||
@@ -301,7 +311,7 @@ async fn handle_mux(queue: &mut Queue, event: QueueEvent) {
|
||||
handle_worker_spawned(queue, idle, handle, job);
|
||||
},
|
||||
QueueEvent::StartWork(worker, outcome, artifact_id, result_tx) => {
|
||||
handle_job_finish(queue, worker, outcome, artifact_id, result_tx);
|
||||
handle_job_finish(queue, worker, outcome, artifact_id, result_tx).await;
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -327,42 +337,69 @@ fn handle_worker_spawned(
|
||||
|
||||
/// If there are pending jobs in the queue, schedules the next of them onto the just freed up
|
||||
/// worker. Otherwise, puts back into the available workers list.
|
||||
fn handle_job_finish(
|
||||
async fn handle_job_finish(
|
||||
queue: &mut Queue,
|
||||
worker: Worker,
|
||||
outcome: Outcome,
|
||||
artifact_id: ArtifactId,
|
||||
result_tx: ResultSender,
|
||||
) {
|
||||
let (idle_worker, result, duration) = match outcome {
|
||||
let (idle_worker, result, duration, sync_channel) = match outcome {
|
||||
Outcome::Ok { result_descriptor, duration, idle_worker } => {
|
||||
// TODO: propagate the soft timeout
|
||||
|
||||
(Some(idle_worker), Ok(result_descriptor), Some(duration))
|
||||
(Some(idle_worker), Ok(result_descriptor), Some(duration), None)
|
||||
},
|
||||
Outcome::InvalidCandidate { err, idle_worker } => (
|
||||
Some(idle_worker),
|
||||
Err(ValidationError::Invalid(InvalidCandidate::WorkerReportedInvalid(err))),
|
||||
None,
|
||||
None,
|
||||
),
|
||||
Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None),
|
||||
Outcome::RuntimeConstruction { err, idle_worker } => {
|
||||
// The task for artifact removal is executed concurrently with
|
||||
// the message to the host on the execution result.
|
||||
let (result_tx, result_rx) = oneshot::channel();
|
||||
queue
|
||||
.from_queue_tx
|
||||
.unbounded_send(FromQueue::RemoveArtifact {
|
||||
artifact: artifact_id.clone(),
|
||||
reply_to: result_tx,
|
||||
})
|
||||
.expect("from execute queue receiver is listened by the host; qed");
|
||||
(
|
||||
Some(idle_worker),
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(
|
||||
err,
|
||||
))),
|
||||
None,
|
||||
Some(result_rx),
|
||||
)
|
||||
},
|
||||
Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None, None),
|
||||
// Either the worker or the job timed out. Kill the worker in either case. Treated as
|
||||
// definitely-invalid, because if we timed out, there's no time left for a retry.
|
||||
Outcome::HardTimeout =>
|
||||
(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None),
|
||||
(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None, None),
|
||||
// "Maybe invalid" errors (will retry).
|
||||
Outcome::WorkerIntfErr => (
|
||||
None,
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
|
||||
None,
|
||||
None,
|
||||
),
|
||||
Outcome::JobDied { err } => (
|
||||
None,
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))),
|
||||
None,
|
||||
None,
|
||||
),
|
||||
Outcome::JobError { err } => (
|
||||
None,
|
||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))),
|
||||
None,
|
||||
None,
|
||||
),
|
||||
Outcome::JobError { err } =>
|
||||
(None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))), None),
|
||||
};
|
||||
|
||||
queue.metrics.execute_finished();
|
||||
@@ -386,6 +423,12 @@ fn handle_job_finish(
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(sync_channel) = sync_channel {
|
||||
// err means the sender is dropped (the artifact is already removed from the cache)
|
||||
// so that's legitimate to ignore the result
|
||||
let _ = sync_channel.await;
|
||||
}
|
||||
|
||||
// First we send the result. It may fail due to the other end of the channel being dropped,
|
||||
// that's legitimate and we don't treat that as an error.
|
||||
let _ = result_tx.send(result);
|
||||
@@ -521,8 +564,10 @@ pub fn start(
|
||||
spawn_timeout: Duration,
|
||||
node_version: Option<String>,
|
||||
security_status: SecurityStatus,
|
||||
) -> (mpsc::Sender<ToQueue>, impl Future<Output = ()>) {
|
||||
) -> (mpsc::Sender<ToQueue>, mpsc::UnboundedReceiver<FromQueue>, impl Future<Output = ()>) {
|
||||
let (to_queue_tx, to_queue_rx) = mpsc::channel(20);
|
||||
let (from_queue_tx, from_queue_rx) = mpsc::unbounded();
|
||||
|
||||
let run = Queue::new(
|
||||
metrics,
|
||||
program_path,
|
||||
@@ -532,7 +577,8 @@ pub fn start(
|
||||
node_version,
|
||||
security_status,
|
||||
to_queue_rx,
|
||||
from_queue_tx,
|
||||
)
|
||||
.run();
|
||||
(to_queue_tx, run)
|
||||
(to_queue_tx, from_queue_rx, run)
|
||||
}
|
||||
|
||||
@@ -87,6 +87,10 @@ pub enum Outcome {
|
||||
/// a trap. Errors related to the preparation process are not expected to be encountered by the
|
||||
/// execution workers.
|
||||
InvalidCandidate { err: String, idle_worker: IdleWorker },
|
||||
/// The error is probably transient. It may be for example
|
||||
/// because the artifact was prepared with a Wasmtime version different from the version
|
||||
/// in the current execution environment.
|
||||
RuntimeConstruction { err: String, idle_worker: IdleWorker },
|
||||
/// The execution time exceeded the hard limit. The worker is terminated.
|
||||
HardTimeout,
|
||||
/// An I/O error happened during communication with the worker. This may mean that the worker
|
||||
@@ -193,6 +197,10 @@ pub async fn start_work(
|
||||
err,
|
||||
idle_worker: IdleWorker { stream, pid, worker_dir },
|
||||
},
|
||||
WorkerResponse::RuntimeConstruction(err) => Outcome::RuntimeConstruction {
|
||||
err,
|
||||
idle_worker: IdleWorker { stream, pid, worker_dir },
|
||||
},
|
||||
WorkerResponse::JobTimedOut => Outcome::HardTimeout,
|
||||
WorkerResponse::JobDied { err, job_pid: _ } => Outcome::JobDied { err },
|
||||
WorkerResponse::JobError(err) => Outcome::JobError { err },
|
||||
|
||||
Reference in New Issue
Block a user