PVF: re-preparing artifact on failed runtime construction (#3187)

resolve https://github.com/paritytech/polkadot-sdk/issues/3139 - [x] use a distinguishable error for `execute_artifact` - [x] remove artifact in case of a `RuntimeConstruction` error during the execution - [x] augment the `validate_candidate_with_retry` of `ValidationBackend` with the case of retriable `RuntimeConstruction` error during the execution - [x] update the book (https://paritytech.github.io/polkadot-sdk/book/node/utility/pvf-host-and-workers.html#retrying-execution-requests) - [x] add a test - [x] run zombienet tests --------- Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
2026-06-13 17:31:05 +00:00 · 2024-02-28 17:29:27 +01:00
parent 14530269b7
commit 426136671a
15 changed files with 294 additions and 49 deletions
@@ -695,6 +695,8 @@ async fn validate_candidate_exhaustive(
 			))),
 		Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))) =>
 			Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),
+		Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(err))) =>
+			Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),

 		Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))) =>
 			Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(format!(
@@ -780,40 +782,50 @@ trait ValidationBackend {
 			return validation_result
 		}

+		macro_rules! break_if_no_retries_left {
+			($counter:ident) => {
+				if $counter > 0 {
+					$counter -= 1;
+				} else {
+					break
+				}
+			};
+		}
+
 		// Allow limited retries for each kind of error.
 		let mut num_death_retries_left = 1;
 		let mut num_job_error_retries_left = 1;
 		let mut num_internal_retries_left = 1;
+		let mut num_runtime_construction_retries_left = 1;
 		loop {
 			// Stop retrying if we exceeded the timeout.
 			if total_time_start.elapsed() + retry_delay > exec_timeout {
 				break
 			}
-
+			let mut retry_immediately = false;
 			match validation_result {
 				Err(ValidationError::PossiblyInvalid(
 					PossiblyInvalidError::AmbiguousWorkerDeath |
 					PossiblyInvalidError::AmbiguousJobDeath(_),
-				)) =>
-					if num_death_retries_left > 0 {
-						num_death_retries_left -= 1;
-					} else {
-						break
-					},
+				)) => break_if_no_retries_left!(num_death_retries_left),

 				Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(_))) =>
-					if num_job_error_retries_left > 0 {
-						num_job_error_retries_left -= 1;
-					} else {
-						break
-					},
+					break_if_no_retries_left!(num_job_error_retries_left),

 				Err(ValidationError::Internal(_)) =>
-					if num_internal_retries_left > 0 {
-						num_internal_retries_left -= 1;
-					} else {
-						break
-					},
+					break_if_no_retries_left!(num_internal_retries_left),
+
+				Err(ValidationError::PossiblyInvalid(
+					PossiblyInvalidError::RuntimeConstruction(_),
+				)) => {
+					break_if_no_retries_left!(num_runtime_construction_retries_left);
+					self.precheck_pvf(pvf.clone()).await?;
+					// In this case the error is deterministic
+					// And a retry forces the ValidationBackend
+					// to re-prepare the artifact so
+					// there is no need to wait before the retry
+					retry_immediately = true;
+				},

 				Ok(_) | Err(ValidationError::Invalid(_) | ValidationError::Preparation(_)) => break,
 			}
@@ -821,8 +833,11 @@ trait ValidationBackend {
 			// If we got a possibly transient error, retry once after a brief delay, on the
 			// assumption that the conditions that caused this error may have resolved on their own.
 			{
-				// Wait a brief delay before retrying.
-				futures_timer::Delay::new(retry_delay).await;
+				// In case of many transient errors it is necessary to wait a little bit
+				// for the error to be probably resolved
+				if !retry_immediately {
+					futures_timer::Delay::new(retry_delay).await;
+				}

 				let new_timeout = exec_timeout.saturating_sub(total_time_start.elapsed());

@@ -16,6 +16,7 @@

 use crate::prepare::{PrepareSuccess, PrepareWorkerSuccess};
 use parity_scale_codec::{Decode, Encode};
+pub use sc_executor_common::error::Error as ExecuteError;

 /// Result of PVF preparation from a worker, with checksum of the compiled PVF and stats of the
 /// preparation if successful.
@@ -40,6 +40,9 @@ pub enum WorkerResponse {
 	},
 	/// The candidate is invalid.
 	InvalidCandidate(String),
+	/// Instantiation of the WASM module instance failed during an execution.
+	/// Possibly related to local issues or dirty node update. May be retried with re-preparation.
+	RuntimeConstruction(String),
 	/// The job timed out.
 	JobTimedOut,
 	/// The job process has died. We must kill the worker just in case.
@@ -68,6 +71,9 @@ pub enum JobResponse {
 		/// The result of parachain validation.
 		result_descriptor: ValidationResult,
 	},
+	/// A possibly transient runtime instantiation error happened during the execution; may be
+	/// retried with re-preparation
+	RuntimeConstruction(String),
 	/// The candidate is invalid.
 	InvalidCandidate(String),
 }
@@ -81,6 +87,15 @@ impl JobResponse {
 			Self::InvalidCandidate(format!("{}: {}", ctx, msg))
 		}
 	}
+
+	/// Creates a may retry response from a context `ctx` and a message `msg` (which can be empty).
+	pub fn runtime_construction(ctx: &'static str, msg: &str) -> Self {
+		if msg.is_empty() {
+			Self::RuntimeConstruction(ctx.to_string())
+		} else {
+			Self::RuntimeConstruction(format!("{}: {}", ctx, msg))
+		}
+	}
 }

 /// An unexpected error occurred in the execution job process. Because this comes from the job,
@@ -16,6 +16,7 @@

 //! Interface to the Substrate Executor

+use crate::error::ExecuteError;
 use polkadot_primitives::{
 	executor_params::{DEFAULT_LOGICAL_STACK_MAX, DEFAULT_NATIVE_STACK_MAX},
 	ExecutorParam, ExecutorParams,
@@ -109,7 +110,7 @@ pub unsafe fn execute_artifact(
 	compiled_artifact_blob: &[u8],
 	executor_params: &ExecutorParams,
 	params: &[u8],
-) -> Result<Vec<u8>, String> {
+) -> Result<Vec<u8>, ExecuteError> {
 	let mut extensions = sp_externalities::Extensions::new();

 	extensions.register(sp_core::traits::ReadRuntimeVersionExt::new(ReadRuntimeVersion));
@@ -123,7 +124,6 @@ pub unsafe fn execute_artifact(
 		Ok(Ok(ok)) => Ok(ok),
 		Ok(Err(err)) | Err(err) => Err(err),
 	}
-	.map_err(|err| format!("execute error: {:?}", err))
 }

 /// Constructs the runtime for the given PVF, given the artifact bytes.
@@ -16,7 +16,9 @@

 //! Contains the logic for executing PVFs. Used by the polkadot-execute-worker binary.

-pub use polkadot_node_core_pvf_common::executor_interface::execute_artifact;
+pub use polkadot_node_core_pvf_common::{
+	error::ExecuteError, executor_interface::execute_artifact,
+};

 // NOTE: Initializing logging in e.g. tests will not have an effect in the workers, as they are
 //       separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-execute-worker=trace`.
@@ -237,7 +239,9 @@ fn validate_using_artifact(
 		//         [`executor_interface::prepare`].
 		execute_artifact(compiled_artifact_blob, executor_params, params)
 	} {
-		Err(err) => return JobResponse::format_invalid("execute", &err),
+		Err(ExecuteError::RuntimeConstruction(wasmerr)) =>
+			return JobResponse::runtime_construction("execute", &wasmerr.to_string()),
+		Err(err) => return JobResponse::format_invalid("execute", &err.to_string()),
 		Ok(d) => d,
 	};

@@ -550,6 +554,8 @@ fn handle_parent_process(
 					Ok(WorkerResponse::Ok { result_descriptor, duration: cpu_tv })
 				},
 				Ok(JobResponse::InvalidCandidate(err)) => Ok(WorkerResponse::InvalidCandidate(err)),
+				Ok(JobResponse::RuntimeConstruction(err)) =>
+					Ok(WorkerResponse::RuntimeConstruction(err)),
 				Err(job_error) => {
 					gum::warn!(
 						target: LOG_TARGET,
@@ -238,6 +238,14 @@ impl Artifacts {
 			.is_none());
 	}

+	/// Remove artifact by its id.
+	pub fn remove(&mut self, artifact_id: ArtifactId) -> Option<(ArtifactId, PathBuf)> {
+		self.inner.remove(&artifact_id).and_then(|state| match state {
+			ArtifactState::Prepared { path, .. } => Some((artifact_id, path)),
+			_ => None,
+		})
+	}
+
 	/// Remove artifacts older than the given TTL and return id and path of the removed ones.
 	pub fn prune(&mut self, artifact_ttl: Duration) -> Vec<(ArtifactId, PathBuf)> {
 		let now = SystemTime::now();
@@ -86,6 +86,10 @@ pub enum PossiblyInvalidError {
 	/// vote invalid.
 	#[error("possibly invalid: job error: {0}")]
 	JobError(String),
+	/// Instantiation of the WASM module instance failed during an execution.
+	/// Possibly related to local issues or dirty node update. May be retried with re-preparation.
+	#[error("possibly invalid: runtime construction: {0}")]
+	RuntimeConstruction(String),
 }

 impl From<PrepareError> for ValidationError {
@@ -23,4 +23,4 @@
 mod queue;
 mod worker_interface;

-pub use queue::{start, PendingExecutionRequest, ToQueue};
+pub use queue::{start, FromQueue, PendingExecutionRequest, ToQueue};
@@ -25,7 +25,7 @@ use crate::{
 	InvalidCandidate, PossiblyInvalidError, ValidationError, LOG_TARGET,
 };
 use futures::{
-	channel::mpsc,
+	channel::{mpsc, oneshot},
 	future::BoxFuture,
 	stream::{FuturesUnordered, StreamExt as _},
 	Future, FutureExt,
@@ -54,6 +54,12 @@ pub enum ToQueue {
 	Enqueue { artifact: ArtifactPathId, pending_execution_request: PendingExecutionRequest },
 }

+/// A response from queue.
+#[derive(Debug)]
+pub enum FromQueue {
+	RemoveArtifact { artifact: ArtifactId, reply_to: oneshot::Sender<()> },
+}
+
 /// An execution request that should execute the PVF (known in the context) and send the results
 /// to the given result sender.
 #[derive(Debug)]
@@ -137,6 +143,8 @@ struct Queue {

 	/// The receiver that receives messages to the pool.
 	to_queue_rx: mpsc::Receiver<ToQueue>,
+	/// The sender to send messages back to validation host.
+	from_queue_tx: mpsc::UnboundedSender<FromQueue>,

 	// Some variables related to the current session.
 	program_path: PathBuf,
@@ -161,6 +169,7 @@ impl Queue {
 		node_version: Option<String>,
 		security_status: SecurityStatus,
 		to_queue_rx: mpsc::Receiver<ToQueue>,
+		from_queue_tx: mpsc::UnboundedSender<FromQueue>,
 	) -> Self {
 		Self {
 			metrics,
@@ -170,6 +179,7 @@ impl Queue {
 			node_version,
 			security_status,
 			to_queue_rx,
+			from_queue_tx,
 			queue: VecDeque::new(),
 			mux: Mux::new(),
 			workers: Workers {
@@ -301,7 +311,7 @@ async fn handle_mux(queue: &mut Queue, event: QueueEvent) {
 			handle_worker_spawned(queue, idle, handle, job);
 		},
 		QueueEvent::StartWork(worker, outcome, artifact_id, result_tx) => {
-			handle_job_finish(queue, worker, outcome, artifact_id, result_tx);
+			handle_job_finish(queue, worker, outcome, artifact_id, result_tx).await;
 		},
 	}
 }
@@ -327,42 +337,69 @@ fn handle_worker_spawned(

 /// If there are pending jobs in the queue, schedules the next of them onto the just freed up
 /// worker. Otherwise, puts back into the available workers list.
-fn handle_job_finish(
+async fn handle_job_finish(
 	queue: &mut Queue,
 	worker: Worker,
 	outcome: Outcome,
 	artifact_id: ArtifactId,
 	result_tx: ResultSender,
 ) {
-	let (idle_worker, result, duration) = match outcome {
+	let (idle_worker, result, duration, sync_channel) = match outcome {
 		Outcome::Ok { result_descriptor, duration, idle_worker } => {
 			// TODO: propagate the soft timeout

-			(Some(idle_worker), Ok(result_descriptor), Some(duration))
+			(Some(idle_worker), Ok(result_descriptor), Some(duration), None)
 		},
 		Outcome::InvalidCandidate { err, idle_worker } => (
 			Some(idle_worker),
 			Err(ValidationError::Invalid(InvalidCandidate::WorkerReportedInvalid(err))),
 			None,
+			None,
 		),
-		Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None),
+		Outcome::RuntimeConstruction { err, idle_worker } => {
+			// The task for artifact removal is executed concurrently with
+			// the message to the host on the execution result.
+			let (result_tx, result_rx) = oneshot::channel();
+			queue
+				.from_queue_tx
+				.unbounded_send(FromQueue::RemoveArtifact {
+					artifact: artifact_id.clone(),
+					reply_to: result_tx,
+				})
+				.expect("from execute queue receiver is listened by the host; qed");
+			(
+				Some(idle_worker),
+				Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(
+					err,
+				))),
+				None,
+				Some(result_rx),
+			)
+		},
+		Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None, None),
 		// Either the worker or the job timed out. Kill the worker in either case. Treated as
 		// definitely-invalid, because if we timed out, there's no time left for a retry.
 		Outcome::HardTimeout =>
-			(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None),
+			(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None, None),
 		// "Maybe invalid" errors (will retry).
 		Outcome::WorkerIntfErr => (
 			None,
 			Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
 			None,
+			None,
 		),
 		Outcome::JobDied { err } => (
 			None,
 			Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))),
 			None,
+			None,
+		),
+		Outcome::JobError { err } => (
+			None,
+			Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))),
+			None,
+			None,
 		),
-		Outcome::JobError { err } =>
-			(None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))), None),
 	};

 	queue.metrics.execute_finished();
@@ -386,6 +423,12 @@ fn handle_job_finish(
 		);
 	}

+	if let Some(sync_channel) = sync_channel {
+		// err means the sender is dropped (the artifact is already removed from the cache)
+		// so that's legitimate to ignore the result
+		let _ = sync_channel.await;
+	}
+
 	// First we send the result. It may fail due to the other end of the channel being dropped,
 	// that's legitimate and we don't treat that as an error.
 	let _ = result_tx.send(result);
@@ -521,8 +564,10 @@ pub fn start(
 	spawn_timeout: Duration,
 	node_version: Option<String>,
 	security_status: SecurityStatus,
-) -> (mpsc::Sender<ToQueue>, impl Future<Output = ()>) {
+) -> (mpsc::Sender<ToQueue>, mpsc::UnboundedReceiver<FromQueue>, impl Future<Output = ()>) {
 	let (to_queue_tx, to_queue_rx) = mpsc::channel(20);
+	let (from_queue_tx, from_queue_rx) = mpsc::unbounded();
+
 	let run = Queue::new(
 		metrics,
 		program_path,
@@ -532,7 +577,8 @@ pub fn start(
 		node_version,
 		security_status,
 		to_queue_rx,
+		from_queue_tx,
 	)
 	.run();
-	(to_queue_tx, run)
+	(to_queue_tx, from_queue_rx, run)
 }
@@ -87,6 +87,10 @@ pub enum Outcome {
 	/// a trap. Errors related to the preparation process are not expected to be encountered by the
 	/// execution workers.
 	InvalidCandidate { err: String, idle_worker: IdleWorker },
+	/// The error is probably transient. It may be for example
+	/// because the artifact was prepared with a Wasmtime version different from the version
+	/// in the current execution environment.
+	RuntimeConstruction { err: String, idle_worker: IdleWorker },
 	/// The execution time exceeded the hard limit. The worker is terminated.
 	HardTimeout,
 	/// An I/O error happened during communication with the worker. This may mean that the worker
@@ -193,6 +197,10 @@ pub async fn start_work(
 				err,
 				idle_worker: IdleWorker { stream, pid, worker_dir },
 			},
+			WorkerResponse::RuntimeConstruction(err) => Outcome::RuntimeConstruction {
+				err,
+				idle_worker: IdleWorker { stream, pid, worker_dir },
+			},
 			WorkerResponse::JobTimedOut => Outcome::HardTimeout,
 			WorkerResponse::JobDied { err, job_pid: _ } => Outcome::JobDied { err },
 			WorkerResponse::JobError(err) => Outcome::JobError { err },
@@ -274,7 +274,7 @@ pub async fn start(
 		from_prepare_pool,
 	);

-	let (to_execute_queue_tx, run_execute_queue) = execute::start(
+	let (to_execute_queue_tx, from_execute_queue_rx, run_execute_queue) = execute::start(
 		metrics,
 		config.execute_worker_program_path.to_owned(),
 		config.cache_path.clone(),
@@ -296,6 +296,7 @@ pub async fn start(
 			to_prepare_queue_tx,
 			from_prepare_queue_rx,
 			to_execute_queue_tx,
+			from_execute_queue_rx,
 			to_sweeper_tx,
 			awaiting_prepare: AwaitingPrepare::default(),
 		})
@@ -342,6 +343,8 @@ struct Inner {
 	from_prepare_queue_rx: mpsc::UnboundedReceiver<prepare::FromQueue>,

 	to_execute_queue_tx: mpsc::Sender<execute::ToQueue>,
+	from_execute_queue_rx: mpsc::UnboundedReceiver<execute::FromQueue>,
+
 	to_sweeper_tx: mpsc::Sender<PathBuf>,

 	awaiting_prepare: AwaitingPrepare,
@@ -358,6 +361,7 @@ async fn run(
 		to_host_rx,
 		from_prepare_queue_rx,
 		mut to_prepare_queue_tx,
+		from_execute_queue_rx,
 		mut to_execute_queue_tx,
 		mut to_sweeper_tx,
 		mut awaiting_prepare,
@@ -384,10 +388,21 @@ async fn run(

 	let mut to_host_rx = to_host_rx.fuse();
 	let mut from_prepare_queue_rx = from_prepare_queue_rx.fuse();
+	let mut from_execute_queue_rx = from_execute_queue_rx.fuse();

 	loop {
 		// biased to make it behave deterministically for tests.
 		futures::select_biased! {
+			from_execute_queue_rx = from_execute_queue_rx.next() => {
+				let from_queue = break_if_fatal!(from_execute_queue_rx.ok_or(Fatal));
+				let execute::FromQueue::RemoveArtifact { artifact, reply_to } = from_queue;
+				break_if_fatal!(handle_artifact_removal(
+					&mut to_sweeper_tx,
+					&mut artifacts,
+					artifact,
+					reply_to,
+				).await);
+			},
 			() = cleanup_pulse.select_next_some() => {
 				// `select_next_some` because we don't expect this to fail, but if it does, we
 				// still don't fail. The trade-off is that the compiled cache will start growing
@@ -861,6 +876,37 @@ async fn handle_cleanup_pulse(
 	Ok(())
 }

+async fn handle_artifact_removal(
+	sweeper_tx: &mut mpsc::Sender<PathBuf>,
+	artifacts: &mut Artifacts,
+	artifact_id: ArtifactId,
+	reply_to: oneshot::Sender<()>,
+) -> Result<(), Fatal> {
+	let (artifact_id, path) = if let Some(artifact) = artifacts.remove(artifact_id) {
+		artifact
+	} else {
+		// if we haven't found the artifact by its id,
+		// it has been probably removed
+		// anyway with the randomness of the artifact name
+		// it is safe to ignore
+		return Ok(());
+	};
+	reply_to
+		.send(())
+		.expect("the execute queue waits for the artifact remove confirmation; qed");
+	// Thanks to the randomness of the artifact name (see
+	// `artifacts::generate_artifact_path`) there is no issue with any name conflict on
+	// future repreparation.
+	// So we can confirm the artifact removal already
+	gum::debug!(
+		target: LOG_TARGET,
+		validation_code_hash = ?artifact_id.code_hash,
+		"PVF pruning: pruning artifact by request from the execute queue",
+	);
+	sweeper_tx.send(path).await.map_err(|_| Fatal)?;
+	Ok(())
+}
+
 /// A simple task which sole purpose is to delete files thrown at it.
 async fn sweeper_task(mut sweeper_rx: mpsc::Receiver<PathBuf>) {
 	loop {
@@ -968,6 +1014,8 @@ pub(crate) mod tests {
 		to_prepare_queue_rx: mpsc::Receiver<prepare::ToQueue>,
 		from_prepare_queue_tx: mpsc::UnboundedSender<prepare::FromQueue>,
 		to_execute_queue_rx: mpsc::Receiver<execute::ToQueue>,
+		#[allow(unused)]
+		from_execute_queue_tx: mpsc::UnboundedSender<execute::FromQueue>,
 		to_sweeper_rx: mpsc::Receiver<PathBuf>,

 		run: BoxFuture<'static, ()>,
@@ -979,6 +1027,7 @@ pub(crate) mod tests {
 			let (to_prepare_queue_tx, to_prepare_queue_rx) = mpsc::channel(10);
 			let (from_prepare_queue_tx, from_prepare_queue_rx) = mpsc::unbounded();
 			let (to_execute_queue_tx, to_execute_queue_rx) = mpsc::channel(10);
+			let (from_execute_queue_tx, from_execute_queue_rx) = mpsc::unbounded();
 			let (to_sweeper_tx, to_sweeper_rx) = mpsc::channel(10);

 			let run = run(Inner {
@@ -989,6 +1038,7 @@ pub(crate) mod tests {
 				to_prepare_queue_tx,
 				from_prepare_queue_rx,
 				to_execute_queue_tx,
+				from_execute_queue_rx,
 				to_sweeper_tx,
 				awaiting_prepare: AwaitingPrepare::default(),
 			})
@@ -999,6 +1049,7 @@ pub(crate) mod tests {
 				to_prepare_queue_rx,
 				from_prepare_queue_tx,
 				to_execute_queue_rx,
+				from_execute_queue_tx,
 				to_sweeper_rx,
 				run,
 			}
@@ -21,13 +21,14 @@ use parity_scale_codec::Encode as _;
 #[cfg(all(feature = "ci-only-tests", target_os = "linux"))]
 use polkadot_node_core_pvf::SecurityStatus;
 use polkadot_node_core_pvf::{
-	start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, PrepareError,
-	PrepareJobKind, PvfPrepData, ValidationError, ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
+	start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics,
+	PossiblyInvalidError, PrepareError, PrepareJobKind, PvfPrepData, ValidationError,
+	ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
 };
 use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult};
 use polkadot_primitives::{ExecutorParam, ExecutorParams};

-use std::time::Duration;
+use std::{io::Write, time::Duration};
 use tokio::sync::Mutex;

 mod adder;
@@ -352,10 +353,80 @@ async fn deleting_prepared_artifact_does_not_dispute() {
 		)
 		.await;

-	match result {
-		Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)) => {},
-		r => panic!("{:?}", r),
+	assert_matches!(result, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)));
+}
+
+// Test that corruption of a prepared artifact does not lead to a dispute when we try to execute it.
+#[tokio::test]
+async fn corrupted_prepared_artifact_does_not_dispute() {
+	let host = TestHost::new().await;
+	let cache_dir = host.cache_dir.path();
+
+	let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), Default::default()).await.unwrap();
+
+	// Manually corrupting the prepared artifact from disk. The in-memory artifacts table won't
+	// change.
+	let artifact_path = {
+		// Get the artifact path (asserting it exists).
+		let mut cache_dir: Vec<_> = std::fs::read_dir(cache_dir).unwrap().collect();
+		// Should contain the artifact and the worker dir.
+		assert_eq!(cache_dir.len(), 2);
+		let mut artifact_path = cache_dir.pop().unwrap().unwrap();
+		if artifact_path.path().is_dir() {
+			artifact_path = cache_dir.pop().unwrap().unwrap();
+		}
+
+		// Corrupt the artifact.
+		let mut f = std::fs::OpenOptions::new()
+			.write(true)
+			.truncate(true)
+			.open(artifact_path.path())
+			.unwrap();
+		f.write_all(b"corrupted wasm").unwrap();
+		f.flush().unwrap();
+		artifact_path
+	};
+
+	assert!(artifact_path.path().exists());
+
+	// Try to validate, artifact should get removed because of the corruption.
+	let result = host
+		.validate_candidate(
+			halt::wasm_binary_unwrap(),
+			ValidationParams {
+				block_data: BlockData(Vec::new()),
+				parent_head: Default::default(),
+				relay_parent_number: 1,
+				relay_parent_storage_root: Default::default(),
+			},
+			Default::default(),
+		)
+		.await;
+
+	assert_matches!(
+		result,
+		Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(_)))
+	);
+
+	// because of RuntimeConstruction we may retry
+	host.precheck_pvf(halt::wasm_binary_unwrap(), Default::default()).await.unwrap();
+
+	// The actual artifact removal is done concurrently
+	// with sending of the result of the execution
+	// it is not a problem for further re-preparation as
+	// artifact filenames are random
+	for _ in 1..5 {
+		if !artifact_path.path().exists() {
+			break;
+		}
+		tokio::time::sleep(Duration::from_secs(1)).await;
 	}
+
+	assert!(
+		!artifact_path.path().exists(),
+		"the corrupted artifact ({}) should be deleted by the host",
+		artifact_path.path().display()
+	);
 }

 #[tokio::test]
@@ -159,7 +159,9 @@ impl sp_std::fmt::LowerHex for ExecutorParamsHash {
 // into individual fields of the structure. Thus, complex migrations shall be avoided when adding
 // new entries and removing old ones. At the moment, there's no mandatory parameters defined. If
 // they show up, they must be clearly documented as mandatory ones.
-#[derive(Clone, Debug, Encode, Decode, PartialEq, Eq, TypeInfo, Serialize, Deserialize)]
+#[derive(
+	Clone, Debug, Default, Encode, Decode, PartialEq, Eq, TypeInfo, Serialize, Deserialize,
+)]
 pub struct ExecutorParams(Vec<ExecutorParam>);

 impl ExecutorParams {
@@ -334,9 +336,3 @@ impl From<&[ExecutorParam]> for ExecutorParams {
 		ExecutorParams(arr.to_vec())
 	}
 }
-
-impl Default for ExecutorParams {
-	fn default() -> Self {
-		ExecutorParams(vec![])
-	}
-}
@@ -125,6 +125,14 @@ execution request:
   reason, which may or may not be independent of the candidate or PVF.
 5. **Internal errors:** See "Internal Errors" section. In this case, after the
   retry we abstain from voting.
+6. **RuntimeConstruction** error. The precheck handles a general case of a wrong
+   artifact but doesn't guarantee its consistency between the preparation and
+   the execution. If something happened with the artifact between
+   the preparation of the artifact and its execution (e.g. the artifact was
+   corrupted on disk or a dirty node upgrade happened when the prepare worker
+   has a wasmtime version different from the execute worker's wasmtime version).
+   We treat such an error as possibly transient due to local issues and retry
+   one time.

 ### Preparation timeouts

@@ -0,0 +1,16 @@
+# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0
+# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json
+
+title: Retrying an execution on failed runtime construction
+
+doc:
+  - audience: Node Dev
+    description: |
+      If a runtime construction error happened during the execution request, then the artifact is re-prepared 
+      and the execution request is retried at most once. See also the related issue.
+
+crates:
+  - name: polkadot-node-core-candidate-validation
+  - name: polkadot-node-core-pvf
+  - name: polkadot-node-core-pvf-execute-worker
+  - name: polkadot-node-core-pvf-common