mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-17 18:21:02 +00:00
Retry failed PVF execution (AmbiguousWorkerDeath) (#6235)
* Fix a couple of typos * Retry failed PVF execution PVF execution that fails due to AmbiguousWorkerDeath should be retried once. This should reduce the occurrence of failures due to transient conditions. Closes #6195 * Address a couple of nits * Write tests; refactor (add `validate_candidate_with_retry`) * Update node/core/candidate-validation/src/lib.rs Co-authored-by: Andronik <write@reusable.software> Co-authored-by: eskimor <eskimor@users.noreply.github.com> Co-authored-by: Andronik <write@reusable.software>
This commit is contained in:
@@ -60,6 +60,12 @@ mod tests;
|
||||
|
||||
const LOG_TARGET: &'static str = "parachain::candidate-validation";
|
||||
|
||||
/// The amount of time to wait before retrying after an AmbiguousWorkerDeath validation error.
|
||||
#[cfg(not(test))]
|
||||
const PVF_EXECUTION_RETRY_DELAY: Duration = Duration::from_secs(3);
|
||||
#[cfg(test)]
|
||||
const PVF_EXECUTION_RETRY_DELAY: Duration = Duration::from_millis(200);
|
||||
|
||||
/// Configuration for the candidate validation subsystem
|
||||
#[derive(Clone)]
|
||||
pub struct Config {
|
||||
@@ -490,7 +496,7 @@ where
|
||||
}
|
||||
|
||||
async fn validate_candidate_exhaustive(
|
||||
mut validation_backend: impl ValidationBackend,
|
||||
mut validation_backend: impl ValidationBackend + Send,
|
||||
persisted_validation_data: PersistedValidationData,
|
||||
validation_code: ValidationCode,
|
||||
candidate_receipt: CandidateReceipt,
|
||||
@@ -551,7 +557,7 @@ async fn validate_candidate_exhaustive(
|
||||
};
|
||||
|
||||
let result = validation_backend
|
||||
.validate_candidate(raw_validation_code.to_vec(), timeout, params)
|
||||
.validate_candidate_with_retry(raw_validation_code.to_vec(), timeout, params)
|
||||
.await;
|
||||
|
||||
if let Err(ref error) = result {
|
||||
@@ -604,45 +610,63 @@ async fn validate_candidate_exhaustive(
|
||||
#[async_trait]
|
||||
trait ValidationBackend {
|
||||
async fn validate_candidate(
|
||||
&mut self,
|
||||
pvf: Pvf,
|
||||
timeout: Duration,
|
||||
encoded_params: Vec<u8>,
|
||||
) -> Result<WasmValidationResult, ValidationError>;
|
||||
|
||||
async fn validate_candidate_with_retry(
|
||||
&mut self,
|
||||
raw_validation_code: Vec<u8>,
|
||||
timeout: Duration,
|
||||
params: ValidationParams,
|
||||
) -> Result<WasmValidationResult, ValidationError>;
|
||||
) -> Result<WasmValidationResult, ValidationError> {
|
||||
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
|
||||
let pvf = Pvf::from_code(raw_validation_code);
|
||||
|
||||
let validation_result =
|
||||
self.validate_candidate(pvf.clone(), timeout, params.encode()).await;
|
||||
|
||||
// If we get an AmbiguousWorkerDeath error, retry once after a brief delay, on the
|
||||
// assumption that the conditions that caused this error may have been transient.
|
||||
if let Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)) =
|
||||
validation_result
|
||||
{
|
||||
// Wait a brief delay before retrying.
|
||||
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
|
||||
// Encode the params again when re-trying. We expect the retry case to be relatively
|
||||
// rare, and we want to avoid unconditionally cloning data.
|
||||
self.validate_candidate(pvf, timeout, params.encode()).await
|
||||
} else {
|
||||
validation_result
|
||||
}
|
||||
}
|
||||
|
||||
async fn precheck_pvf(&mut self, pvf: Pvf) -> Result<(), PrepareError>;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ValidationBackend for ValidationHost {
|
||||
/// Tries executing a PVF a single time (no retries).
|
||||
async fn validate_candidate(
|
||||
&mut self,
|
||||
raw_validation_code: Vec<u8>,
|
||||
pvf: Pvf,
|
||||
timeout: Duration,
|
||||
params: ValidationParams,
|
||||
encoded_params: Vec<u8>,
|
||||
) -> Result<WasmValidationResult, ValidationError> {
|
||||
let priority = polkadot_node_core_pvf::Priority::Normal;
|
||||
|
||||
let (tx, rx) = oneshot::channel();
|
||||
if let Err(err) = self
|
||||
.execute_pvf(
|
||||
Pvf::from_code(raw_validation_code),
|
||||
timeout,
|
||||
params.encode(),
|
||||
polkadot_node_core_pvf::Priority::Normal,
|
||||
tx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
if let Err(err) = self.execute_pvf(pvf, timeout, encoded_params, priority, tx).await {
|
||||
return Err(ValidationError::InternalError(format!(
|
||||
"cannot send pvf to the validation host: {:?}",
|
||||
err
|
||||
)))
|
||||
}
|
||||
|
||||
let validation_result = rx
|
||||
.await
|
||||
.map_err(|_| ValidationError::InternalError("validation was cancelled".into()))?;
|
||||
|
||||
validation_result
|
||||
rx.await
|
||||
.map_err(|_| ValidationError::InternalError("validation was cancelled".into()))?
|
||||
}
|
||||
|
||||
async fn precheck_pvf(&mut self, pvf: Pvf) -> Result<(), PrepareError> {
|
||||
|
||||
Reference in New Issue
Block a user