Retry failed PVF execution (AmbiguousWorkerDeath) (#6235)

* Fix a couple of typos

* Retry failed PVF execution

PVF execution that fails due to AmbiguousWorkerDeath should be retried once.
This should reduce the occurrence of failures due to transient conditions.

Closes #6195

* Address a couple of nits

* Write tests; refactor (add `validate_candidate_with_retry`)

* Update node/core/candidate-validation/src/lib.rs

Co-authored-by: Andronik <write@reusable.software>

Co-authored-by: eskimor <eskimor@users.noreply.github.com>
Co-authored-by: Andronik <write@reusable.software>
This commit is contained in:
Marcin S
2022-11-08 15:36:36 -05:00
committed by GitHub
parent c261353380
commit 6d7f33e612
5 changed files with 185 additions and 31 deletions
@@ -60,6 +60,12 @@ mod tests;
const LOG_TARGET: &'static str = "parachain::candidate-validation";
/// The amount of time to wait before retrying after an AmbiguousWorkerDeath validation error.
#[cfg(not(test))]
const PVF_EXECUTION_RETRY_DELAY: Duration = Duration::from_secs(3);
#[cfg(test)]
const PVF_EXECUTION_RETRY_DELAY: Duration = Duration::from_millis(200);
/// Configuration for the candidate validation subsystem
#[derive(Clone)]
pub struct Config {
@@ -490,7 +496,7 @@ where
}
async fn validate_candidate_exhaustive(
mut validation_backend: impl ValidationBackend,
mut validation_backend: impl ValidationBackend + Send,
persisted_validation_data: PersistedValidationData,
validation_code: ValidationCode,
candidate_receipt: CandidateReceipt,
@@ -551,7 +557,7 @@ async fn validate_candidate_exhaustive(
};
let result = validation_backend
.validate_candidate(raw_validation_code.to_vec(), timeout, params)
.validate_candidate_with_retry(raw_validation_code.to_vec(), timeout, params)
.await;
if let Err(ref error) = result {
@@ -604,45 +610,63 @@ async fn validate_candidate_exhaustive(
#[async_trait]
trait ValidationBackend {
async fn validate_candidate(
&mut self,
pvf: Pvf,
timeout: Duration,
encoded_params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError>;
async fn validate_candidate_with_retry(
&mut self,
raw_validation_code: Vec<u8>,
timeout: Duration,
params: ValidationParams,
) -> Result<WasmValidationResult, ValidationError>;
) -> Result<WasmValidationResult, ValidationError> {
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
let pvf = Pvf::from_code(raw_validation_code);
let validation_result =
self.validate_candidate(pvf.clone(), timeout, params.encode()).await;
// If we get an AmbiguousWorkerDeath error, retry once after a brief delay, on the
// assumption that the conditions that caused this error may have been transient.
if let Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)) =
validation_result
{
// Wait a brief delay before retrying.
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
// Encode the params again when re-trying. We expect the retry case to be relatively
// rare, and we want to avoid unconditionally cloning data.
self.validate_candidate(pvf, timeout, params.encode()).await
} else {
validation_result
}
}
async fn precheck_pvf(&mut self, pvf: Pvf) -> Result<(), PrepareError>;
}
#[async_trait]
impl ValidationBackend for ValidationHost {
/// Tries executing a PVF a single time (no retries).
async fn validate_candidate(
&mut self,
raw_validation_code: Vec<u8>,
pvf: Pvf,
timeout: Duration,
params: ValidationParams,
encoded_params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError> {
let priority = polkadot_node_core_pvf::Priority::Normal;
let (tx, rx) = oneshot::channel();
if let Err(err) = self
.execute_pvf(
Pvf::from_code(raw_validation_code),
timeout,
params.encode(),
polkadot_node_core_pvf::Priority::Normal,
tx,
)
.await
{
if let Err(err) = self.execute_pvf(pvf, timeout, encoded_params, priority, tx).await {
return Err(ValidationError::InternalError(format!(
"cannot send pvf to the validation host: {:?}",
err
)))
}
let validation_result = rx
.await
.map_err(|_| ValidationError::InternalError("validation was cancelled".into()))?;
validation_result
rx.await
.map_err(|_| ValidationError::InternalError("validation was cancelled".into()))?
}
async fn precheck_pvf(&mut self, pvf: Pvf) -> Result<(), PrepareError> {