PVF: re-preparing artifact on failed runtime construction (#3187)

resolve https://github.com/paritytech/polkadot-sdk/issues/3139

- [x] use a distinguishable error for `execute_artifact`
- [x] remove artifact in case of a `RuntimeConstruction` error during
the execution
- [x] augment the `validate_candidate_with_retry` of `ValidationBackend`
with the case of retriable `RuntimeConstruction` error during the
execution
- [x] update the book
(https://paritytech.github.io/polkadot-sdk/book/node/utility/pvf-host-and-workers.html#retrying-execution-requests)
- [x] add a test
- [x] run zombienet tests

---------

Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
This commit is contained in:
maksimryndin
2024-02-28 17:29:27 +01:00
committed by GitHub
parent 14530269b7
commit 426136671a
15 changed files with 294 additions and 49 deletions
@@ -695,6 +695,8 @@ async fn validate_candidate_exhaustive(
))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(err))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(format!(
@@ -780,40 +782,50 @@ trait ValidationBackend {
return validation_result
}
macro_rules! break_if_no_retries_left {
($counter:ident) => {
if $counter > 0 {
$counter -= 1;
} else {
break
}
};
}
// Allow limited retries for each kind of error.
let mut num_death_retries_left = 1;
let mut num_job_error_retries_left = 1;
let mut num_internal_retries_left = 1;
let mut num_runtime_construction_retries_left = 1;
loop {
// Stop retrying if we exceeded the timeout.
if total_time_start.elapsed() + retry_delay > exec_timeout {
break
}
let mut retry_immediately = false;
match validation_result {
Err(ValidationError::PossiblyInvalid(
PossiblyInvalidError::AmbiguousWorkerDeath |
PossiblyInvalidError::AmbiguousJobDeath(_),
)) =>
if num_death_retries_left > 0 {
num_death_retries_left -= 1;
} else {
break
},
)) => break_if_no_retries_left!(num_death_retries_left),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(_))) =>
if num_job_error_retries_left > 0 {
num_job_error_retries_left -= 1;
} else {
break
},
break_if_no_retries_left!(num_job_error_retries_left),
Err(ValidationError::Internal(_)) =>
if num_internal_retries_left > 0 {
num_internal_retries_left -= 1;
} else {
break
},
break_if_no_retries_left!(num_internal_retries_left),
Err(ValidationError::PossiblyInvalid(
PossiblyInvalidError::RuntimeConstruction(_),
)) => {
break_if_no_retries_left!(num_runtime_construction_retries_left);
self.precheck_pvf(pvf.clone()).await?;
// In this case the error is deterministic
// And a retry forces the ValidationBackend
// to re-prepare the artifact so
// there is no need to wait before the retry
retry_immediately = true;
},
Ok(_) | Err(ValidationError::Invalid(_) | ValidationError::Preparation(_)) => break,
}
@@ -821,8 +833,11 @@ trait ValidationBackend {
// If we got a possibly transient error, retry once after a brief delay, on the
// assumption that the conditions that caused this error may have resolved on their own.
{
// Wait a brief delay before retrying.
futures_timer::Delay::new(retry_delay).await;
// In case of many transient errors it is necessary to wait a little bit
// for the error to be probably resolved
if !retry_immediately {
futures_timer::Delay::new(retry_delay).await;
}
let new_timeout = exec_timeout.saturating_sub(total_time_start.elapsed());