PVF: Don't dispute on missing artifact (#7011)

* PVF: Don't dispute on missing artifact

A dispute should never be raised if the local cache doesn't provide a certain
artifact. You can not dispute based on this reason, as it is a local hardware
issue and not related to the candidate to check.

Design:

Currently we assume that if we prepared an artifact, it remains there on-disk
until we prune it, i.e. we never check again if it's still there.

We can change it so that instead of artifact-not-found triggering a dispute, we
retry once (like we do for AmbiguousWorkerDeath, except we don't dispute if it
still doesn't work). And when enqueuing an execute job, we check for the
artifact on-disk, and start preparation if not found.

Changes:

- [x] Integration test (should fail without the following changes)
- [x] Check if artifact exists when executing, prepare if not
- [x] Return an internal error when file is missing
- [x] Retry once on internal errors
- [x] Document design (update impl guide)

* Add some context to wasm error message (it is quite long)

* Fix impl guide

* Add check for missing/inaccessible file

* Add comment referencing Substrate issue

* Add test for retrying internal errors

---------

Co-authored-by: parity-processbot <>
This commit is contained in:
Marcin S
2023-04-20 15:38:31 +02:00
committed by GitHub
parent 023d459857
commit 0940cdd1d7
8 changed files with 286 additions and 99 deletions
@@ -691,6 +691,9 @@ trait ValidationBackend {
/// Tries executing a PVF. Will retry once if an error is encountered that may have been
/// transient.
///
/// NOTE: Should retry only on errors that are a result of execution itself, and not of
/// preparation.
async fn validate_candidate_with_retry(
&mut self,
raw_validation_code: Vec<u8>,
@@ -698,31 +701,44 @@ trait ValidationBackend {
params: ValidationParams,
executor_params: ExecutorParams,
) -> Result<WasmValidationResult, ValidationError> {
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepTimeoutKind::Lenient);
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
let pvf = PvfPrepData::from_code(raw_validation_code, executor_params, prep_timeout);
let mut validation_result =
self.validate_candidate(pvf.clone(), exec_timeout, params.encode()).await;
// If we get an AmbiguousWorkerDeath error, retry once after a brief delay, on the
// assumption that the conditions that caused this error may have been transient. Note that
// this error is only a result of execution itself and not of preparation.
if let Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)) =
validation_result
{
// Wait a brief delay before retrying.
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
// Allow limited retries for each kind of error.
let mut num_internal_retries_left = 1;
let mut num_awd_retries_left = 1;
loop {
match validation_result {
Err(ValidationError::InvalidCandidate(
WasmInvalidCandidate::AmbiguousWorkerDeath,
)) if num_awd_retries_left > 0 => num_awd_retries_left -= 1,
Err(ValidationError::InternalError(_)) if num_internal_retries_left > 0 =>
num_internal_retries_left -= 1,
_ => break,
}
gum::warn!(
target: LOG_TARGET,
?pvf,
"Re-trying failed candidate validation due to AmbiguousWorkerDeath."
);
// If we got a possibly transient error, retry once after a brief delay, on the assumption
// that the conditions that caused this error may have resolved on their own.
{
// Wait a brief delay before retrying.
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
// Encode the params again when re-trying. We expect the retry case to be relatively
// rare, and we want to avoid unconditionally cloning data.
validation_result = self.validate_candidate(pvf, exec_timeout, params.encode()).await;
gum::warn!(
target: LOG_TARGET,
?pvf,
"Re-trying failed candidate validation due to possible transient error: {:?}",
validation_result
);
// Encode the params again when re-trying. We expect the retry case to be relatively
// rare, and we want to avoid unconditionally cloning data.
validation_result =
self.validate_candidate(pvf.clone(), exec_timeout, params.encode()).await;
}
}
validation_result
@@ -672,6 +672,62 @@ fn candidate_validation_multiple_ambiguous_errors_is_invalid() {
assert_matches!(v, ValidationResult::Invalid(InvalidCandidate::ExecutionError(_)));
}
// Test that we retry on internal errors.
#[test]
fn candidate_validation_retry_internal_errors() {
let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() };
let pov = PoV { block_data: BlockData(vec![1; 32]) };
let validation_code = ValidationCode(vec![2; 16]);
let descriptor = make_valid_candidate_descriptor(
ParaId::from(1_u32),
dummy_hash(),
validation_data.hash(),
pov.hash(),
validation_code.hash(),
dummy_hash(),
dummy_hash(),
Sr25519Keyring::Alice,
);
let check = perform_basic_checks(
&descriptor,
validation_data.max_pov_size,
&pov,
&validation_code.hash(),
);
assert!(check.is_ok());
let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };
let pool = TaskExecutor::new();
let (mut ctx, ctx_handle) =
test_helpers::make_subsystem_context::<AllMessages, _>(pool.clone());
let metrics = Metrics::default();
let v = test_with_executor_params(ctx_handle, || {
validate_candidate_exhaustive(
ctx.sender(),
MockValidateCandidateBackend::with_hardcoded_result_list(vec![
Err(ValidationError::InternalError("foo".into())),
// Throw an AWD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
// Throw another internal error.
Err(ValidationError::InternalError("bar".into())),
]),
validation_data,
validation_code,
candidate_receipt,
Arc::new(pov),
PvfExecTimeoutKind::Backing,
&metrics,
)
});
assert_matches!(v, Err(ValidationFailed(s)) if s == "bar".to_string());
}
#[test]
fn candidate_validation_timeout_is_internal_error() {
let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() };