mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-04-27 23:18:01 +00:00
PVF: Don't dispute on missing artifact (#7011)
* PVF: Don't dispute on missing artifact A dispute should never be raised if the local cache doesn't provide a certain artifact. You can not dispute based on this reason, as it is a local hardware issue and not related to the candidate to check. Design: Currently we assume that if we prepared an artifact, it remains there on-disk until we prune it, i.e. we never check again if it's still there. We can change it so that instead of artifact-not-found triggering a dispute, we retry once (like we do for AmbiguousWorkerDeath, except we don't dispute if it still doesn't work). And when enqueuing an execute job, we check for the artifact on-disk, and start preparation if not found. Changes: - [x] Integration test (should fail without the following changes) - [x] Check if artifact exists when executing, prepare if not - [x] Return an internal error when file is missing - [x] Retry once on internal errors - [x] Document design (update impl guide) * Add some context to wasm error message (it is quite long) * Fix impl guide * Add check for missing/inaccessible file * Add comment referencing Substrate issue * Add test for retrying internal errors --------- Co-authored-by: parity-processbot <>
This commit is contained in:
@@ -691,6 +691,9 @@ trait ValidationBackend {
|
||||
|
||||
/// Tries executing a PVF. Will retry once if an error is encountered that may have been
|
||||
/// transient.
|
||||
///
|
||||
/// NOTE: Should retry only on errors that are a result of execution itself, and not of
|
||||
/// preparation.
|
||||
async fn validate_candidate_with_retry(
|
||||
&mut self,
|
||||
raw_validation_code: Vec<u8>,
|
||||
@@ -698,31 +701,44 @@ trait ValidationBackend {
|
||||
params: ValidationParams,
|
||||
executor_params: ExecutorParams,
|
||||
) -> Result<WasmValidationResult, ValidationError> {
|
||||
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
|
||||
let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepTimeoutKind::Lenient);
|
||||
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
|
||||
let pvf = PvfPrepData::from_code(raw_validation_code, executor_params, prep_timeout);
|
||||
|
||||
let mut validation_result =
|
||||
self.validate_candidate(pvf.clone(), exec_timeout, params.encode()).await;
|
||||
|
||||
// If we get an AmbiguousWorkerDeath error, retry once after a brief delay, on the
|
||||
// assumption that the conditions that caused this error may have been transient. Note that
|
||||
// this error is only a result of execution itself and not of preparation.
|
||||
if let Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)) =
|
||||
validation_result
|
||||
{
|
||||
// Wait a brief delay before retrying.
|
||||
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
|
||||
// Allow limited retries for each kind of error.
|
||||
let mut num_internal_retries_left = 1;
|
||||
let mut num_awd_retries_left = 1;
|
||||
loop {
|
||||
match validation_result {
|
||||
Err(ValidationError::InvalidCandidate(
|
||||
WasmInvalidCandidate::AmbiguousWorkerDeath,
|
||||
)) if num_awd_retries_left > 0 => num_awd_retries_left -= 1,
|
||||
Err(ValidationError::InternalError(_)) if num_internal_retries_left > 0 =>
|
||||
num_internal_retries_left -= 1,
|
||||
_ => break,
|
||||
}
|
||||
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?pvf,
|
||||
"Re-trying failed candidate validation due to AmbiguousWorkerDeath."
|
||||
);
|
||||
// If we got a possibly transient error, retry once after a brief delay, on the assumption
|
||||
// that the conditions that caused this error may have resolved on their own.
|
||||
{
|
||||
// Wait a brief delay before retrying.
|
||||
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
|
||||
|
||||
// Encode the params again when re-trying. We expect the retry case to be relatively
|
||||
// rare, and we want to avoid unconditionally cloning data.
|
||||
validation_result = self.validate_candidate(pvf, exec_timeout, params.encode()).await;
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?pvf,
|
||||
"Re-trying failed candidate validation due to possible transient error: {:?}",
|
||||
validation_result
|
||||
);
|
||||
|
||||
// Encode the params again when re-trying. We expect the retry case to be relatively
|
||||
// rare, and we want to avoid unconditionally cloning data.
|
||||
validation_result =
|
||||
self.validate_candidate(pvf.clone(), exec_timeout, params.encode()).await;
|
||||
}
|
||||
}
|
||||
|
||||
validation_result
|
||||
|
||||
@@ -672,6 +672,62 @@ fn candidate_validation_multiple_ambiguous_errors_is_invalid() {
|
||||
assert_matches!(v, ValidationResult::Invalid(InvalidCandidate::ExecutionError(_)));
|
||||
}
|
||||
|
||||
// Test that we retry on internal errors.
|
||||
#[test]
|
||||
fn candidate_validation_retry_internal_errors() {
|
||||
let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() };
|
||||
|
||||
let pov = PoV { block_data: BlockData(vec![1; 32]) };
|
||||
let validation_code = ValidationCode(vec![2; 16]);
|
||||
|
||||
let descriptor = make_valid_candidate_descriptor(
|
||||
ParaId::from(1_u32),
|
||||
dummy_hash(),
|
||||
validation_data.hash(),
|
||||
pov.hash(),
|
||||
validation_code.hash(),
|
||||
dummy_hash(),
|
||||
dummy_hash(),
|
||||
Sr25519Keyring::Alice,
|
||||
);
|
||||
|
||||
let check = perform_basic_checks(
|
||||
&descriptor,
|
||||
validation_data.max_pov_size,
|
||||
&pov,
|
||||
&validation_code.hash(),
|
||||
);
|
||||
assert!(check.is_ok());
|
||||
|
||||
let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };
|
||||
|
||||
let pool = TaskExecutor::new();
|
||||
let (mut ctx, ctx_handle) =
|
||||
test_helpers::make_subsystem_context::<AllMessages, _>(pool.clone());
|
||||
let metrics = Metrics::default();
|
||||
|
||||
let v = test_with_executor_params(ctx_handle, || {
|
||||
validate_candidate_exhaustive(
|
||||
ctx.sender(),
|
||||
MockValidateCandidateBackend::with_hardcoded_result_list(vec![
|
||||
Err(ValidationError::InternalError("foo".into())),
|
||||
// Throw an AWD error, we should still retry again.
|
||||
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
|
||||
// Throw another internal error.
|
||||
Err(ValidationError::InternalError("bar".into())),
|
||||
]),
|
||||
validation_data,
|
||||
validation_code,
|
||||
candidate_receipt,
|
||||
Arc::new(pov),
|
||||
PvfExecTimeoutKind::Backing,
|
||||
&metrics,
|
||||
)
|
||||
});
|
||||
|
||||
assert_matches!(v, Err(ValidationFailed(s)) if s == "bar".to_string());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn candidate_validation_timeout_is_internal_error() {
|
||||
let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() };
|
||||
|
||||
Reference in New Issue
Block a user