remove retry from backers on failed candidate validation (#2182)

Hey guys, as discussed I've changed the name to a more general one
`PvfExecKind`, is this good or too general?
Creating this as a draft, I still have to fix the tests.

Closes #1585

Kusama address: FkB6QEo8VnV3oifugNj5NeVG3Mvq1zFbrUu4P5YwRoe5mQN

---------

Co-authored-by: command-bot <>
Co-authored-by: Marcin S <marcin@realemail.net>
This commit is contained in:
jserrat
2023-11-20 11:00:19 +00:00
committed by GitHub
parent b35300c377
commit ede4a36262
18 changed files with 276 additions and 248 deletions
@@ -49,8 +49,8 @@ use polkadot_primitives::{
DEFAULT_LENIENT_PREPARATION_TIMEOUT, DEFAULT_PRECHECK_PREPARATION_TIMEOUT,
},
CandidateCommitments, CandidateDescriptor, CandidateReceipt, ExecutorParams, Hash,
OccupiedCoreAssumption, PersistedValidationData, PvfExecTimeoutKind, PvfPrepTimeoutKind,
ValidationCode, ValidationCodeHash,
OccupiedCoreAssumption, PersistedValidationData, PvfExecKind, PvfPrepKind, ValidationCode,
ValidationCodeHash,
};
use parity_scale_codec::Encode;
@@ -73,12 +73,6 @@ mod tests;
const LOG_TARGET: &'static str = "parachain::candidate-validation";
/// The amount of time to wait before retrying after a retry-able backing validation error. We use a
/// lower value for the backing case, to fit within the lower backing timeout.
#[cfg(not(test))]
const PVF_BACKING_EXECUTION_RETRY_DELAY: Duration = Duration::from_millis(500);
#[cfg(test)]
const PVF_BACKING_EXECUTION_RETRY_DELAY: Duration = Duration::from_millis(200);
/// The amount of time to wait before retrying after a retry-able approval validation error. We use
/// a higher value for the approval case since we have more time, and if we wait longer it is more
/// likely that transient conditions will resolve.
@@ -163,7 +157,7 @@ async fn run<Context>(
candidate_receipt,
pov,
executor_params,
exec_timeout_kind,
exec_kind,
response_sender,
..
} => {
@@ -180,7 +174,7 @@ async fn run<Context>(
candidate_receipt,
pov,
executor_params,
exec_timeout_kind,
exec_kind,
&metrics,
)
.await;
@@ -198,7 +192,7 @@ async fn run<Context>(
candidate_receipt,
pov,
executor_params,
exec_timeout_kind,
exec_kind,
response_sender,
..
} => {
@@ -215,7 +209,7 @@ async fn run<Context>(
candidate_receipt,
pov,
executor_params,
exec_timeout_kind,
exec_kind,
&metrics,
)
.await;
@@ -357,7 +351,7 @@ where
return PreCheckOutcome::Invalid
};
let timeout = pvf_prep_timeout(&executor_params, PvfPrepTimeoutKind::Precheck);
let timeout = pvf_prep_timeout(&executor_params, PvfPrepKind::Precheck);
let pvf = match sp_maybe_compressed_blob::decompress(
&validation_code.0,
@@ -501,7 +495,7 @@ async fn validate_from_chain_state<Sender>(
candidate_receipt: CandidateReceipt,
pov: Arc<PoV>,
executor_params: ExecutorParams,
exec_timeout_kind: PvfExecTimeoutKind,
exec_kind: PvfExecKind,
metrics: &Metrics,
) -> Result<ValidationResult, ValidationFailed>
where
@@ -521,7 +515,7 @@ where
candidate_receipt.clone(),
pov,
executor_params,
exec_timeout_kind,
exec_kind,
metrics,
)
.await;
@@ -557,7 +551,7 @@ async fn validate_candidate_exhaustive(
candidate_receipt: CandidateReceipt,
pov: Arc<PoV>,
executor_params: ExecutorParams,
exec_timeout_kind: PvfExecTimeoutKind,
exec_kind: PvfExecKind,
metrics: &Metrics,
) -> Result<ValidationResult, ValidationFailed> {
let _timer = metrics.time_validate_candidate_exhaustive();
@@ -616,15 +610,32 @@ async fn validate_candidate_exhaustive(
relay_parent_storage_root: persisted_validation_data.relay_parent_storage_root,
};
let result = validation_backend
.validate_candidate_with_retry(
raw_validation_code.to_vec(),
pvf_exec_timeout(&executor_params, exec_timeout_kind),
exec_timeout_kind,
params,
executor_params,
)
.await;
let result = match exec_kind {
// Retry is disabled to reduce the chance of nondeterministic blocks getting backed and
// honest backers getting slashed.
PvfExecKind::Backing => {
let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepKind::Prepare);
let exec_timeout = pvf_exec_timeout(&executor_params, exec_kind);
let pvf = PvfPrepData::from_code(
raw_validation_code.to_vec(),
executor_params,
prep_timeout,
PrepareJobKind::Compilation,
);
validation_backend.validate_candidate(pvf, exec_timeout, params.encode()).await
},
PvfExecKind::Approval =>
validation_backend
.validate_candidate_with_retry(
raw_validation_code.to_vec(),
pvf_exec_timeout(&executor_params, exec_kind),
params,
executor_params,
PVF_APPROVAL_EXECUTION_RETRY_DELAY,
)
.await,
};
if let Err(ref error) = result {
gum::info!(target: LOG_TARGET, ?para_id, ?error, "Failed to validate candidate");
@@ -709,8 +720,8 @@ trait ValidationBackend {
encoded_params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError>;
/// Tries executing a PVF. Will retry once if an error is encountered that may have been
/// transient.
/// Tries executing a PVF for the approval subsystem. Will retry once if an error is encountered
/// that may have been transient.
///
/// NOTE: Should retry only on errors that are a result of execution itself, and not of
/// preparation.
@@ -718,11 +729,11 @@ trait ValidationBackend {
&mut self,
raw_validation_code: Vec<u8>,
exec_timeout: Duration,
exec_timeout_kind: PvfExecTimeoutKind,
params: ValidationParams,
executor_params: ExecutorParams,
retry_delay: Duration,
) -> Result<WasmValidationResult, ValidationError> {
let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepTimeoutKind::Lenient);
let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepKind::Prepare);
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
let pvf = PvfPrepData::from_code(
raw_validation_code,
@@ -740,11 +751,6 @@ trait ValidationBackend {
return validation_result
}
let retry_delay = match exec_timeout_kind {
PvfExecTimeoutKind::Backing => PVF_BACKING_EXECUTION_RETRY_DELAY,
PvfExecTimeoutKind::Approval => PVF_APPROVAL_EXECUTION_RETRY_DELAY,
};
// Allow limited retries for each kind of error.
let mut num_death_retries_left = 1;
let mut num_job_error_retries_left = 1;
@@ -867,22 +873,41 @@ fn perform_basic_checks(
Ok(())
}
fn pvf_prep_timeout(executor_params: &ExecutorParams, kind: PvfPrepTimeoutKind) -> Duration {
/// To determine the amount of timeout time for the pvf execution.
///
/// Precheck
/// The time period after which the preparation worker is considered
/// unresponsive and will be killed.
///
/// Prepare
///The time period after which the preparation worker is considered
/// unresponsive and will be killed.
fn pvf_prep_timeout(executor_params: &ExecutorParams, kind: PvfPrepKind) -> Duration {
if let Some(timeout) = executor_params.pvf_prep_timeout(kind) {
return timeout
}
match kind {
PvfPrepTimeoutKind::Precheck => DEFAULT_PRECHECK_PREPARATION_TIMEOUT,
PvfPrepTimeoutKind::Lenient => DEFAULT_LENIENT_PREPARATION_TIMEOUT,
PvfPrepKind::Precheck => DEFAULT_PRECHECK_PREPARATION_TIMEOUT,
PvfPrepKind::Prepare => DEFAULT_LENIENT_PREPARATION_TIMEOUT,
}
}
fn pvf_exec_timeout(executor_params: &ExecutorParams, kind: PvfExecTimeoutKind) -> Duration {
/// To determine the amount of timeout time for the pvf execution.
///
/// Backing subsystem
/// The amount of time to spend on execution during backing.
///
/// Approval subsystem
/// The amount of time to spend on execution during approval or disputes.
/// This should be much longer than the backing execution timeout to ensure that in the
/// absence of extremely large disparities between hardware, blocks that pass backing are
/// considered executable by approval checkers or dispute participants.
fn pvf_exec_timeout(executor_params: &ExecutorParams, kind: PvfExecKind) -> Duration {
if let Some(timeout) = executor_params.pvf_exec_timeout(kind) {
return timeout
}
match kind {
PvfExecTimeoutKind::Backing => DEFAULT_BACKING_EXECUTION_TIMEOUT,
PvfExecTimeoutKind::Approval => DEFAULT_APPROVAL_EXECUTION_TIMEOUT,
PvfExecKind::Backing => DEFAULT_BACKING_EXECUTION_TIMEOUT,
PvfExecKind::Approval => DEFAULT_APPROVAL_EXECUTION_TIMEOUT,
}
}
@@ -436,7 +436,7 @@ fn candidate_validation_ok_is_ok() {
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
PvfExecKind::Backing,
&Default::default(),
))
.unwrap();
@@ -488,7 +488,7 @@ fn candidate_validation_bad_return_is_invalid() {
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
PvfExecKind::Backing,
&Default::default(),
))
.unwrap();
@@ -496,6 +496,33 @@ fn candidate_validation_bad_return_is_invalid() {
assert_matches!(v, ValidationResult::Invalid(InvalidCandidate::Timeout));
}
fn perform_basic_checks_on_valid_candidate(
pov: &PoV,
validation_code: &ValidationCode,
validation_data: &PersistedValidationData,
head_data_hash: Hash,
) -> CandidateDescriptor {
let descriptor = make_valid_candidate_descriptor(
ParaId::from(1_u32),
dummy_hash(),
validation_data.hash(),
pov.hash(),
validation_code.hash(),
head_data_hash,
head_data_hash,
Sr25519Keyring::Alice,
);
let check = perform_basic_checks(
&descriptor,
validation_data.max_pov_size,
&pov,
&validation_code.hash(),
);
assert!(check.is_ok());
descriptor
}
// Test that we vote valid if we get `AmbiguousWorkerDeath`, retry, and then succeed.
#[test]
fn candidate_validation_one_ambiguous_error_is_valid() {
@@ -505,24 +532,12 @@ fn candidate_validation_one_ambiguous_error_is_valid() {
let head_data = HeadData(vec![1, 1, 1]);
let validation_code = ValidationCode(vec![2; 16]);
let descriptor = make_valid_candidate_descriptor(
ParaId::from(1_u32),
dummy_hash(),
validation_data.hash(),
pov.hash(),
validation_code.hash(),
head_data.hash(),
dummy_hash(),
Sr25519Keyring::Alice,
);
let check = perform_basic_checks(
&descriptor,
validation_data.max_pov_size,
let descriptor = perform_basic_checks_on_valid_candidate(
&pov,
&validation_code.hash(),
&validation_code,
&validation_data,
head_data.hash(),
);
assert!(check.is_ok());
let validation_result = WasmValidationResult {
head_data,
@@ -554,7 +569,7 @@ fn candidate_validation_one_ambiguous_error_is_valid() {
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
PvfExecKind::Approval,
&Default::default(),
))
.unwrap();
@@ -576,24 +591,12 @@ fn candidate_validation_multiple_ambiguous_errors_is_invalid() {
let pov = PoV { block_data: BlockData(vec![1; 32]) };
let validation_code = ValidationCode(vec![2; 16]);
let descriptor = make_valid_candidate_descriptor(
ParaId::from(1_u32),
dummy_hash(),
validation_data.hash(),
pov.hash(),
validation_code.hash(),
dummy_hash(),
dummy_hash(),
Sr25519Keyring::Alice,
);
let check = perform_basic_checks(
&descriptor,
validation_data.max_pov_size,
let descriptor = perform_basic_checks_on_valid_candidate(
&pov,
&validation_code.hash(),
&validation_code,
&validation_data,
dummy_hash(),
);
assert!(check.is_ok());
let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };
@@ -607,7 +610,7 @@ fn candidate_validation_multiple_ambiguous_errors_is_invalid() {
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
PvfExecKind::Approval,
&Default::default(),
))
.unwrap();
@@ -615,58 +618,79 @@ fn candidate_validation_multiple_ambiguous_errors_is_invalid() {
assert_matches!(v, ValidationResult::Invalid(InvalidCandidate::ExecutionError(_)));
}
// Test that we retry on internal errors.
// Test that we retry for approval on internal errors.
#[test]
fn candidate_validation_retry_internal_errors() {
let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() };
let pov = PoV { block_data: BlockData(vec![1; 32]) };
let validation_code = ValidationCode(vec![2; 16]);
let descriptor = make_valid_candidate_descriptor(
ParaId::from(1_u32),
dummy_hash(),
validation_data.hash(),
pov.hash(),
validation_code.hash(),
dummy_hash(),
dummy_hash(),
Sr25519Keyring::Alice,
let v = candidate_validation_retry_on_error_helper(
PvfExecKind::Approval,
vec![
Err(InternalValidationError::HostCommunication("foo".into()).into()),
// Throw an AJD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousJobDeath(
"baz".into(),
))),
// Throw another internal error.
Err(InternalValidationError::HostCommunication("bar".into()).into()),
],
);
assert_matches!(v, Err(ValidationFailed(s)) if s.contains("bar"));
}
let check = perform_basic_checks(
&descriptor,
validation_data.max_pov_size,
&pov,
&validation_code.hash(),
);
assert!(check.is_ok());
let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };
let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result_list(vec![
// Test that we don't retry for backing on internal errors.
#[test]
fn candidate_validation_dont_retry_internal_errors() {
let v = candidate_validation_retry_on_error_helper(
PvfExecKind::Backing,
vec![
Err(InternalValidationError::HostCommunication("foo".into()).into()),
// Throw an AWD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
// Throw another internal error.
Err(InternalValidationError::HostCommunication("bar".into()).into()),
]),
validation_data,
validation_code,
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
&Default::default(),
));
],
);
assert_matches!(v, Err(ValidationFailed(s)) if s.contains("bar"));
assert_matches!(v, Err(ValidationFailed(s)) if s.contains("foo"));
}
// Test that we retry on panic errors.
// Test that we retry for approval on panic errors.
#[test]
fn candidate_validation_retry_panic_errors() {
let v = candidate_validation_retry_on_error_helper(
PvfExecKind::Approval,
vec![
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("foo".into()))),
// Throw an AWD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
// Throw another panic error.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("bar".into()))),
],
);
assert_matches!(v, Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(s))) if s == "bar".to_string());
}
// Test that we don't retry for backing on panic errors.
#[test]
fn candidate_validation_dont_retry_panic_errors() {
let v = candidate_validation_retry_on_error_helper(
PvfExecKind::Backing,
vec![
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("foo".into()))),
// Throw an AWD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
// Throw another panic error.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("bar".into()))),
],
);
assert_matches!(v, Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(s))) if s == "foo".to_string());
}
fn candidate_validation_retry_on_error_helper(
exec_kind: PvfExecKind,
mock_errors: Vec<Result<WasmValidationResult, ValidationError>>,
) -> Result<ValidationResult, ValidationFailed> {
let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() };
let pov = PoV { block_data: BlockData(vec![1; 32]) };
@@ -693,26 +717,16 @@ fn candidate_validation_retry_panic_errors() {
let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };
let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result_list(vec![
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("foo".into()))),
// Throw an AJD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousJobDeath(
"baz".into(),
))),
// Throw another panic error.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("bar".into()))),
]),
return executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result_list(mock_errors),
validation_data,
validation_code,
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
exec_kind,
&Default::default(),
));
assert_matches!(v, Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(s))) if s == "bar".to_string());
}
#[test]
@@ -752,7 +766,7 @@ fn candidate_validation_timeout_is_internal_error() {
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
PvfExecKind::Backing,
&Default::default(),
));
@@ -797,7 +811,7 @@ fn candidate_validation_commitment_hash_mismatch_is_invalid() {
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
PvfExecKind::Backing,
&Default::default(),
))
.unwrap();
@@ -846,7 +860,7 @@ fn candidate_validation_code_mismatch_is_invalid() {
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
PvfExecKind::Backing,
&Default::default(),
))
.unwrap();
@@ -903,7 +917,7 @@ fn compressed_code_works() {
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
PvfExecKind::Backing,
&Default::default(),
));
@@ -954,7 +968,7 @@ fn code_decompression_failure_is_error() {
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
PvfExecKind::Backing,
&Default::default(),
));
@@ -1006,7 +1020,7 @@ fn pov_decompression_failure_is_invalid() {
candidate_receipt,
Arc::new(pov),
ExecutorParams::default(),
PvfExecTimeoutKind::Backing,
PvfExecKind::Backing,
&Default::default(),
));