mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-13 03:31:10 +00:00
pre-checking: Reject failed PVFs (#6492)
* pre-checking: Reject failed PVFs * paras: immediately reject any PVF that cannot reach a supermajority * Make the `quorum` reject condition a bit more clear semantically * Add comment * Update implementer's guide * Update a link Not related to the rest of the PR, but I randomly noticed and fixed this. * Update runtime/parachains/src/paras/tests.rs Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com> * Remove unneeded loop * Log PVF retries using `info!` * Change retry logs to `warn!` and add preparation failure log * Log PVF execution failure * Clarify why we reject failed PVFs * Fix PVF reject runtime benchmarks Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
This commit is contained in:
@@ -295,8 +295,7 @@ where
|
||||
_ => {
|
||||
// The reasoning why this is "failed" and not invalid is because we assume that
|
||||
// during pre-checking voting the relay-chain will pin the code. In case the code
|
||||
// actually is not there, we issue failed since this looks more like a bug. This
|
||||
// leads to us abstaining.
|
||||
// actually is not there, we issue failed since this looks more like a bug.
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?relay_parent,
|
||||
@@ -635,7 +634,7 @@ trait ValidationBackend {
|
||||
// Wait a brief delay before retrying.
|
||||
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
|
||||
|
||||
gum::debug!(
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?pvf,
|
||||
"Re-trying failed candidate validation due to AmbiguousWorkerDeath."
|
||||
|
||||
@@ -190,16 +190,18 @@ async fn handle_pvf_check(
|
||||
PreCheckOutcome::Valid => Judgement::Valid,
|
||||
PreCheckOutcome::Invalid => Judgement::Invalid,
|
||||
PreCheckOutcome::Failed => {
|
||||
// Abstain.
|
||||
// Always vote against in case of failures. Voting against a PVF when encountering a
|
||||
// timeout (or an unlikely node-specific issue) can be considered safe, since
|
||||
// there is no slashing for being on the wrong side on a pre-check vote.
|
||||
//
|
||||
// Returning here will leave the PVF in the view dangling. Since it is there, no new
|
||||
// pre-checking request will be sent.
|
||||
// Also, by being more strict here, we can safely be more lenient during preparation and
|
||||
// avoid the risk of getting slashed there.
|
||||
gum::info!(
|
||||
target: LOG_TARGET,
|
||||
?validation_code_hash,
|
||||
"Pre-check failed, abstaining from voting",
|
||||
"Pre-check failed, voting against",
|
||||
);
|
||||
return
|
||||
Judgement::Invalid
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@@ -157,7 +157,7 @@ impl TestState {
|
||||
self.active_leaves_update(handle, Some(leaf), None, &[]).await
|
||||
}
|
||||
|
||||
async fn deactive_leaves(
|
||||
async fn deactivate_leaves(
|
||||
&mut self,
|
||||
handle: &mut VirtualOverseer,
|
||||
deactivated: impl IntoIterator<Item = &Hash>,
|
||||
@@ -894,8 +894,8 @@ fn unexpected_pvf_check_judgement() {
|
||||
// Catch the pre-check request, but don't reply just yet.
|
||||
let pre_check = test_state.expect_candidate_precheck(&mut handle).await;
|
||||
|
||||
// Now deactive the leaf and reply to the precheck request.
|
||||
test_state.deactive_leaves(&mut handle, &[block_1.block_hash]).await;
|
||||
// Now deactivate the leaf and reply to the precheck request.
|
||||
test_state.deactivate_leaves(&mut handle, &[block_1.block_hash]).await;
|
||||
pre_check.reply(PreCheckOutcome::Invalid);
|
||||
|
||||
// the subsystem must remain silent.
|
||||
@@ -906,14 +906,17 @@ fn unexpected_pvf_check_judgement() {
|
||||
});
|
||||
}
|
||||
|
||||
// Check that we do not abstain for a nondeterministic failure. Currently, this means the behavior
|
||||
// is the same as if the pre-check returned `PreCheckOutcome::Invalid`.
|
||||
#[test]
|
||||
fn abstain_for_nondeterministic_pvfcheck_failure() {
|
||||
fn dont_abstain_for_nondeterministic_pvfcheck_failure() {
|
||||
test_harness(|mut test_state, mut handle| {
|
||||
async move {
|
||||
let block_1 = FakeLeaf::new(dummy_hash(), 1, vec![dummy_validation_code_hash(1)]);
|
||||
test_state
|
||||
.activate_leaf_with_session(
|
||||
&mut handle,
|
||||
FakeLeaf::new(dummy_hash(), 1, vec![dummy_validation_code_hash(1)]),
|
||||
block_1.clone(),
|
||||
StartsNewSession { session_index: 2, validators: vec![OUR_VALIDATOR] },
|
||||
)
|
||||
.await;
|
||||
@@ -922,10 +925,14 @@ fn abstain_for_nondeterministic_pvfcheck_failure() {
|
||||
test_state.expect_session_for_child(&mut handle).await;
|
||||
test_state.expect_validators(&mut handle).await;
|
||||
|
||||
test_state
|
||||
.expect_candidate_precheck(&mut handle)
|
||||
.await
|
||||
.reply(PreCheckOutcome::Failed);
|
||||
// Catch the pre-check request, but don't reply just yet.
|
||||
let pre_check = test_state.expect_candidate_precheck(&mut handle).await;
|
||||
|
||||
// Now deactivate the leaf and reply to the precheck request.
|
||||
test_state.deactivate_leaves(&mut handle, &[block_1.block_hash]).await;
|
||||
pre_check.reply(PreCheckOutcome::Failed);
|
||||
|
||||
// the subsystem must remain silent.
|
||||
|
||||
test_state.send_conclude(&mut handle).await;
|
||||
}
|
||||
|
||||
@@ -223,32 +223,48 @@ fn handle_job_finish(
|
||||
artifact_id: ArtifactId,
|
||||
result_tx: ResultSender,
|
||||
) {
|
||||
let (idle_worker, result) = match outcome {
|
||||
Outcome::Ok { result_descriptor, duration: _, idle_worker } => {
|
||||
let (idle_worker, result, duration) = match outcome {
|
||||
Outcome::Ok { result_descriptor, duration, idle_worker } => {
|
||||
// TODO: propagate the soft timeout
|
||||
|
||||
(Some(idle_worker), Ok(result_descriptor))
|
||||
(Some(idle_worker), Ok(result_descriptor), Some(duration))
|
||||
},
|
||||
Outcome::InvalidCandidate { err, idle_worker } => (
|
||||
Some(idle_worker),
|
||||
Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(err))),
|
||||
None,
|
||||
),
|
||||
Outcome::InternalError { err, idle_worker } =>
|
||||
(Some(idle_worker), Err(ValidationError::InternalError(err))),
|
||||
(Some(idle_worker), Err(ValidationError::InternalError(err)), None),
|
||||
Outcome::HardTimeout =>
|
||||
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout))),
|
||||
Outcome::IoErr =>
|
||||
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath))),
|
||||
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None),
|
||||
Outcome::IoErr => (
|
||||
None,
|
||||
Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)),
|
||||
None,
|
||||
),
|
||||
};
|
||||
|
||||
queue.metrics.execute_finished();
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
validation_code_hash = ?artifact_id.code_hash,
|
||||
?worker,
|
||||
worker_rip = idle_worker.is_none(),
|
||||
"execute worker concluded",
|
||||
);
|
||||
if let Err(ref err) = result {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?artifact_id,
|
||||
?worker,
|
||||
worker_rip = idle_worker.is_none(),
|
||||
"execution worker concluded, error occurred: {:?}",
|
||||
err
|
||||
);
|
||||
} else {
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
?artifact_id,
|
||||
?worker,
|
||||
worker_rip = idle_worker.is_none(),
|
||||
?duration,
|
||||
"execute worker concluded successfully",
|
||||
);
|
||||
}
|
||||
|
||||
// First we send the result. It may fail due to the other end of the channel being dropped,
|
||||
// that's legitimate and we don't treat that as an error.
|
||||
|
||||
@@ -525,7 +525,7 @@ async fn handle_execute_pvf(
|
||||
},
|
||||
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
|
||||
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
|
||||
gum::debug!(
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?pvf,
|
||||
?artifact_id,
|
||||
@@ -599,7 +599,7 @@ async fn handle_heads_up(
|
||||
},
|
||||
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
|
||||
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
|
||||
gum::debug!(
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?active_pvf,
|
||||
?artifact_id,
|
||||
@@ -728,18 +728,18 @@ async fn handle_prepare_done(
|
||||
Ok(cpu_time_elapsed) =>
|
||||
ArtifactState::Prepared { last_time_needed: SystemTime::now(), cpu_time_elapsed },
|
||||
Err(error) => {
|
||||
gum::debug!(
|
||||
let last_time_failed = SystemTime::now();
|
||||
let num_failures = *num_failures + 1;
|
||||
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
artifact_id = ?artifact_id,
|
||||
num_failures = ?num_failures,
|
||||
"Failed to process artifact: {}",
|
||||
?artifact_id,
|
||||
time_failed = ?last_time_failed,
|
||||
%num_failures,
|
||||
"artifact preparation failed: {}",
|
||||
error
|
||||
);
|
||||
ArtifactState::FailedToProcess {
|
||||
last_time_failed: SystemTime::now(),
|
||||
num_failures: *num_failures + 1,
|
||||
error,
|
||||
}
|
||||
ArtifactState::FailedToProcess { last_time_failed, num_failures, error }
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user