pre-checking: Reject failed PVFs (#6492)

* pre-checking: Reject failed PVFs

* paras: immediately reject any PVF that cannot reach a supermajority

* Make the `quorum` reject condition a bit more clear semantically

* Add comment

* Update implementer's guide

* Update a link

Not related to the rest of the PR, but I randomly noticed and fixed this.

* Update runtime/parachains/src/paras/tests.rs

Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>

* Remove unneeded loop

* Log PVF retries using `info!`

* Change retry logs to `warn!` and add preparation failure log

* Log PVF execution failure

* Clarify why we reject failed PVFs

* Fix PVF reject runtime benchmarks

Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
This commit is contained in:
Marcin S
2023-01-12 04:24:42 -05:00
committed by GitHub
parent ae74d33a93
commit 2efa3bab98
11 changed files with 126 additions and 74 deletions
@@ -295,8 +295,7 @@ where
_ => {
// The reasoning why this is "failed" and not invalid is because we assume that
// during pre-checking voting the relay-chain will pin the code. In case the code
// actually is not there, we issue failed since this looks more like a bug. This
// leads to us abstaining.
// actually is not there, we issue failed since this looks more like a bug.
gum::warn!(
target: LOG_TARGET,
?relay_parent,
@@ -635,7 +634,7 @@ trait ValidationBackend {
// Wait a brief delay before retrying.
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
gum::debug!(
gum::warn!(
target: LOG_TARGET,
?pvf,
"Re-trying failed candidate validation due to AmbiguousWorkerDeath."
+7 -5
View File
@@ -190,16 +190,18 @@ async fn handle_pvf_check(
PreCheckOutcome::Valid => Judgement::Valid,
PreCheckOutcome::Invalid => Judgement::Invalid,
PreCheckOutcome::Failed => {
// Abstain.
// Always vote against in case of failures. Voting against a PVF when encountering a
// timeout (or an unlikely node-specific issue) can be considered safe, since
// there is no slashing for being on the wrong side on a pre-check vote.
//
// Returning here will leave the PVF in the view dangling. Since it is there, no new
// pre-checking request will be sent.
// Also, by being more strict here, we can safely be more lenient during preparation and
// avoid the risk of getting slashed there.
gum::info!(
target: LOG_TARGET,
?validation_code_hash,
"Pre-check failed, abstaining from voting",
"Pre-check failed, voting against",
);
return
Judgement::Invalid
},
};
+16 -9
View File
@@ -157,7 +157,7 @@ impl TestState {
self.active_leaves_update(handle, Some(leaf), None, &[]).await
}
async fn deactive_leaves(
async fn deactivate_leaves(
&mut self,
handle: &mut VirtualOverseer,
deactivated: impl IntoIterator<Item = &Hash>,
@@ -894,8 +894,8 @@ fn unexpected_pvf_check_judgement() {
// Catch the pre-check request, but don't reply just yet.
let pre_check = test_state.expect_candidate_precheck(&mut handle).await;
// Now deactive the leaf and reply to the precheck request.
test_state.deactive_leaves(&mut handle, &[block_1.block_hash]).await;
// Now deactivate the leaf and reply to the precheck request.
test_state.deactivate_leaves(&mut handle, &[block_1.block_hash]).await;
pre_check.reply(PreCheckOutcome::Invalid);
// the subsystem must remain silent.
@@ -906,14 +906,17 @@ fn unexpected_pvf_check_judgement() {
});
}
// Check that we do not abstain for a nondeterministic failure. Currently, this means the behavior
// is the same as if the pre-check returned `PreCheckOutcome::Invalid`.
#[test]
fn abstain_for_nondeterministic_pvfcheck_failure() {
fn dont_abstain_for_nondeterministic_pvfcheck_failure() {
test_harness(|mut test_state, mut handle| {
async move {
let block_1 = FakeLeaf::new(dummy_hash(), 1, vec![dummy_validation_code_hash(1)]);
test_state
.activate_leaf_with_session(
&mut handle,
FakeLeaf::new(dummy_hash(), 1, vec![dummy_validation_code_hash(1)]),
block_1.clone(),
StartsNewSession { session_index: 2, validators: vec![OUR_VALIDATOR] },
)
.await;
@@ -922,10 +925,14 @@ fn abstain_for_nondeterministic_pvfcheck_failure() {
test_state.expect_session_for_child(&mut handle).await;
test_state.expect_validators(&mut handle).await;
test_state
.expect_candidate_precheck(&mut handle)
.await
.reply(PreCheckOutcome::Failed);
// Catch the pre-check request, but don't reply just yet.
let pre_check = test_state.expect_candidate_precheck(&mut handle).await;
// Now deactivate the leaf and reply to the precheck request.
test_state.deactivate_leaves(&mut handle, &[block_1.block_hash]).await;
pre_check.reply(PreCheckOutcome::Failed);
// the subsystem must remain silent.
test_state.send_conclude(&mut handle).await;
}
+30 -14
View File
@@ -223,32 +223,48 @@ fn handle_job_finish(
artifact_id: ArtifactId,
result_tx: ResultSender,
) {
let (idle_worker, result) = match outcome {
Outcome::Ok { result_descriptor, duration: _, idle_worker } => {
let (idle_worker, result, duration) = match outcome {
Outcome::Ok { result_descriptor, duration, idle_worker } => {
// TODO: propagate the soft timeout
(Some(idle_worker), Ok(result_descriptor))
(Some(idle_worker), Ok(result_descriptor), Some(duration))
},
Outcome::InvalidCandidate { err, idle_worker } => (
Some(idle_worker),
Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(err))),
None,
),
Outcome::InternalError { err, idle_worker } =>
(Some(idle_worker), Err(ValidationError::InternalError(err))),
(Some(idle_worker), Err(ValidationError::InternalError(err)), None),
Outcome::HardTimeout =>
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout))),
Outcome::IoErr =>
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath))),
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None),
Outcome::IoErr => (
None,
Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)),
None,
),
};
queue.metrics.execute_finished();
gum::debug!(
target: LOG_TARGET,
validation_code_hash = ?artifact_id.code_hash,
?worker,
worker_rip = idle_worker.is_none(),
"execute worker concluded",
);
if let Err(ref err) = result {
gum::warn!(
target: LOG_TARGET,
?artifact_id,
?worker,
worker_rip = idle_worker.is_none(),
"execution worker concluded, error occurred: {:?}",
err
);
} else {
gum::debug!(
target: LOG_TARGET,
?artifact_id,
?worker,
worker_rip = idle_worker.is_none(),
?duration,
"execute worker concluded successfully",
);
}
// First we send the result. It may fail due to the other end of the channel being dropped,
// that's legitimate and we don't treat that as an error.
+11 -11
View File
@@ -525,7 +525,7 @@ async fn handle_execute_pvf(
},
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
gum::debug!(
gum::warn!(
target: LOG_TARGET,
?pvf,
?artifact_id,
@@ -599,7 +599,7 @@ async fn handle_heads_up(
},
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
gum::debug!(
gum::warn!(
target: LOG_TARGET,
?active_pvf,
?artifact_id,
@@ -728,18 +728,18 @@ async fn handle_prepare_done(
Ok(cpu_time_elapsed) =>
ArtifactState::Prepared { last_time_needed: SystemTime::now(), cpu_time_elapsed },
Err(error) => {
gum::debug!(
let last_time_failed = SystemTime::now();
let num_failures = *num_failures + 1;
gum::warn!(
target: LOG_TARGET,
artifact_id = ?artifact_id,
num_failures = ?num_failures,
"Failed to process artifact: {}",
?artifact_id,
time_failed = ?last_time_failed,
%num_failures,
"artifact preparation failed: {}",
error
);
ArtifactState::FailedToProcess {
last_time_failed: SystemTime::now(),
num_failures: *num_failures + 1,
error,
}
ArtifactState::FailedToProcess { last_time_failed, num_failures, error }
},
};