Log PVF retries (#6504)

This commit is contained in:
Marcin S
2023-01-04 17:18:41 -05:00
committed by GitHub
parent 6ba23078c9
commit f0106b30fa
2 changed files with 35 additions and 5 deletions
@@ -604,6 +604,7 @@ async fn validate_candidate_exhaustive(
#[async_trait]
trait ValidationBackend {
/// Tries executing a PVF a single time (no retries).
async fn validate_candidate(
&mut self,
pvf: Pvf,
@@ -611,6 +612,8 @@ trait ValidationBackend {
encoded_params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError>;
/// Tries executing a PVF. Will retry once if an error is encountered that may have been
/// transient.
async fn validate_candidate_with_retry(
&mut self,
raw_validation_code: Vec<u8>,
@@ -620,7 +623,7 @@ trait ValidationBackend {
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
let pvf = Pvf::from_code(raw_validation_code);
let validation_result =
let mut validation_result =
self.validate_candidate(pvf.clone(), timeout, params.encode()).await;
// If we get an AmbiguousWorkerDeath error, retry once after a brief delay, on the
@@ -630,12 +633,19 @@ trait ValidationBackend {
{
// Wait a brief delay before retrying.
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
gum::debug!(
target: LOG_TARGET,
?pvf,
"Re-trying failed candidate validation due to AmbiguousWorkerDeath."
);
// Encode the params again when re-trying. We expect the retry case to be relatively
// rare, and we want to avoid unconditionally cloning data.
self.validate_candidate(pvf, timeout, params.encode()).await
} else {
validation_result
validation_result = self.validate_candidate(pvf, timeout, params.encode()).await;
}
validation_result
}
async fn precheck_pvf(&mut self, pvf: Pvf) -> Result<Duration, PrepareError>;
+21 -1
View File
@@ -525,6 +525,16 @@ async fn handle_execute_pvf(
},
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
gum::debug!(
target: LOG_TARGET,
?pvf,
?artifact_id,
?last_time_failed,
%num_failures,
%error,
"handle_execute_pvf: Re-trying failed PVF preparation."
);
// If we are allowed to retry the failed prepare job, change the state to
// Preparing and re-queue this job.
*state = ArtifactState::Preparing {
@@ -585,6 +595,16 @@ async fn handle_heads_up(
},
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
gum::debug!(
target: LOG_TARGET,
?active_pvf,
?artifact_id,
?last_time_failed,
%num_failures,
%error,
"handle_heads_up: Re-trying failed PVF preparation."
);
// If we are allowed to retry the failed prepare job, change the state to
// Preparing and re-queue this job.
*state = ArtifactState::Preparing {
@@ -1393,7 +1413,7 @@ mod tests {
}
// Test that multiple execution requests don't trigger preparation retries if the first one
// failed due to reproducible error (e.g. Prevalidation).
// failed due to a reproducible error (e.g. Prevalidation).
#[async_std::test]
async fn test_execute_prepare_no_retry() {
let mut test = Builder::default().build();