PVF preparation: do not conflate errors (#6384)

* PVF preparation: do not conflate errors

+ Adds some more granularity to the prepare errors.
+ Better distinguish whether errors occur on the host side or the worker.
+ Do not kill the worker if the error happened on the host side.
+ Do not retry preparation if the error was `Panic`.
+ Removes unnecessary indirection with `Selected` type.

* Add missing docs, resolve TODOs

* Address review comments and remove TODOs

* Fix error in CI

* Undo unnecessary change

* Update couple of comments

* Don't return error for stream shutdown

* Update node/core/pvf/src/worker_common.rs
This commit is contained in:
Marcin S
2022-12-20 08:32:12 -05:00
committed by GitHub
parent fcc26d42e4
commit e0a0475a05
8 changed files with 173 additions and 108 deletions
+8 -9
View File
@@ -776,16 +776,15 @@ fn can_retry_prepare_after_failure(
num_failures: u32,
error: &PrepareError,
) -> bool {
use PrepareError::*;
match error {
// Gracefully returned an error, so it will probably be reproducible. Don't retry.
Prevalidation(_) | Preparation(_) => false,
// Retry if the retry cooldown has elapsed and if we have already retried less than
// `NUM_PREPARE_RETRIES` times. IO errors may resolve themselves.
Panic(_) | TimedOut | DidNotMakeIt =>
SystemTime::now() >= last_time_failed + PREPARE_FAILURE_COOLDOWN &&
num_failures <= NUM_PREPARE_RETRIES,
if error.is_deterministic() {
// This error is considered deterministic, so it will probably be reproducible. Don't retry.
return false
}
// Retry if the retry cooldown has elapsed and if we have already retried less than `NUM_PREPARE_RETRIES` times. IO
// errors may resolve themselves.
SystemTime::now() >= last_time_failed + PREPARE_FAILURE_COOLDOWN &&
num_failures <= NUM_PREPARE_RETRIES
}
/// A stream that yields a pulse continuously at a given interval.