PVF: unresponsive worker doesn't mean the candidate is bad (#3418)

* PVF: unresponsive worker doesn't mean the candidate is bad

* s/if let Some/.is_some
This commit is contained in:
Sergei Shulepov
2021-07-07 11:28:07 +03:00
committed by GitHub
parent b71ec24815
commit b7b2276555
2 changed files with 19 additions and 5 deletions
+15 -4
View File
@@ -60,8 +60,11 @@ pub enum ToPool {
/// Request the given worker to start working on the given code.
///
/// Once the job either succeeded or failed, a [`FromPool::Concluded`] message will be sent back.
/// It's also possible that the worker dies before handling the message in which case [`FromPool::Rip`]
/// will be sent back.
///
/// This should not be sent again until the concluded message is received.
/// In either case, the worker is considered busy and no further `StartWork` messages should be
/// sent until either `Concluded` or `Rip` message is received.
StartWork {
worker: Worker,
code: Arc<Vec<u8>>,
@@ -176,8 +179,9 @@ async fn purge_dead(
}
}
for w in to_remove {
let _ = spawned.remove(w);
reply(from_pool, FromPool::Rip(w))?;
if spawned.remove(w).is_some() {
reply(from_pool, FromPool::Rip(w))?;
}
}
Ok(())
}
@@ -308,8 +312,15 @@ fn handle_mux(
Ok(())
}
Outcome::Unreachable => {
if spawned.remove(worker).is_some() {
reply(from_pool, FromPool::Rip(worker))?;
}
Ok(())
}
Outcome::DidntMakeIt => {
if let Some(_data) = spawned.remove(worker) {
if spawned.remove(worker).is_some() {
reply(from_pool, FromPool::Concluded(worker, true))?;
}
+4 -1
View File
@@ -55,6 +55,9 @@ pub async fn spawn(
pub enum Outcome {
/// The worker has finished the work assigned to it.
Concluded(IdleWorker),
/// The host tried to reach the worker but failed. This is most likely because the worked was
/// killed by the system.
Unreachable,
/// The execution was interrupted abruptly and the worker is not available anymore. For example,
/// this could've happen because the worker hadn't finished the work until the given deadline.
///
@@ -96,7 +99,7 @@ pub async fn start_work(
"failed to send a prepare request: {:?}",
err,
);
return Outcome::DidntMakeIt;
return Outcome::Unreachable;
}
// Wait for the result from the worker, keeping in mind that there may be a timeout, the