PVF: unresponsive worker doesn't mean the candidate is bad (#3418)

* PVF: unresponsive worker doesn't mean the candidate is bad

* s/if let Some/.is_some
This commit is contained in:
Sergei Shulepov
2021-07-07 11:28:07 +03:00
committed by GitHub
parent b71ec24815
commit b7b2276555
2 changed files with 19 additions and 5 deletions
+15 -4
View File
@@ -60,8 +60,11 @@ pub enum ToPool {
/// Request the given worker to start working on the given code. /// Request the given worker to start working on the given code.
/// ///
/// Once the job either succeeded or failed, a [`FromPool::Concluded`] message will be sent back. /// Once the job either succeeded or failed, a [`FromPool::Concluded`] message will be sent back.
/// It's also possible that the worker dies before handling the message in which case [`FromPool::Rip`]
/// will be sent back.
/// ///
/// This should not be sent again until the concluded message is received. /// In either case, the worker is considered busy and no further `StartWork` messages should be
/// sent until either `Concluded` or `Rip` message is received.
StartWork { StartWork {
worker: Worker, worker: Worker,
code: Arc<Vec<u8>>, code: Arc<Vec<u8>>,
@@ -176,8 +179,9 @@ async fn purge_dead(
} }
} }
for w in to_remove { for w in to_remove {
let _ = spawned.remove(w); if spawned.remove(w).is_some() {
reply(from_pool, FromPool::Rip(w))?; reply(from_pool, FromPool::Rip(w))?;
}
} }
Ok(()) Ok(())
} }
@@ -308,8 +312,15 @@ fn handle_mux(
Ok(()) Ok(())
} }
Outcome::Unreachable => {
if spawned.remove(worker).is_some() {
reply(from_pool, FromPool::Rip(worker))?;
}
Ok(())
}
Outcome::DidntMakeIt => { Outcome::DidntMakeIt => {
if let Some(_data) = spawned.remove(worker) { if spawned.remove(worker).is_some() {
reply(from_pool, FromPool::Concluded(worker, true))?; reply(from_pool, FromPool::Concluded(worker, true))?;
} }
+4 -1
View File
@@ -55,6 +55,9 @@ pub async fn spawn(
pub enum Outcome { pub enum Outcome {
/// The worker has finished the work assigned to it. /// The worker has finished the work assigned to it.
Concluded(IdleWorker), Concluded(IdleWorker),
/// The host tried to reach the worker but failed. This is most likely because the worked was
/// killed by the system.
Unreachable,
/// The execution was interrupted abruptly and the worker is not available anymore. For example, /// The execution was interrupted abruptly and the worker is not available anymore. For example,
/// this could've happen because the worker hadn't finished the work until the given deadline. /// this could've happen because the worker hadn't finished the work until the given deadline.
/// ///
@@ -96,7 +99,7 @@ pub async fn start_work(
"failed to send a prepare request: {:?}", "failed to send a prepare request: {:?}",
err, err,
); );
return Outcome::DidntMakeIt; return Outcome::Unreachable;
} }
// Wait for the result from the worker, keeping in mind that there may be a timeout, the // Wait for the result from the worker, keeping in mind that there may be a timeout, the