diff --git a/polkadot/node/core/pvf/src/prepare/pool.rs b/polkadot/node/core/pvf/src/prepare/pool.rs index 618c71e253..ca7b6f65d5 100644 --- a/polkadot/node/core/pvf/src/prepare/pool.rs +++ b/polkadot/node/core/pvf/src/prepare/pool.rs @@ -60,8 +60,11 @@ pub enum ToPool { /// Request the given worker to start working on the given code. /// /// Once the job either succeeded or failed, a [`FromPool::Concluded`] message will be sent back. + /// It's also possible that the worker dies before handling the message in which case [`FromPool::Rip`] + /// will be sent back. /// - /// This should not be sent again until the concluded message is received. + /// In either case, the worker is considered busy and no further `StartWork` messages should be + /// sent until either `Concluded` or `Rip` message is received. StartWork { worker: Worker, code: Arc>, @@ -176,8 +179,9 @@ async fn purge_dead( } } for w in to_remove { - let _ = spawned.remove(w); - reply(from_pool, FromPool::Rip(w))?; + if spawned.remove(w).is_some() { + reply(from_pool, FromPool::Rip(w))?; + } } Ok(()) } @@ -308,8 +312,15 @@ fn handle_mux( Ok(()) } + Outcome::Unreachable => { + if spawned.remove(worker).is_some() { + reply(from_pool, FromPool::Rip(worker))?; + } + + Ok(()) + } Outcome::DidntMakeIt => { - if let Some(_data) = spawned.remove(worker) { + if spawned.remove(worker).is_some() { reply(from_pool, FromPool::Concluded(worker, true))?; } diff --git a/polkadot/node/core/pvf/src/prepare/worker.rs b/polkadot/node/core/pvf/src/prepare/worker.rs index 307396b01a..a7854e83e6 100644 --- a/polkadot/node/core/pvf/src/prepare/worker.rs +++ b/polkadot/node/core/pvf/src/prepare/worker.rs @@ -55,6 +55,9 @@ pub async fn spawn( pub enum Outcome { /// The worker has finished the work assigned to it. Concluded(IdleWorker), + /// The host tried to reach the worker but failed. This is most likely because the worked was + /// killed by the system. + Unreachable, /// The execution was interrupted abruptly and the worker is not available anymore. For example, /// this could've happen because the worker hadn't finished the work until the given deadline. /// @@ -96,7 +99,7 @@ pub async fn start_work( "failed to send a prepare request: {:?}", err, ); - return Outcome::DidntMakeIt; + return Outcome::Unreachable; } // Wait for the result from the worker, keeping in mind that there may be a timeout, the