mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-05-31 12:11:02 +00:00
New PVF validation host (#2710)
* Implement PVF validation host * WIP: Diener * Increase the alloted compilation time * Add more comments * Minor clean up * Apply suggestions from code review Co-authored-by: Bastian Köcher <bkchr@users.noreply.github.com> * Fix pruning artifact removal * Fix formatting and newlines * Fix the thread pool * Update node/core/pvf/src/executor_intf.rs Co-authored-by: Bastian Köcher <bkchr@users.noreply.github.com> * Remove redundant test declaration * Don't convert the path into an intermediate string * Try to workaround the test failure * Use the puppet_worker trick again * Fix a blip * Move `ensure_wasmtime_version` under the tests mod * Add a macro for puppet_workers * fix build for not real-overseer * Rename the puppet worker for adder collator * play it safe with the name of adder puppet worker * Typo: triggered * Add more comments * Do not kill exec worker on every error * Plumb Duration for timeouts * typo: critical * Add proofs * Clean unused imports * Revert "WIP: Diener" This reverts commit b9f54e513366c7a6dfdd117ac19fbdc46b900b4d. * Sync version of wasmtime * Update cargo.lock * Update Substrate * Merge fixes still * Update wasmtime version in test * bastifmt Co-authored-by: Bastian Köcher <bkchr@users.noreply.github.com> * Squash spaces * Trailing new line for testing.rs * Remove controversial code * comment about biasing * Fix suggestion * Add comments * make it more clear why unwrap_err * tmpfile retry * proper proofs for claim_idle * Remove mutex from ValidationHost * Add some more logging * Extract exec timeout into a constant * Add some clarifying logging * Use blake2_256 * Clean up the merge Specifically the leftovers after removing real-overseer * Update parachain/test-parachains/adder/collator/Cargo.toml Co-authored-by: Andronik Ordian <write@reusable.software> Co-authored-by: Bastian Köcher <bkchr@users.noreply.github.com> Co-authored-by: Andronik Ordian <write@reusable.software>
This commit is contained in:
@@ -0,0 +1,31 @@
|
||||
// Copyright 2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Polkadot.
|
||||
|
||||
// Polkadot is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Polkadot is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Preparation part of pipeline
|
||||
//!
|
||||
//! The validation host spins up two processes: the queue (by running [`start_queue`]) and the pool
|
||||
//! (by running [`start_pool`]).
|
||||
//!
|
||||
//! The pool will spawn workers in new processes and those should execute pass control to
|
||||
//! [`worker_entrypoint`].
|
||||
|
||||
mod pool;
|
||||
mod queue;
|
||||
mod worker;
|
||||
|
||||
pub use queue::{ToQueue, FromQueue, start as start_queue};
|
||||
pub use pool::start as start_pool;
|
||||
pub use worker::worker_entrypoint;
|
||||
@@ -0,0 +1,336 @@
|
||||
// Copyright 2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Polkadot.
|
||||
|
||||
// Polkadot is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Polkadot is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use crate::{
|
||||
worker_common::{IdleWorker, WorkerHandle},
|
||||
LOG_TARGET,
|
||||
};
|
||||
use super::{
|
||||
worker::{self, Outcome},
|
||||
};
|
||||
use std::{fmt, sync::Arc, task::Poll, time::Duration};
|
||||
use async_std::path::{Path, PathBuf};
|
||||
use futures::{
|
||||
Future, FutureExt, StreamExt, channel::mpsc, future::BoxFuture, stream::FuturesUnordered,
|
||||
};
|
||||
use slotmap::HopSlotMap;
|
||||
use assert_matches::assert_matches;
|
||||
use always_assert::never;
|
||||
|
||||
slotmap::new_key_type! { pub struct Worker; }
|
||||
|
||||
/// Messages that the pool handles.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum ToPool {
|
||||
/// Request a new worker to spawn.
|
||||
///
|
||||
/// This request won't fail in case if the worker cannot be created. Instead, we consider
|
||||
/// the failures transient and we try to spawn a worker after a delay.
|
||||
///
|
||||
/// [`FromPool::Spawned`] will be returned as soon as the worker is spawned.
|
||||
///
|
||||
/// The client should anticipate a [`FromPool::Rip`] message, in case the spawned worker was
|
||||
/// stopped for some reason.
|
||||
Spawn,
|
||||
|
||||
/// Kill the given worker. No-op if the given worker is not running.
|
||||
///
|
||||
/// [`FromPool::Rip`] won't be sent in this case. However, the client should be prepared to
|
||||
/// receive [`FromPool::Rip`] nonetheless, since the worker may be have been ripped before
|
||||
/// this message is processed.
|
||||
Kill(Worker),
|
||||
|
||||
/// If the given worker was started with the background priority, then it will be raised up to
|
||||
/// normal priority. Otherwise, it's no-op.
|
||||
BumpPriority(Worker),
|
||||
|
||||
/// Request the given worker to start working on the given code.
|
||||
///
|
||||
/// Once the job either succeeded or failed, a [`FromPool::Concluded`] message will be sent back.
|
||||
///
|
||||
/// This should not be sent again until the concluded message is received.
|
||||
StartWork {
|
||||
worker: Worker,
|
||||
code: Arc<Vec<u8>>,
|
||||
artifact_path: PathBuf,
|
||||
background_priority: bool,
|
||||
},
|
||||
}
|
||||
|
||||
/// A message sent from pool to its client.
|
||||
#[derive(Debug)]
|
||||
pub enum FromPool {
|
||||
/// The given worker was just spawned and is ready to be used.
|
||||
Spawned(Worker),
|
||||
|
||||
/// The given worker either succeeded or failed the given job. Under any circumstances the
|
||||
/// artifact file has been written. The bool says whether the worker ripped.
|
||||
Concluded(Worker, bool),
|
||||
|
||||
/// The given worker ceased to exist.
|
||||
Rip(Worker),
|
||||
}
|
||||
|
||||
struct WorkerData {
|
||||
idle: Option<IdleWorker>,
|
||||
handle: WorkerHandle,
|
||||
}
|
||||
|
||||
impl fmt::Debug for WorkerData {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "WorkerData(pid={})", self.handle.id())
|
||||
}
|
||||
}
|
||||
|
||||
enum PoolEvent {
|
||||
Spawn(IdleWorker, WorkerHandle),
|
||||
StartWork(Worker, Outcome),
|
||||
}
|
||||
|
||||
type Mux = FuturesUnordered<BoxFuture<'static, PoolEvent>>;
|
||||
|
||||
struct Pool {
|
||||
program_path: PathBuf,
|
||||
spawn_timeout: Duration,
|
||||
to_pool: mpsc::Receiver<ToPool>,
|
||||
from_pool: mpsc::UnboundedSender<FromPool>,
|
||||
spawned: HopSlotMap<Worker, WorkerData>,
|
||||
mux: Mux,
|
||||
}
|
||||
|
||||
/// A fatal error that warrants stopping the event loop of the pool.
|
||||
struct Fatal;
|
||||
|
||||
async fn run(
|
||||
Pool {
|
||||
program_path,
|
||||
spawn_timeout,
|
||||
to_pool,
|
||||
mut from_pool,
|
||||
mut spawned,
|
||||
mut mux,
|
||||
}: Pool,
|
||||
) {
|
||||
macro_rules! break_if_fatal {
|
||||
($expr:expr) => {
|
||||
match $expr {
|
||||
Err(Fatal) => break,
|
||||
Ok(v) => v,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let mut to_pool = to_pool.fuse();
|
||||
|
||||
loop {
|
||||
futures::select! {
|
||||
to_pool = to_pool.next() => {
|
||||
let to_pool = break_if_fatal!(to_pool.ok_or(Fatal));
|
||||
handle_to_pool(
|
||||
&program_path,
|
||||
spawn_timeout,
|
||||
&mut spawned,
|
||||
&mut mux,
|
||||
to_pool,
|
||||
)
|
||||
}
|
||||
ev = mux.select_next_some() => break_if_fatal!(handle_mux(&mut from_pool, &mut spawned, ev)),
|
||||
}
|
||||
|
||||
break_if_fatal!(purge_dead(&mut from_pool, &mut spawned).await);
|
||||
}
|
||||
}
|
||||
|
||||
async fn purge_dead(
|
||||
from_pool: &mut mpsc::UnboundedSender<FromPool>,
|
||||
spawned: &mut HopSlotMap<Worker, WorkerData>,
|
||||
) -> Result<(), Fatal> {
|
||||
let mut to_remove = vec![];
|
||||
for (worker, data) in spawned.iter_mut() {
|
||||
if data.idle.is_none() {
|
||||
// The idle token is missing, meaning this worker is now occupied: skip it. This is
|
||||
// because the worker process is observed by the work task and should it reach the
|
||||
// deadline or be terminated it will be handled by the corresponding mux event.
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Poll::Ready(()) = futures::poll!(&mut data.handle) {
|
||||
// a resolved future means that the worker has terminated. Weed it out.
|
||||
to_remove.push(worker);
|
||||
}
|
||||
}
|
||||
for w in to_remove {
|
||||
let _ = spawned.remove(w);
|
||||
reply(from_pool, FromPool::Rip(w))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_to_pool(
|
||||
program_path: &Path,
|
||||
spawn_timeout: Duration,
|
||||
spawned: &mut HopSlotMap<Worker, WorkerData>,
|
||||
mux: &mut Mux,
|
||||
to_pool: ToPool,
|
||||
) {
|
||||
match to_pool {
|
||||
ToPool::Spawn => {
|
||||
mux.push(spawn_worker_task(program_path.to_owned(), spawn_timeout).boxed());
|
||||
}
|
||||
ToPool::StartWork {
|
||||
worker,
|
||||
code,
|
||||
artifact_path,
|
||||
background_priority,
|
||||
} => {
|
||||
if let Some(data) = spawned.get_mut(worker) {
|
||||
if let Some(idle) = data.idle.take() {
|
||||
mux.push(
|
||||
start_work_task(worker, idle, code, artifact_path, background_priority)
|
||||
.boxed(),
|
||||
);
|
||||
} else {
|
||||
// idle token is present after spawn and after a job is concluded;
|
||||
// the precondition for `StartWork` is it should be sent only if all previous work
|
||||
// items concluded;
|
||||
// thus idle token is Some;
|
||||
// qed.
|
||||
never!("unexpected abscence of the idle token in prepare pool");
|
||||
}
|
||||
} else {
|
||||
// That's a relatively normal situation since the queue may send `start_work` and
|
||||
// before receiving it the pool would report that the worker died.
|
||||
}
|
||||
}
|
||||
ToPool::Kill(worker) => {
|
||||
// It may be absent if it were previously already removed by `purge_dead`.
|
||||
let _ = spawned.remove(worker);
|
||||
}
|
||||
ToPool::BumpPriority(worker) => {
|
||||
if let Some(data) = spawned.get(worker) {
|
||||
worker::bump_priority(&data.handle);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn spawn_worker_task(program_path: PathBuf, spawn_timeout: Duration) -> PoolEvent {
|
||||
use futures_timer::Delay;
|
||||
|
||||
loop {
|
||||
match worker::spawn(&program_path, spawn_timeout).await {
|
||||
Ok((idle, handle)) => break PoolEvent::Spawn(idle, handle),
|
||||
Err(err) => {
|
||||
tracing::warn!(
|
||||
target: LOG_TARGET,
|
||||
"failed to spawn a prepare worker: {:?}",
|
||||
err,
|
||||
);
|
||||
|
||||
// Assume that the failure intermittent and retry after a delay.
|
||||
Delay::new(Duration::from_secs(3)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn start_work_task(
|
||||
worker: Worker,
|
||||
idle: IdleWorker,
|
||||
code: Arc<Vec<u8>>,
|
||||
artifact_path: PathBuf,
|
||||
background_priority: bool,
|
||||
) -> PoolEvent {
|
||||
let outcome = worker::start_work(idle, code, artifact_path, background_priority).await;
|
||||
PoolEvent::StartWork(worker, outcome)
|
||||
}
|
||||
|
||||
fn handle_mux(
|
||||
from_pool: &mut mpsc::UnboundedSender<FromPool>,
|
||||
spawned: &mut HopSlotMap<Worker, WorkerData>,
|
||||
event: PoolEvent,
|
||||
) -> Result<(), Fatal> {
|
||||
match event {
|
||||
PoolEvent::Spawn(idle, handle) => {
|
||||
let worker = spawned.insert(WorkerData {
|
||||
idle: Some(idle),
|
||||
handle,
|
||||
});
|
||||
|
||||
reply(from_pool, FromPool::Spawned(worker))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
PoolEvent::StartWork(worker, outcome) => {
|
||||
match outcome {
|
||||
Outcome::Concluded(idle) => {
|
||||
let data = match spawned.get_mut(worker) {
|
||||
None => {
|
||||
// Perhaps the worker was killed meanwhile and the result is no longer
|
||||
// relevant.
|
||||
return Ok(());
|
||||
}
|
||||
Some(data) => data,
|
||||
};
|
||||
|
||||
// We just replace the idle worker that was loaned from this option during
|
||||
// the work starting.
|
||||
let old = data.idle.replace(idle);
|
||||
assert_matches!(old, None, "attempt to overwrite an idle worker");
|
||||
|
||||
reply(from_pool, FromPool::Concluded(worker, false))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Outcome::DidntMakeIt => {
|
||||
if let Some(_data) = spawned.remove(worker) {
|
||||
reply(from_pool, FromPool::Concluded(worker, true))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn reply(from_pool: &mut mpsc::UnboundedSender<FromPool>, m: FromPool) -> Result<(), Fatal> {
|
||||
from_pool.unbounded_send(m).map_err(|_| Fatal)
|
||||
}
|
||||
|
||||
/// Spins up the pool and returns the future that should be polled to make the pool functional.
|
||||
pub fn start(
|
||||
program_path: PathBuf,
|
||||
spawn_timeout: Duration,
|
||||
) -> (
|
||||
mpsc::Sender<ToPool>,
|
||||
mpsc::UnboundedReceiver<FromPool>,
|
||||
impl Future<Output = ()>,
|
||||
) {
|
||||
let (to_pool_tx, to_pool_rx) = mpsc::channel(10);
|
||||
let (from_pool_tx, from_pool_rx) = mpsc::unbounded();
|
||||
|
||||
let run = run(Pool {
|
||||
program_path,
|
||||
spawn_timeout,
|
||||
to_pool: to_pool_rx,
|
||||
from_pool: from_pool_tx,
|
||||
spawned: HopSlotMap::with_capacity_and_key(20),
|
||||
mux: Mux::new(),
|
||||
});
|
||||
|
||||
(to_pool_tx, from_pool_rx, run)
|
||||
}
|
||||
@@ -0,0 +1,894 @@
|
||||
// Copyright 2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Polkadot.
|
||||
|
||||
// Polkadot is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Polkadot is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! A queue that handles requests for PVF preparation.
|
||||
|
||||
use super::{
|
||||
pool::{self, Worker},
|
||||
};
|
||||
use crate::{LOG_TARGET, Priority, Pvf, artifacts::ArtifactId};
|
||||
use futures::{Future, SinkExt, channel::mpsc, stream::StreamExt as _};
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use async_std::path::PathBuf;
|
||||
use always_assert::{always, never};
|
||||
|
||||
/// A request to pool.
|
||||
#[derive(Debug)]
|
||||
pub enum ToQueue {
|
||||
/// This schedules preparation of the given PVF.
|
||||
///
|
||||
/// Note that it is incorrect to enqueue the same PVF again without first receiving the
|
||||
/// [`FromQueue::Prepared`] response. In case there is a need to bump the priority, use
|
||||
/// [`ToQueue::Amend`].
|
||||
Enqueue { priority: Priority, pvf: Pvf },
|
||||
/// Amends the priority for the given [`ArtifactId`] if it is running. If it's not, then it's noop.
|
||||
Amend {
|
||||
priority: Priority,
|
||||
artifact_id: ArtifactId,
|
||||
},
|
||||
}
|
||||
|
||||
/// A response from queue.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum FromQueue {
|
||||
Prepared(ArtifactId),
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Limits {
|
||||
/// The maximum number of workers this pool can ever host. This is expected to be a small
|
||||
/// number, e.g. within a dozen.
|
||||
hard_capacity: usize,
|
||||
|
||||
/// The number of workers we want aim to have. If there is a critical job and we are already
|
||||
/// at `soft_capacity`, we are allowed to grow up to `hard_capacity`. Thus this should be equal
|
||||
/// or smaller than `hard_capacity`.
|
||||
soft_capacity: usize,
|
||||
}
|
||||
|
||||
impl Limits {
|
||||
/// Returns `true` if the queue is allowed to request one more worker.
|
||||
fn can_afford_one_more(&self, spawned_num: usize, critical: bool) -> bool {
|
||||
let cap = if critical {
|
||||
self.hard_capacity
|
||||
} else {
|
||||
self.soft_capacity
|
||||
};
|
||||
spawned_num < cap
|
||||
}
|
||||
|
||||
/// Offer the worker back to the pool. The passed worker ID must be considered unusable unless
|
||||
/// it wasn't taken by the pool, in which case it will be returned as `Some`.
|
||||
fn should_cull(&mut self, spawned_num: usize) -> bool {
|
||||
spawned_num > self.soft_capacity
|
||||
}
|
||||
}
|
||||
|
||||
slotmap::new_key_type! { pub struct Job; }
|
||||
|
||||
struct JobData {
|
||||
/// The priority of this job. Can be bumped.
|
||||
priority: Priority,
|
||||
pvf: Pvf,
|
||||
worker: Option<Worker>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct WorkerData {
|
||||
job: Option<Job>,
|
||||
}
|
||||
|
||||
impl WorkerData {
|
||||
fn is_idle(&self) -> bool {
|
||||
self.job.is_none()
|
||||
}
|
||||
}
|
||||
|
||||
/// A queue structured like this is prone to starving, however, we don't care that much since we expect
|
||||
/// there is going to be a limited number of critical jobs and we don't really care if background starve.
|
||||
#[derive(Default)]
|
||||
struct Unscheduled {
|
||||
background: VecDeque<Job>,
|
||||
normal: VecDeque<Job>,
|
||||
critical: VecDeque<Job>,
|
||||
}
|
||||
|
||||
impl Unscheduled {
|
||||
fn queue_mut(&mut self, prio: Priority) -> &mut VecDeque<Job> {
|
||||
match prio {
|
||||
Priority::Background => &mut self.background,
|
||||
Priority::Normal => &mut self.normal,
|
||||
Priority::Critical => &mut self.critical,
|
||||
}
|
||||
}
|
||||
|
||||
fn add(&mut self, prio: Priority, job: Job) {
|
||||
self.queue_mut(prio).push_back(job);
|
||||
}
|
||||
|
||||
fn readd(&mut self, prio: Priority, job: Job) {
|
||||
self.queue_mut(prio).push_front(job);
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.background.is_empty() && self.normal.is_empty() && self.critical.is_empty()
|
||||
}
|
||||
|
||||
fn next(&mut self) -> Option<Job> {
|
||||
let mut check = |prio: Priority| self.queue_mut(prio).pop_front();
|
||||
check(Priority::Critical)
|
||||
.or_else(|| check(Priority::Normal))
|
||||
.or_else(|| check(Priority::Background))
|
||||
}
|
||||
}
|
||||
|
||||
struct Queue {
|
||||
to_queue_rx: mpsc::Receiver<ToQueue>,
|
||||
from_queue_tx: mpsc::UnboundedSender<FromQueue>,
|
||||
|
||||
to_pool_tx: mpsc::Sender<pool::ToPool>,
|
||||
from_pool_rx: mpsc::UnboundedReceiver<pool::FromPool>,
|
||||
|
||||
cache_path: PathBuf,
|
||||
limits: Limits,
|
||||
|
||||
jobs: slotmap::SlotMap<Job, JobData>,
|
||||
|
||||
/// A mapping from artifact id to a job.
|
||||
artifact_id_to_job: HashMap<ArtifactId, Job>,
|
||||
/// The registry of all workers.
|
||||
workers: slotmap::SparseSecondaryMap<Worker, WorkerData>,
|
||||
/// The number of workers requested to spawn but not yet spawned.
|
||||
spawn_inflight: usize,
|
||||
|
||||
/// The jobs that are not yet scheduled. These are waiting until the next `poll` where they are
|
||||
/// processed all at once.
|
||||
unscheduled: Unscheduled,
|
||||
}
|
||||
|
||||
/// A fatal error that warrants stopping the queue.
|
||||
struct Fatal;
|
||||
|
||||
impl Queue {
|
||||
fn new(
|
||||
soft_capacity: usize,
|
||||
hard_capacity: usize,
|
||||
cache_path: PathBuf,
|
||||
to_queue_rx: mpsc::Receiver<ToQueue>,
|
||||
from_queue_tx: mpsc::UnboundedSender<FromQueue>,
|
||||
to_pool_tx: mpsc::Sender<pool::ToPool>,
|
||||
from_pool_rx: mpsc::UnboundedReceiver<pool::FromPool>,
|
||||
) -> Self {
|
||||
Self {
|
||||
to_queue_rx,
|
||||
from_queue_tx,
|
||||
to_pool_tx,
|
||||
from_pool_rx,
|
||||
cache_path,
|
||||
spawn_inflight: 0,
|
||||
limits: Limits {
|
||||
hard_capacity,
|
||||
soft_capacity,
|
||||
},
|
||||
jobs: slotmap::SlotMap::with_key(),
|
||||
unscheduled: Unscheduled::default(),
|
||||
artifact_id_to_job: HashMap::new(),
|
||||
workers: slotmap::SparseSecondaryMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn run(mut self) {
|
||||
macro_rules! break_if_fatal {
|
||||
($expr:expr) => {
|
||||
if let Err(Fatal) = $expr {
|
||||
break;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
loop {
|
||||
// biased to make it behave deterministically for tests.
|
||||
futures::select_biased! {
|
||||
to_queue = self.to_queue_rx.select_next_some() =>
|
||||
break_if_fatal!(handle_to_queue(&mut self, to_queue).await),
|
||||
from_pool = self.from_pool_rx.select_next_some() =>
|
||||
break_if_fatal!(handle_from_pool(&mut self, from_pool).await),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_to_queue(queue: &mut Queue, to_queue: ToQueue) -> Result<(), Fatal> {
|
||||
match to_queue {
|
||||
ToQueue::Enqueue { priority, pvf } => {
|
||||
handle_enqueue(queue, priority, pvf).await?;
|
||||
}
|
||||
ToQueue::Amend {
|
||||
priority,
|
||||
artifact_id,
|
||||
} => {
|
||||
handle_amend(queue, priority, artifact_id).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_enqueue(queue: &mut Queue, priority: Priority, pvf: Pvf) -> Result<(), Fatal> {
|
||||
let artifact_id = pvf.as_artifact_id();
|
||||
if never!(
|
||||
queue.artifact_id_to_job.contains_key(&artifact_id),
|
||||
"second Enqueue sent for a known artifact"
|
||||
) {
|
||||
// This function is called in response to a `Enqueue` message;
|
||||
// Precondtion for `Enqueue` is that it is sent only once for a PVF;
|
||||
// Thus this should always be `false`;
|
||||
// qed.
|
||||
tracing::warn!(
|
||||
target: LOG_TARGET,
|
||||
"duplicate `enqueue` command received for {:?}",
|
||||
artifact_id,
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let job = queue.jobs.insert(JobData {
|
||||
priority,
|
||||
pvf,
|
||||
worker: None,
|
||||
});
|
||||
queue.artifact_id_to_job.insert(artifact_id, job);
|
||||
|
||||
if let Some(available) = find_idle_worker(queue) {
|
||||
// This may seem not fair (w.r.t priority) on the first glance, but it should be. This is
|
||||
// because as soon as a worker finishes with the job it's immediatelly given the next one.
|
||||
assign(queue, available, job).await?;
|
||||
} else {
|
||||
spawn_extra_worker(queue, priority.is_critical()).await?;
|
||||
queue.unscheduled.add(priority, job);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_idle_worker(queue: &mut Queue) -> Option<Worker> {
|
||||
queue
|
||||
.workers
|
||||
.iter()
|
||||
.filter(|(_, data)| data.is_idle())
|
||||
.map(|(k, _)| k)
|
||||
.next()
|
||||
}
|
||||
|
||||
async fn handle_amend(
|
||||
queue: &mut Queue,
|
||||
priority: Priority,
|
||||
artifact_id: ArtifactId,
|
||||
) -> Result<(), Fatal> {
|
||||
if let Some(&job) = queue.artifact_id_to_job.get(&artifact_id) {
|
||||
let mut job_data: &mut JobData = &mut queue.jobs[job];
|
||||
|
||||
if job_data.priority < priority {
|
||||
// The new priority is higher. We should do two things:
|
||||
// - if the worker was already spawned with the background prio and the new one is not
|
||||
// (it's already the case, if we are in this branch but we still do the check for
|
||||
// clarity), then we should tell the pool to bump the priority for the worker.
|
||||
//
|
||||
// - save the new priority in the job.
|
||||
|
||||
if let Some(worker) = job_data.worker {
|
||||
if job_data.priority.is_background() && !priority.is_background() {
|
||||
send_pool(&mut queue.to_pool_tx, pool::ToPool::BumpPriority(worker)).await?;
|
||||
}
|
||||
}
|
||||
|
||||
job_data.priority = priority;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_from_pool(queue: &mut Queue, from_pool: pool::FromPool) -> Result<(), Fatal> {
|
||||
use pool::FromPool::*;
|
||||
match from_pool {
|
||||
Spawned(worker) => handle_worker_spawned(queue, worker).await?,
|
||||
Concluded(worker, rip) => handle_worker_concluded(queue, worker, rip).await?,
|
||||
Rip(worker) => handle_worker_rip(queue, worker).await?,
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_worker_spawned(queue: &mut Queue, worker: Worker) -> Result<(), Fatal> {
|
||||
queue.workers.insert(worker, WorkerData::default());
|
||||
queue.spawn_inflight -= 1;
|
||||
|
||||
if let Some(job) = queue.unscheduled.next() {
|
||||
assign(queue, worker, job).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_worker_concluded(
|
||||
queue: &mut Queue,
|
||||
worker: Worker,
|
||||
rip: bool,
|
||||
) -> Result<(), Fatal> {
|
||||
macro_rules! never_none {
|
||||
($expr:expr) => {
|
||||
match $expr {
|
||||
Some(v) => v,
|
||||
None => {
|
||||
// Precondition of calling this is that the $expr is never none;
|
||||
// Assume the conditions holds, then this never is not hit;
|
||||
// qed.
|
||||
never!("never_none, {}", stringify!($expr));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Find out on which artifact was the worker working.
|
||||
|
||||
// workers are registered upon spawn and removed in one of the following cases:
|
||||
// 1. received rip signal
|
||||
// 2. received concluded signal with rip=true;
|
||||
// concluded signal only comes from a spawned worker and only once;
|
||||
// rip signal is not sent after conclusion with rip=true;
|
||||
// the worker should be registered;
|
||||
// this can't be None;
|
||||
// qed.
|
||||
let worker_data = never_none!(queue.workers.get_mut(worker));
|
||||
|
||||
// worker_data.job is set only by `assign` and removed only here for a worker;
|
||||
// concluded signal only comes for a worker that was previously assigned and only once;
|
||||
// the worker should have the job;
|
||||
// this can't be None;
|
||||
// qed.
|
||||
let job = never_none!(worker_data.job.take());
|
||||
|
||||
// job_data is inserted upon enqueue and removed only here;
|
||||
// as was established above, this worker was previously `assign`ed to the job;
|
||||
// that implies that the job was enqueued;
|
||||
// conclude signal only comes once;
|
||||
// we are just to remove the job for the first and the only time;
|
||||
// this can't be None;
|
||||
// qed.
|
||||
let job_data = never_none!(queue.jobs.remove(job));
|
||||
let artifact_id = job_data.pvf.as_artifact_id();
|
||||
|
||||
queue.artifact_id_to_job.remove(&artifact_id);
|
||||
|
||||
reply(&mut queue.from_queue_tx, FromQueue::Prepared(artifact_id))?;
|
||||
|
||||
// Figure out what to do with the worker.
|
||||
if rip {
|
||||
let worker_data = queue.workers.remove(worker);
|
||||
// worker should exist, it's asserted above;
|
||||
// qed.
|
||||
always!(worker_data.is_some());
|
||||
|
||||
if !queue.unscheduled.is_empty() {
|
||||
// That is unconditionally not critical just to not accidentally fill up
|
||||
// the pool up to the hard cap.
|
||||
spawn_extra_worker(queue, false).await?;
|
||||
}
|
||||
} else {
|
||||
if queue
|
||||
.limits
|
||||
.should_cull(queue.workers.len() + queue.spawn_inflight)
|
||||
{
|
||||
// We no longer need services of this worker. Kill it.
|
||||
queue.workers.remove(worker);
|
||||
send_pool(&mut queue.to_pool_tx, pool::ToPool::Kill(worker)).await?;
|
||||
} else {
|
||||
// see if there are more work available and schedule it.
|
||||
if let Some(job) = queue.unscheduled.next() {
|
||||
assign(queue, worker, job).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_worker_rip(queue: &mut Queue, worker: Worker) -> Result<(), Fatal> {
|
||||
let worker_data = queue.workers.remove(worker);
|
||||
|
||||
if let Some(WorkerData { job: Some(job), .. }) = worker_data {
|
||||
// This is an edge case where the worker ripped after we sent assignment but before it
|
||||
// was received by the pool.
|
||||
let priority = queue
|
||||
.jobs
|
||||
.get(job)
|
||||
.map(|data| data.priority)
|
||||
.unwrap_or_else(|| {
|
||||
// job is inserted upon enqueue and removed on concluded signal;
|
||||
// this is enclosed in the if statement that narrows the situation to before
|
||||
// conclusion;
|
||||
// that means that the job still exists and is known;
|
||||
// this path cannot be hit;
|
||||
// qed.
|
||||
never!("the job of the ripped worker must be known but it is not");
|
||||
Priority::Normal
|
||||
});
|
||||
queue.unscheduled.readd(priority, job);
|
||||
}
|
||||
|
||||
// If there are still jobs left, spawn another worker to replace the ripped one (but only if it
|
||||
// was indeed removed). That is unconditionally not critical just to not accidentally fill up
|
||||
// the pool up to the hard cap.
|
||||
if worker_data.is_some() && !queue.unscheduled.is_empty() {
|
||||
spawn_extra_worker(queue, false).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Spawns an extra worker if possible.
|
||||
async fn spawn_extra_worker(queue: &mut Queue, critical: bool) -> Result<(), Fatal> {
|
||||
if queue
|
||||
.limits
|
||||
.can_afford_one_more(queue.workers.len() + queue.spawn_inflight, critical)
|
||||
{
|
||||
queue.spawn_inflight += 1;
|
||||
send_pool(&mut queue.to_pool_tx, pool::ToPool::Spawn).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Attaches the work to the given worker telling the poll about the job.
|
||||
async fn assign(queue: &mut Queue, worker: Worker, job: Job) -> Result<(), Fatal> {
|
||||
let job_data = &mut queue.jobs[job];
|
||||
|
||||
let artifact_id = job_data.pvf.as_artifact_id();
|
||||
let artifact_path = artifact_id.path(&queue.cache_path);
|
||||
|
||||
job_data.worker = Some(worker);
|
||||
|
||||
queue.workers[worker].job = Some(job);
|
||||
|
||||
send_pool(
|
||||
&mut queue.to_pool_tx,
|
||||
pool::ToPool::StartWork {
|
||||
worker,
|
||||
code: job_data.pvf.code.clone(),
|
||||
artifact_path,
|
||||
background_priority: job_data.priority.is_background(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn reply(from_queue_tx: &mut mpsc::UnboundedSender<FromQueue>, m: FromQueue) -> Result<(), Fatal> {
|
||||
from_queue_tx.unbounded_send(m).map_err(|_| {
|
||||
// The host has hung up and thus it's fatal and we should shutdown ourselves.
|
||||
Fatal
|
||||
})
|
||||
}
|
||||
|
||||
async fn send_pool(
|
||||
to_pool_tx: &mut mpsc::Sender<pool::ToPool>,
|
||||
m: pool::ToPool,
|
||||
) -> Result<(), Fatal> {
|
||||
to_pool_tx.send(m).await.map_err(|_| {
|
||||
// The pool has hung up and thus we are no longer are able to fulfill our duties. Shutdown.
|
||||
Fatal
|
||||
})
|
||||
}
|
||||
|
||||
/// Spins up the queue and returns the future that should be polled to make the queue functional.
|
||||
pub fn start(
|
||||
soft_capacity: usize,
|
||||
hard_capacity: usize,
|
||||
cache_path: PathBuf,
|
||||
to_pool_tx: mpsc::Sender<pool::ToPool>,
|
||||
from_pool_rx: mpsc::UnboundedReceiver<pool::FromPool>,
|
||||
) -> (
|
||||
mpsc::Sender<ToQueue>,
|
||||
mpsc::UnboundedReceiver<FromQueue>,
|
||||
impl Future<Output = ()>,
|
||||
) {
|
||||
let (to_queue_tx, to_queue_rx) = mpsc::channel(150);
|
||||
let (from_queue_tx, from_queue_rx) = mpsc::unbounded();
|
||||
|
||||
let run = Queue::new(
|
||||
soft_capacity,
|
||||
hard_capacity,
|
||||
cache_path,
|
||||
to_queue_rx,
|
||||
from_queue_tx,
|
||||
to_pool_tx,
|
||||
from_pool_rx,
|
||||
)
|
||||
.run();
|
||||
|
||||
(to_queue_tx, from_queue_rx, run)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use slotmap::SlotMap;
|
||||
use assert_matches::assert_matches;
|
||||
use futures::{FutureExt, future::BoxFuture};
|
||||
use std::task::Poll;
|
||||
use super::*;
|
||||
|
||||
/// Creates a new pvf which artifact id can be uniquely identified by the given number.
|
||||
fn pvf(descriminator: u32) -> Pvf {
|
||||
Pvf::from_discriminator(descriminator)
|
||||
}
|
||||
|
||||
async fn run_until<R>(
|
||||
task: &mut (impl Future<Output = ()> + Unpin),
|
||||
mut fut: (impl Future<Output = R> + Unpin),
|
||||
) -> R {
|
||||
let start = std::time::Instant::now();
|
||||
let fut = &mut fut;
|
||||
loop {
|
||||
if start.elapsed() > std::time::Duration::from_secs(1) {
|
||||
// We expect that this will take only a couple of iterations and thus to take way
|
||||
// less than a second.
|
||||
panic!("timeout");
|
||||
}
|
||||
|
||||
if let Poll::Ready(r) = futures::poll!(&mut *fut) {
|
||||
break r;
|
||||
}
|
||||
|
||||
if futures::poll!(&mut *task).is_ready() {
|
||||
panic!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Test {
|
||||
_tempdir: tempfile::TempDir,
|
||||
run: BoxFuture<'static, ()>,
|
||||
workers: SlotMap<Worker, ()>,
|
||||
from_pool_tx: mpsc::UnboundedSender<pool::FromPool>,
|
||||
to_pool_rx: mpsc::Receiver<pool::ToPool>,
|
||||
to_queue_tx: mpsc::Sender<ToQueue>,
|
||||
from_queue_rx: mpsc::UnboundedReceiver<FromQueue>,
|
||||
}
|
||||
|
||||
impl Test {
|
||||
fn new(soft_capacity: usize, hard_capacity: usize) -> Self {
|
||||
let tempdir = tempfile::tempdir().unwrap();
|
||||
|
||||
let (to_pool_tx, to_pool_rx) = mpsc::channel(10);
|
||||
let (from_pool_tx, from_pool_rx) = mpsc::unbounded();
|
||||
|
||||
let workers: SlotMap<Worker, ()> = SlotMap::with_key();
|
||||
|
||||
let (to_queue_tx, from_queue_rx, run) = start(
|
||||
soft_capacity,
|
||||
hard_capacity,
|
||||
tempdir.path().to_owned().into(),
|
||||
to_pool_tx,
|
||||
from_pool_rx,
|
||||
);
|
||||
|
||||
Self {
|
||||
_tempdir: tempdir,
|
||||
run: run.boxed(),
|
||||
workers,
|
||||
from_pool_tx,
|
||||
to_pool_rx,
|
||||
to_queue_tx,
|
||||
from_queue_rx,
|
||||
}
|
||||
}
|
||||
|
||||
fn send_queue(&mut self, to_queue: ToQueue) {
|
||||
self.to_queue_tx
|
||||
.send(to_queue)
|
||||
.now_or_never()
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
async fn poll_and_recv_from_queue(&mut self) -> FromQueue {
|
||||
let from_queue_rx = &mut self.from_queue_rx;
|
||||
run_until(
|
||||
&mut self.run,
|
||||
async { from_queue_rx.next().await.unwrap() }.boxed(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
fn send_from_pool(&mut self, from_pool: pool::FromPool) {
|
||||
self.from_pool_tx
|
||||
.send(from_pool)
|
||||
.now_or_never()
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
async fn poll_and_recv_to_pool(&mut self) -> pool::ToPool {
|
||||
let to_pool_rx = &mut self.to_pool_rx;
|
||||
run_until(
|
||||
&mut self.run,
|
||||
async { to_pool_rx.next().await.unwrap() }.boxed(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn poll_ensure_to_pool_is_empty(&mut self) {
|
||||
use futures_timer::Delay;
|
||||
use std::time::Duration;
|
||||
|
||||
let to_pool_rx = &mut self.to_pool_rx;
|
||||
run_until(
|
||||
&mut self.run,
|
||||
async {
|
||||
futures::select! {
|
||||
_ = Delay::new(Duration::from_millis(500)).fuse() => (),
|
||||
_ = to_pool_rx.next().fuse() => {
|
||||
panic!("to pool supposed to be empty")
|
||||
}
|
||||
}
|
||||
}
|
||||
.boxed(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
#[async_std::test]
|
||||
async fn properly_concludes() {
|
||||
let mut test = Test::new(2, 2);
|
||||
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Background,
|
||||
pvf: pvf(1),
|
||||
});
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
|
||||
let w = test.workers.insert(());
|
||||
test.send_from_pool(pool::FromPool::Spawned(w));
|
||||
test.send_from_pool(pool::FromPool::Concluded(w, false));
|
||||
|
||||
assert_eq!(
|
||||
test.poll_and_recv_from_queue().await,
|
||||
FromQueue::Prepared(pvf(1).as_artifact_id())
|
||||
);
|
||||
}
|
||||
|
||||
#[async_std::test]
|
||||
async fn dont_spawn_over_soft_limit_unless_critical() {
|
||||
let mut test = Test::new(2, 3);
|
||||
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Normal,
|
||||
pvf: pvf(1),
|
||||
});
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Normal,
|
||||
pvf: pvf(2),
|
||||
});
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Normal,
|
||||
pvf: pvf(3),
|
||||
});
|
||||
|
||||
// Receive only two spawns.
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
|
||||
let w1 = test.workers.insert(());
|
||||
let w2 = test.workers.insert(());
|
||||
|
||||
test.send_from_pool(pool::FromPool::Spawned(w1));
|
||||
test.send_from_pool(pool::FromPool::Spawned(w2));
|
||||
|
||||
// Get two start works.
|
||||
assert_matches!(
|
||||
test.poll_and_recv_to_pool().await,
|
||||
pool::ToPool::StartWork { .. }
|
||||
);
|
||||
assert_matches!(
|
||||
test.poll_and_recv_to_pool().await,
|
||||
pool::ToPool::StartWork { .. }
|
||||
);
|
||||
|
||||
test.send_from_pool(pool::FromPool::Concluded(w1, false));
|
||||
|
||||
assert_matches!(
|
||||
test.poll_and_recv_to_pool().await,
|
||||
pool::ToPool::StartWork { .. }
|
||||
);
|
||||
|
||||
// Enqueue a critical job.
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Critical,
|
||||
pvf: pvf(4),
|
||||
});
|
||||
|
||||
// 2 out of 2 are working, but there is a critical job incoming. That means that spawning
|
||||
// another worker is warranted.
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
}
|
||||
|
||||
#[async_std::test]
|
||||
async fn cull_unwanted() {
|
||||
let mut test = Test::new(1, 2);
|
||||
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Normal,
|
||||
pvf: pvf(1),
|
||||
});
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
let w1 = test.workers.insert(());
|
||||
test.send_from_pool(pool::FromPool::Spawned(w1));
|
||||
assert_matches!(
|
||||
test.poll_and_recv_to_pool().await,
|
||||
pool::ToPool::StartWork { .. }
|
||||
);
|
||||
|
||||
// Enqueue a critical job, which warrants spawning over the soft limit.
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Critical,
|
||||
pvf: pvf(2),
|
||||
});
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
|
||||
// However, before the new worker had a chance to spawn, the first worker finishes with its
|
||||
// job. The old worker will be killed while the new worker will be let live, even though
|
||||
// it's not instantiated.
|
||||
//
|
||||
// That's a bit silly in this context, but in production there will be an entire pool up
|
||||
// to the `soft_capacity` of workers and it doesn't matter which one to cull. Either way,
|
||||
// we just check that edge case of an edge case works.
|
||||
test.send_from_pool(pool::FromPool::Concluded(w1, false));
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Kill(w1));
|
||||
}
|
||||
|
||||
#[async_std::test]
|
||||
async fn bump_prio_on_urgency_change() {
|
||||
let mut test = Test::new(2, 2);
|
||||
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Background,
|
||||
pvf: pvf(1),
|
||||
});
|
||||
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
|
||||
let w = test.workers.insert(());
|
||||
test.send_from_pool(pool::FromPool::Spawned(w));
|
||||
|
||||
assert_matches!(
|
||||
test.poll_and_recv_to_pool().await,
|
||||
pool::ToPool::StartWork { .. }
|
||||
);
|
||||
test.send_queue(ToQueue::Amend {
|
||||
priority: Priority::Normal,
|
||||
artifact_id: pvf(1).as_artifact_id(),
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
test.poll_and_recv_to_pool().await,
|
||||
pool::ToPool::BumpPriority(w)
|
||||
);
|
||||
}
|
||||
|
||||
#[async_std::test]
|
||||
async fn worker_mass_die_out_doesnt_stall_queue() {
|
||||
let mut test = Test::new(2, 2);
|
||||
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Normal,
|
||||
pvf: pvf(1),
|
||||
});
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Normal,
|
||||
pvf: pvf(2),
|
||||
});
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Normal,
|
||||
pvf: pvf(3),
|
||||
});
|
||||
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
|
||||
let w1 = test.workers.insert(());
|
||||
let w2 = test.workers.insert(());
|
||||
|
||||
test.send_from_pool(pool::FromPool::Spawned(w1));
|
||||
test.send_from_pool(pool::FromPool::Spawned(w2));
|
||||
|
||||
assert_matches!(
|
||||
test.poll_and_recv_to_pool().await,
|
||||
pool::ToPool::StartWork { .. }
|
||||
);
|
||||
assert_matches!(
|
||||
test.poll_and_recv_to_pool().await,
|
||||
pool::ToPool::StartWork { .. }
|
||||
);
|
||||
|
||||
// Conclude worker 1 and rip it.
|
||||
test.send_from_pool(pool::FromPool::Concluded(w1, true));
|
||||
|
||||
// Since there is still work, the queue requested one extra worker to spawn to handle the
|
||||
// remaining enqueued work items.
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
assert_eq!(
|
||||
test.poll_and_recv_from_queue().await,
|
||||
FromQueue::Prepared(pvf(1).as_artifact_id())
|
||||
);
|
||||
}
|
||||
|
||||
#[async_std::test]
|
||||
async fn doesnt_resurrect_ripped_worker_if_no_work() {
|
||||
let mut test = Test::new(2, 2);
|
||||
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Normal,
|
||||
pvf: pvf(1),
|
||||
});
|
||||
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
|
||||
let w1 = test.workers.insert(());
|
||||
test.send_from_pool(pool::FromPool::Spawned(w1));
|
||||
|
||||
assert_matches!(
|
||||
test.poll_and_recv_to_pool().await,
|
||||
pool::ToPool::StartWork { .. }
|
||||
);
|
||||
|
||||
test.send_from_pool(pool::FromPool::Concluded(w1, true));
|
||||
test.poll_ensure_to_pool_is_empty().await;
|
||||
}
|
||||
|
||||
#[async_std::test]
|
||||
async fn rip_for_start_work() {
|
||||
let mut test = Test::new(2, 2);
|
||||
|
||||
test.send_queue(ToQueue::Enqueue {
|
||||
priority: Priority::Normal,
|
||||
pvf: pvf(1),
|
||||
});
|
||||
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
|
||||
let w1 = test.workers.insert(());
|
||||
test.send_from_pool(pool::FromPool::Spawned(w1));
|
||||
|
||||
// Now, to the interesting part. After the queue normally issues the start_work command to
|
||||
// the pool, before receiving the command the queue may report that the worker ripped.
|
||||
assert_matches!(
|
||||
test.poll_and_recv_to_pool().await,
|
||||
pool::ToPool::StartWork { .. }
|
||||
);
|
||||
test.send_from_pool(pool::FromPool::Rip(w1));
|
||||
|
||||
// In this case, the pool should spawn a new worker and request it to work on the item.
|
||||
assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn);
|
||||
|
||||
let w2 = test.workers.insert(());
|
||||
test.send_from_pool(pool::FromPool::Spawned(w2));
|
||||
assert_matches!(
|
||||
test.poll_and_recv_to_pool().await,
|
||||
pool::ToPool::StartWork { .. }
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,213 @@
|
||||
// Copyright 2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Polkadot.
|
||||
|
||||
// Polkadot is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Polkadot is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use crate::{
|
||||
LOG_TARGET,
|
||||
artifacts::Artifact,
|
||||
worker_common::{
|
||||
IdleWorker, SpawnErr, WorkerHandle, bytes_to_path, framed_recv, framed_send, path_to_bytes,
|
||||
spawn_with_program_path, tmpfile, worker_event_loop,
|
||||
},
|
||||
};
|
||||
use async_std::{
|
||||
io,
|
||||
os::unix::net::UnixStream,
|
||||
path::{PathBuf, Path},
|
||||
};
|
||||
use futures::FutureExt as _;
|
||||
use futures_timer::Delay;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
const NICENESS_BACKGROUND: i32 = 10;
|
||||
const NICENESS_FOREGROUND: i32 = 0;
|
||||
|
||||
const COMPILATION_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
/// Spawns a new worker with the given program path that acts as the worker and the spawn timeout.
|
||||
///
|
||||
/// The program should be able to handle `<program-path> prepare-worker <socket-path>` invocation.
|
||||
pub async fn spawn(
|
||||
program_path: &Path,
|
||||
spawn_timeout: Duration,
|
||||
) -> Result<(IdleWorker, WorkerHandle), SpawnErr> {
|
||||
spawn_with_program_path(
|
||||
"prepare",
|
||||
program_path,
|
||||
&["prepare-worker"],
|
||||
spawn_timeout,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub enum Outcome {
|
||||
/// The worker has finished the work assigned to it.
|
||||
Concluded(IdleWorker),
|
||||
/// The execution was interrupted abruptly and the worker is not available anymore. For example,
|
||||
/// this could've happen because the worker hadn't finished the work until the given deadline.
|
||||
///
|
||||
/// Note that in this case the artifact file is written (unless there was an error writing the
|
||||
/// the artifact).
|
||||
///
|
||||
/// This doesn't return an idle worker instance, thus this worker is no longer usable.
|
||||
DidntMakeIt,
|
||||
}
|
||||
|
||||
/// Given the idle token of a worker and parameters of work, communicates with the worker and
|
||||
/// returns the outcome.
|
||||
pub async fn start_work(
|
||||
worker: IdleWorker,
|
||||
code: Arc<Vec<u8>>,
|
||||
artifact_path: PathBuf,
|
||||
background_priority: bool,
|
||||
) -> Outcome {
|
||||
let IdleWorker { mut stream, pid } = worker;
|
||||
|
||||
tracing::debug!(
|
||||
target: LOG_TARGET,
|
||||
worker_pid = %pid,
|
||||
%background_priority,
|
||||
"starting prepare for {}",
|
||||
artifact_path.display(),
|
||||
);
|
||||
|
||||
if background_priority {
|
||||
renice(pid, NICENESS_BACKGROUND);
|
||||
}
|
||||
|
||||
if let Err(err) = send_request(&mut stream, code).await {
|
||||
tracing::warn!("failed to send a prepare request to pid={}: {:?}", pid, err);
|
||||
return Outcome::DidntMakeIt;
|
||||
}
|
||||
|
||||
// Wait for the result from the worker, keeping in mind that there may be a timeout, the
|
||||
// worker may get killed, or something along these lines.
|
||||
//
|
||||
// In that case we should handle these gracefully by writing the artifact file by ourselves.
|
||||
// We may potentially overwrite the artifact in rare cases where the worker didn't make
|
||||
// it to report back the result.
|
||||
|
||||
enum Selected {
|
||||
Done,
|
||||
IoErr,
|
||||
Deadline,
|
||||
}
|
||||
|
||||
let selected = futures::select! {
|
||||
artifact_path_bytes = framed_recv(&mut stream).fuse() => {
|
||||
match artifact_path_bytes {
|
||||
Ok(bytes) => {
|
||||
if let Some(tmp_path) = bytes_to_path(&bytes) {
|
||||
async_std::fs::rename(tmp_path, &artifact_path)
|
||||
.await
|
||||
.map(|_| Selected::Done)
|
||||
.unwrap_or(Selected::IoErr)
|
||||
} else {
|
||||
Selected::IoErr
|
||||
}
|
||||
},
|
||||
Err(_) => Selected::IoErr,
|
||||
}
|
||||
},
|
||||
_ = Delay::new(COMPILATION_TIMEOUT).fuse() => Selected::Deadline,
|
||||
};
|
||||
|
||||
match selected {
|
||||
Selected::Done => {
|
||||
renice(pid, NICENESS_FOREGROUND);
|
||||
Outcome::Concluded(IdleWorker { stream, pid })
|
||||
}
|
||||
Selected::IoErr | Selected::Deadline => {
|
||||
let bytes = Artifact::DidntMakeIt.serialize();
|
||||
// best effort: there is nothing we can do here if the write fails.
|
||||
let _ = async_std::fs::write(&artifact_path, &bytes).await;
|
||||
Outcome::DidntMakeIt
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn send_request(stream: &mut UnixStream, code: Arc<Vec<u8>>) -> io::Result<()> {
|
||||
framed_send(stream, &*code).await
|
||||
}
|
||||
|
||||
async fn recv_request(stream: &mut UnixStream) -> io::Result<Vec<u8>> {
|
||||
framed_recv(stream).await
|
||||
}
|
||||
|
||||
pub fn bump_priority(handle: &WorkerHandle) {
|
||||
let pid = handle.id();
|
||||
renice(pid, NICENESS_FOREGROUND);
|
||||
}
|
||||
|
||||
fn renice(pid: u32, niceness: i32) {
|
||||
tracing::debug!(
|
||||
target: LOG_TARGET,
|
||||
worker_pid = %pid,
|
||||
"changing niceness to {}",
|
||||
niceness,
|
||||
);
|
||||
|
||||
// Consider upstreaming this to the `nix` crate.
|
||||
unsafe {
|
||||
if -1 == libc::setpriority(libc::PRIO_PROCESS, pid, niceness) {
|
||||
let err = std::io::Error::last_os_error();
|
||||
tracing::warn!(target: LOG_TARGET, "failed to set the priority: {:?}", err,);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The entrypoint that the spawned prepare worker should start with. The socket_path specifies
|
||||
/// the path to the socket used to communicate with the host.
|
||||
pub fn worker_entrypoint(socket_path: &str) {
|
||||
worker_event_loop("prepare", socket_path, |mut stream| async move {
|
||||
loop {
|
||||
let code = recv_request(&mut stream).await?;
|
||||
|
||||
tracing::debug!(
|
||||
target: LOG_TARGET,
|
||||
worker_pid = %std::process::id(),
|
||||
"worker: preparing artifact",
|
||||
);
|
||||
let artifact_bytes = prepare_artifact(&code).serialize();
|
||||
|
||||
// Write the serialized artifact into into a temp file.
|
||||
let dest = tmpfile("prepare-artifact-").await?;
|
||||
tracing::debug!(
|
||||
target: LOG_TARGET,
|
||||
worker_pid = %std::process::id(),
|
||||
"worker: writing artifact to {}",
|
||||
dest.display(),
|
||||
);
|
||||
async_std::fs::write(&dest, &artifact_bytes).await?;
|
||||
|
||||
// Communicate the results back to the host.
|
||||
framed_send(&mut stream, &path_to_bytes(&dest)).await?;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fn prepare_artifact(code: &[u8]) -> Artifact {
|
||||
let blob = match crate::executor_intf::prevalidate(code) {
|
||||
Err(err) => {
|
||||
return Artifact::PrevalidationErr(format!("{:?}", err));
|
||||
}
|
||||
Ok(b) => b,
|
||||
};
|
||||
|
||||
match crate::executor_intf::prepare(blob) {
|
||||
Ok(compiled_artifact) => Artifact::Compiled { compiled_artifact },
|
||||
Err(err) => Artifact::PreparationErr(format!("{:?}", err)),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user