98.6% OF DEVELOPERS CANNOT REVIEW THIS PR! [read more...] (#7337)

* [WIP] PVF: Split out worker binaries

* Address compilation problems and re-design a bit

* Reorganize once more, fix tests

* Reformat with new nightly to make `cargo fmt` test happy

* Address `clippy` warnings

* Add temporary trace to debug zombienet tests

* Fix zombienet node upgrade test

* Fix malus and its CI

* Fix building worker binaries with malus

* More fixes for malus

* Remove unneeded cli subcommands

* Support placing auxiliary binaries to `/usr/libexec`

* Fix spelling

* Spelling

Co-authored-by: Marcin S. <marcin@realemail.net>

* Implement review comments (mostly nits)

* Fix worker node version flag

* Rework getting the worker paths

* Address a couple of review comments

* Minor restructuring

* Fix CI error

* Add tests for worker binaries detection

* Improve tests; try to fix CI

* Move workers module into separate file

* Try to fix failing test and workers not printing latest version

- Tests were not finding the worker binaries
- Workers were not being rebuilt when the version changed
- Made some errors easier to read

* Make a bunch of fixes

* Rebuild nodes on version change

* Fix more issues

* Fix tests

* Pass node version from node into dependencies to avoid recompiles

- [X] get version in CLI
- [X] pass it in to service
- [X] pass version along to PVF
- [X] remove rerun from service
- [X] add rerun to CLI

- [X] don’t rerun pvf/worker’s (these should be built by nodes which have rerun enabled)

* Some more improvements for smoother tests

- [X] Fix tests
- [X] Make puppet workers pass None for version and remove rerun
- [X] Make test collators self-contained

* Add back rerun to PVF workers

* Move worker binaries into files in cli crate

As a final optimization I've separated out each worker binary from its own crate
into the CLI crate. Before, the worker bin shared a crate with the worker lib,
so when the binaries got recompiled so did the libs and everything transitively
depending on the libs. This commit fixes this regression that was causing
recompiles after every commit.

* Fix bug (was passing worker version for node version)

* Move workers out of cli into root src/bin/ dir

- [X] Pass in node version from top-level (polkadot)
- [X] Add build.rs with rerun-git-head to root dir

* Add some sanity checks for workers to dockerfiles

* Update malus

  + [X] Make it self-contained
  + [X] Undo multiple binary changes

* Try to fix clippy errors

* Address `cargo run` issue

- [X] Add default-run for polkadot
- [X] Add note about installation to error

* Update readme (installation instructions)

* Allow disabling external workers for local/testing setups

  + [X] cli flag to enable single-binary mode
  + [X] Add message to error

* Revert unnecessary Cargo.lock changes

* Remove unnecessary build scripts from collators

* Add back missing malus commands (should fix failing ZN job)

* Some minor fixes

* Update Cargo.lock

* Fix some build errors

* Undo self-contained binaries; cli flag to disable version check

  + [X] Remove --dont-run-external-workers
  + [X] Add --disable-worker-version-check
  + [X] Remove PVF subcommands
  + [X] Redo malus changes

* Try to fix failing job and add some docs for local tests

---------

Co-authored-by: Dmitry Sinyavin <dmitry.sinyavin@parity.io>
Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
Co-authored-by: parity-processbot <>
This commit is contained in:
Marcin S
2023-07-31 15:35:42 +02:00
committed by GitHub
parent 9d2cee1b08
commit 85b06f18b9
48 changed files with 1418 additions and 557 deletions
+31 -6
View File
@@ -137,8 +137,10 @@ struct Queue {
/// The receiver that receives messages to the pool.
to_queue_rx: mpsc::Receiver<ToQueue>,
// Some variables related to the current session.
program_path: PathBuf,
spawn_timeout: Duration,
node_version: Option<String>,
/// The queue of jobs that are waiting for a worker to pick up.
queue: VecDeque<ExecuteJob>,
@@ -152,12 +154,14 @@ impl Queue {
program_path: PathBuf,
worker_capacity: usize,
spawn_timeout: Duration,
node_version: Option<String>,
to_queue_rx: mpsc::Receiver<ToQueue>,
) -> Self {
Self {
metrics,
program_path,
spawn_timeout,
node_version,
to_queue_rx,
queue: VecDeque::new(),
mux: Mux::new(),
@@ -398,9 +402,15 @@ fn spawn_extra_worker(queue: &mut Queue, job: ExecuteJob) {
queue.metrics.execute_worker().on_begin_spawn();
gum::debug!(target: LOG_TARGET, "spawning an extra worker");
queue
.mux
.push(spawn_worker_task(queue.program_path.clone(), job, queue.spawn_timeout).boxed());
queue.mux.push(
spawn_worker_task(
queue.program_path.clone(),
job,
queue.spawn_timeout,
queue.node_version.clone(),
)
.boxed(),
);
queue.workers.spawn_inflight += 1;
}
@@ -414,12 +424,18 @@ async fn spawn_worker_task(
program_path: PathBuf,
job: ExecuteJob,
spawn_timeout: Duration,
node_version: Option<String>,
) -> QueueEvent {
use futures_timer::Delay;
loop {
match super::worker_intf::spawn(&program_path, job.executor_params.clone(), spawn_timeout)
.await
match super::worker_intf::spawn(
&program_path,
job.executor_params.clone(),
spawn_timeout,
node_version.as_deref(),
)
.await
{
Ok((idle, handle)) => break QueueEvent::Spawn(idle, handle, job),
Err(err) => {
@@ -481,8 +497,17 @@ pub fn start(
program_path: PathBuf,
worker_capacity: usize,
spawn_timeout: Duration,
node_version: Option<String>,
) -> (mpsc::Sender<ToQueue>, impl Future<Output = ()>) {
let (to_queue_tx, to_queue_rx) = mpsc::channel(20);
let run = Queue::new(metrics, program_path, worker_capacity, spawn_timeout, to_queue_rx).run();
let run = Queue::new(
metrics,
program_path,
worker_capacity,
spawn_timeout,
node_version,
to_queue_rx,
)
.run();
(to_queue_tx, run)
}
@@ -45,14 +45,14 @@ pub async fn spawn(
program_path: &Path,
executor_params: ExecutorParams,
spawn_timeout: Duration,
node_version: Option<&str>,
) -> Result<(IdleWorker, WorkerHandle), SpawnErr> {
let (mut idle_worker, worker_handle) = spawn_with_program_path(
"execute",
program_path,
&["execute-worker", "--node-impl-version", env!("SUBSTRATE_CLI_IMPL_VERSION")],
spawn_timeout,
)
.await?;
let mut extra_args = vec!["execute-worker"];
if let Some(node_version) = node_version {
extra_args.extend_from_slice(&["--node-impl-version", node_version]);
}
let (mut idle_worker, worker_handle) =
spawn_with_program_path("execute", program_path, &extra_args, spawn_timeout).await?;
send_handshake(&mut idle_worker.stream, Handshake { executor_params })
.await
.map_err(|error| {
+19 -7
View File
@@ -52,6 +52,12 @@ pub const PREPARE_FAILURE_COOLDOWN: Duration = Duration::from_millis(200);
/// The amount of times we will retry failed prepare jobs.
pub const NUM_PREPARE_RETRIES: u32 = 5;
/// The name of binary spawned to prepare a PVF artifact
pub const PREPARE_BINARY_NAME: &str = "polkadot-prepare-worker";
/// The name of binary spawned to execute a PVF
pub const EXECUTE_BINARY_NAME: &str = "polkadot-execute-worker";
/// An alias to not spell the type for the oneshot sender for the PVF execution result.
pub(crate) type ResultSender = oneshot::Sender<Result<ValidationResult, ValidationError>>;
@@ -144,6 +150,8 @@ struct ExecutePvfInputs {
pub struct Config {
/// The root directory where the prepared artifacts can be stored.
pub cache_path: PathBuf,
/// The version of the node. `None` can be passed to skip the version check (only for tests).
pub node_version: Option<String>,
/// The path to the program that can be used to spawn the prepare workers.
pub prepare_worker_program_path: PathBuf,
/// The time allotted for a prepare worker to spawn and report to the host.
@@ -163,18 +171,20 @@ pub struct Config {
impl Config {
/// Create a new instance of the configuration.
pub fn new(cache_path: std::path::PathBuf, program_path: std::path::PathBuf) -> Self {
// Do not contaminate the other parts of the codebase with the types from `tokio`.
let cache_path = PathBuf::from(cache_path);
let program_path = PathBuf::from(program_path);
pub fn new(
cache_path: PathBuf,
node_version: Option<String>,
prepare_worker_program_path: PathBuf,
execute_worker_program_path: PathBuf,
) -> Self {
Self {
cache_path,
prepare_worker_program_path: program_path.clone(),
node_version,
prepare_worker_program_path,
prepare_worker_spawn_timeout: Duration::from_secs(3),
prepare_workers_soft_max_num: 1,
prepare_workers_hard_max_num: 1,
execute_worker_program_path: program_path,
execute_worker_program_path,
execute_worker_spawn_timeout: Duration::from_secs(3),
execute_workers_max_num: 2,
}
@@ -204,6 +214,7 @@ pub fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future<O
config.prepare_worker_program_path.clone(),
config.cache_path.clone(),
config.prepare_worker_spawn_timeout,
config.node_version.clone(),
);
let (to_prepare_queue_tx, from_prepare_queue_rx, run_prepare_queue) = prepare::start_queue(
@@ -220,6 +231,7 @@ pub fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future<O
config.execute_worker_program_path.to_owned(),
config.execute_workers_max_num,
config.execute_worker_spawn_timeout,
config.node_version,
);
let (to_sweeper_tx, to_sweeper_rx) = mpsc::channel(100);
+1 -1
View File
@@ -105,7 +105,7 @@ pub mod testing;
pub use sp_tracing;
pub use error::{InvalidCandidate, ValidationError};
pub use host::{start, Config, ValidationHost};
pub use host::{start, Config, ValidationHost, EXECUTE_BINARY_NAME, PREPARE_BINARY_NAME};
pub use metrics::Metrics;
pub use priority::Priority;
pub use worker_intf::{framed_recv, framed_send, JOB_TIMEOUT_WALL_CLOCK_FACTOR};
+17 -3
View File
@@ -113,10 +113,13 @@ struct Pool {
program_path: PathBuf,
cache_path: PathBuf,
spawn_timeout: Duration,
node_version: Option<String>,
to_pool: mpsc::Receiver<ToPool>,
from_pool: mpsc::UnboundedSender<FromPool>,
spawned: HopSlotMap<Worker, WorkerData>,
mux: Mux,
metrics: Metrics,
}
@@ -128,6 +131,7 @@ async fn run(
program_path,
cache_path,
spawn_timeout,
node_version,
to_pool,
mut from_pool,
mut spawned,
@@ -155,6 +159,7 @@ async fn run(
&program_path,
&cache_path,
spawn_timeout,
node_version.clone(),
&mut spawned,
&mut mux,
to_pool,
@@ -201,6 +206,7 @@ fn handle_to_pool(
program_path: &Path,
cache_path: &Path,
spawn_timeout: Duration,
node_version: Option<String>,
spawned: &mut HopSlotMap<Worker, WorkerData>,
mux: &mut Mux,
to_pool: ToPool,
@@ -209,7 +215,9 @@ fn handle_to_pool(
ToPool::Spawn => {
gum::debug!(target: LOG_TARGET, "spawning a new prepare worker");
metrics.prepare_worker().on_begin_spawn();
mux.push(spawn_worker_task(program_path.to_owned(), spawn_timeout).boxed());
mux.push(
spawn_worker_task(program_path.to_owned(), spawn_timeout, node_version).boxed(),
);
},
ToPool::StartWork { worker, pvf, artifact_path } => {
if let Some(data) = spawned.get_mut(worker) {
@@ -248,11 +256,15 @@ fn handle_to_pool(
}
}
async fn spawn_worker_task(program_path: PathBuf, spawn_timeout: Duration) -> PoolEvent {
async fn spawn_worker_task(
program_path: PathBuf,
spawn_timeout: Duration,
node_version: Option<String>,
) -> PoolEvent {
use futures_timer::Delay;
loop {
match worker_intf::spawn(&program_path, spawn_timeout).await {
match worker_intf::spawn(&program_path, spawn_timeout, node_version.as_deref()).await {
Ok((idle, handle)) => break PoolEvent::Spawn(idle, handle),
Err(err) => {
gum::warn!(target: LOG_TARGET, "failed to spawn a prepare worker: {:?}", err);
@@ -419,6 +431,7 @@ pub fn start(
program_path: PathBuf,
cache_path: PathBuf,
spawn_timeout: Duration,
node_version: Option<String>,
) -> (mpsc::Sender<ToPool>, mpsc::UnboundedReceiver<FromPool>, impl Future<Output = ()>) {
let (to_pool_tx, to_pool_rx) = mpsc::channel(10);
let (from_pool_tx, from_pool_rx) = mpsc::unbounded();
@@ -428,6 +441,7 @@ pub fn start(
program_path,
cache_path,
spawn_timeout,
node_version,
to_pool: to_pool_rx,
from_pool: from_pool_tx,
spawned: HopSlotMap::with_capacity_and_key(20),
@@ -45,14 +45,13 @@ use tokio::{io, net::UnixStream};
pub async fn spawn(
program_path: &Path,
spawn_timeout: Duration,
node_version: Option<&str>,
) -> Result<(IdleWorker, WorkerHandle), SpawnErr> {
spawn_with_program_path(
"prepare",
program_path,
&["prepare-worker", "--node-impl-version", env!("SUBSTRATE_CLI_IMPL_VERSION")],
spawn_timeout,
)
.await
let mut extra_args = vec!["prepare-worker"];
if let Some(node_version) = node_version {
extra_args.extend_from_slice(&["--node-impl-version", node_version]);
}
spawn_with_program_path("prepare", program_path, &extra_args, spawn_timeout).await
}
pub enum Outcome {
+18 -20
View File
@@ -58,37 +58,35 @@ macro_rules! decl_puppet_worker_main {
$crate::sp_tracing::try_init_simple();
let args = std::env::args().collect::<Vec<_>>();
if args.len() < 3 {
if args.len() == 1 {
panic!("wrong number of arguments");
}
let mut version = None;
let mut socket_path: &str = "";
for i in 2..args.len() {
match args[i].as_ref() {
"--socket-path" => socket_path = args[i + 1].as_str(),
"--node-version" => version = Some(args[i + 1].as_str()),
_ => (),
}
}
let subcommand = &args[1];
match subcommand.as_ref() {
let entrypoint = match args[1].as_ref() {
"exit" => {
std::process::exit(1);
},
"sleep" => {
std::thread::sleep(std::time::Duration::from_secs(5));
return
},
"prepare-worker" => {
$crate::prepare_worker_entrypoint(&socket_path, version);
},
"execute-worker" => {
$crate::execute_worker_entrypoint(&socket_path, version);
},
"prepare-worker" => $crate::prepare_worker_entrypoint,
"execute-worker" => $crate::execute_worker_entrypoint,
other => panic!("unknown subcommand: {}", other),
};
let mut node_version = None;
let mut socket_path: &str = "";
for i in (2..args.len()).step_by(2) {
match args[i].as_ref() {
"--socket-path" => socket_path = args[i + 1].as_str(),
"--node-impl-version" => node_version = Some(args[i + 1].as_str()),
arg => panic!("Unexpected argument found: {}", arg),
}
}
entrypoint(&socket_path, node_version, None);
}
};
}
+5 -3
View File
@@ -43,12 +43,14 @@ pub const JOB_TIMEOUT_WALL_CLOCK_FACTOR: u32 = 4;
pub async fn spawn_with_program_path(
debug_id: &'static str,
program_path: impl Into<PathBuf>,
extra_args: &'static [&'static str],
extra_args: &[&str],
spawn_timeout: Duration,
) -> Result<(IdleWorker, WorkerHandle), SpawnErr> {
let program_path = program_path.into();
with_transient_socket_path(debug_id, |socket_path| {
let socket_path = socket_path.to_owned();
let extra_args: Vec<String> = extra_args.iter().map(|arg| arg.to_string()).collect();
async move {
let listener = UnixListener::bind(&socket_path).map_err(|err| {
gum::warn!(
@@ -63,7 +65,7 @@ pub async fn spawn_with_program_path(
})?;
let handle =
WorkerHandle::spawn(&program_path, extra_args, socket_path).map_err(|err| {
WorkerHandle::spawn(&program_path, &extra_args, socket_path).map_err(|err| {
gum::warn!(
target: LOG_TARGET,
%debug_id,
@@ -214,7 +216,7 @@ pub struct WorkerHandle {
impl WorkerHandle {
fn spawn(
program: impl AsRef<Path>,
extra_args: &[&str],
extra_args: &[String],
socket_path: impl AsRef<Path>,
) -> io::Result<Self> {
let mut child = process::Command::new(program.as_ref())