98.6% OF DEVELOPERS CANNOT REVIEW THIS PR! [read more...] (#7337)

* [WIP] PVF: Split out worker binaries

* Address compilation problems and re-design a bit

* Reorganize once more, fix tests

* Reformat with new nightly to make `cargo fmt` test happy

* Address `clippy` warnings

* Add temporary trace to debug zombienet tests

* Fix zombienet node upgrade test

* Fix malus and its CI

* Fix building worker binaries with malus

* More fixes for malus

* Remove unneeded cli subcommands

* Support placing auxiliary binaries to `/usr/libexec`

* Fix spelling

* Spelling

Co-authored-by: Marcin S. <marcin@realemail.net>

* Implement review comments (mostly nits)

* Fix worker node version flag

* Rework getting the worker paths

* Address a couple of review comments

* Minor restructuring

* Fix CI error

* Add tests for worker binaries detection

* Improve tests; try to fix CI

* Move workers module into separate file

* Try to fix failing test and workers not printing latest version

- Tests were not finding the worker binaries
- Workers were not being rebuilt when the version changed
- Made some errors easier to read

* Make a bunch of fixes

* Rebuild nodes on version change

* Fix more issues

* Fix tests

* Pass node version from node into dependencies to avoid recompiles

- [X] get version in CLI
- [X] pass it in to service
- [X] pass version along to PVF
- [X] remove rerun from service
- [X] add rerun to CLI

- [X] don’t rerun pvf/worker’s (these should be built by nodes which have rerun enabled)

* Some more improvements for smoother tests

- [X] Fix tests
- [X] Make puppet workers pass None for version and remove rerun
- [X] Make test collators self-contained

* Add back rerun to PVF workers

* Move worker binaries into files in cli crate

As a final optimization I've separated out each worker binary from its own crate
into the CLI crate. Before, the worker bin shared a crate with the worker lib,
so when the binaries got recompiled so did the libs and everything transitively
depending on the libs. This commit fixes this regression that was causing
recompiles after every commit.

* Fix bug (was passing worker version for node version)

* Move workers out of cli into root src/bin/ dir

- [X] Pass in node version from top-level (polkadot)
- [X] Add build.rs with rerun-git-head to root dir

* Add some sanity checks for workers to dockerfiles

* Update malus

  + [X] Make it self-contained
  + [X] Undo multiple binary changes

* Try to fix clippy errors

* Address `cargo run` issue

- [X] Add default-run for polkadot
- [X] Add note about installation to error

* Update readme (installation instructions)

* Allow disabling external workers for local/testing setups

  + [X] cli flag to enable single-binary mode
  + [X] Add message to error

* Revert unnecessary Cargo.lock changes

* Remove unnecessary build scripts from collators

* Add back missing malus commands (should fix failing ZN job)

* Some minor fixes

* Update Cargo.lock

* Fix some build errors

* Undo self-contained binaries; cli flag to disable version check

  + [X] Remove --dont-run-external-workers
  + [X] Add --disable-worker-version-check
  + [X] Remove PVF subcommands
  + [X] Redo malus changes

* Try to fix failing job and add some docs for local tests

---------

Co-authored-by: Dmitry Sinyavin <dmitry.sinyavin@parity.io>
Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
Co-authored-by: parity-processbot <>
This commit is contained in:
Marcin S
2023-07-31 15:35:42 +02:00
committed by GitHub
parent 9d2cee1b08
commit 85b06f18b9
48 changed files with 1418 additions and 557 deletions
+92 -53
View File
@@ -27,6 +27,8 @@ mod relay_chain_selection;
#[cfg(feature = "full-node")]
pub mod overseer;
#[cfg(feature = "full-node")]
pub mod workers;
#[cfg(feature = "full-node")]
pub use self::overseer::{OverseerGen, OverseerGenArgs, RealOverseerGen};
@@ -73,7 +75,7 @@ pub use {
#[cfg(feature = "full-node")]
use polkadot_node_subsystem::jaeger;
use std::{sync::Arc, time::Duration};
use std::{path::PathBuf, sync::Arc, time::Duration};
use prometheus_endpoint::Registry;
#[cfg(feature = "full-node")]
@@ -235,6 +237,26 @@ pub enum Error {
#[cfg(feature = "full-node")]
#[error("Expected at least one of polkadot, kusama, westend or rococo runtime feature")]
NoRuntime,
#[cfg(feature = "full-node")]
#[error("Worker binaries not executable, prepare binary: {prep_worker_path:?}, execute binary: {exec_worker_path:?}")]
InvalidWorkerBinaries { prep_worker_path: PathBuf, exec_worker_path: PathBuf },
#[cfg(feature = "full-node")]
#[error("Worker binaries could not be found, make sure polkadot was built/installed correctly. Searched given workers path ({given_workers_path:?}), polkadot binary path ({current_exe_path:?}), and lib path (/usr/lib/polkadot), workers names: {workers_names:?}")]
MissingWorkerBinaries {
given_workers_path: Option<PathBuf>,
current_exe_path: PathBuf,
workers_names: Option<(String, String)>,
},
#[cfg(feature = "full-node")]
#[error("Version of worker binary ({worker_version}) is different from node version ({node_version}), worker_path: {worker_path}. TESTING ONLY: this check can be disabled with --disable-worker-version-check")]
WorkerBinaryVersionMismatch {
worker_version: String,
node_version: String,
worker_path: PathBuf,
},
}
/// Identifies the variant of the chain.
@@ -603,6 +625,28 @@ where
})
}
#[cfg(feature = "full-node")]
pub struct NewFullParams<OverseerGenerator: OverseerGen> {
pub is_collator: IsCollator,
pub grandpa_pause: Option<(u32, u32)>,
pub enable_beefy: bool,
pub jaeger_agent: Option<std::net::SocketAddr>,
pub telemetry_worker_handle: Option<TelemetryWorkerHandle>,
/// The version of the node. TESTING ONLY: `None` can be passed to skip the node/worker version
/// check, both on startup and in the workers.
pub node_version: Option<String>,
/// An optional path to a directory containing the workers.
pub workers_path: Option<std::path::PathBuf>,
/// Optional custom names for the prepare and execute workers.
pub workers_names: Option<(String, String)>,
pub overseer_enable_anyways: bool,
pub overseer_gen: OverseerGenerator,
pub overseer_message_channel_capacity_override: Option<usize>,
#[allow(dead_code)]
pub malus_finality_delay: Option<u32>,
pub hwbench: Option<sc_sysinfo::HwBench>,
}
#[cfg(feature = "full-node")]
pub struct NewFull {
pub task_manager: TaskManager,
@@ -656,24 +700,30 @@ pub const AVAILABILITY_CONFIG: AvailabilityConfig = AvailabilityConfig {
/// `overseer_enable_anyways` always enables the overseer, based on the provided `OverseerGenerator`,
/// regardless of the role the node has. The relay chain selection (longest or disputes-aware) is
/// still determined based on the role of the node. Likewise for authority discovery.
///
/// `workers_path` is used to get the path to the directory where auxiliary worker binaries reside.
/// If not specified, the main binary's directory is searched first, then `/usr/lib/polkadot` is
/// searched. If the path points to an executable rather then directory, that executable is used
/// both as preparation and execution worker (supposed to be used for tests only).
#[cfg(feature = "full-node")]
pub fn new_full<OverseerGenerator>(
pub fn new_full<OverseerGenerator: OverseerGen>(
mut config: Configuration,
is_collator: IsCollator,
grandpa_pause: Option<(u32, u32)>,
enable_beefy: bool,
jaeger_agent: Option<std::net::SocketAddr>,
telemetry_worker_handle: Option<TelemetryWorkerHandle>,
program_path: Option<std::path::PathBuf>,
overseer_enable_anyways: bool,
overseer_gen: OverseerGenerator,
overseer_message_channel_capacity_override: Option<usize>,
_malus_finality_delay: Option<u32>,
hwbench: Option<sc_sysinfo::HwBench>,
) -> Result<NewFull, Error>
where
OverseerGenerator: OverseerGen,
{
NewFullParams {
is_collator,
grandpa_pause,
enable_beefy,
jaeger_agent,
telemetry_worker_handle,
node_version,
workers_path,
workers_names,
overseer_enable_anyways,
overseer_gen,
overseer_message_channel_capacity_override,
malus_finality_delay: _malus_finality_delay,
hwbench,
}: NewFullParams<OverseerGenerator>,
) -> Result<NewFull, Error> {
use polkadot_node_network_protocol::request_response::IncomingRequest;
use sc_network_common::sync::warp::WarpSyncParams;
@@ -859,16 +909,24 @@ where
slot_duration_millis: slot_duration.as_millis() as u64,
};
let candidate_validation_config = CandidateValidationConfig {
artifacts_cache_path: config
.database
.path()
.ok_or(Error::DatabasePathRequired)?
.join("pvf-artifacts"),
program_path: match program_path {
None => std::env::current_exe()?,
Some(p) => p,
},
let candidate_validation_config = if is_collator.is_collator() {
None
} else {
let (prep_worker_path, exec_worker_path) =
workers::determine_workers_paths(workers_path, workers_names, node_version.clone())?;
log::info!("🚀 Using prepare-worker binary at: {:?}", prep_worker_path);
log::info!("🚀 Using execute-worker binary at: {:?}", exec_worker_path);
Some(CandidateValidationConfig {
artifacts_cache_path: config
.database
.path()
.ok_or(Error::DatabasePathRequired)?
.join("pvf-artifacts"),
node_version,
prep_worker_path,
exec_worker_path,
})
};
let chain_selection_config = ChainSelectionConfig {
@@ -1278,40 +1336,21 @@ pub fn new_chain_ops(
/// regardless of the role the node has. The relay chain selection (longest or disputes-aware) is
/// still determined based on the role of the node. Likewise for authority discovery.
#[cfg(feature = "full-node")]
pub fn build_full(
pub fn build_full<OverseerGenerator: OverseerGen>(
config: Configuration,
is_collator: IsCollator,
grandpa_pause: Option<(u32, u32)>,
enable_beefy: bool,
jaeger_agent: Option<std::net::SocketAddr>,
telemetry_worker_handle: Option<TelemetryWorkerHandle>,
overseer_enable_anyways: bool,
overseer_gen: impl OverseerGen,
overseer_message_channel_override: Option<usize>,
malus_finality_delay: Option<u32>,
hwbench: Option<sc_sysinfo::HwBench>,
mut params: NewFullParams<OverseerGenerator>,
) -> Result<NewFull, Error> {
let is_polkadot = config.chain_spec.is_polkadot();
new_full(
config,
is_collator,
grandpa_pause,
enable_beefy,
jaeger_agent,
telemetry_worker_handle,
None,
overseer_enable_anyways,
overseer_gen,
overseer_message_channel_override.map(move |capacity| {
params.overseer_message_channel_capacity_override =
params.overseer_message_channel_capacity_override.map(move |capacity| {
if is_polkadot {
gum::warn!("Channel capacity should _never_ be tampered with on polkadot!");
}
capacity
}),
malus_finality_delay,
hwbench,
)
});
new_full(config, params)
}
/// Reverts the node state down to at most the last finalized block.