From 9a0049d0da59b8b842f64fae441b34dba3408430 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> Date: Wed, 24 Apr 2024 09:15:39 +0300 Subject: [PATCH] Plumbing to increase pvf workers configuration based on chain id (#4252) Part of https://github.com/paritytech/polkadot-sdk/issues/4126 we want to safely increase the execute_workers_max_num gradually from chain to chain and assess if there are any negative impacts. This PR performs the necessary plumbing to be able to increase it based on the chain id, it increase the number of execution workers from 2 to 4 on test network but lives kusama and polkadot unchanged until we gather more data. --------- Signed-off-by: Alexandru Gheorghe --- .../src/lib.rs | 3 +++ polkadot/cli/src/cli.rs | 17 ++++++++++++++++ polkadot/cli/src/command.rs | 3 +++ .../node/core/candidate-validation/src/lib.rs | 13 ++++++++++++ .../benches/host_prepare_rococo_runtime.rs | 3 +++ polkadot/node/core/pvf/src/host.rs | 9 ++++++--- polkadot/node/core/pvf/tests/it/main.rs | 3 +++ polkadot/node/service/src/lib.rs | 20 +++++++++++++++++++ polkadot/node/test/service/src/lib.rs | 6 ++++++ .../adder/collator/src/main.rs | 3 +++ .../undying/collator/src/main.rs | 3 +++ prdoc/pr_4252.prdoc | 15 ++++++++++++++ 12 files changed, 95 insertions(+), 3 deletions(-) create mode 100644 prdoc/pr_4252.prdoc diff --git a/cumulus/client/relay-chain-inprocess-interface/src/lib.rs b/cumulus/client/relay-chain-inprocess-interface/src/lib.rs index 6ea02b2e7c..578b942776 100644 --- a/cumulus/client/relay-chain-inprocess-interface/src/lib.rs +++ b/cumulus/client/relay-chain-inprocess-interface/src/lib.rs @@ -312,6 +312,9 @@ fn build_polkadot_full_node( overseer_message_channel_capacity_override: None, malus_finality_delay: None, hwbench, + execute_workers_max_num: None, + prepare_workers_hard_max_num: None, + prepare_workers_soft_max_num: None, }, )?; diff --git a/polkadot/cli/src/cli.rs b/polkadot/cli/src/cli.rs index 3737942e6e..3e5a6ccdd3 100644 --- a/polkadot/cli/src/cli.rs +++ b/polkadot/cli/src/cli.rs @@ -131,6 +131,23 @@ pub struct RunCmd { #[arg(long, value_name = "PATH")] pub workers_path: Option, + /// Override the maximum number of pvf execute workers. + /// + /// **Dangerous!** Do not touch unless explicitly advised to. + #[arg(long)] + pub execute_workers_max_num: Option, + /// Override the maximum number of pvf workers that can be spawned in the pvf prepare + /// pool for tasks with the priority below critical. + /// + /// **Dangerous!** Do not touch unless explicitly advised to. + + #[arg(long)] + pub prepare_workers_soft_max_num: Option, + /// Override the absolute number of pvf workers that can be spawned in the pvf prepare pool. + /// + /// **Dangerous!** Do not touch unless explicitly advised to. + #[arg(long)] + pub prepare_workers_hard_max_num: Option, /// TESTING ONLY: disable the version check between nodes and workers. #[arg(long, hide = true)] pub disable_worker_version_check: bool, diff --git a/polkadot/cli/src/command.rs b/polkadot/cli/src/command.rs index 6af93a7563..f5ee538e8c 100644 --- a/polkadot/cli/src/command.rs +++ b/polkadot/cli/src/command.rs @@ -253,6 +253,9 @@ where .overseer_channel_capacity_override, malus_finality_delay: maybe_malus_finality_delay, hwbench, + execute_workers_max_num: cli.run.execute_workers_max_num, + prepare_workers_hard_max_num: cli.run.prepare_workers_hard_max_num, + prepare_workers_soft_max_num: cli.run.prepare_workers_soft_max_num, }, ) .map(|full| full.task_manager)?; diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index 8663dc4383..08881dad19 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -100,6 +100,13 @@ pub struct Config { pub prep_worker_path: PathBuf, /// Path to the execution worker binary pub exec_worker_path: PathBuf, + /// The maximum number of pvf execution workers. + pub pvf_execute_workers_max_num: usize, + /// The maximum number of pvf workers that can be spawned in the pvf prepare pool for tasks + /// with the priority below critical. + pub pvf_prepare_workers_soft_max_num: usize, + /// The absolute number of pvf workers that can be spawned in the pvf prepare pool. + pub pvf_prepare_workers_hard_max_num: usize, } /// The candidate validation subsystem. @@ -224,6 +231,9 @@ async fn run( secure_validator_mode, prep_worker_path, exec_worker_path, + pvf_execute_workers_max_num, + pvf_prepare_workers_soft_max_num, + pvf_prepare_workers_hard_max_num, }: Config, ) -> SubsystemResult<()> { let (validation_host, task) = polkadot_node_core_pvf::start( @@ -233,6 +243,9 @@ async fn run( secure_validator_mode, prep_worker_path, exec_worker_path, + pvf_execute_workers_max_num, + pvf_prepare_workers_soft_max_num, + pvf_prepare_workers_hard_max_num, ), pvf_metrics, ) diff --git a/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs b/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs index 2aea21361a..97a03e6596 100644 --- a/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs +++ b/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs @@ -48,6 +48,9 @@ impl TestHost { false, prepare_worker_path, execute_worker_path, + 2, + 1, + 2, ); f(&mut config); let (host, task) = start(config, Metrics::default()).await.unwrap(); diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 2d180fc592..4065598a3a 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -188,6 +188,9 @@ impl Config { secure_validator_mode: bool, prepare_worker_program_path: PathBuf, execute_worker_program_path: PathBuf, + execute_workers_max_num: usize, + prepare_workers_soft_max_num: usize, + prepare_workers_hard_max_num: usize, ) -> Self { Self { cache_path, @@ -196,12 +199,12 @@ impl Config { prepare_worker_program_path, prepare_worker_spawn_timeout: Duration::from_secs(3), - prepare_workers_soft_max_num: 1, - prepare_workers_hard_max_num: 2, + prepare_workers_soft_max_num, + prepare_workers_hard_max_num, execute_worker_program_path, execute_worker_spawn_timeout: Duration::from_secs(3), - execute_workers_max_num: 2, + execute_workers_max_num, } } } diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 16ef23c69c..56cc681aff 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -63,6 +63,9 @@ impl TestHost { false, prepare_worker_path, execute_worker_path, + 2, + 1, + 2, ); f(&mut config); let (host, task) = start(config, Metrics::default()).await.unwrap(); diff --git a/polkadot/node/service/src/lib.rs b/polkadot/node/service/src/lib.rs index 22231c84b1..e5c2917209 100644 --- a/polkadot/node/service/src/lib.rs +++ b/polkadot/node/service/src/lib.rs @@ -643,6 +643,13 @@ pub struct NewFullParams { pub workers_path: Option, /// Optional custom names for the prepare and execute workers. pub workers_names: Option<(String, String)>, + /// An optional number of the maximum number of pvf execute workers. + pub execute_workers_max_num: Option, + /// An optional maximum number of pvf workers that can be spawned in the pvf prepare pool for + /// tasks with the priority below critical. + pub prepare_workers_soft_max_num: Option, + /// An optional absolute number of pvf workers that can be spawned in the pvf prepare pool. + pub prepare_workers_hard_max_num: Option, pub overseer_gen: OverseerGenerator, pub overseer_message_channel_capacity_override: Option, #[allow(dead_code)] @@ -738,6 +745,9 @@ pub fn new_full< overseer_message_channel_capacity_override, malus_finality_delay: _malus_finality_delay, hwbench, + execute_workers_max_num, + prepare_workers_soft_max_num, + prepare_workers_hard_max_num, }: NewFullParams, ) -> Result { use polkadot_node_network_protocol::request_response::IncomingRequest; @@ -943,6 +953,16 @@ pub fn new_full< secure_validator_mode, prep_worker_path, exec_worker_path, + pvf_execute_workers_max_num: execute_workers_max_num.unwrap_or_else( + || match config.chain_spec.identify_chain() { + // The intention is to use this logic for gradual increasing from 2 to 4 + // of this configuration chain by chain untill it reaches production chain. + Chain::Polkadot | Chain::Kusama => 2, + Chain::Rococo | Chain::Westend | Chain::Unknown => 4, + }, + ), + pvf_prepare_workers_soft_max_num: prepare_workers_soft_max_num.unwrap_or(1), + pvf_prepare_workers_hard_max_num: prepare_workers_hard_max_num.unwrap_or(2), }) } else { None diff --git a/polkadot/node/test/service/src/lib.rs b/polkadot/node/test/service/src/lib.rs index d313c19333..87fbc7c20f 100644 --- a/polkadot/node/test/service/src/lib.rs +++ b/polkadot/node/test/service/src/lib.rs @@ -97,6 +97,9 @@ pub fn new_full( overseer_message_channel_capacity_override: None, malus_finality_delay: None, hwbench: None, + execute_workers_max_num: None, + prepare_workers_hard_max_num: None, + prepare_workers_soft_max_num: None, }, ), sc_network::config::NetworkBackendType::Litep2p => @@ -116,6 +119,9 @@ pub fn new_full( overseer_message_channel_capacity_override: None, malus_finality_delay: None, hwbench: None, + execute_workers_max_num: None, + prepare_workers_hard_max_num: None, + prepare_workers_soft_max_num: None, }, ), } diff --git a/polkadot/parachain/test-parachains/adder/collator/src/main.rs b/polkadot/parachain/test-parachains/adder/collator/src/main.rs index fec90fc41c..e8588274df 100644 --- a/polkadot/parachain/test-parachains/adder/collator/src/main.rs +++ b/polkadot/parachain/test-parachains/adder/collator/src/main.rs @@ -95,6 +95,9 @@ fn main() -> Result<()> { overseer_message_channel_capacity_override: None, malus_finality_delay: None, hwbench: None, + execute_workers_max_num: None, + prepare_workers_hard_max_num: None, + prepare_workers_soft_max_num: None, }, ) .map_err(|e| e.to_string())?; diff --git a/polkadot/parachain/test-parachains/undying/collator/src/main.rs b/polkadot/parachain/test-parachains/undying/collator/src/main.rs index 45f21e7b85..7198a831a4 100644 --- a/polkadot/parachain/test-parachains/undying/collator/src/main.rs +++ b/polkadot/parachain/test-parachains/undying/collator/src/main.rs @@ -97,6 +97,9 @@ fn main() -> Result<()> { overseer_message_channel_capacity_override: None, malus_finality_delay: None, hwbench: None, + execute_workers_max_num: None, + prepare_workers_hard_max_num: None, + prepare_workers_soft_max_num: None, }, ) .map_err(|e| e.to_string())?; diff --git a/prdoc/pr_4252.prdoc b/prdoc/pr_4252.prdoc new file mode 100644 index 0000000000..22987b4684 --- /dev/null +++ b/prdoc/pr_4252.prdoc @@ -0,0 +1,15 @@ +title: "Add logic to increase pvf worker based on chain" + +doc: + - audience: Node Operator + description: | + A new logic and cli parameters were added to allow increasing the number of pvf + workers based on the chain-id. + +crates: + - name: polkadot-node-core-candidate-validation + bump: minor + - name: polkadot-cli + bump: minor + - name: polkadot-service + bump: minor