Follow ups for benchmark machine (#11270)

* Follow ups for the MachineCmd Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Fix CI Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Review fixes Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Add to node-template Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Fix test with feature flag Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Review fixes Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Lower disk requirements Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Add ExecutionLimit to the disk benchmarks Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * fmt Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Add doc Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Review fixes Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Rename DISK_WRITE_LIMIT -> DEFAULT_DISK_EXECUTION_LIMIT Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Rename POLKADOT_REFERENCE_HARDWARE -> SUBSTRATE_REFERENCE_HARDWARE Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Fix build profile + add license Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Remove deps Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Set tolerance to 10% Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Fix tests Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Ignore test I cannot reproduce the CI error, even with the full command: cargo test --workspace --locked --release --verbose --features runtime-benchmarks --manifest-path ./bin/node/cli/Cargo.toml I will put an 'ignore' on that test for now, since it works for me and is worth having. Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> * Remove test Still cannot reproduce the error and it fails in the CI. Removing it now. Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io> Co-authored-by: Shawn Tabrizi <shawntabrizi@gmail.com>
2026-04-26 15:47:58 +00:00 · 2022-04-26 16:31:26 +02:00
parent 9a3201ef3d
commit 9980d314b1
14 changed files with 512 additions and 42 deletions
@@ -18,6 +18,8 @@
 //! Contains the [`MachineCmd`] as entry point for the node
 //! and the core benchmarking logic.

+pub mod hardware;
+
 use sc_cli::{CliConfiguration, Result, SharedParams};
 use sc_service::Configuration;
 use sc_sysinfo::{
@@ -26,9 +28,12 @@ use sc_sysinfo::{
 };

 use clap::Parser;
-use log::info;
+use log::{error, info, warn};
 use prettytable::{cell, row, table};
-use std::{fmt::Debug, fs, time::Duration};
+use std::{boxed::Box, fmt::Debug, fs, path::Path};
+
+use crate::shared::check_build_profile;
+pub use hardware::{Metric, Requirement, Requirements, Throughput, SUBSTRATE_REFERENCE_HARDWARE};

 /// Command to benchmark the hardware.
 ///
@@ -44,38 +49,174 @@ pub struct MachineCmd {
 	#[clap(flatten)]
 	pub shared_params: SharedParams,

+	/// Do not return an error if any check fails.
+	///
+	/// Should only be used for debugging.
+	#[clap(long)]
+	pub allow_fail: bool,
+
+	/// Set a fault tolerance for passing a requirement.
+	///
+	/// 10% means that the test would pass even when only 90% score was archived.
+	/// Can be used to mitigate outliers of the benchmarks.
+	#[clap(long, default_value = "10.0", value_name = "PERCENT")]
+	pub tolerance: f64,
+
 	/// Time limit for the verification benchmark.
 	#[clap(long, default_value = "2.0", value_name = "SECONDS")]
 	pub verify_duration: f32,
+
+	/// Time limit for each disk benchmark.
+	#[clap(long, default_value = "5.0", value_name = "SECONDS")]
+	pub disk_duration: f32,
+}
+
+/// Helper for the result of a concrete benchmark.
+struct BenchResult {
+	/// Did the hardware pass the benchmark?
+	passed: bool,
+
+	/// The absolute score that was archived.
+	score: Throughput,
+
+	/// The score relative to the minimal required score.
+	///
+	/// Is in range [0, 1].
+	rel_score: f64,
+}
+
+/// Errors that can be returned by the this command.
+#[derive(Debug, thiserror::Error)]
+#[allow(missing_docs)]
+pub enum Error {
+	#[error("One of the benchmarks had a score that was lower than its requirement")]
+	UnmetRequirement,
+
+	#[error("The build profile is unfit for benchmarking: {0}")]
+	BadBuildProfile(String),
+
+	#[error("Benchmark results are off by at least factor 100")]
+	BadResults,
 }

 impl MachineCmd {
 	/// Execute the benchmark and print the results.
-	pub fn run(&self, cfg: &Configuration) -> Result<()> {
+	pub fn run(&self, cfg: &Configuration, requirements: Requirements) -> Result<()> {
+		self.validate_args()?;
 		// Ensure that the dir exists since the node is not started to take care of it.
 		let dir = cfg.database.path().ok_or("No DB directory provided")?;
 		fs::create_dir_all(dir)?;

 		info!("Running machine benchmarks...");
-		let write = benchmark_disk_sequential_writes(dir)?;
-		let read = benchmark_disk_random_writes(dir)?;
-		let verify_limit =
-			ExecutionLimit::MaxDuration(Duration::from_secs_f32(self.verify_duration));
-		let verify = benchmark_sr25519_verify(verify_limit) * 1024.0;
+		let mut results = Vec::new();
+		for requirement in &requirements.0 {
+			let result = self.run_benchmark(requirement, &dir)?;
+			results.push(result);
+		}
+		self.print_summary(requirements, results)
+	}

+	/// Benchmarks a specific metric of the hardware and judges the resulting score.
+	fn run_benchmark(&self, requirement: &Requirement, dir: &Path) -> Result<BenchResult> {
+		// Dispatch the concrete function from `sc-sysinfo`.
+		let score = self.measure(&requirement.metric, dir)?;
+		let rel_score = score.to_bs() / requirement.minimum.to_bs();
+
+		// Sanity check if the result is off by factor >100x.
+		if rel_score >= 100.0 || rel_score <= 0.01 {
+			self.check_failed(Error::BadResults)?;
+		}
+		let passed = rel_score >= (1.0 - (self.tolerance / 100.0));
+		Ok(BenchResult { passed, score, rel_score })
+	}
+
+	/// Measures a metric of the hardware.
+	fn measure(&self, metric: &Metric, dir: &Path) -> Result<Throughput> {
+		let verify_limit = ExecutionLimit::from_secs_f32(self.verify_duration);
+		let disk_limit = ExecutionLimit::from_secs_f32(self.disk_duration);
+
+		let score = match metric {
+			Metric::Blake2256 => Throughput::MiBs(benchmark_cpu() as f64),
+			Metric::Sr25519Verify => Throughput::MiBs(benchmark_sr25519_verify(verify_limit)),
+			Metric::MemCopy => Throughput::MiBs(benchmark_memory() as f64),
+			Metric::DiskSeqWrite =>
+				Throughput::MiBs(benchmark_disk_sequential_writes(disk_limit, dir)? as f64),
+			Metric::DiskRndWrite =>
+				Throughput::MiBs(benchmark_disk_random_writes(disk_limit, dir)? as f64),
+		};
+		Ok(score)
+	}
+
+	/// Prints a human-readable summary.
+	fn print_summary(&self, requirements: Requirements, results: Vec<BenchResult>) -> Result<()> {
 		// Use a table for nicer console output.
-		let table = table!(
-			["Category", "Function", "Score", "Unit"],
-			["CPU", "BLAKE2-256", benchmark_cpu(), "MB/s"],
-			["CPU", "SR25519 Verify", format!("{:.1}", verify), "KB/s"],
-			["Memory", "Copy", benchmark_memory(), "MB/s"],
-			["Disk", "Seq Write", write, "MB/s"],
-			["Disk", "Rnd Write", read, "MB/s"]
-		);
+		let mut table = table!(["Category", "Function", "Score", "Minimum", "Result"]);
+		// Count how many passed and how many failed.
+		let (mut passed, mut failed) = (0, 0);
+		for (requirement, result) in requirements.0.iter().zip(results.iter()) {
+			if result.passed {
+				passed += 1
+			} else {
+				failed += 1
+			}

-		info!("\n{}", table);
+			table.add_row(result.to_row(requirement));
+		}
+		// Print the table and a summary.
+		info!(
+			"\n{}\nFrom {} benchmarks in total, {} passed and {} failed ({:.0?}% fault tolerance).",
+			table,
+			passed + failed,
+			passed,
+			failed,
+			self.tolerance
+		);
+		// Print the final result.
+		if failed != 0 {
+			info!("The hardware fails to meet the requirements");
+			self.check_failed(Error::UnmetRequirement)?;
+		} else {
+			info!("The hardware meets the requirements ");
+		}
+		// Check that the results were not created by a bad build profile.
+		if let Err(err) = check_build_profile() {
+			self.check_failed(Error::BadBuildProfile(err))?;
+		}
 		Ok(())
 	}
+
+	/// Returns `Ok` if [`self.allow_fail`] is set and otherwise the error argument.
+	fn check_failed(&self, e: Error) -> Result<()> {
+		if !self.allow_fail {
+			error!("Failing since --allow-fail is not set");
+			Err(sc_cli::Error::Application(Box::new(e)))
+		} else {
+			warn!("Ignoring error since --allow-fail is set: {:?}", e);
+			Ok(())
+		}
+	}
+
+	/// Validates the CLI arguments.
+	fn validate_args(&self) -> Result<()> {
+		if self.tolerance > 100.0 || self.tolerance < 0.0 {
+			return Err("The --tolerance argument is out of range".into())
+		}
+		Ok(())
+	}
+}
+
+impl BenchResult {
+	/// Format [`Self`] as row that can be printed in a table.
+	fn to_row(&self, req: &Requirement) -> prettytable::Row {
+		let passed = if self.passed { "✅ Pass" } else { "❌ Fail" };
+		row![
+			req.metric.category(),
+			req.metric.name(),
+			format!("{}", self.score),
+			format!("{}", req.minimum),
+			format!("{} ({: >5.1?} %)", passed, self.rel_score * 100.0)
+		]
+	}
 }

 // Boilerplate