Follow ups for benchmark machine (#11270)

* Follow ups for the MachineCmd

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Fix CI

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Review fixes

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Add to node-template

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Fix test with feature flag

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Review fixes

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Lower disk requirements

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Add ExecutionLimit to the disk benchmarks

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* fmt

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Add doc

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Review fixes

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Rename DISK_WRITE_LIMIT -> DEFAULT_DISK_EXECUTION_LIMIT

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Rename POLKADOT_REFERENCE_HARDWARE -> SUBSTRATE_REFERENCE_HARDWARE

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Fix build profile + add license

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Remove deps

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Set tolerance to 10%

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Fix tests

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Ignore test

I cannot reproduce the CI error, even with the full command:
cargo test --workspace --locked --release --verbose --features runtime-benchmarks --manifest-path ./bin/node/cli/Cargo.toml

I will put an 'ignore' on that test for now, since it works for me and is worth having.

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

* Remove test

Still cannot reproduce the error and it fails in the CI.
Removing it now.

Signed-off-by: Oliver Tale-Yazdi <oliver.tale-yazdi@parity.io>

Co-authored-by: Shawn Tabrizi <shawntabrizi@gmail.com>
This commit is contained in:
Oliver Tale-Yazdi
2022-04-26 16:31:26 +02:00
committed by GitHub
parent 9a3201ef3d
commit 9980d314b1
14 changed files with 512 additions and 42 deletions
@@ -18,6 +18,8 @@
//! Contains the [`MachineCmd`] as entry point for the node
//! and the core benchmarking logic.
pub mod hardware;
use sc_cli::{CliConfiguration, Result, SharedParams};
use sc_service::Configuration;
use sc_sysinfo::{
@@ -26,9 +28,12 @@ use sc_sysinfo::{
};
use clap::Parser;
use log::info;
use log::{error, info, warn};
use prettytable::{cell, row, table};
use std::{fmt::Debug, fs, time::Duration};
use std::{boxed::Box, fmt::Debug, fs, path::Path};
use crate::shared::check_build_profile;
pub use hardware::{Metric, Requirement, Requirements, Throughput, SUBSTRATE_REFERENCE_HARDWARE};
/// Command to benchmark the hardware.
///
@@ -44,38 +49,174 @@ pub struct MachineCmd {
#[clap(flatten)]
pub shared_params: SharedParams,
/// Do not return an error if any check fails.
///
/// Should only be used for debugging.
#[clap(long)]
pub allow_fail: bool,
/// Set a fault tolerance for passing a requirement.
///
/// 10% means that the test would pass even when only 90% score was archived.
/// Can be used to mitigate outliers of the benchmarks.
#[clap(long, default_value = "10.0", value_name = "PERCENT")]
pub tolerance: f64,
/// Time limit for the verification benchmark.
#[clap(long, default_value = "2.0", value_name = "SECONDS")]
pub verify_duration: f32,
/// Time limit for each disk benchmark.
#[clap(long, default_value = "5.0", value_name = "SECONDS")]
pub disk_duration: f32,
}
/// Helper for the result of a concrete benchmark.
struct BenchResult {
/// Did the hardware pass the benchmark?
passed: bool,
/// The absolute score that was archived.
score: Throughput,
/// The score relative to the minimal required score.
///
/// Is in range [0, 1].
rel_score: f64,
}
/// Errors that can be returned by the this command.
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error("One of the benchmarks had a score that was lower than its requirement")]
UnmetRequirement,
#[error("The build profile is unfit for benchmarking: {0}")]
BadBuildProfile(String),
#[error("Benchmark results are off by at least factor 100")]
BadResults,
}
impl MachineCmd {
/// Execute the benchmark and print the results.
pub fn run(&self, cfg: &Configuration) -> Result<()> {
pub fn run(&self, cfg: &Configuration, requirements: Requirements) -> Result<()> {
self.validate_args()?;
// Ensure that the dir exists since the node is not started to take care of it.
let dir = cfg.database.path().ok_or("No DB directory provided")?;
fs::create_dir_all(dir)?;
info!("Running machine benchmarks...");
let write = benchmark_disk_sequential_writes(dir)?;
let read = benchmark_disk_random_writes(dir)?;
let verify_limit =
ExecutionLimit::MaxDuration(Duration::from_secs_f32(self.verify_duration));
let verify = benchmark_sr25519_verify(verify_limit) * 1024.0;
let mut results = Vec::new();
for requirement in &requirements.0 {
let result = self.run_benchmark(requirement, &dir)?;
results.push(result);
}
self.print_summary(requirements, results)
}
/// Benchmarks a specific metric of the hardware and judges the resulting score.
fn run_benchmark(&self, requirement: &Requirement, dir: &Path) -> Result<BenchResult> {
// Dispatch the concrete function from `sc-sysinfo`.
let score = self.measure(&requirement.metric, dir)?;
let rel_score = score.to_bs() / requirement.minimum.to_bs();
// Sanity check if the result is off by factor >100x.
if rel_score >= 100.0 || rel_score <= 0.01 {
self.check_failed(Error::BadResults)?;
}
let passed = rel_score >= (1.0 - (self.tolerance / 100.0));
Ok(BenchResult { passed, score, rel_score })
}
/// Measures a metric of the hardware.
fn measure(&self, metric: &Metric, dir: &Path) -> Result<Throughput> {
let verify_limit = ExecutionLimit::from_secs_f32(self.verify_duration);
let disk_limit = ExecutionLimit::from_secs_f32(self.disk_duration);
let score = match metric {
Metric::Blake2256 => Throughput::MiBs(benchmark_cpu() as f64),
Metric::Sr25519Verify => Throughput::MiBs(benchmark_sr25519_verify(verify_limit)),
Metric::MemCopy => Throughput::MiBs(benchmark_memory() as f64),
Metric::DiskSeqWrite =>
Throughput::MiBs(benchmark_disk_sequential_writes(disk_limit, dir)? as f64),
Metric::DiskRndWrite =>
Throughput::MiBs(benchmark_disk_random_writes(disk_limit, dir)? as f64),
};
Ok(score)
}
/// Prints a human-readable summary.
fn print_summary(&self, requirements: Requirements, results: Vec<BenchResult>) -> Result<()> {
// Use a table for nicer console output.
let table = table!(
["Category", "Function", "Score", "Unit"],
["CPU", "BLAKE2-256", benchmark_cpu(), "MB/s"],
["CPU", "SR25519 Verify", format!("{:.1}", verify), "KB/s"],
["Memory", "Copy", benchmark_memory(), "MB/s"],
["Disk", "Seq Write", write, "MB/s"],
["Disk", "Rnd Write", read, "MB/s"]
);
let mut table = table!(["Category", "Function", "Score", "Minimum", "Result"]);
// Count how many passed and how many failed.
let (mut passed, mut failed) = (0, 0);
for (requirement, result) in requirements.0.iter().zip(results.iter()) {
if result.passed {
passed += 1
} else {
failed += 1
}
info!("\n{}", table);
table.add_row(result.to_row(requirement));
}
// Print the table and a summary.
info!(
"\n{}\nFrom {} benchmarks in total, {} passed and {} failed ({:.0?}% fault tolerance).",
table,
passed + failed,
passed,
failed,
self.tolerance
);
// Print the final result.
if failed != 0 {
info!("The hardware fails to meet the requirements");
self.check_failed(Error::UnmetRequirement)?;
} else {
info!("The hardware meets the requirements ");
}
// Check that the results were not created by a bad build profile.
if let Err(err) = check_build_profile() {
self.check_failed(Error::BadBuildProfile(err))?;
}
Ok(())
}
/// Returns `Ok` if [`self.allow_fail`] is set and otherwise the error argument.
fn check_failed(&self, e: Error) -> Result<()> {
if !self.allow_fail {
error!("Failing since --allow-fail is not set");
Err(sc_cli::Error::Application(Box::new(e)))
} else {
warn!("Ignoring error since --allow-fail is set: {:?}", e);
Ok(())
}
}
/// Validates the CLI arguments.
fn validate_args(&self) -> Result<()> {
if self.tolerance > 100.0 || self.tolerance < 0.0 {
return Err("The --tolerance argument is out of range".into())
}
Ok(())
}
}
impl BenchResult {
/// Format [`Self`] as row that can be printed in a table.
fn to_row(&self, req: &Requirement) -> prettytable::Row {
let passed = if self.passed { "✅ Pass" } else { "❌ Fail" };
row![
req.metric.category(),
req.metric.name(),
format!("{}", self.score),
format!("{}", req.minimum),
format!("{} ({: >5.1?} %)", passed, self.rel_score * 100.0)
]
}
}
// Boilerplate