mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-04-30 15:27:57 +00:00
subsystem benchmarks: add cpu profiling (#2734)
Ready-to-merge version of https://github.com/paritytech/polkadot-sdk/pull/2601 - Added optional CPU profiling - Updated instructions how to set up Prometheus, Pyroscope and Graphana - Added a flamegraph dashboard <img width="1470" alt="image" src="https://github.com/paritytech/polkadot-sdk/assets/27277055/c8f3b33d-3c01-4ec0-ac34-72d52325b6e6"> --------- Co-authored-by: ordian <write@reusable.software>
This commit is contained in:
Generated
+2
@@ -13322,6 +13322,8 @@ dependencies = [
|
||||
"polkadot-primitives",
|
||||
"polkadot-primitives-test-helpers",
|
||||
"prometheus",
|
||||
"pyroscope",
|
||||
"pyroscope_pprofrs",
|
||||
"rand 0.8.5",
|
||||
"sc-keystore",
|
||||
"sc-network",
|
||||
|
||||
@@ -56,6 +56,8 @@ serde = "1.0.192"
|
||||
serde_yaml = "0.9"
|
||||
paste = "1.0.14"
|
||||
orchestra = { version = "0.3.3", default-features = false, features = ["futures_channel"] }
|
||||
pyroscope = "0.5.7"
|
||||
pyroscope_pprofrs = "0.2.7"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Subsystem benchmark client
|
||||
|
||||
Run parachain consensus stress and performance tests on your development machine.
|
||||
Run parachain consensus stress and performance tests on your development machine.
|
||||
|
||||
## Motivation
|
||||
|
||||
@@ -26,17 +26,26 @@ The output binary will be placed in `target/testnet/subsystem-bench`.
|
||||
|
||||
### Test metrics
|
||||
|
||||
Subsystem, CPU usage and network metrics are exposed via a prometheus endpoint during the test execution.
|
||||
Subsystem, CPU usage and network metrics are exposed via a prometheus endpoint during the test execution.
|
||||
A small subset of these collected metrics are displayed in the CLI, but for an in depth analysys of the test results,
|
||||
a local Grafana/Prometheus stack is needed.
|
||||
|
||||
### Run Prometheus, Pyroscope and Graphana in Docker
|
||||
|
||||
If docker is not usable, then follow the next sections to manually install Prometheus, Pyroscope and Graphana on your machine.
|
||||
|
||||
```bash
|
||||
cd polkadot/node/subsystem-bench/docker
|
||||
docker compose up
|
||||
```
|
||||
|
||||
### Install Prometheus
|
||||
|
||||
Please follow the [official installation guide](https://prometheus.io/docs/prometheus/latest/installation/) for your
|
||||
platform/OS.
|
||||
|
||||
After succesfully installing and starting up Prometheus, we need to alter it's configuration such that it
|
||||
will scrape the benchmark prometheus endpoint `127.0.0.1:9999`. Please check the prometheus official documentation
|
||||
will scrape the benchmark prometheus endpoint `127.0.0.1:9999`. Please check the prometheus official documentation
|
||||
regarding the location of `prometheus.yml`. On MacOS for example the full path `/opt/homebrew/etc/prometheus.yml`
|
||||
|
||||
prometheus.yml:
|
||||
@@ -57,13 +66,29 @@ scrape_configs:
|
||||
|
||||
To complete this step restart Prometheus server such that it picks up the new configuration.
|
||||
|
||||
### Install and setup Grafana
|
||||
### Install Pyroscope
|
||||
|
||||
To collect CPU profiling data, you must be running the Pyroscope server.
|
||||
Follow the [installation guide](https://grafana.com/docs/pyroscope/latest/get-started/)
|
||||
relevant to your operating system.
|
||||
|
||||
### Install Grafana
|
||||
|
||||
Follow the [installation guide](https://grafana.com/docs/grafana/latest/setup-grafana/installation/) relevant
|
||||
to your operating system.
|
||||
|
||||
Once you have the installation up and running, configure the local Prometheus as a data source by following
|
||||
[this guide](https://grafana.com/docs/grafana/latest/datasources/prometheus/configure-prometheus-data-source/)
|
||||
### Setup Grafana
|
||||
|
||||
Once you have the installation up and running, configure the local Prometheus and Pyroscope (if needed)
|
||||
as data sources by following these guides:
|
||||
|
||||
- [Prometheus](https://grafana.com/docs/grafana/latest/datasources/prometheus/configure-prometheus-data-source/)
|
||||
- [Pyroscope](https://grafana.com/docs/grafana/latest/datasources/grafana-pyroscope/)
|
||||
|
||||
If you are running the servers in Docker, use the following URLs:
|
||||
|
||||
- Prometheus `http://prometheus:9090/`
|
||||
- Pyroscope `http://pyroscope:4040/`
|
||||
|
||||
#### Import dashboards
|
||||
|
||||
@@ -86,26 +111,29 @@ Commands:
|
||||
```
|
||||
|
||||
Note: `test-sequence` is a special test objective that wraps up an arbitrary number of test objectives. It is tipically
|
||||
used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml).
|
||||
used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml).
|
||||
|
||||
### Standard test options
|
||||
|
||||
|
||||
```
|
||||
Options:
|
||||
--network <NETWORK> The type of network to be emulated [default: ideal] [possible values:
|
||||
ideal, healthy, degraded]
|
||||
--n-cores <N_CORES> Number of cores to fetch availability for [default: 100]
|
||||
--n-validators <N_VALIDATORS> Number of validators to fetch chunks from [default: 500]
|
||||
--min-pov-size <MIN_POV_SIZE> The minimum pov size in KiB [default: 5120]
|
||||
--max-pov-size <MAX_POV_SIZE> The maximum pov size bytes [default: 5120]
|
||||
-n, --num-blocks <NUM_BLOCKS> The number of blocks the test is going to run [default: 1]
|
||||
-p, --peer-bandwidth <PEER_BANDWIDTH> The bandwidth of simulated remote peers in KiB
|
||||
-b, --bandwidth <BANDWIDTH> The bandwidth of our simulated node in KiB
|
||||
--peer-error <PEER_ERROR> Simulated conection error ratio [0-100]
|
||||
--peer-min-latency <PEER_MIN_LATENCY> Minimum remote peer latency in milliseconds [0-5000]
|
||||
--peer-max-latency <PEER_MAX_LATENCY> Maximum remote peer latency in milliseconds [0-5000]
|
||||
-h, --help Print help
|
||||
-V, --version Print version
|
||||
--network <NETWORK> The type of network to be emulated [default: ideal] [possible values:
|
||||
ideal, healthy, degraded]
|
||||
--n-cores <N_CORES> Number of cores to fetch availability for [default: 100]
|
||||
--n-validators <N_VALIDATORS> Number of validators to fetch chunks from [default: 500]
|
||||
--min-pov-size <MIN_POV_SIZE> The minimum pov size in KiB [default: 5120]
|
||||
--max-pov-size <MAX_POV_SIZE> The maximum pov size bytes [default: 5120]
|
||||
-n, --num-blocks <NUM_BLOCKS> The number of blocks the test is going to run [default: 1]
|
||||
-p, --peer-bandwidth <PEER_BANDWIDTH> The bandwidth of simulated remote peers in KiB
|
||||
-b, --bandwidth <BANDWIDTH> The bandwidth of our simulated node in KiB
|
||||
--peer-error <PEER_ERROR> Simulated conection error ratio [0-100]
|
||||
--peer-min-latency <PEER_MIN_LATENCY> Minimum remote peer latency in milliseconds [0-5000]
|
||||
--peer-max-latency <PEER_MAX_LATENCY> Maximum remote peer latency in milliseconds [0-5000]
|
||||
--profile Enable CPU Profiling with Pyroscope
|
||||
--pyroscope-url <PYROSCOPE_URL> Pyroscope Server URL [default: http://localhost:4040]
|
||||
--pyroscope-sample-rate <PYROSCOPE_SAMPLE_RATE> Pyroscope Sample Rate [default: 113]
|
||||
-h, --help Print help
|
||||
-V, --version Print version
|
||||
```
|
||||
|
||||
These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file.
|
||||
@@ -123,8 +151,8 @@ Benchmark availability recovery strategies
|
||||
Usage: subsystem-bench data-availability-read [OPTIONS]
|
||||
|
||||
Options:
|
||||
-f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU
|
||||
as we don't need to re-construct from chunks. Tipically this is only faster if nodes
|
||||
-f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU
|
||||
as we don't need to re-construct from chunks. Tipically this is only faster if nodes
|
||||
have enough bandwidth
|
||||
-h, --help Print help
|
||||
```
|
||||
@@ -152,8 +180,8 @@ Let's run an availabilty read test which will recover availability for 10 cores
|
||||
node validator network.
|
||||
|
||||
```
|
||||
target/testnet/subsystem-bench --n-cores 10 data-availability-read
|
||||
[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120,
|
||||
target/testnet/subsystem-bench --n-cores 10 data-availability-read
|
||||
[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120,
|
||||
error = 0, latency = None
|
||||
[2023-11-28T09:01:59Z INFO subsystem-bench::availability] Generating template candidate index=0 pov_size=5242880
|
||||
[2023-11-28T09:01:59Z INFO subsystem-bench::availability] Created test environment.
|
||||
@@ -167,8 +195,8 @@ node validator network.
|
||||
[2023-11-28T09:02:07Z INFO subsystem_bench::availability] All blocks processed in 6001ms
|
||||
[2023-11-28T09:02:07Z INFO subsystem_bench::availability] Throughput: 51200 KiB/block
|
||||
[2023-11-28T09:02:07Z INFO subsystem_bench::availability] Block time: 6001 ms
|
||||
[2023-11-28T09:02:07Z INFO subsystem_bench::availability]
|
||||
|
||||
[2023-11-28T09:02:07Z INFO subsystem_bench::availability]
|
||||
|
||||
Total received from network: 66 MiB
|
||||
Total sent to network: 58 KiB
|
||||
Total subsystem CPU usage 4.16s
|
||||
@@ -192,8 +220,7 @@ view the test progress in real time by accessing [this link](http://localhost:30
|
||||
|
||||
Now run
|
||||
`target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml`
|
||||
and view the metrics in real time and spot differences between different `n_valiator` values.
|
||||
|
||||
and view the metrics in real time and spot differences between different `n_validators` values.
|
||||
## Create new test objectives
|
||||
|
||||
This tool is intended to make it easy to write new test objectives that focus individual subsystems,
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
services:
|
||||
grafana:
|
||||
image: grafana/grafana-enterprise:latest
|
||||
container_name: grafana
|
||||
restart: always
|
||||
networks:
|
||||
- subsystem-bench
|
||||
ports:
|
||||
- "3000:3000"
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
restart: always
|
||||
networks:
|
||||
- subsystem-bench
|
||||
volumes:
|
||||
- ./prometheus:/etc/prometheus
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
ports:
|
||||
- "9090:9090"
|
||||
- "9999:9999"
|
||||
|
||||
pyroscope:
|
||||
container_name: pyroscope
|
||||
image: grafana/pyroscope:latest
|
||||
restart: always
|
||||
networks:
|
||||
- subsystem-bench
|
||||
ports:
|
||||
- "4040:4040"
|
||||
|
||||
networks:
|
||||
subsystem-bench:
|
||||
@@ -0,0 +1,11 @@
|
||||
global:
|
||||
scrape_interval: 5s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
- job_name: "subsystem-bench"
|
||||
scrape_interval: 0s500ms
|
||||
static_configs:
|
||||
- targets: ['host.docker.internal:9999']
|
||||
@@ -0,0 +1,70 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": 1,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "grafana-pyroscope-datasource",
|
||||
"uid": "bc3bc04f-85f9-464b-8ae3-fbe0949063f6"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 18,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "grafana-pyroscope-datasource",
|
||||
"uid": "bc3bc04f-85f9-464b-8ae3-fbe0949063f6"
|
||||
},
|
||||
"groupBy": [],
|
||||
"labelSelector": "{service_name=\"subsystem-bench\"}",
|
||||
"profileTypeId": "process_cpu:cpu:nanoseconds:cpu:nanoseconds",
|
||||
"queryType": "profile",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "CPU Profiling",
|
||||
"type": "flamegraph"
|
||||
}
|
||||
],
|
||||
"refresh": "",
|
||||
"schemaVersion": 38,
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "CPU Profiling",
|
||||
"uid": "c31191d5-fe2b-49e2-8b1c-1451f31d1628",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
@@ -18,6 +18,8 @@
|
||||
//! CI regression testing.
|
||||
use clap::Parser;
|
||||
use color_eyre::eyre;
|
||||
use pyroscope::PyroscopeAgent;
|
||||
use pyroscope_pprofrs::{pprof_backend, PprofConfig};
|
||||
|
||||
use colored::Colorize;
|
||||
use std::{path::Path, time::Duration};
|
||||
@@ -76,12 +78,34 @@ struct BenchCli {
|
||||
/// Maximum remote peer latency in milliseconds [0-5000].
|
||||
pub peer_max_latency: Option<u64>,
|
||||
|
||||
#[clap(long, default_value_t = false)]
|
||||
/// Enable CPU Profiling with Pyroscope
|
||||
pub profile: bool,
|
||||
|
||||
#[clap(long, requires = "profile", default_value_t = String::from("http://localhost:4040"))]
|
||||
/// Pyroscope Server URL
|
||||
pub pyroscope_url: String,
|
||||
|
||||
#[clap(long, requires = "profile", default_value_t = 113)]
|
||||
/// Pyroscope Sample Rate
|
||||
pub pyroscope_sample_rate: u32,
|
||||
|
||||
#[command(subcommand)]
|
||||
pub objective: cli::TestObjective,
|
||||
}
|
||||
|
||||
impl BenchCli {
|
||||
fn launch(self) -> eyre::Result<()> {
|
||||
let agent_running = if self.profile {
|
||||
let agent = PyroscopeAgent::builder(self.pyroscope_url.as_str(), "subsystem-bench")
|
||||
.backend(pprof_backend(PprofConfig::new().sample_rate(self.pyroscope_sample_rate)))
|
||||
.build()?;
|
||||
|
||||
Some(agent.start()?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let configuration = self.standard_configuration;
|
||||
let mut test_config = match self.objective {
|
||||
TestObjective::TestSequence(options) => {
|
||||
@@ -165,6 +189,11 @@ impl BenchCli {
|
||||
env.runtime()
|
||||
.block_on(availability::benchmark_availability_read(&mut env, state));
|
||||
|
||||
if let Some(agent_running) = agent_running {
|
||||
let agent_ready = agent_running.stop()?;
|
||||
agent_ready.shutdown();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user