From ec7bfae00a0ec0ce0841274acc707ad9b8d5b1c0 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Tue, 16 Jan 2024 18:14:29 +0100
Subject: [PATCH] subsystem-bench: cache misses profiling (#2893)

## Why we need it
To provide another level of understanding to why polkadot's subsystems
may perform slower than expected. Cache misses occur when processing
large amounts of data, such as during availability recovery.

## Why Cachegrind
Cachegrind has many drawbacks: it is slow, it uses its own cache
simulation, which is very basic. But unlike `perf`, which is a great
tool, Cachegrind can run in a virtual machine. This means we can easily
run it in remote installations and even use it in CI/CD to catch
possible regressions.

Why Cachegrind and not Callgrind, another part of Valgrind? It is simply
empirically proven that profiling runs faster with Cachegrind.

## First results
First results have been obtained while testing of the approach. Here is
an example.

```
$ target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read
$ cat cachegrind_report.txt
I refs:        64,622,081,485
I1  misses:         3,018,168
LLi misses:           437,654
I1  miss rate:           0.00%
LLi miss rate:           0.00%

D refs:        12,161,833,115  (9,868,356,364 rd   + 2,293,476,751 wr)
D1  misses:       167,940,701  (   71,060,073 rd   +    96,880,628 wr)
LLd misses:        33,550,018  (   16,685,853 rd   +    16,864,165 wr)
D1  miss rate:            1.4% (          0.7%     +           4.2%  )
LLd miss rate:            0.3% (          0.2%     +           0.7%  )

LL refs:          170,958,869  (   74,078,241 rd   +    96,880,628 wr)
LL misses:         33,987,672  (   17,123,507 rd   +    16,864,165 wr)
LL miss rate:             0.0% (          0.0%     +           0.7%  )
```

The CLI output shows that 1.4% of the L1 data cache missed, which is not
so bad, given that the last-level cache had that data most of the time
missing only 0.3%. Instruction data of the L1 has 0.00% misses of the
time. Looking at an output file with `cg_annotate` shows that most of
the misses occur during reed-solomon, which is expected.
---
 polkadot/node/subsystem-bench/README.md       | 77 +++++++++++++++----
 .../subsystem-bench/src/subsystem-bench.rs    | 13 +++-
 polkadot/node/subsystem-bench/src/valgrind.rs | 49 ++++++++++++
 3 files changed, 121 insertions(+), 18 deletions(-)
 create mode 100644 polkadot/node/subsystem-bench/src/valgrind.rs
diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md
index b1476db275..1ff5b129e1 100644
--- a/polkadot/node/subsystem-bench/README.md
+++ b/polkadot/node/subsystem-bench/README.md
@@ -117,23 +117,24 @@ used to run a suite of tests defined in a `yaml` file like in this [example](exa
 
 ```
 Options:
-      --network <NETWORK>                              The type of network to be emulated [default: ideal] [possible values:
-                                                       ideal, healthy, degraded]
-      --n-cores <N_CORES>                              Number of cores to fetch availability for [default: 100]
-      --n-validators <N_VALIDATORS>                    Number of validators to fetch chunks from [default: 500]
-      --min-pov-size <MIN_POV_SIZE>                    The minimum pov size in KiB [default: 5120]
-      --max-pov-size <MAX_POV_SIZE>                    The maximum pov size bytes [default: 5120]
-  -n, --num-blocks <NUM_BLOCKS>                        The number of blocks the test is going to run [default: 1]
-  -p, --peer-bandwidth <PEER_BANDWIDTH>                The bandwidth of simulated remote peers in KiB
-  -b, --bandwidth <BANDWIDTH>                          The bandwidth of our simulated node in KiB
-      --peer-error <PEER_ERROR>                        Simulated conection error ratio [0-100]
-      --peer-min-latency <PEER_MIN_LATENCY>            Minimum remote peer latency in milliseconds [0-5000]
-      --peer-max-latency <PEER_MAX_LATENCY>            Maximum remote peer latency in milliseconds [0-5000]
-      --profile                                        Enable CPU Profiling with Pyroscope
-      --pyroscope-url <PYROSCOPE_URL>                  Pyroscope Server URL [default: http://localhost:4040]
-      --pyroscope-sample-rate <PYROSCOPE_SAMPLE_RATE>  Pyroscope Sample Rate [default: 113]
-  -h, --help                                           Print help
-  -V, --version                                        Print version
+    --network <NETWORK>                              The type of network to be emulated [default: ideal] [possible
+                                                     values: ideal, healthy, degraded]
+    --n-cores <N_CORES>                              Number of cores to fetch availability for [default: 100]
+    --n-validators <N_VALIDATORS>                    Number of validators to fetch chunks from [default: 500]
+    --min-pov-size <MIN_POV_SIZE>                    The minimum pov size in KiB [default: 5120]
+    --max-pov-size <MAX_POV_SIZE>                    The maximum pov size bytes [default: 5120]
+-n, --num-blocks <NUM_BLOCKS>                        The number of blocks the test is going to run [default: 1]
+-p, --peer-bandwidth <PEER_BANDWIDTH>                The bandwidth of simulated remote peers in KiB
+-b, --bandwidth <BANDWIDTH>                          The bandwidth of our simulated node in KiB
+    --peer-error <PEER_ERROR>                        Simulated conection error ratio [0-100]
+    --peer-min-latency <PEER_MIN_LATENCY>            Minimum remote peer latency in milliseconds [0-5000]
+    --peer-max-latency <PEER_MAX_LATENCY>            Maximum remote peer latency in milliseconds [0-5000]
+    --profile                                        Enable CPU Profiling with Pyroscope
+    --pyroscope-url <PYROSCOPE_URL>                  Pyroscope Server URL [default: http://localhost:4040]
+    --pyroscope-sample-rate <PYROSCOPE_SAMPLE_RATE>  Pyroscope Sample Rate [default: 113]
+    --cache-misses                                   Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind
+                                                     must be in the PATH
+-h, --help                                           Print help
 ```
 
 These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file.
@@ -221,6 +222,48 @@ view the test progress in real time by accessing [this link](http://localhost:30
 Now run
 `target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml`
 and view the metrics in real time and spot differences between different `n_validators` values.
+
+### Profiling cache misses
+
+Cache misses are profiled using Cachegrind, part of Valgrind. Cachegrind runs slowly, and its cache simulation is basic
+and unlikely to reflect the behavior of a modern machine. However, it still represents the general situation with cache
+usage, and more importantly it doesn't require a bare-metal machine to run on, which means it could be run in CI or in
+a remote virtual installation.
+
+To profile cache misses use the `--cache-misses` flag. Cache simulation of current runs tuned for Intel Ice Lake CPU.
+Since the execution will be very slow, it's recommended not to run it together with other profiling and not to take
+benchmark results into account. A report is saved in a file `cachegrind_report.txt`.
+
+Example run results:
+```
+$ target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read
+$ cat cachegrind_report.txt
+I refs:        64,622,081,485
+I1  misses:         3,018,168
+LLi misses:           437,654
+I1  miss rate:           0.00%
+LLi miss rate:           0.00%
+
+D refs:        12,161,833,115  (9,868,356,364 rd   + 2,293,476,751 wr)
+D1  misses:       167,940,701  (   71,060,073 rd   +    96,880,628 wr)
+LLd misses:        33,550,018  (   16,685,853 rd   +    16,864,165 wr)
+D1  miss rate:            1.4% (          0.7%     +           4.2%  )
+LLd miss rate:            0.3% (          0.2%     +           0.7%  )
+
+LL refs:          170,958,869  (   74,078,241 rd   +    96,880,628 wr)
+LL misses:         33,987,672  (   17,123,507 rd   +    16,864,165 wr)
+LL miss rate:             0.0% (          0.0%     +           0.7%  )
+```
+
+The results show that 1.4% of the L1 data cache missed, but the last level cache only missed 0.3% of the time.
+Instruction data of the L1 has 0.00%.
+
+Cachegrind writes line-by-line cache profiling information to a file named `cachegrind.out.<pid>`.
+This file is best interpreted with `cg_annotate --auto=yes cachegrind.out.<pid>`. For more information see the
+[cachegrind manual](https://www.cs.cmu.edu/afs/cs.cmu.edu/project/cmt-40/Nice/RuleRefinement/bin/valgrind-3.2.0/docs/html/cg-manual.html).
+
+For finer profiling of cache misses, better use `perf` on a bare-metal machine.
+
 ## Create new test objectives
 
 This tool is intended to make it easy to write new test objectives that focus individual subsystems,
diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
index 29b62b2785..8669ee4e8b 100644
--- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs
+++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -16,6 +16,7 @@
 
 //! A tool for running subsystem benchmark tests designed for development and
 //! CI regression testing.
+
 use clap::Parser;
 use color_eyre::eyre;
 use pyroscope::PyroscopeAgent;
@@ -27,6 +28,7 @@ use std::{path::Path, time::Duration};
 pub(crate) mod availability;
 pub(crate) mod cli;
 pub(crate) mod core;
+mod valgrind;
 
 use availability::{prepare_test, NetworkEmulation, TestState};
 use cli::TestObjective;
@@ -90,12 +92,21 @@ struct BenchCli {
 	/// Pyroscope Sample Rate
 	pub pyroscope_sample_rate: u32,
 
+	#[clap(long, default_value_t = false)]
+	/// Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind must be in the PATH
+	pub cache_misses: bool,
+
 	#[command(subcommand)]
 	pub objective: cli::TestObjective,
 }
 
 impl BenchCli {
 	fn launch(self) -> eyre::Result<()> {
+		let is_valgrind_running = valgrind::is_valgrind_running();
+		if !is_valgrind_running && self.cache_misses {
+			return valgrind::relaunch_in_valgrind_mode()
+		}
+
 		let agent_running = if self.profile {
 			let agent = PyroscopeAgent::builder(self.pyroscope_url.as_str(), "subsystem-bench")
 				.backend(pprof_backend(PprofConfig::new().sample_rate(self.pyroscope_sample_rate)))
@@ -185,7 +196,7 @@ impl BenchCli {
 
 		let mut state = TestState::new(&test_config);
 		let (mut env, _protocol_config) = prepare_test(test_config, &mut state);
-		// test_config.write_to_disk();
+
 		env.runtime()
 			.block_on(availability::benchmark_availability_read(&mut env, state));
 
diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
new file mode 100644
index 0000000000..3d0c488355
--- /dev/null
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -0,0 +1,49 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+use color_eyre::eyre;
+
+/// Show if the app is running under Valgrind
+pub(crate) fn is_valgrind_running() -> bool {
+	match std::env::var("LD_PRELOAD") {
+		Ok(v) => v.contains("valgrind"),
+		Err(_) => false,
+	}
+}
+
+/// Stop execution and relaunch the app under valgrind
+/// Cache configuration used to emulate Intel Ice Lake (size, associativity, line size):
+///     L1 instruction: 32,768 B, 8-way, 64 B lines
+///     L1 data: 49,152 B, 12-way, 64 B lines
+///     Last-level: 2,097,152 B, 16-way, 64 B lines
+pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
+	use std::os::unix::process::CommandExt;
+	let err = std::process::Command::new("valgrind")
+		.arg("--tool=cachegrind")
+		.arg("--cache-sim=yes")
+		.arg("--log-file=cachegrind_report.txt")
+		.arg("--I1=32768,8,64")
+		.arg("--D1=49152,12,64")
+		.arg("--LL=2097152,16,64")
+		.arg("--verbose")
+		.args(std::env::args())
+		.exec();
+
+	Err(eyre::eyre!(
+		"Сannot run Valgrind, check that it is installed and available in the PATH\n{}",
+		err
+	))
+}