pezkuwi-subxt/polkadot/node/subsystem-bench/src/lib/environment.rs

// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.

// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.

//! Test environment implementation

use crate::{
	configuration::{TestAuthorities, TestConfiguration},
	mock::AlwaysSupportsParachains,
	network::NetworkEmulatorHandle,
	usage::{BenchmarkUsage, ResourceUsage},
};
use core::time::Duration;
use futures::{Future, FutureExt};
use polkadot_node_subsystem::{messages::AllMessages, Overseer, SpawnGlue, TimeoutExt};
use polkadot_node_subsystem_types::Hash;
use polkadot_node_subsystem_util::metrics::prometheus::{
	self, Gauge, Histogram, PrometheusError, Registry, U64,
};
use polkadot_overseer::{BlockInfo, Handle as OverseerHandle};
use sc_service::{SpawnTaskHandle, TaskManager};
use std::net::{Ipv4Addr, SocketAddr};
use tokio::runtime::Handle;

const LOG_TARGET: &str = "subsystem-bench::environment";

/// Test environment/configuration metrics
#[derive(Clone)]
pub struct TestEnvironmentMetrics {
	/// Number of bytes sent per peer.
	n_validators: Gauge<U64>,
	/// Number of received sent per peer.
	n_cores: Gauge<U64>,
	/// PoV size
	pov_size: Histogram,
	/// Current block
	current_block: Gauge<U64>,
	/// Current block
	block_time: Gauge<U64>,
}

impl TestEnvironmentMetrics {
	pub fn new(registry: &Registry) -> Result<Self, PrometheusError> {
		let buckets = prometheus::exponential_buckets(16384.0, 2.0, 9)
			.expect("arguments are always valid; qed");

		Ok(Self {
			n_validators: prometheus::register(
				Gauge::new(
					"subsystem_benchmark_n_validators",
					"Total number of validators in the test",
				)?,
				registry,
			)?,
			n_cores: prometheus::register(
				Gauge::new(
					"subsystem_benchmark_n_cores",
					"Number of cores we fetch availability for each block",
				)?,
				registry,
			)?,
			current_block: prometheus::register(
				Gauge::new("subsystem_benchmark_current_block", "The current test block")?,
				registry,
			)?,
			block_time: prometheus::register(
				Gauge::new("subsystem_benchmark_block_time", "The time it takes for the target subsystems(s) to complete all the requests in a block")?,
				registry,
			)?,
			pov_size: prometheus::register(
				Histogram::with_opts(
					prometheus::HistogramOpts::new(
						"subsystem_benchmark_pov_size",
						"The compressed size of the proof of validity of a candidate",
					)
					.buckets(buckets),
				)?,
				registry,
			)?,
		})
	}

	pub fn set_n_validators(&self, n_validators: usize) {
		self.n_validators.set(n_validators as u64);
	}

	pub fn set_n_cores(&self, n_cores: usize) {
		self.n_cores.set(n_cores as u64);
	}

	pub fn set_current_block(&self, current_block: usize) {
		self.current_block.set(current_block as u64);
	}

	pub fn set_block_time(&self, block_time_ms: u64) {
		self.block_time.set(block_time_ms);
	}

	pub fn on_pov_size(&self, pov_size: usize) {
		self.pov_size.observe(pov_size as f64);
	}
}

fn new_runtime() -> tokio::runtime::Runtime {
	tokio::runtime::Builder::new_multi_thread()
		.thread_name("subsystem-bench")
		.enable_all()
		.thread_stack_size(3 * 1024 * 1024)
		.worker_threads(4)
		.build()
		.unwrap()
}

/// Wrapper for dependencies
pub struct TestEnvironmentDependencies {
	pub registry: Registry,
	pub task_manager: TaskManager,
	pub runtime: tokio::runtime::Runtime,
}

impl Default for TestEnvironmentDependencies {
	fn default() -> Self {
		let runtime = new_runtime();
		let registry = Registry::new();
		let task_manager: TaskManager =
			TaskManager::new(runtime.handle().clone(), Some(&registry)).unwrap();

		Self { runtime, registry, task_manager }
	}
}

// A dummy genesis hash
pub const GENESIS_HASH: Hash = Hash::repeat_byte(0xff);

// We use this to bail out sending messages to the subsystem if it is overloaded such that
// the time of flight is breaches 5s.
// This should eventually be a test parameter.
pub const MAX_TIME_OF_FLIGHT: Duration = Duration::from_millis(5000);

/// The test environment is the high level wrapper of all things required to test
/// a certain subsystem.
///
/// ## Mockups
/// The overseer is passed in during construction and it can host an arbitrary number of
/// real subsystems instances and the corresponding mocked instances such that the real
/// subsystems can get their messages answered.
///
/// As the subsystem's performance depends on network connectivity, the test environment
/// emulates validator nodes on the network, see `NetworkEmulator`. The network emulation
/// is configurable in terms of peer bandwidth, latency and connection error rate using
/// uniform distribution sampling.
///
///
/// ## Usage
/// `TestEnvironment` is used in tests to send `Overseer` messages or signals to the subsystem
/// under test.
///
/// ## Collecting test metrics
///
/// ### Prometheus
/// A prometheus endpoint is exposed while the test is running. A local Prometheus instance
/// can scrape it every 1s and a Grafana dashboard is the preferred way of visualizing
/// the performance characteristics of the subsystem.
///
/// ### CLI
/// A subset of the Prometheus metrics are printed at the end of the test.
pub struct TestEnvironment {
	/// Test dependencies
	dependencies: TestEnvironmentDependencies,
	/// A runtime handle
	runtime_handle: tokio::runtime::Handle,
	/// A handle to the lovely overseer
	overseer_handle: OverseerHandle,
	/// The test configuration.
	config: TestConfiguration,
	/// A handle to the network emulator.
	network: NetworkEmulatorHandle,
	/// Configuration/env metrics
	metrics: TestEnvironmentMetrics,
	/// Test authorities generated from the configuration.
	authorities: TestAuthorities,
}

impl TestEnvironment {
	/// Create a new test environment
	pub fn new(
		dependencies: TestEnvironmentDependencies,
		config: TestConfiguration,
		network: NetworkEmulatorHandle,
		overseer: Overseer<SpawnGlue<SpawnTaskHandle>, AlwaysSupportsParachains>,
		overseer_handle: OverseerHandle,
		authorities: TestAuthorities,
		with_prometheus_endpoint: bool,
	) -> Self {
		let metrics = TestEnvironmentMetrics::new(&dependencies.registry)
			.expect("Metrics need to be registered");

		let spawn_handle = dependencies.task_manager.spawn_handle();
		spawn_handle.spawn_blocking("overseer", "overseer", overseer.run().boxed());

		if with_prometheus_endpoint {
			let registry_clone = dependencies.registry.clone();
			dependencies.task_manager.spawn_handle().spawn_blocking(
				"prometheus",
				"test-environment",
				async move {
					prometheus_endpoint::init_prometheus(
						SocketAddr::new(std::net::IpAddr::V4(Ipv4Addr::LOCALHOST), 9999),
						registry_clone,
					)
					.await
					.unwrap();
				},
			);
		}

		TestEnvironment {
			runtime_handle: dependencies.runtime.handle().clone(),
			dependencies,
			overseer_handle,
			config,
			network,
			metrics,
			authorities,
		}
	}

	/// Returns the test configuration.
	pub fn config(&self) -> &TestConfiguration {
		&self.config
	}

	/// Returns a reference to the inner network emulator handle.
	pub fn network(&self) -> &NetworkEmulatorHandle {
		&self.network
	}

	/// Returns a reference to the overseer handle.
	pub fn overseer_handle(&self) -> &OverseerHandle {
		&self.overseer_handle
	}

	/// Returns the Prometheus registry.
	pub fn registry(&self) -> &Registry {
		&self.dependencies.registry
	}

	/// Spawn a named task in the `test-environment` task group.
	#[allow(unused)]
	pub fn spawn(&self, name: &'static str, task: impl Future<Output = ()> + Send + 'static) {
		self.dependencies
			.task_manager
			.spawn_handle()
			.spawn(name, "test-environment", task);
	}

	/// Spawn a blocking named task in the `test-environment` task group.
	pub fn spawn_blocking(
		&self,
		name: &'static str,
		task: impl Future<Output = ()> + Send + 'static,
	) {
		self.dependencies.task_manager.spawn_handle().spawn_blocking(
			name,
			"test-environment",
			task,
		);
	}
	/// Returns a reference to the test environment metrics instance
	pub fn metrics(&self) -> &TestEnvironmentMetrics {
		&self.metrics
	}

	/// Returns a handle to the tokio runtime.
	pub fn runtime(&self) -> Handle {
		self.runtime_handle.clone()
	}

	/// Returns a reference to the authority keys used in the test.
	pub fn authorities(&self) -> &TestAuthorities {
		&self.authorities
	}

	/// Send a message to the subsystem under test environment.
	pub async fn send_message(&mut self, msg: AllMessages) {
		self.overseer_handle
			.send_msg(msg, LOG_TARGET)
			.timeout(MAX_TIME_OF_FLIGHT)
			.await
			.unwrap_or_else(|| {
				panic!("{}ms maximum time of flight breached", MAX_TIME_OF_FLIGHT.as_millis())
			});
	}

	/// Send an `ActiveLeavesUpdate` signal to all subsystems under test.
	pub async fn import_block(&mut self, block: BlockInfo) {
		self.overseer_handle
			.block_imported(block)
			.timeout(MAX_TIME_OF_FLIGHT)
			.await
			.unwrap_or_else(|| {
				panic!("{}ms maximum time of flight breached", MAX_TIME_OF_FLIGHT.as_millis())
			});
	}

	/// Stop overseer and subsystems.
	pub async fn stop(&mut self) {
		self.overseer_handle.stop().await;
	}

	/// Tells if entries in bucket metric is lower than `value`
	pub fn metric_lower_than(registry: &Registry, metric_name: &str, value: f64) -> bool {
		let test_metrics = super::display::parse_metrics(registry);
		test_metrics.metric_lower_than(metric_name, value)
	}

	/// Blocks until `metric_name` >= `value`
	pub async fn wait_until_metric(
		&self,
		metric_name: &str,
		label: Option<(&str, &str)>,
		condition: impl Fn(f64) -> bool,
	) {
		loop {
			let test_metrics = if let Some((label_name, label_value)) = label {
				super::display::parse_metrics(self.registry())
					.subset_with_label_value(label_name, label_value)
			} else {
				super::display::parse_metrics(self.registry())
			};
			let current_value = test_metrics.sum_by(metric_name);

			gum::debug!(target: LOG_TARGET, metric_name, current_value, "Waiting for metric");
			if condition(current_value) {
				break
			}
			// Check value every 50ms.
			tokio::time::sleep(std::time::Duration::from_millis(50)).await;
		}
	}

	pub fn collect_resource_usage(
		&self,
		benchmark_name: &str,
		subsystems_under_test: &[&str],
	) -> BenchmarkUsage {
		BenchmarkUsage {
			benchmark_name: benchmark_name.to_string(),
			network_usage: self.network_usage(),
			cpu_usage: self.cpu_usage(subsystems_under_test),
		}
	}

	fn network_usage(&self) -> Vec<ResourceUsage> {
		let stats = self.network().peer_stats(0);
		let total_node_received = (stats.received() / 1024) as f64;
		let total_node_sent = (stats.sent() / 1024) as f64;
		let num_blocks = self.config().num_blocks as f64;

		vec![
			ResourceUsage {
				resource_name: "Received from peers".to_string(),
				total: total_node_received,
				per_block: total_node_received / num_blocks,
			},
			ResourceUsage {
				resource_name: "Sent to peers".to_string(),
				total: total_node_sent,
				per_block: total_node_sent / num_blocks,
			},
		]
	}

	fn cpu_usage(&self, subsystems_under_test: &[&str]) -> Vec<ResourceUsage> {
		let test_metrics = super::display::parse_metrics(self.registry());
		let mut usage = vec![];
		let num_blocks = self.config().num_blocks as f64;

		for subsystem in subsystems_under_test.iter() {
			let subsystem_cpu_metrics =
				test_metrics.subset_with_label_value("task_group", subsystem);
			let total_cpu = subsystem_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum");
			usage.push(ResourceUsage {
				resource_name: subsystem.to_string(),
				total: total_cpu,
				per_block: total_cpu / num_blocks,
			});
		}

		let test_env_cpu_metrics =
			test_metrics.subset_with_label_value("task_group", "test-environment");
		let total_cpu = test_env_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum");

		usage.push(ResourceUsage {
			resource_name: "test-environment".to_string(),
			total: total_cpu,
			per_block: total_cpu / num_blocks,
		});

		usage
	}
}