Add Prometheus timers to the subsystems (#1923)

* reexport prometheus-super for ease of use of other subsystems

* add some prometheus timers for collation generation subsystem

* add timing metrics to av-store

* add metrics to candidate backing

* add timing metric to bitfield signing

* add timing metrics to candidate selection

* add timing metrics to candidate-validation

* add timing metrics to chain-api

* add timing metrics to provisioner

* add timing metrics to runtime-api

* add timing metrics to availability-distribution

* add timing metrics to bitfield-distribution

* add timing metrics to collator protocol: collator side

* add timing metrics to collator protocol: validator side

* fix candidate validation test failures

* add timing metrics to pov distribution

* add timing metrics to statement-distribution

* use substrate_prometheus_endpoint prometheus reexport instead of prometheus_super

* don't include JOB_DELAY in bitfield-signing metrics

* give adder-collator ability to easily export its genesis-state and validation code

* wip: adder-collator pushbutton script

* don't attempt to register the adder-collator automatically

Instead, get these values with

```sh
target/release/adder-collator export-genesis-state
target/release/adder-collator export-genesis-wasm
```

And then register the parachain on https://polkadot.js.org/apps/?rpc=ws%3A%2F%2F127.0.0.1%3A9944#/explorer

To collect prometheus data, after running the script, create `prometheus.yml` per the instructions
at https://www.notion.so/paritytechnologies/Setting-up-Prometheus-locally-835cb3a9df7541a781c381006252b5ff
and then run:

```sh
docker run -v `pwd`/prometheus.yml:/etc/prometheus/prometheus.yml:z --network host prom/prometheus
```

Demonstrates that data makes it across to prometheus, though it is likely to be useful in the future
to tweak the buckets.

* Update parachain/test-parachains/adder/collator/src/cli.rs

Co-authored-by: Andronik Ordian <write@reusable.software>

* use the grandpa-pause parameter

* skip metrics in tracing instrumentation

* remove unnecessary grandpa_pause cli param

Co-authored-by: Andronik Ordian <write@reusable.software>
This commit is contained in:
Peter Goodspeed-Niklaus
2020-11-20 15:04:51 +01:00
committed by GitHub
parent e49989971d
commit 0a5bc82529
23 changed files with 1199 additions and 87 deletions
@@ -385,6 +385,8 @@ async fn handle_our_view_change<Context>(
where
Context: SubsystemContext<Message = AvailabilityDistributionMessage>,
{
let _timer = metrics.time_handle_our_view_change();
let old_view = std::mem::replace(&mut (state.view), view);
// needed due to borrow rules
@@ -651,6 +653,8 @@ async fn process_incoming_peer_message<Context>(
where
Context: SubsystemContext<Message = AvailabilityDistributionMessage>,
{
let _timer = metrics.time_process_incoming_peer_message();
// obtain the set of candidates we are interested in based on our current view
let live_candidates = state.cached_live_candidates_unioned(state.view.0.iter());
@@ -1180,6 +1184,8 @@ where
#[derive(Clone)]
struct MetricsInner {
gossipped_availability_chunks: prometheus::Counter<prometheus::U64>,
handle_our_view_change: prometheus::Histogram,
process_incoming_peer_message: prometheus::Histogram,
}
/// Availability Distribution metrics.
@@ -1192,6 +1198,16 @@ impl Metrics {
metrics.gossipped_availability_chunks.inc();
}
}
/// Provide a timer for `handle_our_view_change` which observes on drop.
fn time_handle_our_view_change(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_our_view_change.start_timer())
}
/// Provide a timer for `process_incoming_peer_message` which observes on drop.
fn time_process_incoming_peer_message(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.process_incoming_peer_message.start_timer())
}
}
impl metrics::Metrics for Metrics {
@@ -1206,6 +1222,24 @@ impl metrics::Metrics for Metrics {
)?,
registry,
)?,
handle_our_view_change: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_availability_distribution_handle_our_view_change",
"Time spent within `availability_distribution::handle_our_view_change`",
)
)?,
registry,
)?,
process_incoming_peer_message: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_availability_distribution_process_incoming_peer_message",
"Time spent within `availability_distribution::process_incoming_peer_message`",
)
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
@@ -183,6 +183,8 @@ impl BitfieldDistribution {
}
}
FromOverseer::Signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, deactivated })) => {
let _timer = self.metrics.time_active_leaves_update();
for relay_parent in activated {
tracing::trace!(target: LOG_TARGET, relay_parent = %relay_parent, "activated");
// query basic system parameters once
@@ -257,6 +259,8 @@ async fn handle_bitfield_distribution<Context>(
where
Context: SubsystemContext<Message = BitfieldDistributionMessage>,
{
let _timer = metrics.time_handle_bitfield_distribution();
// Ignore anything the overseer did not tell this subsystem to work on
let mut job_data = state.per_relay_parent.get_mut(&relay_parent);
let job_data: &mut _ = if let Some(ref mut job_data) = job_data {
@@ -461,6 +465,8 @@ async fn handle_network_msg<Context>(
where
Context: SubsystemContext<Message = BitfieldDistributionMessage>,
{
let _timer = metrics.time_handle_network_msg();
match bridge_message {
NetworkBridgeEvent::PeerConnected(peerid, _role) => {
// insert if none already present
@@ -659,6 +665,9 @@ where
struct MetricsInner {
gossipped_own_availability_bitfields: prometheus::Counter<prometheus::U64>,
received_availability_bitfields: prometheus::Counter<prometheus::U64>,
active_leaves_update: prometheus::Histogram,
handle_bitfield_distribution: prometheus::Histogram,
handle_network_msg: prometheus::Histogram,
}
/// Bitfield Distribution metrics.
@@ -677,6 +686,21 @@ impl Metrics {
metrics.received_availability_bitfields.inc();
}
}
/// Provide a timer for `active_leaves_update` which observes on drop.
fn time_active_leaves_update(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.active_leaves_update.start_timer())
}
/// Provide a timer for `handle_bitfield_distribution` which observes on drop.
fn time_handle_bitfield_distribution(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_bitfield_distribution.start_timer())
}
/// Provide a timer for `handle_network_msg` which observes on drop.
fn time_handle_network_msg(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_network_msg.start_timer())
}
}
impl metrics::Metrics for Metrics {
@@ -696,6 +720,33 @@ impl metrics::Metrics for Metrics {
)?,
registry,
)?,
active_leaves_update: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_bitfield_distribution_active_leaves_update",
"Time spent within `bitfield_distribution::active_leaves_update`",
)
)?,
registry,
)?,
handle_bitfield_distribution: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_bitfield_distribution_handle_bitfield_distribution",
"Time spent within `bitfield_distribution::handle_bitfield_distribution`",
)
)?,
registry,
)?,
handle_network_msg: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_bitfield_distribution_handle_network_msg",
"Time spent within `bitfield_distribution::handle_network_msg`",
)
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
@@ -57,12 +57,24 @@ impl Metrics {
metrics.collations_sent.inc();
}
}
/// Provide a timer for handling `ConnectionRequest` which observes on drop.
fn time_handle_connection_request(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_connection_request.start_timer())
}
/// Provide a timer for `process_msg` which observes on drop.
fn time_process_msg(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.process_msg.start_timer())
}
}
#[derive(Clone)]
struct MetricsInner {
advertisements_made: prometheus::Counter<prometheus::U64>,
collations_sent: prometheus::Counter<prometheus::U64>,
handle_connection_request: prometheus::Histogram,
process_msg: prometheus::Histogram,
}
impl metrics::Metrics for Metrics {
@@ -84,6 +96,24 @@ impl metrics::Metrics for Metrics {
)?,
registry,
)?,
handle_connection_request: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_collator_protocol_collator_handle_connection_request",
"Time spent within `collator_protocol_collator::handle_connection_request`",
)
)?,
registry,
)?,
process_msg: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_collator_protocol_collator_process_msg",
"Time spent within `collator_protocol_collator::process_msg`",
)
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
@@ -377,6 +407,8 @@ where
{
use CollatorProtocolMessage::*;
let _timer = state.metrics.time_process_msg();
match msg {
CollateOn(id) => {
state.collating_on = Some(id);
@@ -662,6 +694,8 @@ where
loop {
if let Some(mut request) = state.last_connection_request.take() {
let _timer = state.metrics.time_handle_connection_request();
while let Poll::Ready(Some((validator_id, peer_id))) = futures::poll!(request.next()) {
if let Err(err) = handle_validator_connected(&mut ctx, &mut state, peer_id, validator_id).await {
tracing::warn!(
@@ -63,11 +63,23 @@ impl Metrics {
}
}
}
/// Provide a timer for `process_msg` which observes on drop.
fn time_process_msg(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.process_msg.start_timer())
}
/// Provide a timer for `handle_collation_request_result` which observes on drop.
fn time_handle_collation_request_result(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_collation_request_result.start_timer())
}
}
#[derive(Clone)]
struct MetricsInner {
collation_requests: prometheus::CounterVec<prometheus::U64>,
process_msg: prometheus::Histogram,
handle_collation_request_result: prometheus::Histogram,
}
impl metrics::Metrics for Metrics {
@@ -84,7 +96,25 @@ impl metrics::Metrics for Metrics {
&["success"],
)?,
registry,
)?
)?,
process_msg: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_collator_protocol_validator_process_msg",
"Time spent within `collator_protocol_validator::process_msg`",
)
)?,
registry,
)?,
handle_collation_request_result: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_collator_protocol_validator_handle_collation_request_result",
"Time spent within `collator_protocol_validator::handle_collation_request_result`",
)
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
@@ -628,6 +658,8 @@ where
{
use CollatorProtocolMessage::*;
let _timer = state.metrics.time_process_msg();
match msg {
CollateOn(id) => {
tracing::warn!(
@@ -703,6 +735,8 @@ where
}
while let Poll::Ready(Some(request)) = futures::poll!(state.requests_in_progress.next()) {
let _timer = state.metrics.time_handle_collation_request_result();
// Request has timed out, we need to penalize the collator and re-send the request
// if the chain has not moved on yet.
match request {
@@ -125,6 +125,8 @@ async fn handle_signal(
match signal {
OverseerSignal::Conclude => Ok(true),
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, deactivated }) => {
let _timer = state.metrics.time_handle_signal();
for relay_parent in activated {
let (vals_tx, vals_rx) = oneshot::channel();
ctx.send_message(AllMessages::RuntimeApi(RuntimeApiMessage::Request(
@@ -267,6 +269,8 @@ async fn handle_fetch(
descriptor: CandidateDescriptor,
response_sender: oneshot::Sender<Arc<PoV>>,
) -> SubsystemResult<()> {
let _timer = state.metrics.time_handle_fetch();
let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) {
Some(s) => s,
None => return Ok(()),
@@ -316,6 +320,8 @@ async fn handle_distribute(
descriptor: CandidateDescriptor,
pov: Arc<PoV>,
) -> SubsystemResult<()> {
let _timer = state.metrics.time_handle_distribute();
let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) {
None => return Ok(()),
Some(s) => s,
@@ -483,6 +489,8 @@ async fn handle_network_update(
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
update: NetworkBridgeEvent<protocol_v1::PoVDistributionMessage>,
) -> SubsystemResult<()> {
let _timer = state.metrics.time_handle_network_update();
match update {
NetworkBridgeEvent::PeerConnected(peer, _observed_role) => {
state.peer_state.insert(peer, PeerState { awaited: HashMap::new() });
@@ -600,6 +608,10 @@ impl PoVDistribution {
#[derive(Clone)]
struct MetricsInner {
povs_distributed: prometheus::Counter<prometheus::U64>,
handle_signal: prometheus::Histogram,
handle_fetch: prometheus::Histogram,
handle_distribute: prometheus::Histogram,
handle_network_update: prometheus::Histogram,
}
/// Availability Distribution metrics.
@@ -612,6 +624,26 @@ impl Metrics {
metrics.povs_distributed.inc();
}
}
/// Provide a timer for `handle_signal` which observes on drop.
fn time_handle_signal(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_signal.start_timer())
}
/// Provide a timer for `handle_fetch` which observes on drop.
fn time_handle_fetch(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_fetch.start_timer())
}
/// Provide a timer for `handle_distribute` which observes on drop.
fn time_handle_distribute(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_distribute.start_timer())
}
/// Provide a timer for `handle_network_update` which observes on drop.
fn time_handle_network_update(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_network_update.start_timer())
}
}
impl metrics::Metrics for Metrics {
@@ -624,6 +656,42 @@ impl metrics::Metrics for Metrics {
)?,
registry,
)?,
handle_signal: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_pov_distribution_handle_signal",
"Time spent within `pov_distribution::handle_signal`",
)
)?,
registry,
)?,
handle_fetch: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_pov_distribution_handle_fetch",
"Time spent within `pov_distribution::handle_fetch`",
)
)?,
registry,
)?,
handle_distribute: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_pov_distribution_handle_distribute",
"Time spent within `pov_distribution::handle_distribute`",
)
)?,
registry,
)?,
handle_network_update: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_pov_distribution_handle_network_update",
"Time spent within `pov_distribution::handle_network_update`",
)
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
@@ -895,6 +895,8 @@ impl StatementDistribution {
let message = ctx.recv().await?;
match message {
FromOverseer::Signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, .. })) => {
let _timer = metrics.time_active_leaves_update();
for relay_parent in activated {
let (validators, session_index) = {
let (val_tx, val_rx) = oneshot::channel();
@@ -944,6 +946,8 @@ impl StatementDistribution {
FromOverseer::Signal(OverseerSignal::Conclude) => break,
FromOverseer::Communication { msg } => match msg {
StatementDistributionMessage::Share(relay_parent, statement) => {
let _timer = metrics.time_share();
inform_statement_listeners(
&statement,
&mut statement_listeners,
@@ -957,7 +961,9 @@ impl StatementDistribution {
&metrics,
).await?;
}
StatementDistributionMessage::NetworkBridgeUpdateV1(event) =>
StatementDistributionMessage::NetworkBridgeUpdateV1(event) => {
let _timer = metrics.time_network_bridge_update_v1();
handle_network_update(
&mut peers,
&mut active_heads,
@@ -965,7 +971,8 @@ impl StatementDistribution {
&mut our_view,
event,
&metrics,
).await?,
).await?
}
StatementDistributionMessage::RegisterStatementListener(tx) => {
statement_listeners.push(tx);
}
@@ -979,6 +986,9 @@ impl StatementDistribution {
#[derive(Clone)]
struct MetricsInner {
statements_distributed: prometheus::Counter<prometheus::U64>,
active_leaves_update: prometheus::Histogram,
share: prometheus::Histogram,
network_bridge_update_v1: prometheus::Histogram,
}
/// Statement Distribution metrics.
@@ -991,6 +1001,21 @@ impl Metrics {
metrics.statements_distributed.inc();
}
}
/// Provide a timer for `active_leaves_update` which observes on drop.
fn time_active_leaves_update(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.active_leaves_update.start_timer())
}
/// Provide a timer for `share` which observes on drop.
fn time_share(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.share.start_timer())
}
/// Provide a timer for `network_bridge_update_v1` which observes on drop.
fn time_network_bridge_update_v1(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.network_bridge_update_v1.start_timer())
}
}
impl metrics::Metrics for Metrics {
@@ -1003,6 +1028,33 @@ impl metrics::Metrics for Metrics {
)?,
registry,
)?,
active_leaves_update: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_statement_distribution_active_leaves_update",
"Time spent within `statement_distribution::active_leaves_update`",
)
)?,
registry,
)?,
share: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_statement_distribution_share",
"Time spent within `statement_distribution::share`",
)
)?,
registry,
)?,
network_bridge_update_v1: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_statement_distribution_network_bridge_update_v1",
"Time spent within `statement_distribution::network_bridge_update_v1`",
)
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}