Add metrics for out of view statement distribution errors (#4972)

* Add a simple metric for statements out-of-view

* Avoid repeated out-of-view peer reputation change messages

* Log reporting status

* Address review comments

* Use counter to store a number of unexpected messages from a peer

* Distinguish different unexpected statements in the metrics

* Fix labels cardinality

* Rename metric name to `statements_unexpected`

* Move metrics to a separate unit, avoid unnecessary enum

* Prefer specific methods in lieu of public constants
This commit is contained in:
Vsevolod Stakhov
2022-02-25 20:29:51 +00:00
committed by GitHub
parent 86f2d65a72
commit 44f66825c7
3 changed files with 218 additions and 115 deletions
@@ -0,0 +1,165 @@
// Copyright 2022 Parity Technologies (UK) Ltd.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! Metrics for the statement distribution module
use polkadot_node_subsystem_util::metrics::{self, prometheus};
#[derive(Clone)]
struct MetricsInner {
statements_distributed: prometheus::Counter<prometheus::U64>,
sent_requests: prometheus::Counter<prometheus::U64>,
received_responses: prometheus::CounterVec<prometheus::U64>,
active_leaves_update: prometheus::Histogram,
share: prometheus::Histogram,
network_bridge_update_v1: prometheus::Histogram,
statements_unexpected: prometheus::CounterVec<prometheus::U64>,
}
/// Statement Distribution metrics.
#[derive(Default, Clone)]
pub struct Metrics(Option<MetricsInner>);
impl Metrics {
/// Update statements distributed counter
pub fn on_statement_distributed(&self) {
if let Some(metrics) = &self.0 {
metrics.statements_distributed.inc();
}
}
/// Update sent requests counter
/// This counter is updated merely for the statements sent via request/response method,
/// meaning that it counts large statements only
pub fn on_sent_request(&self) {
if let Some(metrics) = &self.0 {
metrics.sent_requests.inc();
}
}
/// Update counters for the received responses with `succeeded` or `failed` labels
/// These counters are updated merely for the statements received via request/response method,
/// meaning that they count large statements only
pub fn on_received_response(&self, success: bool) {
if let Some(metrics) = &self.0 {
let label = if success { "succeeded" } else { "failed" };
metrics.received_responses.with_label_values(&[label]).inc();
}
}
/// Provide a timer for `active_leaves_update` which observes on drop.
pub fn time_active_leaves_update(
&self,
) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.active_leaves_update.start_timer())
}
/// Provide a timer for `share` which observes on drop.
pub fn time_share(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.share.start_timer())
}
/// Provide a timer for `network_bridge_update_v1` which observes on drop.
pub fn time_network_bridge_update_v1(
&self,
) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.network_bridge_update_v1.start_timer())
}
/// Update the out-of-view statements counter for unexpected valid statements
pub fn on_unexpected_statement_valid(&self) {
if let Some(metrics) = &self.0 {
metrics.statements_unexpected.with_label_values(&["valid"]).inc();
}
}
/// Update the out-of-view statements counter for unexpected seconded statements
pub fn on_unexpected_statement_seconded(&self) {
if let Some(metrics) = &self.0 {
metrics.statements_unexpected.with_label_values(&["seconded"]).inc();
}
}
/// Update the out-of-view statements counter for unexpected large statements
pub fn on_unexpected_statement_large(&self) {
if let Some(metrics) = &self.0 {
metrics.statements_unexpected.with_label_values(&["large"]).inc();
}
}
}
impl metrics::Metrics for Metrics {
fn try_register(
registry: &prometheus::Registry,
) -> std::result::Result<Self, prometheus::PrometheusError> {
let metrics = MetricsInner {
statements_distributed: prometheus::register(
prometheus::Counter::new(
"polkadot_parachain_statements_distributed_total",
"Number of candidate validity statements distributed to other peers.",
)?,
registry,
)?,
sent_requests: prometheus::register(
prometheus::Counter::new(
"polkadot_parachain_statement_distribution_sent_requests_total",
"Number of large statement fetching requests sent.",
)?,
registry,
)?,
received_responses: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"polkadot_parachain_statement_distribution_received_responses_total",
"Number of received responses for large statement data.",
),
&["success"],
)?,
registry,
)?,
active_leaves_update: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"polkadot_parachain_statement_distribution_active_leaves_update",
"Time spent within `statement_distribution::active_leaves_update`",
))?,
registry,
)?,
share: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"polkadot_parachain_statement_distribution_share",
"Time spent within `statement_distribution::share`",
))?,
registry,
)?,
network_bridge_update_v1: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"polkadot_parachain_statement_distribution_network_bridge_update_v1",
"Time spent within `statement_distribution::network_bridge_update_v1`",
))?,
registry,
)?,
statements_unexpected: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"polkadot_parachain_statement_distribution_statements_unexpected",
"Number of statements that were not expected to be received.",
),
&["type"],
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
}