Better metrics for availability-recovery (#5249)

* Better metrics.

- Fix time metric
- Add counters

* Typo

* Better docs.
This commit is contained in:
Robert Klotzner
2022-04-04 16:31:08 +02:00
committed by GitHub
parent fc041426db
commit 4ca9691dcf
2 changed files with 72 additions and 16 deletions
@@ -489,6 +489,8 @@ impl RequestChunksFromValidators {
}
}
let _recovery_timer = metrics.time_full_recovery();
loop {
if self.is_unavailable(&params) {
gum::debug!(
@@ -502,10 +504,11 @@ impl RequestChunksFromValidators {
"Data recovery is not possible",
);
metrics.on_recovery_failed();
return Err(RecoveryError::Unavailable)
}
let recovery_possible = metrics.time_erasure_recovery_becomes_possible();
self.launch_parallel_requests(params, sender).await;
self.wait_for_chunks(params).await;
@@ -513,7 +516,6 @@ impl RequestChunksFromValidators {
// If that fails, or a re-encoding of it doesn't match the expected erasure root,
// return Err(RecoveryError::Invalid)
if self.received_chunks.len() >= params.threshold {
drop(recovery_possible);
let recovery_duration = metrics.time_erasure_recovery();
return match polkadot_erasure_coding::reconstruct_v1(
@@ -532,6 +534,7 @@ impl RequestChunksFromValidators {
erasure_root = ?params.erasure_root,
"Data recovery complete",
);
metrics.on_recovery_succeeded();
Ok(data)
} else {
@@ -542,6 +545,7 @@ impl RequestChunksFromValidators {
erasure_root = ?params.erasure_root,
"Data recovery - root mismatch",
);
metrics.on_recovery_invalid();
Err(RecoveryError::Invalid)
}
@@ -555,12 +559,11 @@ impl RequestChunksFromValidators {
?err,
"Data recovery error ",
);
metrics.on_recovery_invalid();
Err(RecoveryError::Invalid)
},
}
} else {
recovery_possible.map(|rp| rp.stop_and_discard());
}
}
}
@@ -637,6 +640,8 @@ impl<S: SubsystemSender> RecoveryTask<S> {
}
}
self.params.metrics.on_recovery_started();
loop {
// These only fail if we cannot reach the underlying subsystem, which case there is nothing
// meaningful we can do.
@@ -46,8 +46,18 @@ struct MetricsInner {
/// The duration between the pure recovery and verification.
time_erasure_recovery: Histogram,
/// The duration between the first request and the time when we have a sufficient number of chunks to recover.
time_erasure_recovery_becomes_possible: Histogram,
/// Time of a full recovery, including erasure decoding or until we gave
/// up.
time_full_recovery: Histogram,
/// Number of full recoveries that have been finished one way or the other.
full_recoveries_finished: CounterVec<U64>,
/// Number of full recoveries that have been started on this subsystem.
///
/// Note: Those are only recoveries which could not get served locally already - so in other
/// words: Only real recoveries.
full_recoveries_started: Counter<U64>,
}
impl Metrics {
@@ -108,13 +118,37 @@ impl Metrics {
self.0.as_ref().map(|metrics| metrics.time_erasure_recovery.start_timer())
}
/// Get a timer to measure the time duration until a sufficient amount of chunks were available to attempt recovery.
pub fn time_erasure_recovery_becomes_possible(
&self,
) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0
.as_ref()
.map(|metrics| metrics.time_erasure_recovery_becomes_possible.start_timer())
/// Get a timer to measure the time of the complete recovery process.
pub fn time_full_recovery(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.time_full_recovery.start_timer())
}
/// A full recovery succeeded.
pub fn on_recovery_succeeded(&self) {
if let Some(metrics) = &self.0 {
metrics.full_recoveries_finished.with_label_values(&["success"]).inc()
}
}
/// A full recovery failed (data not available).
pub fn on_recovery_failed(&self) {
if let Some(metrics) = &self.0 {
metrics.full_recoveries_finished.with_label_values(&["failure"]).inc()
}
}
/// A full recovery failed (data was recovered, but invalid).
pub fn on_recovery_invalid(&self) {
if let Some(metrics) = &self.0 {
metrics.full_recoveries_finished.with_label_values(&["invalid"]).inc()
}
}
/// A recover was started.
pub fn on_recovery_started(&self) {
if let Some(metrics) = &self.0 {
metrics.full_recoveries_started.inc()
}
}
}
@@ -152,13 +186,30 @@ impl metrics::Metrics for Metrics {
))?,
registry,
)?,
time_erasure_recovery_becomes_possible: prometheus::register(
time_full_recovery: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"polkadot_parachain_availability_recovery_time_erasure_recovery_becomes_possible",
"Time spent launching the first request until a sufficient amount of chunks was recovered",
"polkadot_parachain_availability_recovery_time_total",
"Time a full recovery process took, either until failure or successful erasure decoding.",
))?,
registry,
)?,
full_recoveries_finished: prometheus::register(
CounterVec::new(
Opts::new(
"polkadot_parachain_availability_recovery_recoveries_finished",
"Total number of recoveries that finished.",
),
&["result"],
)?,
registry,
)?,
full_recoveries_started: prometheus::register(
Counter::new(
"polkadot_parachain_availability_recovery_recovieries_started",
"Total number of started recoveries.",
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}