diff --git a/polkadot/node/network/availability-recovery/src/lib.rs b/polkadot/node/network/availability-recovery/src/lib.rs index f33ee90d82..5f7d8de069 100644 --- a/polkadot/node/network/availability-recovery/src/lib.rs +++ b/polkadot/node/network/availability-recovery/src/lib.rs @@ -489,6 +489,8 @@ impl RequestChunksFromValidators { } } + let _recovery_timer = metrics.time_full_recovery(); + loop { if self.is_unavailable(¶ms) { gum::debug!( @@ -502,10 +504,11 @@ impl RequestChunksFromValidators { "Data recovery is not possible", ); + metrics.on_recovery_failed(); + return Err(RecoveryError::Unavailable) } - let recovery_possible = metrics.time_erasure_recovery_becomes_possible(); self.launch_parallel_requests(params, sender).await; self.wait_for_chunks(params).await; @@ -513,7 +516,6 @@ impl RequestChunksFromValidators { // If that fails, or a re-encoding of it doesn't match the expected erasure root, // return Err(RecoveryError::Invalid) if self.received_chunks.len() >= params.threshold { - drop(recovery_possible); let recovery_duration = metrics.time_erasure_recovery(); return match polkadot_erasure_coding::reconstruct_v1( @@ -532,6 +534,7 @@ impl RequestChunksFromValidators { erasure_root = ?params.erasure_root, "Data recovery complete", ); + metrics.on_recovery_succeeded(); Ok(data) } else { @@ -542,6 +545,7 @@ impl RequestChunksFromValidators { erasure_root = ?params.erasure_root, "Data recovery - root mismatch", ); + metrics.on_recovery_invalid(); Err(RecoveryError::Invalid) } @@ -555,12 +559,11 @@ impl RequestChunksFromValidators { ?err, "Data recovery error ", ); + metrics.on_recovery_invalid(); Err(RecoveryError::Invalid) }, } - } else { - recovery_possible.map(|rp| rp.stop_and_discard()); } } } @@ -637,6 +640,8 @@ impl RecoveryTask { } } + self.params.metrics.on_recovery_started(); + loop { // These only fail if we cannot reach the underlying subsystem, which case there is nothing // meaningful we can do. diff --git a/polkadot/node/network/availability-recovery/src/metrics.rs b/polkadot/node/network/availability-recovery/src/metrics.rs index 77d4aeaa64..7acf0c228c 100644 --- a/polkadot/node/network/availability-recovery/src/metrics.rs +++ b/polkadot/node/network/availability-recovery/src/metrics.rs @@ -46,8 +46,18 @@ struct MetricsInner { /// The duration between the pure recovery and verification. time_erasure_recovery: Histogram, - /// The duration between the first request and the time when we have a sufficient number of chunks to recover. - time_erasure_recovery_becomes_possible: Histogram, + /// Time of a full recovery, including erasure decoding or until we gave + /// up. + time_full_recovery: Histogram, + + /// Number of full recoveries that have been finished one way or the other. + full_recoveries_finished: CounterVec, + + /// Number of full recoveries that have been started on this subsystem. + /// + /// Note: Those are only recoveries which could not get served locally already - so in other + /// words: Only real recoveries. + full_recoveries_started: Counter, } impl Metrics { @@ -108,13 +118,37 @@ impl Metrics { self.0.as_ref().map(|metrics| metrics.time_erasure_recovery.start_timer()) } - /// Get a timer to measure the time duration until a sufficient amount of chunks were available to attempt recovery. - pub fn time_erasure_recovery_becomes_possible( - &self, - ) -> Option { - self.0 - .as_ref() - .map(|metrics| metrics.time_erasure_recovery_becomes_possible.start_timer()) + /// Get a timer to measure the time of the complete recovery process. + pub fn time_full_recovery(&self) -> Option { + self.0.as_ref().map(|metrics| metrics.time_full_recovery.start_timer()) + } + + /// A full recovery succeeded. + pub fn on_recovery_succeeded(&self) { + if let Some(metrics) = &self.0 { + metrics.full_recoveries_finished.with_label_values(&["success"]).inc() + } + } + + /// A full recovery failed (data not available). + pub fn on_recovery_failed(&self) { + if let Some(metrics) = &self.0 { + metrics.full_recoveries_finished.with_label_values(&["failure"]).inc() + } + } + + /// A full recovery failed (data was recovered, but invalid). + pub fn on_recovery_invalid(&self) { + if let Some(metrics) = &self.0 { + metrics.full_recoveries_finished.with_label_values(&["invalid"]).inc() + } + } + + /// A recover was started. + pub fn on_recovery_started(&self) { + if let Some(metrics) = &self.0 { + metrics.full_recoveries_started.inc() + } } } @@ -152,13 +186,30 @@ impl metrics::Metrics for Metrics { ))?, registry, )?, - time_erasure_recovery_becomes_possible: prometheus::register( + time_full_recovery: prometheus::register( prometheus::Histogram::with_opts(prometheus::HistogramOpts::new( - "polkadot_parachain_availability_recovery_time_erasure_recovery_becomes_possible", - "Time spent launching the first request until a sufficient amount of chunks was recovered", + "polkadot_parachain_availability_recovery_time_total", + "Time a full recovery process took, either until failure or successful erasure decoding.", ))?, registry, )?, + full_recoveries_finished: prometheus::register( + CounterVec::new( + Opts::new( + "polkadot_parachain_availability_recovery_recoveries_finished", + "Total number of recoveries that finished.", + ), + &["result"], + )?, + registry, + )?, + full_recoveries_started: prometheus::register( + Counter::new( + "polkadot_parachain_availability_recovery_recovieries_started", + "Total number of started recoveries.", + )?, + registry, + )?, }; Ok(Metrics(Some(metrics))) }