diff --git a/polkadot/node/network/availability-recovery/src/lib.rs b/polkadot/node/network/availability-recovery/src/lib.rs index c3d27e5ea3..54971cf32b 100644 --- a/polkadot/node/network/availability-recovery/src/lib.rs +++ b/polkadot/node/network/availability-recovery/src/lib.rs @@ -458,6 +458,8 @@ impl RequestChunksFromValidators { params: &RecoveryParams, sender: &mut impl SubsystemSender, ) -> Result { + let metrics = ¶ms.metrics; + // First query the store for any chunks we've got. { let (tx, rx) = oneshot::channel(); @@ -504,6 +506,7 @@ impl RequestChunksFromValidators { return Err(RecoveryError::Unavailable) } + let recovery_possible = metrics.time_erasure_recovery_becomes_possible(); self.launch_parallel_requests(params, sender).await; self.wait_for_chunks(params).await; @@ -511,6 +514,9 @@ impl RequestChunksFromValidators { // If that fails, or a re-encoding of it doesn't match the expected erasure root, // return Err(RecoveryError::Invalid) if self.received_chunks.len() >= params.threshold { + drop(recovery_possible); + let recovery_duration = metrics.time_erasure_recovery(); + return match polkadot_erasure_coding::reconstruct_v1( params.validators.len(), self.received_chunks.values().map(|c| (&c.chunk[..], c.index.0 as usize)), @@ -530,6 +536,7 @@ impl RequestChunksFromValidators { Ok(data) } else { + recovery_duration.map(|rd| rd.stop_and_discard()); gum::trace!( target: LOG_TARGET, candidate_hash = ?params.candidate_hash, @@ -541,6 +548,7 @@ impl RequestChunksFromValidators { } }, Err(err) => { + recovery_duration.map(|rd| rd.stop_and_discard()); gum::trace!( target: LOG_TARGET, candidate_hash = ?params.candidate_hash, @@ -552,6 +560,8 @@ impl RequestChunksFromValidators { Err(RecoveryError::Invalid) }, } + } else { + recovery_possible.map(|rp| rp.stop_and_discard()); } } } diff --git a/polkadot/node/network/availability-recovery/src/metrics.rs b/polkadot/node/network/availability-recovery/src/metrics.rs index ba0e8cacdf..77d4aeaa64 100644 --- a/polkadot/node/network/availability-recovery/src/metrics.rs +++ b/polkadot/node/network/availability-recovery/src/metrics.rs @@ -14,12 +14,9 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use polkadot_node_subsystem_util::{ - metrics, - metrics::{ - prometheus, - prometheus::{Counter, CounterVec, Opts, PrometheusError, Registry, U64}, - }, +use polkadot_node_subsystem_util::metrics::{ + self, + prometheus::{self, Counter, CounterVec, Histogram, Opts, PrometheusError, Registry, U64}, }; /// Availability Distribution metrics. @@ -42,8 +39,15 @@ struct MetricsInner { /// - `invalid` ... Chunk was received, but not valid. /// - `success` chunk_requests_finished: CounterVec, + /// The duration of request to response. - time_chunk_request: prometheus::Histogram, + time_chunk_request: Histogram, + + /// The duration between the pure recovery and verification. + time_erasure_recovery: Histogram, + + /// The duration between the first request and the time when we have a sufficient number of chunks to recover. + time_erasure_recovery_becomes_possible: Histogram, } impl Metrics { @@ -93,10 +97,25 @@ impl Metrics { metrics.chunk_requests_finished.with_label_values(&["success"]).inc() } } + /// Get a timer to time request/response duration. pub fn time_chunk_request(&self) -> Option { self.0.as_ref().map(|metrics| metrics.time_chunk_request.start_timer()) } + + /// Get a timer to time erasure code recover. + pub fn time_erasure_recovery(&self) -> Option { + self.0.as_ref().map(|metrics| metrics.time_erasure_recovery.start_timer()) + } + + /// Get a timer to measure the time duration until a sufficient amount of chunks were available to attempt recovery. + pub fn time_erasure_recovery_becomes_possible( + &self, + ) -> Option { + self.0 + .as_ref() + .map(|metrics| metrics.time_erasure_recovery_becomes_possible.start_timer()) + } } impl metrics::Metrics for Metrics { @@ -126,6 +145,20 @@ impl metrics::Metrics for Metrics { ))?, registry, )?, + time_erasure_recovery: prometheus::register( + prometheus::Histogram::with_opts(prometheus::HistogramOpts::new( + "polkadot_parachain_availability_recovery_time_erasure_recovery", + "Time spent to recover the erasure code and verify the merkle root by re-encoding as erasure chunks", + ))?, + registry, + )?, + time_erasure_recovery_becomes_possible: prometheus::register( + prometheus::Histogram::with_opts(prometheus::HistogramOpts::new( + "polkadot_parachain_availability_recovery_time_erasure_recovery_becomes_possible", + "Time spent launching the first request until a sufficient amount of chunks was recovered", + ))?, + registry, + )?, }; Ok(Metrics(Some(metrics))) }