Better metrics for approval voting (#3247)

* add more information about what's happening during approval work

* record assignment tranches in histogram
This commit is contained in:
Robert Habermeier
2021-06-14 19:28:22 +02:00
committed by GitHub
parent 659a04cad6
commit faf42fc29d
2 changed files with 63 additions and 17 deletions
+61 -15
View File
@@ -122,8 +122,8 @@ pub struct ApprovalVotingSubsystem {
#[derive(Clone)] #[derive(Clone)]
struct MetricsInner { struct MetricsInner {
imported_candidates_total: prometheus::Counter<prometheus::U64>, imported_candidates_total: prometheus::Counter<prometheus::U64>,
assignments_produced_total: prometheus::Counter<prometheus::U64>, assignments_produced: prometheus::Histogram,
approvals_produced_total: prometheus::Counter<prometheus::U64>, approvals_produced_total: prometheus::CounterVec<prometheus::U64>,
no_shows_total: prometheus::Counter<prometheus::U64>, no_shows_total: prometheus::Counter<prometheus::U64>,
wakeups_triggered_total: prometheus::Counter<prometheus::U64>, wakeups_triggered_total: prometheus::Counter<prometheus::U64>,
candidate_approval_time_ticks: prometheus::Histogram, candidate_approval_time_ticks: prometheus::Histogram,
@@ -143,15 +143,39 @@ impl Metrics {
} }
} }
fn on_assignment_produced(&self) { fn on_assignment_produced(&self, tranche: DelayTranche) {
if let Some(metrics) = &self.0 { if let Some(metrics) = &self.0 {
metrics.assignments_produced_total.inc(); metrics.assignments_produced.observe(tranche as f64);
}
}
fn on_approval_stale(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_produced_total.with_label_values(&["stale"]).inc()
}
}
fn on_approval_invalid(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_produced_total.with_label_values(&["invalid"]).inc()
}
}
fn on_approval_unavailable(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_produced_total.with_label_values(&["unavailable"]).inc()
}
}
fn on_approval_error(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_produced_total.with_label_values(&["internal error"]).inc()
} }
} }
fn on_approval_produced(&self) { fn on_approval_produced(&self) {
if let Some(metrics) = &self.0 { if let Some(metrics) = &self.0 {
metrics.approvals_produced_total.inc(); metrics.approvals_produced_total.with_label_values(&["success"]).inc()
} }
} }
@@ -200,17 +224,22 @@ impl metrics::Metrics for Metrics {
)?, )?,
registry, registry,
)?, )?,
assignments_produced_total: prometheus::register( assignments_produced: prometheus::register(
prometheus::Counter::new( prometheus::Histogram::with_opts(
"parachain_assignments_produced_total", prometheus::HistogramOpts::new(
"Number of assignments produced by the approval voting subsystem", "parachain_assignments_produced",
"Assignments and tranches produced by the approval voting subsystem",
).buckets(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 15.0, 25.0, 40.0, 70.0]),
)?, )?,
registry, registry,
)?, )?,
approvals_produced_total: prometheus::register( approvals_produced_total: prometheus::register(
prometheus::Counter::new( prometheus::CounterVec::new(
"parachain_approvals_produced_total", prometheus::Opts::new(
"Number of approvals produced by the approval voting subsystem", "parachain_approvals_produced_total",
"Number of approvals produced by the approval voting subsystem",
),
&["status"]
)?, )?,
registry, registry,
)?, )?,
@@ -583,6 +612,7 @@ enum Action {
WriteCandidateEntry(CandidateHash, CandidateEntry), WriteCandidateEntry(CandidateHash, CandidateEntry),
LaunchApproval { LaunchApproval {
indirect_cert: IndirectAssignmentCert, indirect_cert: IndirectAssignmentCert,
assignment_tranche: DelayTranche,
relay_block_number: BlockNumber, relay_block_number: BlockNumber,
candidate_index: CandidateIndex, candidate_index: CandidateIndex,
session: SessionIndex, session: SessionIndex,
@@ -724,6 +754,7 @@ async fn handle_actions(
} }
Action::LaunchApproval { Action::LaunchApproval {
indirect_cert, indirect_cert,
assignment_tranche,
relay_block_number, relay_block_number,
candidate_index, candidate_index,
session, session,
@@ -733,7 +764,7 @@ async fn handle_actions(
// Don't launch approval work if the node is syncing. // Don't launch approval work if the node is syncing.
if let Mode::Syncing(_) = *mode { continue } if let Mode::Syncing(_) = *mode { continue }
metrics.on_assignment_produced(); metrics.on_assignment_produced(assignment_tranche);
let block_hash = indirect_cert.block_hash; let block_hash = indirect_cert.block_hash;
let validator_index = indirect_cert.validator; let validator_index = indirect_cert.validator;
@@ -1830,7 +1861,7 @@ fn process_wakeup(
(Vec::new(), None) (Vec::new(), None)
}; };
if let Some((cert, val_index)) = maybe_cert { if let Some((cert, val_index, tranche)) = maybe_cert {
let indirect_cert = IndirectAssignmentCert { let indirect_cert = IndirectAssignmentCert {
block_hash: relay_block, block_hash: relay_block,
validator: val_index, validator: val_index,
@@ -1852,6 +1883,7 @@ fn process_wakeup(
// sanity: should always be present. // sanity: should always be present.
actions.push(Action::LaunchApproval { actions.push(Action::LaunchApproval {
indirect_cert, indirect_cert,
assignment_tranche: tranche,
relay_block_number: block_entry.block_number(), relay_block_number: block_entry.block_number(),
candidate_index: i as _, candidate_index: i as _,
session: block_entry.session(), session: block_entry.session(),
@@ -1950,6 +1982,7 @@ async fn launch_approval(
(candidate_hash, candidate.descriptor.para_id), (candidate_hash, candidate.descriptor.para_id),
); );
// do nothing. we'll just be a no-show and that'll cause others to rise up. // do nothing. we'll just be a no-show and that'll cause others to rise up.
metrics.on_approval_unavailable();
return; return;
} }
Ok(Err(RecoveryError::Invalid)) => { Ok(Err(RecoveryError::Invalid)) => {
@@ -1961,6 +1994,7 @@ async fn launch_approval(
// TODO: dispute. Either the merkle trie is bad or the erasure root is. // TODO: dispute. Either the merkle trie is bad or the erasure root is.
// https://github.com/paritytech/polkadot/issues/2176 // https://github.com/paritytech/polkadot/issues/2176
metrics.on_approval_invalid();
return; return;
} }
}; };
@@ -1979,6 +2013,7 @@ async fn launch_approval(
// No dispute necessary, as this indicates that the chain is not behaving // No dispute necessary, as this indicates that the chain is not behaving
// according to expectations. // according to expectations.
metrics.on_approval_unavailable();
return; return;
} }
}; };
@@ -2024,6 +2059,7 @@ async fn launch_approval(
// TODO: issue dispute, but not for timeouts. // TODO: issue dispute, but not for timeouts.
// https://github.com/paritytech/polkadot/issues/2176 // https://github.com/paritytech/polkadot/issues/2176
metrics.on_approval_invalid();
} }
Ok(Err(e)) => { Ok(Err(e)) => {
tracing::error!( tracing::error!(
@@ -2032,6 +2068,7 @@ async fn launch_approval(
"Failed to validate candidate due to internal error", "Failed to validate candidate due to internal error",
); );
metrics.on_approval_error();
return return
} }
} }
@@ -2055,7 +2092,11 @@ fn issue_approval(
let block_entry = match state.db.load_block_entry(&block_hash)? { let block_entry = match state.db.load_block_entry(&block_hash)? {
Some(b) => b, Some(b) => b,
None => return Ok(Vec::new()), // not a cause for alarm - just lost a race with pruning, most likely. None => {
// not a cause for alarm - just lost a race with pruning, most likely.
metrics.on_approval_stale();
return Ok(Vec::new())
}
}; };
let session_info = match state.session_info(block_entry.session()) { let session_info = match state.session_info(block_entry.session()) {
@@ -2068,6 +2109,7 @@ fn issue_approval(
block_entry.session(), block_entry.session(),
); );
metrics.on_approval_error();
return Ok(Vec::new()); return Ok(Vec::new());
} }
}; };
@@ -2082,6 +2124,7 @@ fn issue_approval(
block_hash, block_hash,
); );
metrics.on_approval_error();
return Ok(Vec::new()); return Ok(Vec::new());
} }
}; };
@@ -2096,6 +2139,7 @@ fn issue_approval(
block_hash, block_hash,
); );
metrics.on_approval_error();
return Ok(Vec::new()); return Ok(Vec::new());
} }
}; };
@@ -2110,6 +2154,7 @@ fn issue_approval(
block_entry.session(), block_entry.session(),
); );
metrics.on_approval_error();
return Ok(Vec::new()); return Ok(Vec::new());
} }
}; };
@@ -2129,6 +2174,7 @@ fn issue_approval(
block_entry.session(), block_entry.session(),
); );
metrics.on_approval_error();
return Ok(Vec::new()); return Ok(Vec::new());
} }
}; };
@@ -93,7 +93,7 @@ impl ApprovalEntry {
// Note that our assignment is triggered. No-op if already triggered. // Note that our assignment is triggered. No-op if already triggered.
pub fn trigger_our_assignment(&mut self, tick_now: Tick) pub fn trigger_our_assignment(&mut self, tick_now: Tick)
-> Option<(AssignmentCert, ValidatorIndex)> -> Option<(AssignmentCert, ValidatorIndex, DelayTranche)>
{ {
let our = self.our_assignment.as_mut().and_then(|a| { let our = self.our_assignment.as_mut().and_then(|a| {
if a.triggered() { return None } if a.triggered() { return None }
@@ -105,7 +105,7 @@ impl ApprovalEntry {
our.map(|a| { our.map(|a| {
self.import_assignment(a.tranche(), a.validator_index(), tick_now); self.import_assignment(a.tranche(), a.validator_index(), tick_now);
(a.cert().clone(), a.validator_index()) (a.cert().clone(), a.validator_index(), a.tranche())
}) })
} }