implement remaining subsystem metrics (#1770)

* overseer metrics: messages relayed

* provisioner metrics: cosmetic changes

* candidate selection metrics: cosmetic changes

* availability bitfields metrics

* availability distribution metrics

* PoV distribution metrics

* statement-distribution: small simplification

* statement-distribution: extract log target into a const

* statement-distribution: metrics

* address review nits
This commit is contained in:
Andronik Ordian
2020-10-01 12:08:03 +02:00
committed by GitHub
parent 693d40831d
commit 579614d127
12 changed files with 450 additions and 177 deletions
@@ -22,10 +22,13 @@
use polkadot_subsystem::{
Subsystem, SubsystemResult, SubsystemContext, SpawnedSubsystem,
ActiveLeavesUpdate, FromOverseer, OverseerSignal,
messages::{
AllMessages, NetworkBridgeMessage, StatementDistributionMessage, CandidateBackingMessage,
RuntimeApiMessage, RuntimeApiRequest,
},
};
use polkadot_subsystem::messages::{
AllMessages, NetworkBridgeMessage, StatementDistributionMessage, CandidateBackingMessage,
RuntimeApiMessage, RuntimeApiRequest,
use polkadot_node_subsystem_util::{
metrics::{self, prometheus},
};
use node_primitives::SignedFullStatement;
use polkadot_primitives::v1::{
@@ -59,8 +62,13 @@ const BENEFIT_VALID_STATEMENT_FIRST: Rep = Rep::new(
/// Typically we will only keep 1, but when a validator equivocates we will need to track 2.
const VC_THRESHOLD: usize = 2;
const LOG_TARGET: &str = "statement_distribution";
/// The statement distribution subsystem.
pub struct StatementDistribution;
pub struct StatementDistribution {
// Prometheus metrics
metrics: Metrics,
}
impl<C> Subsystem<C> for StatementDistribution
where C: SubsystemContext<Message=StatementDistributionMessage>
@@ -70,7 +78,7 @@ impl<C> Subsystem<C> for StatementDistribution
// within `run`.
SpawnedSubsystem {
name: "statement-distribution-subsystem",
future: run(ctx).map(|_| ()).boxed(),
future: self.run(ctx).map(|_| ()).boxed(),
}
}
}
@@ -111,14 +119,7 @@ fn note_hash(
) -> bool {
if observed.contains(&h) { return true; }
if observed.is_full() {
false
} else {
observed.try_push(h).expect("length of storage guarded above; \
only panics if length exceeds capacity; qed");
true
}
observed.try_push(h).is_ok()
}
/// knowledge that a peer has about goings-on in a relay parent.
@@ -502,6 +503,7 @@ async fn circulate_statement_and_dependents(
ctx: &mut impl SubsystemContext<Message = StatementDistributionMessage>,
relay_parent: Hash,
statement: SignedFullStatement,
metrics: &Metrics,
) -> SubsystemResult<()> {
if let Some(active_head)= active_heads.get_mut(&relay_parent) {
@@ -529,7 +531,8 @@ async fn circulate_statement_and_dependents(
ctx,
relay_parent,
candidate_hash,
&*active_head
&*active_head,
metrics,
).await?;
}
}
@@ -589,6 +592,7 @@ async fn send_statements_about(
relay_parent: Hash,
candidate_hash: Hash,
active_head: &ActiveHeadData,
metrics: &Metrics,
) -> SubsystemResult<()> {
for statement in active_head.statements_about(candidate_hash) {
if peer_data.send(&relay_parent, &statement.fingerprint()).is_some() {
@@ -600,6 +604,8 @@ async fn send_statements_about(
ctx.send_message(AllMessages::NetworkBridge(
NetworkBridgeMessage::SendValidationMessage(vec![peer.clone()], payload)
)).await?;
metrics.on_statement_distributed();
}
}
@@ -612,7 +618,8 @@ async fn send_statements(
peer_data: &mut PeerData,
ctx: &mut impl SubsystemContext<Message = StatementDistributionMessage>,
relay_parent: Hash,
active_head: &ActiveHeadData
active_head: &ActiveHeadData,
metrics: &Metrics,
) -> SubsystemResult<()> {
for statement in active_head.statements() {
if peer_data.send(&relay_parent, &statement.fingerprint()).is_some() {
@@ -624,6 +631,8 @@ async fn send_statements(
ctx.send_message(AllMessages::NetworkBridge(
NetworkBridgeMessage::SendValidationMessage(vec![peer.clone()], payload)
)).await?;
metrics.on_statement_distributed();
}
}
@@ -652,6 +661,7 @@ async fn handle_incoming_message<'a>(
active_heads: &'a mut HashMap<Hash, ActiveHeadData>,
ctx: &mut impl SubsystemContext<Message = StatementDistributionMessage>,
message: protocol_v1::StatementDistributionMessage,
metrics: &Metrics,
) -> SubsystemResult<Option<(Hash, &'a StoredStatement)>> {
let (relay_parent, statement) = match message {
protocol_v1::StatementDistributionMessage::Statement(r, s) => (r, s),
@@ -697,6 +707,7 @@ async fn handle_incoming_message<'a>(
relay_parent,
fingerprint.0.candidate_hash().clone(),
&*active_head,
metrics,
).await?
}
Ok(false) => {}
@@ -724,6 +735,7 @@ async fn update_peer_view_and_send_unlocked(
ctx: &mut impl SubsystemContext<Message = StatementDistributionMessage>,
active_heads: &HashMap<Hash, ActiveHeadData>,
new_view: View,
metrics: &Metrics,
) -> SubsystemResult<()> {
let old_view = std::mem::replace(&mut peer_data.view, new_view);
@@ -745,6 +757,7 @@ async fn update_peer_view_and_send_unlocked(
ctx,
new,
active_head,
metrics,
).await?;
}
}
@@ -758,6 +771,7 @@ async fn handle_network_update(
ctx: &mut impl SubsystemContext<Message = StatementDistributionMessage>,
our_view: &mut View,
update: NetworkBridgeEvent<protocol_v1::StatementDistributionMessage>,
metrics: &Metrics,
) -> SubsystemResult<()> {
match update {
NetworkBridgeEvent::PeerConnected(peer, _role) => {
@@ -782,6 +796,7 @@ async fn handle_network_update(
active_heads,
ctx,
message,
metrics,
).await?;
if let Some((relay_parent, new)) = new_stored {
@@ -808,6 +823,7 @@ async fn handle_network_update(
ctx,
&*active_heads,
view,
metrics,
).await
}
None => Ok(()),
@@ -819,7 +835,7 @@ async fn handle_network_update(
for new in our_view.difference(&old_view) {
if !active_heads.contains_key(&new) {
log::warn!(target: "statement_distribution", "Our network bridge view update \
log::warn!(target: LOG_TARGET, "Our network bridge view update \
inconsistent with `StartWork` messages we have received from overseer. \
Contains unknown hash {}", new);
}
@@ -831,94 +847,132 @@ async fn handle_network_update(
}
async fn run(
mut ctx: impl SubsystemContext<Message = StatementDistributionMessage>,
) -> SubsystemResult<()> {
let mut peers: HashMap<PeerId, PeerData> = HashMap::new();
let mut our_view = View::default();
let mut active_heads: HashMap<Hash, ActiveHeadData> = HashMap::new();
let mut statement_listeners: Vec<mpsc::Sender<SignedFullStatement>> = Vec::new();
impl StatementDistribution {
async fn run(
self,
mut ctx: impl SubsystemContext<Message = StatementDistributionMessage>,
) -> SubsystemResult<()> {
let mut peers: HashMap<PeerId, PeerData> = HashMap::new();
let mut our_view = View::default();
let mut active_heads: HashMap<Hash, ActiveHeadData> = HashMap::new();
let mut statement_listeners: Vec<mpsc::Sender<SignedFullStatement>> = Vec::new();
let metrics = self.metrics;
loop {
let message = ctx.recv().await?;
match message {
FromOverseer::Signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, .. })) => {
for relay_parent in activated {
let (validators, session_index) = {
let (val_tx, val_rx) = oneshot::channel();
let (session_tx, session_rx) = oneshot::channel();
loop {
let message = ctx.recv().await?;
match message {
FromOverseer::Signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, .. })) => {
for relay_parent in activated {
let (validators, session_index) = {
let (val_tx, val_rx) = oneshot::channel();
let (session_tx, session_rx) = oneshot::channel();
let val_message = AllMessages::RuntimeApi(
RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::Validators(val_tx),
),
);
let session_message = AllMessages::RuntimeApi(
RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::SessionIndexForChild(session_tx),
),
);
let val_message = AllMessages::RuntimeApi(
RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::Validators(val_tx),
),
);
let session_message = AllMessages::RuntimeApi(
RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::SessionIndexForChild(session_tx),
),
);
ctx.send_messages(
std::iter::once(val_message).chain(std::iter::once(session_message))
).await?;
ctx.send_messages(
std::iter::once(val_message).chain(std::iter::once(session_message))
).await?;
match (val_rx.await?, session_rx.await?) {
(Ok(v), Ok(s)) => (v, s),
(Err(e), _) | (_, Err(e)) => {
log::warn!(
target: "statement_distribution",
"Failed to fetch runtime API data for active leaf: {:?}",
e,
);
match (val_rx.await?, session_rx.await?) {
(Ok(v), Ok(s)) => (v, s),
(Err(e), _) | (_, Err(e)) => {
log::warn!(
target: LOG_TARGET,
"Failed to fetch runtime API data for active leaf: {:?}",
e,
);
// Lacking this bookkeeping might make us behave funny, although
// not in any slashable way. But we shouldn't take down the node
// on what are likely spurious runtime API errors.
continue;
// Lacking this bookkeeping might make us behave funny, although
// not in any slashable way. But we shouldn't take down the node
// on what are likely spurious runtime API errors.
continue;
}
}
}
};
};
active_heads.entry(relay_parent)
.or_insert(ActiveHeadData::new(validators, session_index));
active_heads.entry(relay_parent)
.or_insert(ActiveHeadData::new(validators, session_index));
}
}
}
FromOverseer::Signal(OverseerSignal::BlockFinalized(_block_hash)) => {
// do nothing
}
FromOverseer::Signal(OverseerSignal::Conclude) => break,
FromOverseer::Communication { msg } => match msg {
StatementDistributionMessage::Share(relay_parent, statement) => {
inform_statement_listeners(
&statement,
&mut statement_listeners,
).await;
circulate_statement_and_dependents(
&mut peers,
&mut active_heads,
&mut ctx,
relay_parent,
statement,
).await?;
FromOverseer::Signal(OverseerSignal::BlockFinalized(_block_hash)) => {
// do nothing
}
StatementDistributionMessage::NetworkBridgeUpdateV1(event) =>
handle_network_update(
&mut peers,
&mut active_heads,
&mut ctx,
&mut our_view,
event,
).await?,
StatementDistributionMessage::RegisterStatementListener(tx) => {
statement_listeners.push(tx);
FromOverseer::Signal(OverseerSignal::Conclude) => break,
FromOverseer::Communication { msg } => match msg {
StatementDistributionMessage::Share(relay_parent, statement) => {
inform_statement_listeners(
&statement,
&mut statement_listeners,
).await;
circulate_statement_and_dependents(
&mut peers,
&mut active_heads,
&mut ctx,
relay_parent,
statement,
&metrics,
).await?;
}
StatementDistributionMessage::NetworkBridgeUpdateV1(event) =>
handle_network_update(
&mut peers,
&mut active_heads,
&mut ctx,
&mut our_view,
event,
&metrics,
).await?,
StatementDistributionMessage::RegisterStatementListener(tx) => {
statement_listeners.push(tx);
}
}
}
}
Ok(())
}
}
#[derive(Clone)]
struct MetricsInner {
statements_distributed: prometheus::Counter<prometheus::U64>,
}
/// Statement Distribution metrics.
#[derive(Default, Clone)]
pub struct Metrics(Option<MetricsInner>);
impl Metrics {
fn on_statement_distributed(&self) {
if let Some(metrics) = &self.0 {
metrics.statements_distributed.inc();
}
}
}
impl metrics::Metrics for Metrics {
fn try_register(registry: &prometheus::Registry) -> std::result::Result<Self, prometheus::PrometheusError> {
let metrics = MetricsInner {
statements_distributed: prometheus::register(
prometheus::Counter::new(
"parachain_statements_distributed_total",
"Number of candidate validity statements distributed to other peers."
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
Ok(())
}
#[cfg(test)]
@@ -1256,6 +1310,7 @@ mod tests {
&mut ctx,
&active_heads,
new_view.clone(),
&Default::default(),
).await.unwrap();
assert_eq!(peer_data.view, new_view);