Add tracing support to node (#1940)

* drop in tracing to replace log

* add structured logging to trace messages

* add structured logging to debug messages

* add structured logging to info messages

* add structured logging to warn messages

* add structured logging to error messages

* normalize spacing and Display vs Debug

* add instrumentation to the various 'fn run'

* use explicit tracing module throughout

* fix availability distribution test

* don't double-print errors

* remove further redundancy from logs

* fix test errors

* fix more test errors

* remove unused kv_log_macro

* fix unused variable

* add tracing spans to collation generation

* add tracing spans to av-store

* add tracing spans to backing

* add tracing spans to bitfield-signing

* add tracing spans to candidate-selection

* add tracing spans to candidate-validation

* add tracing spans to chain-api

* add tracing spans to provisioner

* add tracing spans to runtime-api

* add tracing spans to availability-distribution

* add tracing spans to bitfield-distribution

* add tracing spans to network-bridge

* add tracing spans to collator-protocol

* add tracing spans to pov-distribution

* add tracing spans to statement-distribution

* add tracing spans to overseer

* cleanup
This commit is contained in:
Peter Goodspeed-Niklaus
2020-11-20 12:02:04 +01:00
committed by GitHub
parent 94670d8082
commit e49989971d
53 changed files with 564 additions and 280 deletions
+24 -12
View File
@@ -68,7 +68,7 @@ const MALFORMED_VIEW_COST: ReputationChange
= ReputationChange::new(-500, "Malformed view");
// network bridge log target
const TARGET: &'static str = "network_bridge";
const LOG_TARGET: &'static str = "network_bridge";
/// Messages received on the network.
#[derive(Debug, Encode, Decode, Clone)]
@@ -264,6 +264,7 @@ enum Action {
Nop,
}
#[tracing::instrument(level = "trace", fields(subsystem = LOG_TARGET))]
fn action_from_overseer_message(
res: polkadot_subsystem::SubsystemResult<FromOverseer<NetworkBridgeMessage>>,
) -> Action {
@@ -286,16 +287,17 @@ fn action_from_overseer_message(
Ok(FromOverseer::Signal(OverseerSignal::BlockFinalized(_)))
=> Action::Nop,
Err(e) => {
log::warn!(target: TARGET, "Shutting down Network Bridge due to error {:?}", e);
tracing::warn!(target: LOG_TARGET, err = ?e, "Shutting down Network Bridge due to error");
Action::Abort
}
}
}
#[tracing::instrument(level = "trace", fields(subsystem = LOG_TARGET))]
fn action_from_network_message(event: Option<NetworkEvent>) -> Action {
match event {
None => {
log::info!(target: TARGET, "Shutting down Network Bridge: underlying event stream concluded");
tracing::info!(target: LOG_TARGET, "Shutting down Network Bridge: underlying event stream concluded");
Action::Abort
}
Some(NetworkEvent::Dht(_)) => Action::Nop,
@@ -350,6 +352,7 @@ fn construct_view(live_heads: &[Hash]) -> View {
View(live_heads.iter().rev().take(MAX_VIEW_HEADS).cloned().collect())
}
#[tracing::instrument(level = "trace", skip(net, ctx, validation_peers, collation_peers), fields(subsystem = LOG_TARGET))]
async fn update_view(
net: &mut impl Network,
ctx: &mut impl SubsystemContext<Message = NetworkBridgeMessage>,
@@ -379,7 +382,7 @@ async fn update_view(
NetworkBridgeEvent::OurViewChange(new_view.clone()),
ctx,
).await {
log::warn!(target: TARGET, "Aborting - Failure to dispatch messages to overseer");
tracing::warn!(target: LOG_TARGET, err = ?e, "Aborting - Failure to dispatch messages to overseer");
return Err(e)
}
@@ -387,7 +390,7 @@ async fn update_view(
NetworkBridgeEvent::OurViewChange(new_view.clone()),
ctx,
).await {
log::warn!(target: TARGET, "Aborting - Failure to dispatch messages to overseer");
tracing::warn!(target: LOG_TARGET, err = ?e, "Aborting - Failure to dispatch messages to overseer");
return Err(e)
}
@@ -396,6 +399,7 @@ async fn update_view(
// Handle messages on a specific peer-set. The peer is expected to be connected on that
// peer-set.
#[tracing::instrument(level = "trace", skip(peers, messages, net), fields(subsystem = LOG_TARGET))]
async fn handle_peer_messages<M>(
peer: PeerId,
peers: &mut HashMap<PeerId, PeerData>,
@@ -442,6 +446,7 @@ async fn handle_peer_messages<M>(
Ok(outgoing_messages)
}
#[tracing::instrument(level = "trace", skip(net, peers), fields(subsystem = LOG_TARGET))]
async fn send_validation_message<I>(
net: &mut impl Network,
peers: I,
@@ -454,6 +459,7 @@ async fn send_validation_message<I>(
send_message(net, peers, PeerSet::Validation, message).await
}
#[tracing::instrument(level = "trace", skip(net, peers), fields(subsystem = LOG_TARGET))]
async fn send_collation_message<I>(
net: &mut impl Network,
peers: I,
@@ -516,6 +522,7 @@ async fn dispatch_collation_event_to_all(
dispatch_collation_events_to_all(std::iter::once(event), ctx).await
}
#[tracing::instrument(level = "trace", skip(events, ctx), fields(subsystem = LOG_TARGET))]
async fn dispatch_validation_events_to_all<I>(
events: I,
ctx: &mut impl SubsystemContext<Message=NetworkBridgeMessage>,
@@ -547,6 +554,7 @@ async fn dispatch_validation_events_to_all<I>(
ctx.send_messages(events.into_iter().flat_map(messages_for)).await
}
#[tracing::instrument(level = "trace", skip(events, ctx), fields(subsystem = LOG_TARGET))]
async fn dispatch_collation_events_to_all<I>(
events: I,
ctx: &mut impl SubsystemContext<Message=NetworkBridgeMessage>,
@@ -564,6 +572,7 @@ async fn dispatch_collation_events_to_all<I>(
ctx.send_messages(events.into_iter().flat_map(messages_for)).await
}
#[tracing::instrument(skip(network_service, authority_discovery_service, ctx), fields(subsystem = LOG_TARGET))]
async fn run_network<N, AD>(
mut network_service: N,
mut authority_discovery_service: AD,
@@ -686,7 +695,7 @@ where
};
if let Err(e) = res {
log::warn!("Aborting - Failure to dispatch messages to overseer");
tracing::warn!(err = ?e, "Aborting - Failure to dispatch messages to overseer");
return Err(e);
}
}
@@ -713,8 +722,9 @@ where
};
if let Err(e) = res {
log::warn!(
target: TARGET,
tracing::warn!(
target: LOG_TARGET,
err = ?e,
"Aborting - Failure to dispatch messages to overseer",
);
return Err(e)
@@ -734,8 +744,9 @@ where
events,
&mut ctx,
).await {
log::warn!(
target: TARGET,
tracing::warn!(
target: LOG_TARGET,
err = ?e,
"Aborting - Failure to dispatch messages to overseer",
);
return Err(e)
@@ -754,8 +765,9 @@ where
events,
&mut ctx,
).await {
log::warn!(
target: TARGET,
tracing::warn!(
target: LOG_TARGET,
err = ?e,
"Aborting - Failure to dispatch messages to overseer",
);
return Err(e)
@@ -29,6 +29,7 @@ use polkadot_node_network_protocol::PeerId;
use polkadot_primitives::v1::{AuthorityDiscoveryId, Block, Hash};
const PRIORITY_GROUP: &'static str = "parachain_validators";
const LOG_TARGET: &str = "ValidatorDiscovery";
/// An abstraction over networking for the purposes of validator discovery service.
#[async_trait]
@@ -163,6 +164,7 @@ impl<N: Network, AD: AuthorityDiscovery> Service<N, AD> {
/// Find connected validators using the given `validator_ids`.
///
/// Returns a [`HashMap`] that contains the found [`AuthorityDiscoveryId`]'s and their associated [`PeerId`]'s.
#[tracing::instrument(level = "trace", skip(self, authority_discovery_service), fields(subsystem = LOG_TARGET))]
async fn find_connected_validators(
&mut self,
validator_ids: &[AuthorityDiscoveryId],
@@ -201,6 +203,7 @@ impl<N: Network, AD: AuthorityDiscovery> Service<N, AD> {
/// This method will also clean up all previously revoked requests.
/// it takes `network_service` and `authority_discovery_service` by value
/// and returns them as a workaround for the Future: Send requirement imposed by async fn impl.
#[tracing::instrument(level = "trace", skip(self, connected, revoke, network_service, authority_discovery_service), fields(subsystem = LOG_TARGET))]
pub async fn on_request(
&mut self,
validator_ids: Vec<AuthorityDiscoveryId>,
@@ -283,7 +286,7 @@ impl<N: Network, AD: AuthorityDiscovery> Service<N, AD> {
PRIORITY_GROUP.to_owned(),
multiaddr_to_add,
).await {
log::warn!(target: super::TARGET, "AuthorityDiscoveryService returned an invalid multiaddress: {}", e);
tracing::warn!(target: LOG_TARGET, err = ?e, "AuthorityDiscoveryService returned an invalid multiaddress");
}
// the addresses are known to be valid
let _ = network_service.remove_from_priority_group(PRIORITY_GROUP.to_owned(), multiaddr_to_remove).await;
@@ -304,6 +307,7 @@ impl<N: Network, AD: AuthorityDiscovery> Service<N, AD> {
}
/// Should be called when a peer connected.
#[tracing::instrument(level = "trace", skip(self, authority_discovery_service), fields(subsystem = LOG_TARGET))]
pub async fn on_peer_connected(&mut self, peer_id: &PeerId, authority_discovery_service: &mut AD) {
// check if it's an authority we've been waiting for
let maybe_authority = authority_discovery_service.get_authority_id_by_peer_id(peer_id.clone()).await;