Add tracing support to node (#1940)

* drop in tracing to replace log

* add structured logging to trace messages

* add structured logging to debug messages

* add structured logging to info messages

* add structured logging to warn messages

* add structured logging to error messages

* normalize spacing and Display vs Debug

* add instrumentation to the various 'fn run'

* use explicit tracing module throughout

* fix availability distribution test

* don't double-print errors

* remove further redundancy from logs

* fix test errors

* fix more test errors

* remove unused kv_log_macro

* fix unused variable

* add tracing spans to collation generation

* add tracing spans to av-store

* add tracing spans to backing

* add tracing spans to bitfield-signing

* add tracing spans to candidate-selection

* add tracing spans to candidate-validation

* add tracing spans to chain-api

* add tracing spans to provisioner

* add tracing spans to runtime-api

* add tracing spans to availability-distribution

* add tracing spans to bitfield-distribution

* add tracing spans to network-bridge

* add tracing spans to collator-protocol

* add tracing spans to pov-distribution

* add tracing spans to statement-distribution

* add tracing spans to overseer

* cleanup
This commit is contained in:
Peter Goodspeed-Niklaus
2020-11-20 12:02:04 +01:00
committed by GitHub
parent 94670d8082
commit e49989971d
53 changed files with 564 additions and 280 deletions
@@ -19,7 +19,6 @@ use std::collections::HashMap;
use super::{LOG_TARGET, Result};
use futures::{StreamExt, task::Poll};
use log::warn;
use polkadot_primitives::v1::{
CollatorId, CoreIndex, CoreState, Hash, Id as ParaId, CandidateReceipt,
@@ -138,6 +137,7 @@ struct State {
/// or the relay-parent isn't in the active-leaves set, we ignore the message
/// as it must be invalid in that case - although this indicates a logic error
/// elsewhere in the node.
#[tracing::instrument(level = "trace", skip(ctx, state, pov), fields(subsystem = LOG_TARGET))]
async fn distribute_collation<Context>(
ctx: &mut Context,
state: &mut State,
@@ -152,10 +152,10 @@ where
// This collation is not in the active-leaves set.
if !state.view.contains(&relay_parent) {
warn!(
tracing::warn!(
target: LOG_TARGET,
"Distribute collation message parent {:?} is outside of our view",
relay_parent,
relay_parent = %relay_parent,
"distribute collation message parent is outside of our view",
);
return Ok(());
@@ -171,9 +171,11 @@ where
let (our_core, num_cores) = match determine_core(ctx, id, relay_parent).await? {
Some(core) => core,
None => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"Looks like no core is assigned to {:?} at {:?}", id, relay_parent,
para_id = %id,
relay_parent = %relay_parent,
"looks like no core is assigned to {} at {}", id, relay_parent,
);
return Ok(());
}
@@ -183,9 +185,10 @@ where
let our_validators = match determine_our_validators(ctx, our_core, num_cores, relay_parent).await? {
Some(validators) => validators,
None => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"There are no validators assigned to {:?} core", our_core,
core = ?our_core,
"there are no validators assigned to core",
);
return Ok(());
@@ -217,6 +220,7 @@ where
/// Get the Id of the Core that is assigned to the para being collated on if any
/// and the total number of cores.
#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
async fn determine_core<Context>(
ctx: &mut Context,
para_id: ParaId,
@@ -241,6 +245,7 @@ where
/// Figure out a group of validators assigned to the para being collated on.
///
/// This returns validators for the current group and the next group.
#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
async fn determine_our_validators<Context>(
ctx: &mut Context,
core_index: CoreIndex,
@@ -280,6 +285,7 @@ where
}
/// Issue a `Declare` collation message to a set of peers.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn declare<Context>(
ctx: &mut Context,
state: &mut State,
@@ -302,6 +308,7 @@ where
/// Issue a connection request to a set of validators and
/// revoke the previous connection request.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn connect_to_validators<Context>(
ctx: &mut Context,
relay_parent: Hash,
@@ -327,6 +334,7 @@ where
}
/// Advertise collation to a set of relay chain validators.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn advertise_collation<Context>(
ctx: &mut Context,
state: &mut State,
@@ -358,6 +366,7 @@ where
}
/// The main incoming message dispatching switch.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn process_msg<Context>(
ctx: &mut Context,
state: &mut State,
@@ -377,39 +386,39 @@ where
Some(id) if receipt.descriptor.para_id != id => {
// If the ParaId of a collation requested to be distributed does not match
// the one we expect, we ignore the message.
warn!(
tracing::warn!(
target: LOG_TARGET,
"DistributeCollation message for para {:?} while collating on {:?}",
receipt.descriptor.para_id,
id,
para_id = %receipt.descriptor.para_id,
collating_on = %id,
"DistributeCollation for unexpected para_id",
);
}
Some(id) => {
distribute_collation(ctx, state, id, receipt, pov).await?;
}
None => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"DistributeCollation message for para {:?} while not collating on any",
receipt.descriptor.para_id,
para_id = %receipt.descriptor.para_id,
"DistributeCollation message while not collating on any",
);
}
}
}
FetchCollation(_, _, _, _) => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"FetchCollation message is not expected on the collator side of the protocol",
);
}
ReportCollator(_) => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"ReportCollator message is not expected on the collator side of the protocol",
);
}
NoteGoodCollation(_) => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"NoteGoodCollation message is not expected on the collator side of the protocol",
);
@@ -420,9 +429,10 @@ where
state,
event,
).await {
warn!(
tracing::warn!(
target: LOG_TARGET,
"Failed to handle incoming network message: {:?}", e,
err = ?e,
"Failed to handle incoming network message",
);
}
},
@@ -432,6 +442,7 @@ where
}
/// Issue a response to a previously requested collation.
#[tracing::instrument(level = "trace", skip(ctx, state, pov), fields(subsystem = LOG_TARGET))]
async fn send_collation<Context>(
ctx: &mut Context,
state: &mut State,
@@ -462,6 +473,7 @@ where
}
/// A networking messages switch.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn handle_incoming_peer_message<Context>(
ctx: &mut Context,
state: &mut State,
@@ -475,13 +487,13 @@ where
match msg {
Declare(_) => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"Declare message is not expected on the collator side of the protocol",
);
}
AdvertiseCollation(_, _) => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"AdvertiseCollation message is not expected on the collator side of the protocol",
);
@@ -494,24 +506,25 @@ where
send_collation(ctx, state, request_id, origin, collation.0, collation.1).await?;
}
} else {
warn!(
tracing::warn!(
target: LOG_TARGET,
"Received a RequestCollation for {:?} while collating on {:?}",
para_id, our_para_id,
for_para_id = %para_id,
our_para_id = %our_para_id,
"received a RequestCollation for unexpected para_id",
);
}
}
None => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"Received a RequestCollation for {:?} while not collating on any para",
para_id,
for_para_id = %para_id,
"received a RequestCollation while not collating on any para",
);
}
}
}
Collation(_, _, _) => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"Collation message is not expected on the collator side of the protocol",
);
@@ -522,6 +535,7 @@ where
}
/// Our view has changed.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn handle_peer_view_change<Context>(
ctx: &mut Context,
state: &mut State,
@@ -549,6 +563,7 @@ where
/// A validator is connected.
///
/// `Declare` that we are a collator with a given `CollatorId`.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn handle_validator_connected<Context>(
ctx: &mut Context,
state: &mut State,
@@ -571,6 +586,7 @@ where
}
/// Bridge messages switch.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn handle_network_msg<Context>(
ctx: &mut Context,
state: &mut State,
@@ -605,6 +621,7 @@ where
}
/// Handles our view changes.
#[tracing::instrument(level = "trace", skip(state), fields(subsystem = LOG_TARGET))]
async fn handle_our_view_change(
state: &mut State,
view: View,
@@ -624,6 +641,7 @@ async fn handle_our_view_change(
}
/// The collator protocol collator side main loop.
#[tracing::instrument(skip(ctx, metrics), fields(subsystem = LOG_TARGET))]
pub(crate) async fn run<Context>(
mut ctx: Context,
our_id: CollatorId,
@@ -646,10 +664,10 @@ where
if let Some(mut request) = state.last_connection_request.take() {
while let Poll::Ready(Some((validator_id, peer_id))) = futures::poll!(request.next()) {
if let Err(err) = handle_validator_connected(&mut ctx, &mut state, peer_id, validator_id).await {
warn!(
tracing::warn!(
target: LOG_TARGET,
"Failed to declare our collator id: {:?}",
err,
err = ?err,
"Failed to declare our collator id",
);
}
}
@@ -661,7 +679,7 @@ where
match msg? {
Communication { msg } => {
if let Err(e) = process_msg(&mut ctx, &mut state, msg).await {
warn!(target: LOG_TARGET, "Failed to process message: {}", e);
tracing::warn!(target: LOG_TARGET, err = ?e, "Failed to process message");
}
},
Signal(ActiveLeaves(_update)) => {}
@@ -682,7 +700,6 @@ mod tests {
use assert_matches::assert_matches;
use futures::{executor, future, Future};
use log::trace;
use smallvec::smallvec;
use sp_core::crypto::Pair;
@@ -839,7 +856,7 @@ mod tests {
overseer: &mut test_helpers::TestSubsystemContextHandle<CollatorProtocolMessage>,
msg: CollatorProtocolMessage,
) {
trace!("Sending message:\n{:?}", &msg);
tracing::trace!(msg = ?msg, "sending message");
overseer
.send(FromOverseer::Communication { msg })
.timeout(TIMEOUT)
@@ -854,7 +871,7 @@ mod tests {
.await
.expect(&format!("{:?} is more than enough to receive messages", TIMEOUT));
trace!("Received message:\n{:?}", &msg);
tracing::trace!(msg = ?msg, "received message");
msg
}
@@ -863,7 +880,7 @@ mod tests {
overseer: &mut test_helpers::TestSubsystemContextHandle<CollatorProtocolMessage>,
timeout: Duration,
) -> Option<AllMessages> {
trace!("Waiting for message...");
tracing::trace!("waiting for message...");
overseer
.recv()
.timeout(timeout)
@@ -21,7 +21,6 @@
use std::time::Duration;
use futures::{channel::oneshot, FutureExt, TryFutureExt};
use log::trace;
use thiserror::Error;
use polkadot_subsystem::{
@@ -96,6 +95,7 @@ impl CollatorProtocolSubsystem {
}
}
#[tracing::instrument(skip(self, ctx), fields(subsystem = LOG_TARGET))]
async fn run<Context>(self, ctx: Context) -> Result<()>
where
Context: SubsystemContext<Message = CollatorProtocolMessage>,
@@ -135,13 +135,16 @@ where
}
/// Modify the reputation of a peer based on its behavior.
#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
async fn modify_reputation<Context>(ctx: &mut Context, peer: PeerId, rep: Rep) -> Result<()>
where
Context: SubsystemContext<Message = CollatorProtocolMessage>,
{
trace!(
tracing::trace!(
target: LOG_TARGET,
"Reputation change of {:?} for peer {:?}", rep, peer,
rep = ?rep,
peer_id = %peer,
"reputation change for peer",
);
ctx.send_message(AllMessages::NetworkBridge(
@@ -25,7 +25,6 @@ use futures::{
future::BoxFuture,
stream::FuturesUnordered,
};
use log::{trace, warn};
use polkadot_primitives::v1::{
Id as ParaId, CandidateReceipt, CollatorId, Hash, PoV,
@@ -188,6 +187,7 @@ struct State {
}
/// Another subsystem has requested to fetch collations on a particular leaf for some para.
#[tracing::instrument(level = "trace", skip(ctx, state, tx), fields(subsystem = LOG_TARGET))]
async fn fetch_collation<Context>(
ctx: &mut Context,
state: &mut State,
@@ -206,9 +206,10 @@ where
if let Err(e) = tx.send((collation.1.clone(), collation.2.clone())) {
// We do not want this to be fatal because the receving subsystem
// may have closed the results channel for some reason.
trace!(
tracing::trace!(
target: LOG_TARGET,
"Failed to send collation: {:?}", e,
err = ?e,
"Failed to send collation",
);
}
return Ok(());
@@ -238,6 +239,7 @@ where
}
/// Report a collator for some malicious actions.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn report_collator<Context>(
ctx: &mut Context,
state: &mut State,
@@ -259,6 +261,7 @@ where
}
/// Some other subsystem has reported a collator as a good one, bump reputation.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn note_good_collation<Context>(
ctx: &mut Context,
state: &mut State,
@@ -279,6 +282,7 @@ where
/// A peer's view has changed. A number of things should be done:
/// - Ongoing collation requests have to be cancelled.
/// - Advertisements by this peer that are no longer relevant have to be removed.
#[tracing::instrument(level = "trace", skip(state), fields(subsystem = LOG_TARGET))]
async fn handle_peer_view_change(
state: &mut State,
peer_id: PeerId,
@@ -320,6 +324,7 @@ async fn handle_peer_view_change(
/// - Cancel all ongoing requests
/// - Reply to interested parties if any
/// - Store collation.
#[tracing::instrument(level = "trace", skip(ctx, state, pov), fields(subsystem = LOG_TARGET))]
async fn received_collation<Context>(
ctx: &mut Context,
state: &mut State,
@@ -368,6 +373,7 @@ where
/// - Check if the requested collation is in our view.
/// - Update PerRequest records with the `result` field if necessary.
/// And as such invocations of this function may rely on that.
#[tracing::instrument(level = "trace", skip(ctx, state, result), fields(subsystem = LOG_TARGET))]
async fn request_collation<Context>(
ctx: &mut Context,
state: &mut State,
@@ -380,19 +386,23 @@ where
Context: SubsystemContext<Message = CollatorProtocolMessage>
{
if !state.view.contains(&relay_parent) {
trace!(
tracing::trace!(
target: LOG_TARGET,
"Collation by {} on {} on relay parent {} is no longer in view",
peer_id, para_id, relay_parent,
peer_id = %peer_id,
para_id = %para_id,
relay_parent = %relay_parent,
"collation is no longer in view",
);
return Ok(());
}
if state.requested_collations.contains_key(&(relay_parent, para_id.clone(), peer_id.clone())) {
trace!(
tracing::trace!(
target: LOG_TARGET,
"Collation by {} on {} on relay parent {} has already been requested",
peer_id, para_id, relay_parent,
peer_id = %peer_id,
para_id = %para_id,
relay_parent = %relay_parent,
"collation has already been requested",
);
return Ok(());
}
@@ -436,6 +446,7 @@ where
}
/// Notify `CandidateSelectionSubsystem` that a collation has been advertised.
#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
async fn notify_candidate_selection<Context>(
ctx: &mut Context,
collator: CollatorId,
@@ -457,6 +468,7 @@ where
}
/// Networking message has been received.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn process_incoming_peer_message<Context>(
ctx: &mut Context,
state: &mut State,
@@ -495,6 +507,7 @@ where
/// A leaf has become inactive so we want to
/// - Cancel all ongoing collation requests that are on top of that leaf.
/// - Remove all stored collations relevant to that leaf.
#[tracing::instrument(level = "trace", skip(state), fields(subsystem = LOG_TARGET))]
async fn remove_relay_parent(
state: &mut State,
relay_parent: Hash,
@@ -520,6 +533,7 @@ async fn remove_relay_parent(
}
/// Our view has changed.
#[tracing::instrument(level = "trace", skip(state), fields(subsystem = LOG_TARGET))]
async fn handle_our_view_change(
state: &mut State,
view: View,
@@ -543,6 +557,7 @@ async fn handle_our_view_change(
}
/// A request has timed out.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn request_timed_out<Context>(
ctx: &mut Context,
state: &mut State,
@@ -568,6 +583,7 @@ where
}
/// Bridge event switch.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn handle_network_msg<Context>(
ctx: &mut Context,
state: &mut State,
@@ -601,6 +617,7 @@ where
}
/// The main message receiver switch.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn process_msg<Context>(
ctx: &mut Context,
msg: CollatorProtocolMessage,
@@ -613,13 +630,14 @@ where
match msg {
CollateOn(id) => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"CollateOn({}) message is not expected on the validator side of the protocol", id,
para_id = %id,
"CollateOn message is not expected on the validator side of the protocol",
);
}
DistributeCollation(_, _) => {
warn!(
tracing::warn!(
target: LOG_TARGET,
"DistributeCollation message is not expected on the validator side of the protocol",
);
@@ -639,9 +657,10 @@ where
state,
event,
).await {
warn!(
tracing::warn!(
target: LOG_TARGET,
"Failed to handle incoming network message: {:?}", e,
err = ?e,
"Failed to handle incoming network message",
);
}
}
@@ -651,6 +670,7 @@ where
}
/// The main run loop.
#[tracing::instrument(skip(ctx, metrics), fields(subsystem = LOG_TARGET))]
pub(crate) async fn run<Context>(
mut ctx: Context,
request_timeout: Duration,
@@ -671,7 +691,7 @@ where
loop {
if let Poll::Ready(msg) = futures::poll!(ctx.recv()) {
let msg = msg?;
trace!(target: LOG_TARGET, "Received a message {:?}", msg);
tracing::trace!(target: LOG_TARGET, msg = ?msg, "received a message");
match msg {
Communication { msg } => process_msg(&mut ctx, msg, &mut state).await?,
@@ -687,7 +707,7 @@ where
// if the chain has not moved on yet.
match request {
CollationRequestResult::Timeout(id) => {
trace!(target: LOG_TARGET, "Request timed out {}", id);
tracing::trace!(target: LOG_TARGET, id, "request timed out");
request_timed_out(&mut ctx, &mut state, id).await?;
}
CollationRequestResult::Received(id) => {
@@ -784,7 +804,7 @@ mod tests {
overseer: &mut test_helpers::TestSubsystemContextHandle<CollatorProtocolMessage>,
msg: CollatorProtocolMessage,
) {
log::trace!("Sending message:\n{:?}", &msg);
tracing::trace!("Sending message:\n{:?}", &msg);
overseer
.send(FromOverseer::Communication { msg })
.timeout(TIMEOUT)
@@ -799,7 +819,7 @@ mod tests {
.await
.expect(&format!("{:?} is enough to receive messages.", TIMEOUT));
log::trace!("Received message:\n{:?}", &msg);
tracing::trace!("Received message:\n{:?}", &msg);
msg
}
@@ -808,7 +828,7 @@ mod tests {
overseer: &mut test_helpers::TestSubsystemContextHandle<CollatorProtocolMessage>,
timeout: Duration,
) -> Option<AllMessages> {
log::trace!("Waiting for message...");
tracing::trace!("Waiting for message...");
overseer
.recv()
.timeout(timeout)
@@ -826,7 +846,7 @@ mod tests {
} = test_harness;
let pair = CollatorPair::generate().0;
log::trace!("activating");
tracing::trace!("activating");
overseer_send(
&mut virtual_overseer,