Add tracing support to node (#1940)

* drop in tracing to replace log * add structured logging to trace messages * add structured logging to debug messages * add structured logging to info messages * add structured logging to warn messages * add structured logging to error messages * normalize spacing and Display vs Debug * add instrumentation to the various 'fn run' * use explicit tracing module throughout * fix availability distribution test * don't double-print errors * remove further redundancy from logs * fix test errors * fix more test errors * remove unused kv_log_macro * fix unused variable * add tracing spans to collation generation * add tracing spans to av-store * add tracing spans to backing * add tracing spans to bitfield-signing * add tracing spans to candidate-selection * add tracing spans to candidate-validation * add tracing spans to chain-api * add tracing spans to provisioner * add tracing spans to runtime-api * add tracing spans to availability-distribution * add tracing spans to bitfield-distribution * add tracing spans to network-bridge * add tracing spans to collator-protocol * add tracing spans to pov-distribution * add tracing spans to statement-distribution * add tracing spans to overseer * cleanup
2026-04-26 12:17:58 +00:00 · 2020-11-20 12:02:04 +01:00
parent 94670d8082
commit e49989971d
53 changed files with 564 additions and 280 deletions
@@ -19,7 +19,6 @@ use std::collections::HashMap;
 use super::{LOG_TARGET,  Result};

 use futures::{StreamExt, task::Poll};
-use log::warn;

 use polkadot_primitives::v1::{
 	CollatorId, CoreIndex, CoreState, Hash, Id as ParaId, CandidateReceipt,
@@ -138,6 +137,7 @@ struct State {
 /// or the relay-parent isn't in the active-leaves set, we ignore the message
 /// as it must be invalid in that case - although this indicates a logic error
 /// elsewhere in the node.
+#[tracing::instrument(level = "trace", skip(ctx, state, pov), fields(subsystem = LOG_TARGET))]
 async fn distribute_collation<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -152,10 +152,10 @@ where

 	// This collation is not in the active-leaves set.
 	if !state.view.contains(&relay_parent) {
-		warn!(
+		tracing::warn!(
 			target: LOG_TARGET,
-			"Distribute collation message parent {:?} is outside of our view",
-			relay_parent,
+			relay_parent = %relay_parent,
+			"distribute collation message parent is outside of our view",
 		);

 		return Ok(());
@@ -171,9 +171,11 @@ where
 	let (our_core, num_cores) = match determine_core(ctx, id, relay_parent).await? {
 		Some(core) => core,
 		None => {
-			warn!(
+			tracing::warn!(
 				target: LOG_TARGET,
-				"Looks like no core is assigned to {:?} at {:?}", id, relay_parent,
+				para_id = %id,
+				relay_parent = %relay_parent,
+				"looks like no core is assigned to {} at {}", id, relay_parent,
 			);
 			return Ok(());
 		}
@@ -183,9 +185,10 @@ where
 	let our_validators = match determine_our_validators(ctx, our_core, num_cores, relay_parent).await? {
 		Some(validators) => validators,
 		None => {
-			warn!(
+			tracing::warn!(
 				target: LOG_TARGET,
-				"There are no validators assigned to {:?} core", our_core,
+				core = ?our_core,
+				"there are no validators assigned to core",
 			);

 			return Ok(());
@@ -217,6 +220,7 @@ where

 /// Get the Id of the Core that is assigned to the para being collated on if any
 /// and the total number of cores.
+#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
 async fn determine_core<Context>(
 	ctx: &mut Context,
 	para_id: ParaId,
@@ -241,6 +245,7 @@ where
 /// Figure out a group of validators assigned to the para being collated on.
 ///
 /// This returns validators for the current group and the next group.
+#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
 async fn determine_our_validators<Context>(
 	ctx: &mut Context,
 	core_index: CoreIndex,
@@ -280,6 +285,7 @@ where
 }

 /// Issue a `Declare` collation message to a set of peers.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn declare<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -302,6 +308,7 @@ where

 /// Issue a connection request to a set of validators and
 /// revoke the previous connection request.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn connect_to_validators<Context>(
 	ctx: &mut Context,
 	relay_parent: Hash,
@@ -327,6 +334,7 @@ where
 }

 /// Advertise collation to a set of relay chain validators.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn advertise_collation<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -358,6 +366,7 @@ where
 }

 /// The main incoming message dispatching switch.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn process_msg<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -377,39 +386,39 @@ where
 				Some(id) if receipt.descriptor.para_id != id => {
 					// If the ParaId of a collation requested to be distributed does not match
 					// the one we expect, we ignore the message.
-					warn!(
+					tracing::warn!(
 						target: LOG_TARGET,
-						"DistributeCollation message for para {:?} while collating on {:?}",
-						receipt.descriptor.para_id,
-						id,
+						para_id = %receipt.descriptor.para_id,
+						collating_on = %id,
+						"DistributeCollation for unexpected para_id",
 					);
 				}
 				Some(id) => {
 					distribute_collation(ctx, state, id, receipt, pov).await?;
 				}
 				None => {
-					warn!(
+					tracing::warn!(
 						target: LOG_TARGET,
-						"DistributeCollation message for para {:?} while not collating on any",
-						receipt.descriptor.para_id,
+						para_id = %receipt.descriptor.para_id,
+						"DistributeCollation message while not collating on any",
 					);
 				}
 			}
 		}
 		FetchCollation(_, _, _, _) => {
-			warn!(
+			tracing::warn!(
 				target: LOG_TARGET,
 				"FetchCollation message is not expected on the collator side of the protocol",
 			);
 		}
 		ReportCollator(_) => {
-			warn!(
+			tracing::warn!(
 				target: LOG_TARGET,
 				"ReportCollator message is not expected on the collator side of the protocol",
 			);
 		}
 		NoteGoodCollation(_) => {
-			warn!(
+			tracing::warn!(
 				target: LOG_TARGET,
 				"NoteGoodCollation message is not expected on the collator side of the protocol",
 			);
@@ -420,9 +429,10 @@ where
 				state,
 				event,
 			).await {
-				warn!(
+				tracing::warn!(
 					target: LOG_TARGET,
-					"Failed to handle incoming network message: {:?}", e,
+					err = ?e,
+					"Failed to handle incoming network message",
 				);
 			}
 		},
@@ -432,6 +442,7 @@ where
 }

 /// Issue a response to a previously requested collation.
+#[tracing::instrument(level = "trace", skip(ctx, state, pov), fields(subsystem = LOG_TARGET))]
 async fn send_collation<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -462,6 +473,7 @@ where
 }

 /// A networking messages switch.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn handle_incoming_peer_message<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -475,13 +487,13 @@ where

 	match msg {
 		Declare(_) => {
-			warn!(
+			tracing::warn!(
 				target: LOG_TARGET,
 				"Declare message is not expected on the collator side of the protocol",
 			);
 		}
 		AdvertiseCollation(_, _) => {
-			warn!(
+			tracing::warn!(
 				target: LOG_TARGET,
 				"AdvertiseCollation message is not expected on the collator side of the protocol",
 			);
@@ -494,24 +506,25 @@ where
 							send_collation(ctx, state, request_id, origin, collation.0, collation.1).await?;
 						}
 					} else {
-						warn!(
+						tracing::warn!(
 							target: LOG_TARGET,
-							"Received a RequestCollation for {:?} while collating on {:?}",
-							para_id, our_para_id,
+							for_para_id = %para_id,
+							our_para_id = %our_para_id,
+							"received a RequestCollation for unexpected para_id",
 						);
 					}
 				}
 				None => {
-					warn!(
+					tracing::warn!(
 						target: LOG_TARGET,
-						"Received a RequestCollation for {:?} while not collating on any para",
-						para_id,
+						for_para_id = %para_id,
+						"received a RequestCollation while not collating on any para",
 					);
 				}
 			}
 		}
 		Collation(_, _, _) => {
-			warn!(
+			tracing::warn!(
 				target: LOG_TARGET,
 				"Collation message is not expected on the collator side of the protocol",
 			);
@@ -522,6 +535,7 @@ where
 }

 /// Our view has changed.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn handle_peer_view_change<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -549,6 +563,7 @@ where
 /// A validator is connected.
 ///
 /// `Declare` that we are a collator with a given `CollatorId`.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn handle_validator_connected<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -571,6 +586,7 @@ where
 }

 /// Bridge messages switch.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn handle_network_msg<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -605,6 +621,7 @@ where
 }

 /// Handles our view changes.
+#[tracing::instrument(level = "trace", skip(state), fields(subsystem = LOG_TARGET))]
 async fn handle_our_view_change(
 	state: &mut State,
 	view: View,
@@ -624,6 +641,7 @@ async fn handle_our_view_change(
 }

 /// The collator protocol collator side main loop.
+#[tracing::instrument(skip(ctx, metrics), fields(subsystem = LOG_TARGET))]
 pub(crate) async fn run<Context>(
 	mut ctx: Context,
 	our_id: CollatorId,
@@ -646,10 +664,10 @@ where
 		if let Some(mut request) = state.last_connection_request.take() {
 			while let Poll::Ready(Some((validator_id, peer_id))) = futures::poll!(request.next()) {
 				if let Err(err) = handle_validator_connected(&mut ctx, &mut state, peer_id, validator_id).await {
-					warn!(
+					tracing::warn!(
 						target: LOG_TARGET,
-						"Failed to declare our collator id: {:?}",
-						err,
+						err = ?err,
+						"Failed to declare our collator id",
 					);
 				}
 			}
@@ -661,7 +679,7 @@ where
 			match msg? {
 				Communication { msg } => {
 					if let Err(e) = process_msg(&mut ctx, &mut state, msg).await {
-						warn!(target: LOG_TARGET, "Failed to process message: {}", e);
+						tracing::warn!(target: LOG_TARGET, err = ?e, "Failed to process message");
 					}
 				},
 				Signal(ActiveLeaves(_update)) => {}
@@ -682,7 +700,6 @@ mod tests {

 	use assert_matches::assert_matches;
 	use futures::{executor, future, Future};
-	use log::trace;
 	use smallvec::smallvec;

 	use sp_core::crypto::Pair;
@@ -839,7 +856,7 @@ mod tests {
 		overseer: &mut test_helpers::TestSubsystemContextHandle<CollatorProtocolMessage>,
 		msg: CollatorProtocolMessage,
 	) {
-		trace!("Sending message:\n{:?}", &msg);
+		tracing::trace!(msg = ?msg, "sending message");
 		overseer
 			.send(FromOverseer::Communication { msg })
 			.timeout(TIMEOUT)
@@ -854,7 +871,7 @@ mod tests {
 			.await
 			.expect(&format!("{:?} is more than enough to receive messages", TIMEOUT));

-		trace!("Received message:\n{:?}", &msg);
+		tracing::trace!(msg = ?msg, "received message");

 		msg
 	}
@@ -863,7 +880,7 @@ mod tests {
 		overseer: &mut test_helpers::TestSubsystemContextHandle<CollatorProtocolMessage>,
 		timeout: Duration,
 	) -> Option<AllMessages> {
-		trace!("Waiting for message...");
+		tracing::trace!("waiting for message...");
 		overseer
 			.recv()
 			.timeout(timeout)
@@ -21,7 +21,6 @@

 use std::time::Duration;
 use futures::{channel::oneshot, FutureExt, TryFutureExt};
-use log::trace;
 use thiserror::Error;

 use polkadot_subsystem::{
@@ -96,6 +95,7 @@ impl CollatorProtocolSubsystem {
 		}
 	}

+	#[tracing::instrument(skip(self, ctx), fields(subsystem = LOG_TARGET))]
 	async fn run<Context>(self, ctx: Context) -> Result<()>
 	where
 		Context: SubsystemContext<Message = CollatorProtocolMessage>,
@@ -135,13 +135,16 @@ where
 }

 /// Modify the reputation of a peer based on its behavior.
+#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
 async fn modify_reputation<Context>(ctx: &mut Context, peer: PeerId, rep: Rep) -> Result<()>
 where
 	Context: SubsystemContext<Message = CollatorProtocolMessage>,
 {
-	trace!(
+	tracing::trace!(
 		target: LOG_TARGET,
-		"Reputation change of {:?} for peer {:?}", rep, peer,
+		rep = ?rep,
+		peer_id = %peer,
+		"reputation change for peer",
 	);

 	ctx.send_message(AllMessages::NetworkBridge(
@@ -25,7 +25,6 @@ use futures::{
 	future::BoxFuture,
 	stream::FuturesUnordered,
 };
-use log::{trace, warn};

 use polkadot_primitives::v1::{
 	Id as ParaId, CandidateReceipt, CollatorId, Hash, PoV,
@@ -188,6 +187,7 @@ struct State {
 }

 /// Another subsystem has requested to fetch collations on a particular leaf for some para.
+#[tracing::instrument(level = "trace", skip(ctx, state, tx), fields(subsystem = LOG_TARGET))]
 async fn fetch_collation<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -206,9 +206,10 @@ where
 				if let Err(e) = tx.send((collation.1.clone(), collation.2.clone())) {
 					// We do not want this to be fatal because the receving subsystem
 					// may have closed the results channel for some reason.
-					trace!(
+					tracing::trace!(
 						target: LOG_TARGET,
-						"Failed to send collation: {:?}", e,
+						err = ?e,
+						"Failed to send collation",
 					);
 				}
 				return Ok(());
@@ -238,6 +239,7 @@ where
 }

 /// Report a collator for some malicious actions.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn report_collator<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -259,6 +261,7 @@ where
 }

 /// Some other subsystem has reported a collator as a good one, bump reputation.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn note_good_collation<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -279,6 +282,7 @@ where
 /// A peer's view has changed. A number of things should be done:
 ///  - Ongoing collation requests have to be cancelled.
 ///  - Advertisements by this peer that are no longer relevant have to be removed.
+#[tracing::instrument(level = "trace", skip(state), fields(subsystem = LOG_TARGET))]
 async fn handle_peer_view_change(
 	state: &mut State,
 	peer_id: PeerId,
@@ -320,6 +324,7 @@ async fn handle_peer_view_change(
 ///  - Cancel all ongoing requests
 ///  - Reply to interested parties if any
 ///  - Store collation.
+#[tracing::instrument(level = "trace", skip(ctx, state, pov), fields(subsystem = LOG_TARGET))]
 async fn received_collation<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -368,6 +373,7 @@ where
 ///  - Check if the requested collation is in our view.
 ///  - Update PerRequest records with the `result` field if necessary.
 /// And as such invocations of this function may rely on that.
+#[tracing::instrument(level = "trace", skip(ctx, state, result), fields(subsystem = LOG_TARGET))]
 async fn request_collation<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -380,19 +386,23 @@ where
 	Context: SubsystemContext<Message = CollatorProtocolMessage>
 {
 	if !state.view.contains(&relay_parent) {
-		trace!(
+		tracing::trace!(
 			target: LOG_TARGET,
-			"Collation by {} on {} on relay parent {} is no longer in view",
-			peer_id, para_id, relay_parent,
+			peer_id = %peer_id,
+			para_id = %para_id,
+			relay_parent = %relay_parent,
+			"collation is no longer in view",
 		);
 		return Ok(());
 	}

 	if state.requested_collations.contains_key(&(relay_parent, para_id.clone(), peer_id.clone())) {
-		trace!(
+		tracing::trace!(
 			target: LOG_TARGET,
-			"Collation by {} on {} on relay parent {} has already been requested",
-			peer_id, para_id, relay_parent,
+			peer_id = %peer_id,
+			para_id = %para_id,
+			relay_parent = %relay_parent,
+			"collation has already been requested",
 		);
 		return Ok(());
 	}
@@ -436,6 +446,7 @@ where
 }

 /// Notify `CandidateSelectionSubsystem` that a collation has been advertised.
+#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
 async fn notify_candidate_selection<Context>(
 	ctx: &mut Context,
 	collator: CollatorId,
@@ -457,6 +468,7 @@ where
 }

 /// Networking message has been received.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn process_incoming_peer_message<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -495,6 +507,7 @@ where
 /// A leaf has become inactive so we want to
 ///   - Cancel all ongoing collation requests that are on top of that leaf.
 ///   - Remove all stored collations relevant to that leaf.
+#[tracing::instrument(level = "trace", skip(state), fields(subsystem = LOG_TARGET))]
 async fn remove_relay_parent(
 	state: &mut State,
 	relay_parent: Hash,
@@ -520,6 +533,7 @@ async fn remove_relay_parent(
 }

 /// Our view has changed.
+#[tracing::instrument(level = "trace", skip(state), fields(subsystem = LOG_TARGET))]
 async fn handle_our_view_change(
 	state: &mut State,
 	view: View,
@@ -543,6 +557,7 @@ async fn handle_our_view_change(
 }

 /// A request has timed out.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn request_timed_out<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -568,6 +583,7 @@ where
 }

 /// Bridge event switch.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn handle_network_msg<Context>(
 	ctx: &mut Context,
 	state: &mut State,
@@ -601,6 +617,7 @@ where
 }

 /// The main message receiver switch.
+#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
 async fn process_msg<Context>(
 	ctx: &mut Context,
 	msg: CollatorProtocolMessage,
@@ -613,13 +630,14 @@ where

 	match msg {
 		CollateOn(id) => {
-			warn!(
+			tracing::warn!(
 				target: LOG_TARGET,
-				"CollateOn({}) message is not expected on the validator side of the protocol", id,
+				para_id = %id,
+				"CollateOn message is not expected on the validator side of the protocol",
 			);
 		}
 		DistributeCollation(_, _) => {
-			warn!(
+			tracing::warn!(
 				target: LOG_TARGET,
 				"DistributeCollation message is not expected on the validator side of the protocol",
 			);
@@ -639,9 +657,10 @@ where
 				state,
 				event,
 			).await {
-				warn!(
+				tracing::warn!(
 					target: LOG_TARGET,
-					"Failed to handle incoming network message: {:?}", e,
+					err = ?e,
+					"Failed to handle incoming network message",
 				);
 			}
 		}
@@ -651,6 +670,7 @@ where
 }

 /// The main run loop.
+#[tracing::instrument(skip(ctx, metrics), fields(subsystem = LOG_TARGET))]
 pub(crate) async fn run<Context>(
 	mut ctx: Context,
 	request_timeout: Duration,
@@ -671,7 +691,7 @@ where
 	loop {
 		if let Poll::Ready(msg) = futures::poll!(ctx.recv()) {
 			let msg = msg?;
-			trace!(target: LOG_TARGET, "Received a message {:?}", msg);
+			tracing::trace!(target: LOG_TARGET, msg = ?msg, "received a message");

 			match msg {
 				Communication { msg } => process_msg(&mut ctx, msg, &mut state).await?,
@@ -687,7 +707,7 @@ where
 			// if the chain has not moved on yet.
 			match request {
 				CollationRequestResult::Timeout(id) => {
-					trace!(target: LOG_TARGET, "Request timed out {}", id);
+					tracing::trace!(target: LOG_TARGET, id, "request timed out");
 					request_timed_out(&mut ctx, &mut state, id).await?;
 				}
 				CollationRequestResult::Received(id) => {
@@ -784,7 +804,7 @@ mod tests {
 		overseer: &mut test_helpers::TestSubsystemContextHandle<CollatorProtocolMessage>,
 		msg: CollatorProtocolMessage,
 	) {
-		log::trace!("Sending message:\n{:?}", &msg);
+		tracing::trace!("Sending message:\n{:?}", &msg);
 		overseer
 			.send(FromOverseer::Communication { msg })
 			.timeout(TIMEOUT)
@@ -799,7 +819,7 @@ mod tests {
 			.await
 			.expect(&format!("{:?} is enough to receive messages.", TIMEOUT));

-		log::trace!("Received message:\n{:?}", &msg);
+		tracing::trace!("Received message:\n{:?}", &msg);

 		msg
 	}
@@ -808,7 +828,7 @@ mod tests {
 		overseer: &mut test_helpers::TestSubsystemContextHandle<CollatorProtocolMessage>,
 		timeout: Duration,
 	) -> Option<AllMessages> {
-		log::trace!("Waiting for message...");
+		tracing::trace!("Waiting for message...");
 		overseer
 			.recv()
 			.timeout(timeout)
@@ -826,7 +846,7 @@ mod tests {
 			} = test_harness;

 			let pair = CollatorPair::generate().0;
-			log::trace!("activating");
+			tracing::trace!("activating");

 			overseer_send(
 				&mut virtual_overseer,