Rework telemetry to replace the use of tracing with an object we pass around (#8143)

polkadot companion: paritytech/polkadot#2535
This commit is contained in:
Cecile Tonglet
2021-03-11 11:05:45 +01:00
committed by GitHub
parent 7aaba0c154
commit 8031b6eacb
55 changed files with 1028 additions and 838 deletions
+52 -42
View File
@@ -55,8 +55,8 @@ use wasm_timer::SystemTime;
use sc_telemetry::{
telemetry,
ConnectionMessage,
TelemetryConnectionNotifier,
TelemetrySpan,
Telemetry,
TelemetryHandle,
SUBSTRATE_INFO,
};
use sp_transaction_pool::MaintainedTransactionPool;
@@ -213,17 +213,17 @@ pub type TLightClientWithBackend<TBl, TRtApi, TExecDisp, TBackend> = Client<
>;
trait AsCryptoStoreRef {
fn keystore_ref(&self) -> Arc<dyn CryptoStore>;
fn sync_keystore_ref(&self) -> Arc<dyn SyncCryptoStore>;
fn keystore_ref(&self) -> Arc<dyn CryptoStore>;
fn sync_keystore_ref(&self) -> Arc<dyn SyncCryptoStore>;
}
impl<T> AsCryptoStoreRef for Arc<T> where T: CryptoStore + SyncCryptoStore + 'static {
fn keystore_ref(&self) -> Arc<dyn CryptoStore> {
self.clone()
}
fn sync_keystore_ref(&self) -> Arc<dyn SyncCryptoStore> {
self.clone()
}
fn keystore_ref(&self) -> Arc<dyn CryptoStore> {
self.clone()
}
fn sync_keystore_ref(&self) -> Arc<dyn SyncCryptoStore> {
self.clone()
}
}
/// Construct and hold different layers of Keystore wrappers
@@ -291,16 +291,18 @@ impl KeystoreContainer {
/// Creates a new full client for the given config.
pub fn new_full_client<TBl, TRtApi, TExecDisp>(
config: &Configuration,
telemetry: Option<TelemetryHandle>,
) -> Result<TFullClient<TBl, TRtApi, TExecDisp>, Error> where
TBl: BlockT,
TExecDisp: NativeExecutionDispatch + 'static,
{
new_full_parts(config).map(|parts| parts.0)
new_full_parts(config, telemetry).map(|parts| parts.0)
}
/// Create the initial parts of a full node.
pub fn new_full_parts<TBl, TRtApi, TExecDisp>(
config: &Configuration,
telemetry: Option<TelemetryHandle>,
) -> Result<TFullParts<TBl, TRtApi, TExecDisp>, Error> where
TBl: BlockT,
TExecDisp: NativeExecutionDispatch + 'static,
@@ -356,6 +358,7 @@ pub fn new_full_parts<TBl, TRtApi, TExecDisp>(
extensions,
Box::new(task_manager.spawn_handle()),
config.prometheus_config.as_ref().map(|config| config.registry.clone()),
telemetry,
ClientConfig {
offchain_worker_enabled : config.offchain_worker.enabled,
offchain_indexing_api: config.offchain_worker.indexing_enabled,
@@ -377,6 +380,7 @@ pub fn new_full_parts<TBl, TRtApi, TExecDisp>(
/// Create the initial parts of a light node.
pub fn new_light_parts<TBl, TRtApi, TExecDisp>(
config: &Configuration,
telemetry: Option<TelemetryHandle>,
) -> Result<TLightParts<TBl, TRtApi, TExecDisp>, Error> where
TBl: BlockT,
TExecDisp: NativeExecutionDispatch + 'static,
@@ -421,6 +425,7 @@ pub fn new_light_parts<TBl, TRtApi, TExecDisp>(
executor,
Box::new(task_manager.spawn_handle()),
config.prometheus_config.as_ref().map(|config| config.registry.clone()),
telemetry,
)?);
Ok((client, backend, keystore_container, task_manager, on_demand))
@@ -447,6 +452,7 @@ pub fn new_client<E, Block, RA>(
execution_extensions: ExecutionExtensions<Block>,
spawn_handle: Box<dyn SpawnNamed>,
prometheus_registry: Option<Registry>,
telemetry: Option<TelemetryHandle>,
config: ClientConfig,
) -> Result<
crate::client::Client<
@@ -470,6 +476,7 @@ pub fn new_client<E, Block, RA>(
bad_blocks,
execution_extensions,
prometheus_registry,
telemetry,
config,
)?)
}
@@ -501,10 +508,8 @@ pub struct SpawnTasksParams<'a, TBl: BlockT, TCl, TExPool, TRpc, Backend> {
pub network_status_sinks: NetworkStatusSinks<TBl>,
/// A Sender for RPC requests.
pub system_rpc_tx: TracingUnboundedSender<sc_rpc::system::Request<TBl>>,
/// Telemetry span.
///
/// This span needs to be entered **before** calling [`spawn_tasks()`].
pub telemetry_span: Option<TelemetrySpan>,
/// Telemetry instance for this node.
pub telemetry: Option<&'a mut Telemetry>,
}
/// Build a shared offchain workers instance.
@@ -541,13 +546,12 @@ pub fn build_offchain_workers<TBl, TCl>(
/// Spawn the tasks that are required to run a node.
pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
params: SpawnTasksParams<TBl, TCl, TExPool, TRpc, TBackend>,
) -> Result<(RpcHandlers, Option<TelemetryConnectionNotifier>), Error>
) -> Result<RpcHandlers, Error>
where
TCl: ProvideRuntimeApi<TBl> + HeaderMetadata<TBl, Error=sp_blockchain::Error> + Chain<TBl> +
BlockBackend<TBl> + BlockIdTo<TBl, Error=sp_blockchain::Error> + ProofProvider<TBl> +
HeaderBackend<TBl> + BlockchainEvents<TBl> + ExecutorProvider<TBl> + UsageProvider<TBl> +
StorageProvider<TBl, TBackend> + CallApiAt<TBl> +
Send + 'static,
StorageProvider<TBl, TBackend> + CallApiAt<TBl> + Send + 'static,
<TCl as ProvideRuntimeApi<TBl>>::Api:
sp_api::Metadata<TBl> +
sc_offchain::OffchainWorkerApi<TBl> +
@@ -573,7 +577,7 @@ pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
network,
network_status_sinks,
system_rpc_tx,
telemetry_span,
telemetry,
} = params;
let chain_info = client.usage_info().chain;
@@ -584,12 +588,16 @@ pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
config.dev_key_seed.clone().map(|s| vec![s]).unwrap_or_default(),
).map_err(|e| Error::Application(Box::new(e)))?;
let telemetry_connection_notifier = init_telemetry(
&mut config,
telemetry_span,
network.clone(),
client.clone(),
);
let telemetry = telemetry
.map(|telemetry| {
init_telemetry(
&mut config,
network.clone(),
client.clone(),
telemetry,
)
})
.transpose()?;
info!("📦 Highest known block at #{}", chain_info.best_number);
@@ -603,7 +611,11 @@ pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
spawn_handle.spawn(
"on-transaction-imported",
transaction_notifications(transaction_pool.clone(), network.clone()),
transaction_notifications(
transaction_pool.clone(),
network.clone(),
telemetry.clone(),
),
);
// Prometheus metrics.
@@ -611,7 +623,7 @@ pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
config.prometheus_config.clone()
{
// Set static metrics.
let metrics = MetricsService::with_prometheus(&registry, &config)?;
let metrics = MetricsService::with_prometheus(telemetry.clone(), &registry, &config)?;
spawn_handle.spawn(
"prometheus-endpoint",
prometheus_endpoint::init_prometheus(port, registry).map(drop)
@@ -619,7 +631,7 @@ pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
metrics
} else {
MetricsService::new()
MetricsService::new(telemetry.clone())
};
// Periodically updated metrics and telemetry updates.
@@ -659,12 +671,13 @@ pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
task_manager.keep_alive((config.base_path, rpc, rpc_handlers.clone()));
Ok((rpc_handlers, telemetry_connection_notifier))
Ok(rpc_handlers)
}
async fn transaction_notifications<TBl, TExPool>(
transaction_pool: Arc<TExPool>,
network: Arc<NetworkService<TBl, <TBl as BlockT>::Hash>>,
telemetry: Option<TelemetryHandle>,
)
where
TBl: BlockT,
@@ -676,9 +689,12 @@ async fn transaction_notifications<TBl, TExPool>(
.for_each(move |hash| {
network.propagate_transaction(hash);
let status = transaction_pool.status();
telemetry!(SUBSTRATE_INFO; "txpool.import";
telemetry!(
telemetry;
SUBSTRATE_INFO;
"txpool.import";
"ready" => status.ready,
"future" => status.future
"future" => status.future,
);
ready(())
})
@@ -687,12 +703,10 @@ async fn transaction_notifications<TBl, TExPool>(
fn init_telemetry<TBl: BlockT, TCl: BlockBackend<TBl>>(
config: &mut Configuration,
telemetry_span: Option<TelemetrySpan>,
network: Arc<NetworkService<TBl, <TBl as BlockT>::Hash>>,
client: Arc<TCl>,
) -> Option<TelemetryConnectionNotifier> {
let telemetry_span = telemetry_span?;
let endpoints = config.telemetry_endpoints.clone()?;
telemetry: &mut Telemetry,
) -> sc_telemetry::Result<TelemetryHandle> {
let genesis_hash = client.block_hash(Zero::zero()).ok().flatten().unwrap_or_default();
let connection_message = ConnectionMessage {
name: config.network.node_name.to_owned(),
@@ -708,13 +722,9 @@ fn init_telemetry<TBl: BlockT, TCl: BlockBackend<TBl>>(
network_id: network.local_peer_id().to_base58(),
};
config.telemetry_handle
.as_mut()
.map(|handle| handle.start_telemetry(
telemetry_span,
endpoints,
connection_message,
))
telemetry.start_telemetry(connection_message)?;
Ok(telemetry.handle())
}
fn gen_handler<TBl, TBackend, TExPool, TRpc, TCl>(
@@ -368,6 +368,7 @@ mod tests {
None,
Box::new(TaskExecutor::new()),
None,
None,
Default::default(),
).expect("Creates a client");
+26 -8
View File
@@ -35,7 +35,11 @@ use sp_core::{
};
#[cfg(feature="test-helpers")]
use sp_keystore::SyncCryptoStorePtr;
use sc_telemetry::{telemetry, SUBSTRATE_INFO};
use sc_telemetry::{
telemetry,
TelemetryHandle,
SUBSTRATE_INFO,
};
use sp_runtime::{
Justification, BuildStorage,
generic::{BlockId, SignedBlock, DigestItem},
@@ -115,6 +119,7 @@ pub struct Client<B, E, Block, RA> where Block: BlockT {
block_rules: BlockRules<Block>,
execution_extensions: ExecutionExtensions<Block>,
config: ClientConfig,
telemetry: Option<TelemetryHandle>,
_phantom: PhantomData<RA>,
}
@@ -152,6 +157,7 @@ pub fn new_in_mem<E, Block, S, RA>(
genesis_storage: &S,
keystore: Option<SyncCryptoStorePtr>,
prometheus_registry: Option<Registry>,
telemetry: Option<TelemetryHandle>,
spawn_handle: Box<dyn SpawnNamed>,
config: ClientConfig,
) -> sp_blockchain::Result<Client<
@@ -171,6 +177,7 @@ pub fn new_in_mem<E, Block, S, RA>(
keystore,
spawn_handle,
prometheus_registry,
telemetry,
config,
)
}
@@ -196,6 +203,7 @@ pub fn new_with_backend<B, E, Block, S, RA>(
keystore: Option<SyncCryptoStorePtr>,
spawn_handle: Box<dyn SpawnNamed>,
prometheus_registry: Option<Registry>,
telemetry: Option<TelemetryHandle>,
config: ClientConfig,
) -> sp_blockchain::Result<Client<B, LocalCallExecutor<B, E>, Block, RA>>
where
@@ -218,6 +226,7 @@ pub fn new_with_backend<B, E, Block, S, RA>(
Default::default(),
extensions,
prometheus_registry,
telemetry,
config,
)
}
@@ -298,6 +307,7 @@ impl<B, E, Block, RA> Client<B, E, Block, RA> where
bad_blocks: BadBlocks<Block>,
execution_extensions: ExecutionExtensions<Block>,
prometheus_registry: Option<Registry>,
telemetry: Option<TelemetryHandle>,
config: ClientConfig,
) -> sp_blockchain::Result<Self> {
if backend.blockchain().header(BlockId::Number(Zero::zero()))?.is_none() {
@@ -330,6 +340,7 @@ impl<B, E, Block, RA> Client<B, E, Block, RA> where
block_rules: BlockRules::new(fork_blocks, bad_blocks),
execution_extensions,
config,
telemetry,
_phantom: Default::default(),
})
}
@@ -672,7 +683,10 @@ impl<B, E, Block, RA> Client<B, E, Block, RA> where
if origin != BlockOrigin::NetworkInitialSync ||
rand::thread_rng().gen_bool(0.1)
{
telemetry!(SUBSTRATE_INFO; "block.import";
telemetry!(
self.telemetry;
SUBSTRATE_INFO;
"block.import";
"height" => height,
"best" => ?hash,
"origin" => ?origin
@@ -989,10 +1003,13 @@ impl<B, E, Block, RA> Client<B, E, Block, RA> where
let header = self.header(&BlockId::Hash(*last))?
.expect(
"Header already known to exist in DB because it is \
indicated in the tree route; qed"
indicated in the tree route; qed"
);
telemetry!(SUBSTRATE_INFO; "notify.finalized";
telemetry!(
self.telemetry;
SUBSTRATE_INFO;
"notify.finalized";
"height" => format!("{}", header.number()),
"best" => ?last,
);
@@ -1002,7 +1019,7 @@ impl<B, E, Block, RA> Client<B, E, Block, RA> where
let header = self.header(&BlockId::Hash(finalized_hash))?
.expect(
"Header already known to exist in DB because it is \
indicated in the tree route; qed"
indicated in the tree route; qed"
);
let notification = FinalityNotification {
@@ -1991,9 +2008,10 @@ impl<B, E, Block, RA> backend::AuxStore for &Client<B, E, Block, RA>
}
impl<BE, E, B, RA> sp_consensus::block_validation::Chain<B> for Client<BE, E, B, RA>
where BE: backend::Backend<B>,
E: CallExecutor<B>,
B: BlockT
where
BE: backend::Backend<B>,
E: CallExecutor<B>,
B: BlockT,
{
fn block_status(
&self,
+3 -1
View File
@@ -22,6 +22,7 @@ use std::sync::Arc;
use sc_executor::RuntimeInfo;
use sp_core::traits::{CodeExecutor, SpawnNamed};
use sc_telemetry::TelemetryHandle;
use sp_runtime::BuildStorage;
use sp_runtime::traits::{Block as BlockT, HashFor};
use sp_blockchain::Result as ClientResult;
@@ -31,7 +32,6 @@ use super::{call_executor::LocalCallExecutor, client::{Client, ClientConfig}};
use sc_client_api::light::Storage as BlockchainStorage;
use sc_light::{Backend, GenesisCallExecutor};
/// Create an instance of light client.
pub fn new_light<B, S, RA, E>(
backend: Arc<Backend<S, HashFor<B>>>,
@@ -39,6 +39,7 @@ pub fn new_light<B, S, RA, E>(
code_executor: E,
spawn_handle: Box<dyn SpawnNamed>,
prometheus_registry: Option<Registry>,
telemetry: Option<TelemetryHandle>,
) -> ClientResult<
Client<
Backend<S, HashFor<B>>,
@@ -70,6 +71,7 @@ pub fn new_light<B, S, RA, E>(
Default::default(),
Default::default(),
prometheus_registry,
telemetry,
ClientConfig::default(),
)
}
-5
View File
@@ -96,11 +96,6 @@ pub struct Configuration {
/// External WASM transport for the telemetry. If `Some`, when connection to a telemetry
/// endpoint, this transport will be tried in priority before all others.
pub telemetry_external_transport: Option<ExtTransport>,
/// Telemetry handle.
///
/// This is a handle to a `TelemetryWorker` instance. It is used to initialize the telemetry for
/// a substrate node.
pub telemetry_handle: Option<sc_telemetry::TelemetryHandle>,
/// The default number of 64KB pages to allocate for Wasm execution
pub default_heap_pages: Option<u64>,
/// Should offchain workers be executed.
+3
View File
@@ -46,6 +46,9 @@ pub enum Error {
#[error(transparent)]
Keystore(#[from] sc_keystore::Error),
#[error(transparent)]
Telemetry(#[from] sc_telemetry::Error),
#[error("Best chain selection strategy (SelectChain) is not provided.")]
SelectChainRequired,
+9 -2
View File
@@ -21,7 +21,7 @@ use std::{convert::TryFrom, time::SystemTime};
use crate::{NetworkStatus, NetworkState, NetworkStatusSinks, config::Configuration};
use futures_timer::Delay;
use prometheus_endpoint::{register, Gauge, U64, Registry, PrometheusError, Opts, GaugeVec};
use sc_telemetry::{telemetry, SUBSTRATE_INFO};
use sc_telemetry::{telemetry, TelemetryHandle, SUBSTRATE_INFO};
use sp_api::ProvideRuntimeApi;
use sp_runtime::traits::{NumberFor, Block, SaturatedConversion, UniqueSaturatedInto};
use sp_transaction_pool::{PoolStatus, MaintainedTransactionPool};
@@ -112,23 +112,26 @@ pub struct MetricsService {
last_update: Instant,
last_total_bytes_inbound: u64,
last_total_bytes_outbound: u64,
telemetry: Option<TelemetryHandle>,
}
impl MetricsService {
/// Creates a `MetricsService` that only sends information
/// to the telemetry.
pub fn new() -> Self {
pub fn new(telemetry: Option<TelemetryHandle>) -> Self {
MetricsService {
metrics: None,
last_total_bytes_inbound: 0,
last_total_bytes_outbound: 0,
last_update: Instant::now(),
telemetry,
}
}
/// Creates a `MetricsService` that sends metrics
/// to prometheus alongside the telemetry.
pub fn with_prometheus(
telemetry: Option<TelemetryHandle>,
registry: &Registry,
config: &Configuration,
) -> Result<Self, PrometheusError> {
@@ -149,6 +152,7 @@ impl MetricsService {
last_total_bytes_inbound: 0,
last_total_bytes_outbound: 0,
last_update: Instant::now(),
telemetry,
})
}
@@ -245,6 +249,7 @@ impl MetricsService {
// Update/send metrics that are always available.
telemetry!(
self.telemetry;
SUBSTRATE_INFO;
"system.interval";
"height" => best_number,
@@ -307,6 +312,7 @@ impl MetricsService {
};
telemetry!(
self.telemetry;
SUBSTRATE_INFO;
"system.interval";
"peers" => num_peers,
@@ -328,6 +334,7 @@ impl MetricsService {
// Send network state information, if any.
if let Some(net_state) = net_state {
telemetry!(
self.telemetry;
SUBSTRATE_INFO;
"system.network_state";
"state" => net_state,
@@ -20,14 +20,7 @@ use crate::config::TaskExecutor;
use crate::task_manager::TaskManager;
use futures::{future::FutureExt, pin_mut, select};
use parking_lot::Mutex;
use sc_telemetry::TelemetrySpan;
use std::{any::Any, env, sync::Arc, time::Duration};
use tracing::{event::Event, span::Id, subscriber::Subscriber};
use tracing_subscriber::{
layer::{Context, SubscriberExt},
registry::LookupSpan,
Layer,
};
use std::{any::Any, sync::Arc, time::Duration};
#[derive(Clone, Debug)]
struct DropTester(Arc<Mutex<usize>>);
@@ -317,94 +310,3 @@ fn ensure_task_manager_future_continues_when_childs_not_essential_task_fails() {
runtime.block_on(task_manager.clean_shutdown());
assert_eq!(drop_tester, 0);
}
struct TestLayer {
spans_found: Arc<Mutex<Option<Vec<Id>>>>,
}
impl<S> Layer<S> for TestLayer
where
S: Subscriber + for<'a> LookupSpan<'a>,
{
fn on_event(&self, _: &Event<'_>, ctx: Context<S>) {
let mut spans_found = self.spans_found.lock();
if spans_found.is_some() {
panic!("on_event called multiple times");
}
*spans_found = Some(ctx.scope().map(|x| x.id()).collect());
}
}
fn setup_subscriber() -> (
impl Subscriber + for<'a> LookupSpan<'a>,
Arc<Mutex<Option<Vec<Id>>>>,
) {
let spans_found = Arc::new(Mutex::new(Default::default()));
let layer = TestLayer {
spans_found: spans_found.clone(),
};
let subscriber = tracing_subscriber::fmt().finish().with(layer);
(subscriber, spans_found)
}
/// This is not an actual test, it is used by the `telemetry_span_is_forwarded_to_task` test.
/// The given test will call the test executable and only execute this one test that
/// test that the telemetry span and the prefix span are forwarded correctly. This needs to be done
/// in a separate process to avoid interfering with the other tests.
#[test]
fn subprocess_telemetry_span_is_forwarded_to_task() {
if env::var("SUBPROCESS_TEST").is_err() {
return;
}
let (subscriber, spans_found) = setup_subscriber();
tracing_log::LogTracer::init().unwrap();
let _sub_guard = tracing::subscriber::set_global_default(subscriber);
let mut runtime = tokio::runtime::Runtime::new().unwrap();
let prefix_span = tracing::info_span!("prefix");
let _enter_prefix_span = prefix_span.enter();
let telemetry_span = TelemetrySpan::new();
let _enter_telemetry_span = telemetry_span.enter();
let handle = runtime.handle().clone();
let task_executor = TaskExecutor::from(move |fut, _| handle.spawn(fut).map(|_| ()));
let task_manager = new_task_manager(task_executor);
let (sender, receiver) = futures::channel::oneshot::channel();
task_manager.spawn_handle().spawn(
"log-something",
async move {
log::info!("boo!");
sender.send(()).unwrap();
}
.boxed(),
);
runtime.block_on(receiver).unwrap();
runtime.block_on(task_manager.clean_shutdown());
let spans = spans_found.lock().take().unwrap();
assert_eq!(2, spans.len());
assert_eq!(spans[0], prefix_span.id().unwrap());
assert_eq!(spans[1], telemetry_span.span().id().unwrap());
}
#[test]
fn telemetry_span_is_forwarded_to_task() {
let executable = env::current_exe().unwrap();
let output = std::process::Command::new(executable)
.env("SUBPROCESS_TEST", "1")
.args(&["--nocapture", "subprocess_telemetry_span_is_forwarded_to_task"])
.output()
.unwrap();
println!("{}", String::from_utf8(output.stdout).unwrap());
eprintln!("{}", String::from_utf8(output.stderr).unwrap());
assert!(output.status.success());
}