Telemetry per node (#7463)

This commit is contained in:
Cecile Tonglet
2021-01-20 12:28:56 +01:00
committed by GitHub
parent 71ef82afbc
commit 970cc25cef
49 changed files with 2578 additions and 2009 deletions
+69 -86
View File
@@ -17,8 +17,7 @@
// along with this program. If not, see <https://www.gnu.org/licenses/>.
use crate::{
error::Error, DEFAULT_PROTOCOL_ID, MallocSizeOfWasm,
TelemetryConnectionSinks, RpcHandlers, NetworkStatusSinks,
error::Error, DEFAULT_PROTOCOL_ID, MallocSizeOfWasm, RpcHandlers, NetworkStatusSinks,
start_rpc_servers, build_network_future, TransactionPoolAdapter, TaskManager, SpawnTaskHandle,
metrics::MetricsService,
client::{light, Client, ClientConfig},
@@ -46,13 +45,19 @@ use sc_network::NetworkService;
use sc_network::block_request_handler::{self, BlockRequestHandler};
use sp_runtime::generic::BlockId;
use sp_runtime::traits::{
Block as BlockT, SaturatedConversion, HashFor, Zero, BlockIdTo,
Block as BlockT, HashFor, Zero, BlockIdTo,
};
use sp_api::{ProvideRuntimeApi, CallApiAt};
use sc_executor::{NativeExecutor, NativeExecutionDispatch, RuntimeInfo};
use std::sync::Arc;
use wasm_timer::SystemTime;
use sc_telemetry::{telemetry, SUBSTRATE_INFO};
use sc_telemetry::{
telemetry,
ConnectionMessage,
TelemetryConnectionNotifier,
TelemetrySpan,
SUBSTRATE_INFO,
};
use sp_transaction_pool::MaintainedTransactionPool;
use prometheus_endpoint::Registry;
use sc_client_db::{Backend, DatabaseSettings};
@@ -179,6 +184,7 @@ type TFullParts<TBl, TRtApi, TExecDisp> = (
Arc<TFullBackend<TBl>>,
KeystoreContainer,
TaskManager,
Option<TelemetrySpan>,
);
type TLightParts<TBl, TRtApi, TExecDisp> = (
@@ -187,6 +193,7 @@ type TLightParts<TBl, TRtApi, TExecDisp> = (
KeystoreContainer,
TaskManager,
Arc<OnDemand<TBl>>,
Option<TelemetrySpan>,
);
/// Light client backend type with a specific hash type.
@@ -301,9 +308,14 @@ pub fn new_full_parts<TBl, TRtApi, TExecDisp>(
{
let keystore_container = KeystoreContainer::new(&config.keystore)?;
let telemetry_span = if config.telemetry_endpoints.is_some() {
Some(TelemetrySpan::new())
} else {
None
};
let task_manager = {
let registry = config.prometheus_config.as_ref().map(|cfg| &cfg.registry);
TaskManager::new(config.task_executor.clone(), registry)?
TaskManager::new(config.task_executor.clone(), registry, telemetry_span.clone())?
};
let executor = NativeExecutor::<TExecDisp>::new(
@@ -359,20 +371,26 @@ pub fn new_full_parts<TBl, TRtApi, TExecDisp>(
backend,
keystore_container,
task_manager,
telemetry_span,
))
}
/// Create the initial parts of a light node.
pub fn new_light_parts<TBl, TRtApi, TExecDisp>(
config: &Configuration
config: &Configuration,
) -> Result<TLightParts<TBl, TRtApi, TExecDisp>, Error> where
TBl: BlockT,
TExecDisp: NativeExecutionDispatch + 'static,
{
let keystore_container = KeystoreContainer::new(&config.keystore)?;
let telemetry_span = if config.telemetry_endpoints.is_some() {
Some(TelemetrySpan::new())
} else {
None
};
let task_manager = {
let registry = config.prometheus_config.as_ref().map(|cfg| &cfg.registry);
TaskManager::new(config.task_executor.clone(), registry)?
TaskManager::new(config.task_executor.clone(), registry, telemetry_span.clone())?
};
let executor = NativeExecutor::<TExecDisp>::new(
@@ -411,7 +429,7 @@ pub fn new_light_parts<TBl, TRtApi, TExecDisp>(
config.prometheus_config.as_ref().map(|config| config.registry.clone()),
)?);
Ok((client, backend, keystore_container, task_manager, on_demand))
Ok((client, backend, keystore_container, task_manager, on_demand, telemetry_span))
}
/// Create an instance of db-backed client.
@@ -463,6 +481,8 @@ pub fn new_client<E, Block, RA>(
pub struct SpawnTasksParams<'a, TBl: BlockT, TCl, TExPool, TRpc, Backend> {
/// The service configuration.
pub config: Configuration,
/// Telemetry span, if any.
pub telemetry_span: Option<TelemetrySpan>,
/// A shared client returned by `new_full_parts`/`new_light_parts`.
pub client: Arc<TCl>,
/// A shared backend returned by `new_full_parts`/`new_light_parts`.
@@ -486,8 +506,6 @@ pub struct SpawnTasksParams<'a, TBl: BlockT, TCl, TExPool, TRpc, Backend> {
pub network_status_sinks: NetworkStatusSinks<TBl>,
/// A Sender for RPC requests.
pub system_rpc_tx: TracingUnboundedSender<sc_rpc::system::Request<TBl>>,
/// Shared Telemetry connection sinks,
pub telemetry_connection_sinks: TelemetryConnectionSinks,
}
/// Build a shared offchain workers instance.
@@ -534,7 +552,7 @@ pub fn build_offchain_workers<TBl, TBackend, TCl>(
/// Spawn the tasks that are required to run a node.
pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
params: SpawnTasksParams<TBl, TCl, TExPool, TRpc, TBackend>,
) -> Result<RpcHandlers, Error>
) -> Result<(RpcHandlers, Option<TelemetryConnectionNotifier>), Error>
where
TCl: ProvideRuntimeApi<TBl> + HeaderMetadata<TBl, Error=sp_blockchain::Error> + Chain<TBl> +
BlockBackend<TBl> + BlockIdTo<TBl, Error=sp_blockchain::Error> + ProofProvider<TBl> +
@@ -557,6 +575,7 @@ pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
let SpawnTasksParams {
mut config,
task_manager,
telemetry_span,
client,
on_demand,
backend,
@@ -567,7 +586,6 @@ pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
network,
network_status_sinks,
system_rpc_tx,
telemetry_connection_sinks,
} = params;
let chain_info = client.usage_info().chain;
@@ -578,13 +596,15 @@ pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
config.dev_key_seed.clone().map(|s| vec![s]).unwrap_or_default(),
)?;
let telemetry_connection_notifier = telemetry_span
.and_then(|span| init_telemetry(
&mut config,
span,
network.clone(),
client.clone(),
));
info!("📦 Highest known block at #{}", chain_info.best_number);
telemetry!(
SUBSTRATE_INFO;
"node.start";
"height" => chain_info.best_number.saturated_into::<u64>(),
"best" => ?chain_info.best_hash
);
let spawn_handle = task_manager.spawn_handle();
@@ -642,24 +662,6 @@ pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
sc_rpc_server::RpcMiddleware::new(rpc_metrics, "inbrowser")
).into()));
// Telemetry
let telemetry = config.telemetry_endpoints.clone().and_then(|endpoints| {
if endpoints.is_empty() {
// we don't want the telemetry to be initialized if telemetry_endpoints == Some([])
return None;
}
let genesis_hash = match client.block_hash(Zero::zero()) {
Ok(Some(hash)) => hash,
_ => Default::default(),
};
Some(build_telemetry(
&mut config, endpoints, telemetry_connection_sinks.clone(), network.clone(),
task_manager.spawn_handle(), genesis_hash,
))
});
// Spawn informant task
spawn_handle.spawn("informant", sc_informant::build(
client.clone(),
@@ -668,21 +670,22 @@ pub fn spawn_tasks<TBl, TBackend, TExPool, TRpc, TCl>(
config.informant_output_format,
));
task_manager.keep_alive((telemetry, config.base_path, rpc, rpc_handlers.clone()));
task_manager.keep_alive((config.base_path, rpc, rpc_handlers.clone()));
Ok(rpc_handlers)
Ok((rpc_handlers, telemetry_connection_notifier))
}
async fn transaction_notifications<TBl, TExPool>(
transaction_pool: Arc<TExPool>,
network: Arc<NetworkService<TBl, <TBl as BlockT>::Hash>>
network: Arc<NetworkService<TBl, <TBl as BlockT>::Hash>>,
)
where
TBl: BlockT,
TExPool: MaintainedTransactionPool<Block=TBl, Hash = <TBl as BlockT>::Hash>,
{
// transaction notifications
transaction_pool.import_notification_stream()
transaction_pool
.import_notification_stream()
.for_each(move |hash| {
network.propagate_transaction(hash);
let status = transaction_pool.status();
@@ -695,55 +698,35 @@ async fn transaction_notifications<TBl, TExPool>(
.await;
}
fn build_telemetry<TBl: BlockT>(
fn init_telemetry<TBl: BlockT, TCl: BlockBackend<TBl>>(
config: &mut Configuration,
endpoints: sc_telemetry::TelemetryEndpoints,
telemetry_connection_sinks: TelemetryConnectionSinks,
telemetry_span: TelemetrySpan,
network: Arc<NetworkService<TBl, <TBl as BlockT>::Hash>>,
spawn_handle: SpawnTaskHandle,
genesis_hash: <TBl as BlockT>::Hash,
) -> sc_telemetry::Telemetry {
let is_authority = config.role.is_authority();
let network_id = network.local_peer_id().to_base58();
let name = config.network.node_name.clone();
let impl_name = config.impl_name.clone();
let impl_version = config.impl_version.clone();
let chain_name = config.chain_spec.name().to_owned();
let telemetry = sc_telemetry::init_telemetry(sc_telemetry::TelemetryConfig {
endpoints,
wasm_external_transport: config.telemetry_external_transport.take(),
});
let startup_time = SystemTime::UNIX_EPOCH.elapsed()
.map(|dur| dur.as_millis())
.unwrap_or(0);
client: Arc<TCl>,
) -> Option<TelemetryConnectionNotifier> {
let endpoints = config.telemetry_endpoints()?.clone();
let genesis_hash = client.block_hash(Zero::zero()).ok().flatten().unwrap_or_default();
let connection_message = ConnectionMessage {
name: config.network.node_name.to_owned(),
implementation: config.impl_name.to_owned(),
version: config.impl_version.to_owned(),
config: String::new(),
chain: config.chain_spec.name().to_owned(),
genesis_hash: format!("{:?}", genesis_hash),
authority: config.role.is_authority(),
startup_time: SystemTime::UNIX_EPOCH.elapsed()
.map(|dur| dur.as_millis())
.unwrap_or(0).to_string(),
network_id: network.local_peer_id().to_base58(),
};
spawn_handle.spawn(
"telemetry-worker",
telemetry.clone()
.for_each(move |event| {
// Safe-guard in case we add more events in the future.
let sc_telemetry::TelemetryEvent::Connected = event;
telemetry!(SUBSTRATE_INFO; "system.connected";
"name" => name.clone(),
"implementation" => impl_name.clone(),
"version" => impl_version.clone(),
"config" => "",
"chain" => chain_name.clone(),
"genesis_hash" => ?genesis_hash,
"authority" => is_authority,
"startup_time" => startup_time,
"network_id" => network_id.clone()
);
telemetry_connection_sinks.0.lock().retain(|sink| {
sink.unbounded_send(()).is_ok()
});
ready(())
})
);
telemetry
config.telemetry_handle
.as_mut()
.map(|handle| handle.start_telemetry(
telemetry_span,
endpoints,
connection_message,
))
}
fn gen_handler<TBl, TBackend, TExPool, TRpc, TCl>(
+19 -1
View File
@@ -96,6 +96,11 @@ pub struct Configuration {
/// External WASM transport for the telemetry. If `Some`, when connection to a telemetry
/// endpoint, this transport will be tried in priority before all others.
pub telemetry_external_transport: Option<ExtTransport>,
/// Telemetry handle.
///
/// This is a handle to a `TelemetryWorker` instance. It is used to initialize the telemetry for
/// a substrate node.
pub telemetry_handle: Option<sc_telemetry::TelemetryHandle>,
/// The default number of 64KB pages to allocate for Wasm execution
pub default_heap_pages: Option<u64>,
/// Should offchain workers be executed.
@@ -198,9 +203,22 @@ impl Configuration {
}
/// Returns the prometheus metrics registry, if available.
pub fn prometheus_registry<'a>(&'a self) -> Option<&'a Registry> {
pub fn prometheus_registry(&self) -> Option<&Registry> {
self.prometheus_config.as_ref().map(|config| &config.registry)
}
/// Returns the telemetry endpoints if any and if the telemetry handle exists.
pub(crate) fn telemetry_endpoints(&self) -> Option<&TelemetryEndpoints> {
if self.telemetry_handle.is_none() {
return None;
}
match self.telemetry_endpoints.as_ref() {
// Don't initialise telemetry if `telemetry_endpoints` == Some([])
Some(endpoints) if !endpoints.is_empty() => Some(endpoints),
_ => None,
}
}
}
/// Available RPC methods.
+2 -16
View File
@@ -39,7 +39,6 @@ use std::net::SocketAddr;
use std::collections::HashMap;
use std::time::Duration;
use std::task::Poll;
use parking_lot::Mutex;
use futures::{Future, FutureExt, Stream, StreamExt, stream, compat::*};
use sc_network::{NetworkStatus, network_state::NetworkState, PeerId};
@@ -48,7 +47,7 @@ use codec::{Encode, Decode};
use sp_runtime::generic::BlockId;
use sp_runtime::traits::{Block as BlockT, Header as HeaderT};
use parity_util_mem::MallocSizeOf;
use sp_utils::{status_sinks, mpsc::{tracing_unbounded, TracingUnboundedReceiver, TracingUnboundedSender}};
use sp_utils::{status_sinks, mpsc::{tracing_unbounded, TracingUnboundedReceiver}};
pub use self::error::Error;
pub use self::builder::{
@@ -161,20 +160,7 @@ impl<Block: BlockT> NetworkStatusSinks<Block> {
}
/// Sinks to propagate telemetry connection established events.
#[derive(Default, Clone)]
pub struct TelemetryConnectionSinks(Arc<Mutex<Vec<TracingUnboundedSender<()>>>>);
impl TelemetryConnectionSinks {
/// Get event stream for telemetry connection established events.
pub fn on_connect_stream(&self) -> TracingUnboundedReceiver<()> {
let (sink, stream) =tracing_unbounded("mpsc_telemetry_on_connect");
self.0.lock().push(sink);
stream
}
}
/// An imcomplete set of chain components, but enough to run the chain ops subcommands.
/// An incomplete set of chain components, but enough to run the chain ops subcommands.
pub struct PartialComponents<Client, Backend, SelectChain, ImportQueue, TransactionPool, Other> {
/// A shared client instance.
pub client: Arc<Client>,
@@ -34,6 +34,7 @@ use prometheus_endpoint::{
use sp_utils::mpsc::{TracingUnboundedSender, TracingUnboundedReceiver, tracing_unbounded};
use tracing_futures::Instrument;
use crate::{config::{TaskExecutor, TaskType, JoinFuture}, Error};
use sc_telemetry::TelemetrySpan;
mod prometheus_future;
#[cfg(test)]
@@ -46,6 +47,7 @@ pub struct SpawnTaskHandle {
executor: TaskExecutor,
metrics: Option<Metrics>,
task_notifier: TracingUnboundedSender<JoinFuture>,
telemetry_span: Option<TelemetrySpan>,
}
impl SpawnTaskHandle {
@@ -89,7 +91,10 @@ impl SpawnTaskHandle {
metrics.tasks_ended.with_label_values(&[name, "finished"]).inc_by(0);
}
let telemetry_span = self.telemetry_span.clone();
let future = async move {
let _telemetry_entered = telemetry_span.as_ref().map(|x| x.enter());
if let Some(metrics) = metrics {
// Add some wrappers around `task`.
let task = {
@@ -228,14 +233,17 @@ pub struct TaskManager {
/// terminates and gracefully shutdown. Also ends the parent `future()` if a child's essential
/// task fails.
children: Vec<TaskManager>,
/// A telemetry handle used to enter the telemetry span when a task is spawned.
telemetry_span: Option<TelemetrySpan>,
}
impl TaskManager {
/// If a Prometheus registry is passed, it will be used to report statistics about the
/// service tasks.
/// If a Prometheus registry is passed, it will be used to report statistics about the
/// service tasks.
pub(super) fn new(
executor: TaskExecutor,
prometheus_registry: Option<&Registry>
prometheus_registry: Option<&Registry>,
telemetry_span: Option<TelemetrySpan>,
) -> Result<Self, PrometheusError> {
let (signal, on_exit) = exit_future::signal();
@@ -264,6 +272,7 @@ impl TaskManager {
task_notifier,
completion_future,
children: Vec::new(),
telemetry_span,
})
}
@@ -274,6 +283,7 @@ impl TaskManager {
executor: self.executor.clone(),
metrics: self.metrics.clone(),
task_notifier: self.task_notifier.clone(),
telemetry_span: self.telemetry_span.clone(),
}
}
@@ -81,13 +81,17 @@ async fn run_background_task_blocking(duration: Duration, _keep_alive: impl Any)
}
}
fn new_task_manager(task_executor: TaskExecutor) -> TaskManager {
TaskManager::new(task_executor, None, None).unwrap()
}
#[test]
fn ensure_tasks_are_awaited_on_shutdown() {
let mut runtime = tokio::runtime::Runtime::new().unwrap();
let handle = runtime.handle().clone();
let task_executor: TaskExecutor = (move |future, _| handle.spawn(future).map(|_| ())).into();
let task_manager = TaskManager::new(task_executor, None).unwrap();
let task_manager = new_task_manager(task_executor);
let spawn_handle = task_manager.spawn_handle();
let drop_tester = DropTester::new();
spawn_handle.spawn("task1", run_background_task(drop_tester.new_ref()));
@@ -106,7 +110,7 @@ fn ensure_keep_alive_during_shutdown() {
let handle = runtime.handle().clone();
let task_executor: TaskExecutor = (move |future, _| handle.spawn(future).map(|_| ())).into();
let mut task_manager = TaskManager::new(task_executor, None).unwrap();
let mut task_manager = new_task_manager(task_executor);
let spawn_handle = task_manager.spawn_handle();
let drop_tester = DropTester::new();
task_manager.keep_alive(drop_tester.new_ref());
@@ -125,7 +129,7 @@ fn ensure_blocking_futures_are_awaited_on_shutdown() {
let handle = runtime.handle().clone();
let task_executor: TaskExecutor = (move |future, _| handle.spawn(future).map(|_| ())).into();
let task_manager = TaskManager::new(task_executor, None).unwrap();
let task_manager = new_task_manager(task_executor);
let spawn_handle = task_manager.spawn_handle();
let drop_tester = DropTester::new();
spawn_handle.spawn(
@@ -150,7 +154,7 @@ fn ensure_no_task_can_be_spawn_after_terminate() {
let handle = runtime.handle().clone();
let task_executor: TaskExecutor = (move |future, _| handle.spawn(future).map(|_| ())).into();
let mut task_manager = TaskManager::new(task_executor, None).unwrap();
let mut task_manager = new_task_manager(task_executor);
let spawn_handle = task_manager.spawn_handle();
let drop_tester = DropTester::new();
spawn_handle.spawn("task1", run_background_task(drop_tester.new_ref()));
@@ -171,7 +175,7 @@ fn ensure_task_manager_future_ends_when_task_manager_terminated() {
let handle = runtime.handle().clone();
let task_executor: TaskExecutor = (move |future, _| handle.spawn(future).map(|_| ())).into();
let mut task_manager = TaskManager::new(task_executor, None).unwrap();
let mut task_manager = new_task_manager(task_executor);
let spawn_handle = task_manager.spawn_handle();
let drop_tester = DropTester::new();
spawn_handle.spawn("task1", run_background_task(drop_tester.new_ref()));
@@ -192,7 +196,7 @@ fn ensure_task_manager_future_ends_with_error_when_essential_task_fails() {
let handle = runtime.handle().clone();
let task_executor: TaskExecutor = (move |future, _| handle.spawn(future).map(|_| ())).into();
let mut task_manager = TaskManager::new(task_executor, None).unwrap();
let mut task_manager = new_task_manager(task_executor);
let spawn_handle = task_manager.spawn_handle();
let spawn_essential_handle = task_manager.spawn_essential_handle();
let drop_tester = DropTester::new();
@@ -215,10 +219,10 @@ fn ensure_children_tasks_ends_when_task_manager_terminated() {
let handle = runtime.handle().clone();
let task_executor: TaskExecutor = (move |future, _| handle.spawn(future).map(|_| ())).into();
let mut task_manager = TaskManager::new(task_executor.clone(), None).unwrap();
let child_1 = TaskManager::new(task_executor.clone(), None).unwrap();
let mut task_manager = new_task_manager(task_executor.clone());
let child_1 = new_task_manager(task_executor.clone());
let spawn_handle_child_1 = child_1.spawn_handle();
let child_2 = TaskManager::new(task_executor.clone(), None).unwrap();
let child_2 = new_task_manager(task_executor.clone());
let spawn_handle_child_2 = child_2.spawn_handle();
task_manager.add_child(child_1);
task_manager.add_child(child_2);
@@ -244,11 +248,11 @@ fn ensure_task_manager_future_ends_with_error_when_childs_essential_task_fails()
let handle = runtime.handle().clone();
let task_executor: TaskExecutor = (move |future, _| handle.spawn(future).map(|_| ())).into();
let mut task_manager = TaskManager::new(task_executor.clone(), None).unwrap();
let child_1 = TaskManager::new(task_executor.clone(), None).unwrap();
let mut task_manager = new_task_manager(task_executor.clone());
let child_1 = new_task_manager(task_executor.clone());
let spawn_handle_child_1 = child_1.spawn_handle();
let spawn_essential_handle_child_1 = child_1.spawn_essential_handle();
let child_2 = TaskManager::new(task_executor.clone(), None).unwrap();
let child_2 = new_task_manager(task_executor.clone());
let spawn_handle_child_2 = child_2.spawn_handle();
task_manager.add_child(child_1);
task_manager.add_child(child_2);
@@ -275,10 +279,10 @@ fn ensure_task_manager_future_continues_when_childs_not_essential_task_fails() {
let handle = runtime.handle().clone();
let task_executor: TaskExecutor = (move |future, _| handle.spawn(future).map(|_| ())).into();
let mut task_manager = TaskManager::new(task_executor.clone(), None).unwrap();
let child_1 = TaskManager::new(task_executor.clone(), None).unwrap();
let mut task_manager = new_task_manager(task_executor.clone());
let child_1 = new_task_manager(task_executor.clone());
let spawn_handle_child_1 = child_1.spawn_handle();
let child_2 = TaskManager::new(task_executor.clone(), None).unwrap();
let child_2 = new_task_manager(task_executor.clone());
let spawn_handle_child_2 = child_2.spawn_handle();
task_manager.add_child(child_1);
task_manager.add_child(child_2);