Add more metrics to prometheus (#5034)

* Add a few things

* Add finality_grandpa_round

* fix fg tests

* Nitpicks

* Nitpicks

* Fix name of prometheus crate
This commit is contained in:
Ashley
2020-03-03 11:36:58 +01:00
committed by GitHub
parent b27a820032
commit 883ddfc897
16 changed files with 135 additions and 24 deletions
+2
View File
@@ -6120,6 +6120,7 @@ dependencies = [
"sp-keyring",
"sp-runtime",
"sp-state-machine",
"substrate-prometheus-endpoint",
"substrate-test-runtime-client",
"tempfile",
"tokio 0.2.12",
@@ -6204,6 +6205,7 @@ dependencies = [
"sp-keyring",
"sp-runtime",
"sp-test-primitives",
"substrate-prometheus-endpoint",
"substrate-test-runtime",
"substrate-test-runtime-client",
"tempfile",
@@ -157,6 +157,7 @@ pub fn new_full(config: Configuration<GenesisConfig>)
on_exit: service.on_exit(),
telemetry_on_connect: Some(service.telemetry_on_connect_stream()),
voting_rule: grandpa::VotingRulesBuilder::default().build(),
prometheus_registry: service.prometheus_registry()
};
// the GRANDPA voter task is considered infallible, i.e.
+1
View File
@@ -228,6 +228,7 @@ macro_rules! new_full {
on_exit: service.on_exit(),
telemetry_on_connect: Some(service.telemetry_on_connect_stream()),
voting_rule: grandpa::VotingRulesBuilder::default().build(),
prometheus_registry: service.prometheus_registry(),
};
// the GRANDPA voter task is considered infallible, i.e.
@@ -35,6 +35,7 @@ sc-network = { version = "0.8.0-alpha.2", path = "../network" }
sc-network-gossip = { version = "0.8.0-alpha.2", path = "../network-gossip" }
sp-finality-tracker = { version = "2.0.0-alpha.2", path = "../../primitives/finality-tracker" }
sp-finality-grandpa = { version = "2.0.0-alpha.2", path = "../../primitives/finality-grandpa" }
prometheus-endpoint = { package = "substrate-prometheus-endpoint", path = "../../utils/prometheus", version = "0.8.0-alpha.2" }
sc-block-builder = { version = "0.8.0-alpha.2", path = "../block-builder" }
finality-grandpa = { version = "0.11.1", features = ["derive-codec"] }
pin-project = "0.4.6"
@@ -58,6 +58,7 @@ use crate::justification::GrandpaJustification;
use crate::until_imported::UntilVoteTargetImported;
use crate::voting_rule::VotingRule;
use sp_finality_grandpa::{AuthorityId, AuthoritySignature, SetId, RoundNumber};
use prometheus_endpoint::{Gauge, U64, register, PrometheusError};
type HistoricalVotes<Block> = finality_grandpa::HistoricalVotes<
<Block as BlockT>::Hash,
@@ -372,6 +373,24 @@ impl<Block: BlockT> SharedVoterSetState<Block> {
}
}
/// Prometheus metrics for GRANDPA.
#[derive(Clone)]
pub(crate) struct Metrics {
finality_grandpa_round: Gauge<U64>,
}
impl Metrics {
pub(crate) fn register(registry: &prometheus_endpoint::Registry) -> Result<Self, PrometheusError> {
Ok(Self {
finality_grandpa_round: register(
Gauge::new("finality_grandpa_round", "Highest completed GRANDPA round.")?,
registry
)?,
})
}
}
/// The environment we run GRANDPA in.
pub(crate) struct Environment<Backend, Block: BlockT, C, N: NetworkT<Block>, SC, VR> {
pub(crate) client: Arc<C>,
@@ -384,6 +403,7 @@ pub(crate) struct Environment<Backend, Block: BlockT, C, N: NetworkT<Block>, SC,
pub(crate) set_id: SetId,
pub(crate) voter_set_state: SharedVoterSetState<Block>,
pub(crate) voting_rule: VR,
pub(crate) metrics: Option<Metrics>,
pub(crate) _phantom: PhantomData<Backend>,
}
@@ -397,6 +417,17 @@ impl<Backend, Block: BlockT, C, N: NetworkT<Block>, SC, VR> Environment<Backend,
self.voter_set_state.with(|voter_set_state| {
if let Some(set_state) = f(&voter_set_state)? {
*voter_set_state = set_state;
if let Some(metrics) = self.metrics.as_ref() {
if let VoterSetState::Live { completed_rounds, .. } = voter_set_state {
let highest = completed_rounds.rounds.iter()
.map(|round| round.number)
.max()
.expect("There is always one completed round (genesis); qed");
metrics.finality_grandpa_round.set(highest);
}
}
}
Ok(())
})
+11 -1
View File
@@ -104,7 +104,7 @@ pub use voting_rule::{
};
use aux_schema::PersistentData;
use environment::{Environment, VoterSetState};
use environment::{Environment, VoterSetState, Metrics};
use import::GrandpaBlockImport;
use until_imported::UntilGlobalMessageBlocksImported;
use communication::{NetworkBridge, Network as NetworkT};
@@ -551,6 +551,8 @@ pub struct GrandpaParams<Block: BlockT, C, N, SC, VR, X> {
pub telemetry_on_connect: Option<futures::channel::mpsc::UnboundedReceiver<()>>,
/// A voting rule used to potentially restrict target votes.
pub voting_rule: VR,
/// The prometheus metrics registry.
pub prometheus_registry: Option<prometheus_endpoint::Registry>,
}
/// Run a GRANDPA voter as a task. Provide configuration and a link to a
@@ -576,6 +578,7 @@ pub fn run_grandpa_voter<Block: BlockT, BE: 'static, C, N, SC, VR, X>(
on_exit,
telemetry_on_connect,
voting_rule,
prometheus_registry,
} = grandpa_params;
// NOTE: we have recently removed `run_grandpa_observer` from the public
@@ -634,6 +637,7 @@ pub fn run_grandpa_voter<Block: BlockT, BE: 'static, C, N, SC, VR, X>(
voting_rule,
persistent_data,
voter_commands_rx,
prometheus_registry,
);
let voter_work = voter_work
@@ -673,6 +677,7 @@ where
voting_rule: VR,
persistent_data: PersistentData<Block>,
voter_commands_rx: mpsc::UnboundedReceiver<VoterCommand<Block::Hash, NumberFor<Block>>>,
prometheus_registry: Option<prometheus_endpoint::Registry>,
) -> Self {
let voters = persistent_data.authority_set.current_authorities();
@@ -687,6 +692,10 @@ where
authority_set: persistent_data.authority_set.clone(),
consensus_changes: persistent_data.consensus_changes.clone(),
voter_set_state: persistent_data.set_state.clone(),
metrics: prometheus_registry.map(|registry| {
Metrics::register(&registry)
.expect("Other metrics would have failed to register before these; qed")
}),
_phantom: PhantomData,
});
@@ -807,6 +816,7 @@ where
consensus_changes: self.env.consensus_changes.clone(),
network: self.env.network.clone(),
voting_rule: self.env.voting_rule.clone(),
metrics: self.env.metrics.clone(),
_phantom: PhantomData,
});
@@ -447,6 +447,7 @@ fn run_to_completion_with<F>(
on_exit: Exit,
telemetry_on_connect: None,
voting_rule: (),
prometheus_registry: None,
};
let voter = run_grandpa_voter(grandpa_params).expect("all in order with client and network");
@@ -578,6 +579,7 @@ fn finalize_3_voters_1_full_observer() {
on_exit: Exit,
telemetry_on_connect: None,
voting_rule: (),
prometheus_registry: None,
};
voters.push(run_grandpa_voter(grandpa_params).expect("all in order with client and network"));
@@ -741,6 +743,7 @@ fn transition_3_voters_twice_1_full_observer() {
on_exit: Exit,
telemetry_on_connect: None,
voting_rule: (),
prometheus_registry: None,
};
let voter = run_grandpa_voter(grandpa_params).expect("all in order with client and network");
@@ -1166,6 +1169,7 @@ fn voter_persists_its_votes() {
on_exit: Exit,
telemetry_on_connect: None,
voting_rule: VotingRulesBuilder::default().build(),
prometheus_registry: None,
};
let voter = run_grandpa_voter(grandpa_params)
@@ -1511,6 +1515,7 @@ fn voter_catches_up_to_latest_round_when_behind() {
on_exit: Exit,
telemetry_on_connect: None,
voting_rule: (),
prometheus_registry: None,
};
Box::pin(run_grandpa_voter(grandpa_params).expect("all in order with client and network"))
@@ -1642,6 +1647,7 @@ fn grandpa_environment_respects_voting_rules() {
voters: Arc::new(authority_set.current_authorities()),
network,
voting_rule,
metrics: None,
_phantom: PhantomData,
}
};
+1
View File
@@ -52,6 +52,7 @@ sp-consensus = { version = "0.8.0-alpha.2", path = "../../primitives/consensus/c
sp-consensus-babe = { version = "0.8.0-alpha.2", path = "../../primitives/consensus/babe" }
sp-core = { version = "2.0.0-alpha.2", path = "../../primitives/core" }
sp-runtime = { version = "2.0.0-alpha.2", path = "../../primitives/runtime" }
prometheus-endpoint = { package = "substrate-prometheus-endpoint", version = "0.8.0-alpha.2", path = "../../utils/prometheus" }
thiserror = "1"
unsigned-varint = { version = "0.3.1", features = ["futures", "futures-codec"] }
void = "1.0.2"
+4
View File
@@ -41,6 +41,7 @@ use core::{fmt, iter};
use std::{future::Future, pin::Pin};
use std::{error::Error, fs, io::{self, Write}, net::Ipv4Addr, path::{Path, PathBuf}, sync::Arc};
use zeroize::Zeroize;
use prometheus_endpoint::Registry;
/// Network initialization parameters.
pub struct Params<B: BlockT, H: ExHashT> {
@@ -90,6 +91,9 @@ pub struct Params<B: BlockT, H: ExHashT> {
/// Type to check incoming block announcements.
pub block_announce_validator: Box<dyn BlockAnnounceValidator<B> + Send>,
/// Registry for recording prometheus metrics to.
pub metrics_registry: Option<Registry>,
}
bitflags! {
+3
View File
@@ -45,6 +45,8 @@ pub enum Error {
/// The second peer id that was found for the bootnode.
second_id: PeerId,
},
/// Prometheus metrics error.
Prometheus(prometheus_endpoint::PrometheusError)
}
// Make `Debug` use the `Display` implementation.
@@ -60,6 +62,7 @@ impl std::error::Error for Error {
Error::Io(ref err) => Some(err),
Error::Client(ref err) => Some(err),
Error::DuplicateBootnode { .. } => None,
Error::Prometheus(ref err) => Some(err),
}
}
}
+38 -3
View File
@@ -39,6 +39,7 @@ use libp2p::swarm::{NetworkBehaviour, SwarmBuilder, SwarmEvent};
use parking_lot::Mutex;
use sc_peerset::PeersetHandle;
use sp_runtime::{traits::{Block as BlockT, NumberFor}, ConsensusEngineId};
use prometheus_endpoint::{Registry, Gauge, U64, register, PrometheusError};
use crate::{behaviour::{Behaviour, BehaviourOut}, config::{parse_str_addr, parse_addr}};
use crate::{transport, config::NonReservedPeerMode, ReputationChange};
@@ -294,6 +295,10 @@ impl<B: BlockT + 'static, H: ExHashT> NetworkWorker<B, H> {
from_worker,
light_client_rqs: params.on_demand.and_then(|od| od.extract_receiver()),
event_streams: Vec::new(),
metrics: match params.metrics_registry {
Some(registry) => Some(Metrics::register(&registry)?),
None => None
}
})
}
@@ -727,6 +732,26 @@ pub struct NetworkWorker<B: BlockT + 'static, H: ExHashT> {
light_client_rqs: Option<mpsc::UnboundedReceiver<RequestData<B>>>,
/// Senders for events that happen on the network.
event_streams: Vec<mpsc::UnboundedSender<Event>>,
/// Prometheus network metrics.
metrics: Option<Metrics>
}
struct Metrics {
is_major_syncing: Gauge<U64>,
peers_count: Gauge<U64>,
}
impl Metrics {
fn register(registry: &Registry) -> Result<Self, PrometheusError> {
Ok(Self {
is_major_syncing: register(Gauge::new(
"is_major_syncing", "Whether the node is performing a major sync or not.",
)?, registry)?,
peers_count: register(Gauge::new(
"peers_count", "Number of network gossip peers",
)?, registry)?,
})
}
}
impl<B: BlockT + 'static, H: ExHashT> Future for NetworkWorker<B, H> {
@@ -818,16 +843,26 @@ impl<B: BlockT + 'static, H: ExHashT> Future for NetworkWorker<B, H> {
};
}
let num_connected_peers = this.network_service.user_protocol_mut().num_connected_peers();
// Update the variables shared with the `NetworkService`.
this.num_connected.store(this.network_service.user_protocol_mut().num_connected_peers(), Ordering::Relaxed);
this.num_connected.store(num_connected_peers, Ordering::Relaxed);
{
let external_addresses = Swarm::<B, H>::external_addresses(&this.network_service).cloned().collect();
*this.external_addresses.lock() = external_addresses;
}
this.is_major_syncing.store(match this.network_service.user_protocol_mut().sync_state() {
let is_major_syncing = match this.network_service.user_protocol_mut().sync_state() {
SyncState::Idle => false,
SyncState::Downloading => true,
}, Ordering::Relaxed);
};
this.is_major_syncing.store(is_major_syncing, Ordering::Relaxed);
if let Some(metrics) = this.metrics.as_ref() {
metrics.is_major_syncing.set(is_major_syncing as u64);
metrics.peers_count.set(num_connected_peers as u64);
}
Poll::Pending
}
+4 -2
View File
@@ -640,7 +640,8 @@ pub trait TestNetFactory: Sized {
transaction_pool: Arc::new(EmptyTransactionPool),
protocol_id: ProtocolId::from(&b"test-protocol-name"[..]),
import_queue,
block_announce_validator: Box::new(DefaultBlockAnnounceValidator::new(client.clone()))
block_announce_validator: Box::new(DefaultBlockAnnounceValidator::new(client.clone())),
metrics_registry: None,
}).unwrap();
self.mut_peers(|peers| {
@@ -713,7 +714,8 @@ pub trait TestNetFactory: Sized {
transaction_pool: Arc::new(EmptyTransactionPool),
protocol_id: ProtocolId::from(&b"test-protocol-name"[..]),
import_queue,
block_announce_validator: Box::new(DefaultBlockAnnounceValidator::new(client.clone()))
block_announce_validator: Box::new(DefaultBlockAnnounceValidator::new(client.clone())),
metrics_registry: None,
}).unwrap();
self.mut_peers(|peers| {
+1 -1
View File
@@ -56,7 +56,7 @@ sc-rpc = { version = "2.0.0-alpha.2", path = "../rpc" }
sc-telemetry = { version = "2.0.0-alpha.2", path = "../telemetry" }
sc-offchain = { version = "2.0.0-alpha.2", path = "../offchain" }
parity-multiaddr = { package = "parity-multiaddr", version = "0.5.0" }
substrate-prometheus-endpoint = { path = "../../utils/prometheus" , version = "0.8.0-alpha.2"}
prometheus-endpoint = { package = "substrate-prometheus-endpoint", path = "../../utils/prometheus" , version = "0.8.0-alpha.2"}
sc-tracing = { version = "2.0.0-alpha.2", path = "../tracing" }
tracing = "0.1.10"
parity-util-mem = { version = "0.5.1", default-features = false, features = ["primitive-types"] }
+21 -15
View File
@@ -53,15 +53,15 @@ use sysinfo::{get_current_pid, ProcessExt, System, SystemExt};
use sc_telemetry::{telemetry, SUBSTRATE_INFO};
use sp_transaction_pool::{MaintainedTransactionPool, ChainEvent};
use sp_blockchain;
use substrate_prometheus_endpoint::{register, Gauge, U64, F64, Registry, PrometheusError, Opts, GaugeVec};
use prometheus_endpoint::{register, Gauge, U64, F64, Registry, PrometheusError, Opts, GaugeVec};
struct ServiceMetrics {
block_height_number: GaugeVec<U64>,
peers_count: Gauge<U64>,
ready_transactions_number: Gauge<U64>,
memory_usage_bytes: Gauge<U64>,
cpu_usage_percentage: Gauge<F64>,
network_per_sec_bytes: GaugeVec<U64>,
node_roles: Gauge<U64>,
}
impl ServiceMetrics {
@@ -71,9 +71,6 @@ impl ServiceMetrics {
Opts::new("block_height_number", "Height of the chain"),
&["status"]
)?, registry)?,
peers_count: register(Gauge::new(
"peers_count", "Number of network gossip peers",
)?, registry)?,
ready_transactions_number: register(Gauge::new(
"ready_transactions_number", "Number of transactions in the ready queue",
)?, registry)?,
@@ -87,6 +84,10 @@ impl ServiceMetrics {
Opts::new("network_per_sec_bytes", "Networking bytes per second"),
&["direction"]
)?, registry)?,
node_roles: register(Gauge::new(
"node_roles", "The roles the node is running as",
)?, registry)?,
})
}
}
@@ -887,6 +888,14 @@ ServiceBuilder<
let block_announce_validator =
Box::new(sp_consensus::block_validation::DefaultBlockAnnounceValidator::new(client.clone()));
let prometheus_registry_and_port = match config.prometheus_port {
Some(port) => match prometheus_registry {
Some(registry) => Some((registry, port)),
None => Some((Registry::new_custom(Some("substrate".into()), None)?, port))
},
None => None
};
let network_params = sc_network::config::Params {
roles: config.roles,
executor: {
@@ -906,6 +915,7 @@ ServiceBuilder<
import_queue,
protocol_id,
block_announce_validator,
metrics_registry: prometheus_registry_and_port.as_ref().map(|(r, _)| r.clone())
};
let has_bootnodes = !network_params.network_config.boot_nodes.is_empty();
@@ -1020,17 +1030,14 @@ ServiceBuilder<
));
}
// Prometheus endpoint and metrics
let metrics = if let Some(port) = config.prometheus_port {
let registry = match prometheus_registry {
Some(registry) => registry,
None => Registry::new_custom(Some("substrate".into()), None)?
};
// Prometheus metrics
let metrics = if let Some((registry, port)) = prometheus_registry_and_port.clone() {
let metrics = ServiceMetrics::register(&registry)?;
metrics.node_roles.set(u64::from(config.roles.bits()));
let future = select(
substrate_prometheus_endpoint::init_prometheus(port, registry).boxed(),
prometheus_endpoint::init_prometheus(port, registry).boxed(),
exit.clone()
).map(drop);
@@ -1043,7 +1050,6 @@ ServiceBuilder<
} else {
None
};
// Periodically notify the telemetry.
let transaction_pool_ = transaction_pool.clone();
let client_ = client.clone();
@@ -1102,7 +1108,6 @@ ServiceBuilder<
metrics.memory_usage_bytes.set(memory);
metrics.cpu_usage_percentage.set(f64::from(cpu_usage));
metrics.ready_transactions_number.set(txpool_status.ready as u64);
metrics.peers_count.set(num_peers as u64);
metrics.network_per_sec_bytes.with_label_values(&["download"]).set(net_status.average_download_per_sec);
metrics.network_per_sec_bytes.with_label_values(&["upload"]).set(net_status.average_upload_per_sec);
@@ -1305,6 +1310,7 @@ ServiceBuilder<
_telemetry_on_connect_sinks: telemetry_connection_sinks.clone(),
keystore,
marker: PhantomData::<TBl>,
prometheus_registry: prometheus_registry_and_port.map(|(r, _)| r)
})
}
}
+2 -2
View File
@@ -53,8 +53,8 @@ impl<'a> From<&'a str> for Error {
}
}
impl From<substrate_prometheus_endpoint::PrometheusError> for Error {
fn from(e: substrate_prometheus_endpoint::PrometheusError) -> Self {
impl From<prometheus_endpoint::PrometheusError> for Error {
fn from(e: prometheus_endpoint::PrometheusError) -> Self {
Error::Other(format!("Prometheus error: {}", e))
}
}
+8
View File
@@ -114,6 +114,7 @@ pub struct Service<TBl, TCl, TSc, TNetStatus, TNet, TTxPool, TOc> {
_offchain_workers: Option<Arc<TOc>>,
keystore: sc_keystore::KeyStorePtr,
marker: PhantomData<TBl>,
prometheus_registry: Option<prometheus_endpoint::Registry>,
}
impl<TBl, TCl, TSc, TNetStatus, TNet, TTxPool, TOc> Unpin for Service<TBl, TCl, TSc, TNetStatus, TNet, TTxPool, TOc> {}
@@ -225,6 +226,9 @@ pub trait AbstractService: 'static + Future<Output = Result<(), Error>> +
/// Get a handle to a future that will resolve on exit.
fn on_exit(&self) -> ::exit_future::Exit;
/// Get the prometheus metrics registry, if available.
fn prometheus_registry(&self) -> Option<prometheus_endpoint::Registry>;
}
impl<TBl, TBackend, TExec, TRtApi, TSc, TExPool, TOc> AbstractService for
@@ -328,6 +332,10 @@ where
fn on_exit(&self) -> exit_future::Exit {
self.exit.clone()
}
fn prometheus_registry(&self) -> Option<prometheus_endpoint::Registry> {
self.prometheus_registry.clone()
}
}
impl<TBl, TCl, TSc, TNetStatus, TNet, TTxPool, TOc> Future for