initial prometheus metrics (#1536)

* service-new: cosmetic changes

* overseer: draft of prometheus metrics

* metrics: update active_leaves metrics

* metrics: extract into functions

* metrics: resolve XXX

* metrics: it's ugly, but it works

* Bump Substrate

* metrics: move a bunch of code around

* Bumb substrate again

* metrics: fix a warning

* fix a warning in runtime

* metrics: statements signed

* metrics: statements impl RegisterMetrics

* metrics: refactor Metrics trait

* metrics: add Metrics assoc type to JobTrait

* metrics: move Metrics trait to util

* metrics: fix overseer

* metrics: fix backing

* metrics: fix candidate validation

* metrics: derive Default

* metrics: docs

* metrics: add stubs for other subsystems

* metrics: add more stubs and fix compilation

* metrics: fix doctest

* metrics: move to subsystem

* metrics: fix candidate validation

* metrics: bitfield signing

* metrics: av store

* metrics: chain API

* metrics: runtime API

* metrics: stub for avad

* metrics: candidates seconded

* metrics: ok I gave up

* metrics: provisioner

* metrics: remove a clone by requiring Metrics: Sync

* metrics: YAGNI

* metrics: remove another TODO

* metrics: for later

* metrics: add parachain_ prefix

* metrics: s/signed_statement/signed_statements

* utils: add a comment for job metrics

* metrics: address review comments

* metrics: oops

* metrics: make sure to save files before commit 😅

* use _total suffix for requests metrics

Co-authored-by: Max Inden <mail@max-inden.de>

* metrics: add tests for overseer

* update Cargo.lock

* overseer: add a test for CollationGeneration

* collation-generation: impl metrics

* collation-generation: use kebab-case for name

* collation-generation: add a constructor

Co-authored-by: Gav Wood <gavin@parity.io>
Co-authored-by: Ashley Ruglys <ashley.ruglys@gmail.com>
Co-authored-by: Max Inden <mail@max-inden.de>
This commit is contained in:
Andronik Ordian
2020-08-18 11:18:54 +02:00
committed by GitHub
parent ae37a00c17
commit e7ead40255
20 changed files with 742 additions and 106 deletions
+38 -15
View File
@@ -24,6 +24,7 @@ use polkadot_node_subsystem::{
errors::{ChainApiError, RuntimeApiError},
messages::{AllMessages, RuntimeApiMessage, RuntimeApiRequest, RuntimeApiSender},
FromOverseer, SpawnedSubsystem, Subsystem, SubsystemContext, SubsystemError, SubsystemResult,
metrics,
};
use futures::{
channel::{mpsc, oneshot},
@@ -63,11 +64,13 @@ pub mod reexports {
};
}
/// Duration a job will wait after sending a stop signal before hard-aborting.
pub const JOB_GRACEFUL_STOP_DURATION: Duration = Duration::from_secs(1);
/// Capacity of channels to and from individual jobs
pub const JOB_CHANNEL_CAPACITY: usize = 64;
/// Utility errors
#[derive(Debug, derive_more::From)]
pub enum Error {
@@ -446,6 +449,12 @@ pub trait JobTrait: Unpin {
///
/// If no extra information is needed, it is perfectly acceptable to set it to `()`.
type RunArgs: 'static + Send;
/// Subsystem-specific Prometheus metrics.
///
/// Jobs spawned by one subsystem should share the same
/// instance of metrics (use `.clone()`).
/// The `delegate_subsystem!` macro should take care of this.
type Metrics: 'static + metrics::Metrics + Send;
/// Name of the job, i.e. `CandidateBackingJob`
const NAME: &'static str;
@@ -454,6 +463,7 @@ pub trait JobTrait: Unpin {
fn run(
parent: Hash,
run_args: Self::RunArgs,
metrics: Self::Metrics,
receiver: mpsc::Receiver<Self::ToJob>,
sender: mpsc::Sender<Self::FromJob>,
) -> Pin<Box<dyn Future<Output = Result<(), Self::Error>> + Send>>;
@@ -532,7 +542,7 @@ impl<Spawner: SpawnNamed, Job: 'static + JobTrait> Jobs<Spawner, Job> {
}
/// Spawn a new job for this `parent_hash`, with whatever args are appropriate.
fn spawn_job(&mut self, parent_hash: Hash, run_args: Job::RunArgs) -> Result<(), Error> {
fn spawn_job(&mut self, parent_hash: Hash, run_args: Job::RunArgs, metrics: Job::Metrics) -> Result<(), Error> {
let (to_job_tx, to_job_rx) = mpsc::channel(JOB_CHANNEL_CAPACITY);
let (from_job_tx, from_job_rx) = mpsc::channel(JOB_CHANNEL_CAPACITY);
let (finished_tx, finished) = oneshot::channel();
@@ -541,7 +551,7 @@ impl<Spawner: SpawnNamed, Job: 'static + JobTrait> Jobs<Spawner, Job> {
let err_tx = self.errors.clone();
let (future, abort_handle) = future::abortable(async move {
if let Err(e) = Job::run(parent_hash, run_args, to_job_rx, from_job_tx).await {
if let Err(e) = Job::run(parent_hash, run_args, metrics, to_job_rx, from_job_tx).await {
log::error!(
"{}({}) finished with an error {:?}",
Job::NAME,
@@ -648,6 +658,7 @@ where
pub struct JobManager<Spawner, Context, Job: JobTrait> {
spawner: Spawner,
run_args: Job::RunArgs,
metrics: Job::Metrics,
context: std::marker::PhantomData<Context>,
job: std::marker::PhantomData<Job>,
errors: Option<mpsc::Sender<(Option<Hash>, JobsError<Job::Error>)>>,
@@ -662,10 +673,11 @@ where
Job::ToJob: TryFrom<AllMessages> + TryFrom<<Context as SubsystemContext>::Message> + Sync,
{
/// Creates a new `Subsystem`.
pub fn new(spawner: Spawner, run_args: Job::RunArgs) -> Self {
pub fn new(spawner: Spawner, run_args: Job::RunArgs, metrics: Job::Metrics) -> Self {
Self {
spawner,
run_args,
metrics,
context: std::marker::PhantomData,
job: std::marker::PhantomData,
errors: None,
@@ -703,6 +715,7 @@ where
pub async fn run(
mut ctx: Context,
run_args: Job::RunArgs,
metrics: Job::Metrics,
spawner: Spawner,
mut err_tx: Option<mpsc::Sender<(Option<Hash>, JobsError<Job::Error>)>>,
) {
@@ -714,7 +727,7 @@ where
loop {
select! {
incoming = ctx.recv().fuse() => if Self::handle_incoming(incoming, &mut jobs, &run_args, &mut err_tx).await { break },
incoming = ctx.recv().fuse() => if Self::handle_incoming(incoming, &mut jobs, &run_args, &metrics, &mut err_tx).await { break },
outgoing = jobs.next().fuse() => if Self::handle_outgoing(outgoing, &mut ctx, &mut err_tx).await { break },
complete => break,
}
@@ -741,6 +754,7 @@ where
incoming: SubsystemResult<FromOverseer<Context::Message>>,
jobs: &mut Jobs<Spawner, Job>,
run_args: &Job::RunArgs,
metrics: &Job::Metrics,
err_tx: &mut Option<mpsc::Sender<(Option<Hash>, JobsError<Job::Error>)>>,
) -> bool {
use polkadot_node_subsystem::ActiveLeavesUpdate;
@@ -753,7 +767,8 @@ where
deactivated,
}))) => {
for hash in activated {
if let Err(e) = jobs.spawn_job(hash, run_args.clone()) {
let metrics = metrics.clone();
if let Err(e) = jobs.spawn_job(hash, run_args.clone(), metrics) {
log::error!("Failed to spawn a job: {:?}", e);
Self::fwd_err(Some(hash), e.into(), err_tx).await;
return true;
@@ -849,14 +864,18 @@ where
Job: 'static + JobTrait + Send,
Job::RunArgs: Clone + Sync,
Job::ToJob: TryFrom<AllMessages> + Sync,
Job::Metrics: Sync,
{
type Metrics = Job::Metrics;
fn start(self, ctx: Context) -> SpawnedSubsystem {
let spawner = self.spawner.clone();
let run_args = self.run_args.clone();
let metrics = self.metrics.clone();
let errors = self.errors;
let future = Box::pin(async move {
Self::run(ctx, run_args, spawner, errors).await;
Self::run(ctx, run_args, metrics, spawner, errors).await;
});
SpawnedSubsystem {
@@ -901,11 +920,11 @@ where
/// ```
#[macro_export]
macro_rules! delegated_subsystem {
($job:ident($run_args:ty) <- $to_job:ty as $subsystem:ident) => {
delegated_subsystem!($job($run_args) <- $to_job as $subsystem; stringify!($subsystem));
($job:ident($run_args:ty, $metrics:ty) <- $to_job:ty as $subsystem:ident) => {
delegated_subsystem!($job($run_args, $metrics) <- $to_job as $subsystem; stringify!($subsystem));
};
($job:ident($run_args:ty) <- $to_job:ty as $subsystem:ident; $subsystem_name:expr) => {
($job:ident($run_args:ty, $metrics:ty) <- $to_job:ty as $subsystem:ident; $subsystem_name:expr) => {
#[doc = "Manager type for the "]
#[doc = $subsystem_name]
type Manager<Spawner, Context> = $crate::JobManager<Spawner, Context, $job>;
@@ -924,15 +943,15 @@ macro_rules! delegated_subsystem {
{
#[doc = "Creates a new "]
#[doc = $subsystem_name]
pub fn new(spawner: Spawner, run_args: $run_args) -> Self {
pub fn new(spawner: Spawner, run_args: $run_args, metrics: $metrics) -> Self {
$subsystem {
manager: $crate::JobManager::new(spawner, run_args)
manager: $crate::JobManager::new(spawner, run_args, metrics)
}
}
/// Run this subsystem
pub async fn run(ctx: Context, run_args: $run_args, spawner: Spawner) {
<Manager<Spawner, Context>>::run(ctx, run_args, spawner, None).await
pub async fn run(ctx: Context, run_args: $run_args, metrics: $metrics, spawner: Spawner) {
<Manager<Spawner, Context>>::run(ctx, run_args, metrics, spawner, None).await
}
}
@@ -942,6 +961,8 @@ macro_rules! delegated_subsystem {
Context: $crate::reexports::SubsystemContext,
<Context as $crate::reexports::SubsystemContext>::Message: Into<$to_job>,
{
type Metrics = $metrics;
fn start(self, ctx: Context) -> $crate::reexports::SpawnedSubsystem {
self.manager.start(ctx)
}
@@ -1061,6 +1082,7 @@ mod tests {
// RunArgs get cloned so that each job gets its own owned copy. If you need that, wrap it in
// an Arc. Within a testing context, that efficiency is less important.
type RunArgs = HashMap<Hash, Vec<FromJob>>;
type Metrics = ();
const NAME: &'static str = "FakeCandidateSelectionJob";
@@ -1070,6 +1092,7 @@ mod tests {
fn run(
parent: Hash,
mut run_args: Self::RunArgs,
_metrics: Self::Metrics,
receiver: mpsc::Receiver<ToJob>,
mut sender: mpsc::Sender<FromJob>,
) -> Pin<Box<dyn Future<Output = Result<(), Self::Error>> + Send>> {
@@ -1121,7 +1144,7 @@ mod tests {
let (context, overseer_handle) = make_subsystem_context(pool.clone());
let (err_tx, err_rx) = mpsc::channel(16);
let subsystem = FakeCandidateSelectionSubsystem::run(context, run_args, pool, Some(err_tx));
let subsystem = FakeCandidateSelectionSubsystem::run(context, run_args, (), pool, Some(err_tx));
let test_future = test(overseer_handle, err_rx);
let timeout = Delay::new(Duration::from_secs(2));
@@ -1196,7 +1219,7 @@ mod tests {
let (context, _) = make_subsystem_context::<CandidateSelectionMessage, _>(pool.clone());
let SpawnedSubsystem { name, .. } =
FakeCandidateSelectionSubsystem::new(pool, HashMap::new()).start(context);
FakeCandidateSelectionSubsystem::new(pool, HashMap::new(), ()).start(context);
assert_eq!(name, "FakeCandidateSelection");
}
}