// This file is part of Bizinikiwi. // Copyright (C) Parity Technologies (UK) Ltd. // SPDX-License-Identifier: GPL-3.0-or-later WITH Classpath-exception-2.0 // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // You should have received a copy of the GNU General Public License // along with this program. If not, see . //! Bizinikiwi service tasks management module. use crate::{config::TaskType, Error}; use exit_future::Signal; use futures::{ future::{pending, select, try_join_all, BoxFuture, Either}, Future, FutureExt, StreamExt, }; use parking_lot::Mutex; use pezsc_utils::mpsc::{tracing_unbounded, TracingUnboundedReceiver, TracingUnboundedSender}; use prometheus_endpoint::{ exponential_buckets, register, CounterVec, HistogramOpts, HistogramVec, Opts, PrometheusError, Registry, U64, }; use std::{ collections::{hash_map::Entry, HashMap}, panic, pin::Pin, result::Result, sync::Arc, }; use tokio::runtime::Handle; use tracing_futures::Instrument; mod prometheus_future; #[cfg(test)] mod tests; /// Default task group name. pub const DEFAULT_GROUP_NAME: &str = "default"; /// The name of a group a task belongs to. /// /// This name is passed belong-side the task name to the prometheus metrics and can be used /// to group tasks. pub enum GroupName { /// Sets the group name to `default`. Default, /// Use the specifically given name as group name. Specific(&'static str), } impl From> for GroupName { fn from(name: Option<&'static str>) -> Self { match name { Some(name) => Self::Specific(name), None => Self::Default, } } } impl From<&'static str> for GroupName { fn from(name: &'static str) -> Self { Self::Specific(name) } } /// An handle for spawning tasks in the service. #[derive(Clone)] pub struct SpawnTaskHandle { on_exit: exit_future::Exit, tokio_handle: Handle, metrics: Option, task_registry: TaskRegistry, } impl SpawnTaskHandle { /// Spawns the given task with the given name and a group name. /// If group is not specified `DEFAULT_GROUP_NAME` will be used. /// /// Note that the `name` is a `&'static str`. The reason for this choice is that /// statistics about this task are getting reported to the Prometheus endpoint (if enabled), and /// that therefore the set of possible task names must be bounded. /// /// In other words, it would be a bad idea for someone to do for example /// `spawn(format!("{:?}", some_public_key))`. pub fn spawn( &self, name: &'static str, group: impl Into, task: impl Future + Send + 'static, ) { self.spawn_inner(name, group, task, TaskType::Async) } /// Spawns the blocking task with the given name. See also `spawn`. pub fn spawn_blocking( &self, name: &'static str, group: impl Into, task: impl Future + Send + 'static, ) { self.spawn_inner(name, group, task, TaskType::Blocking) } /// Helper function that implements the spawning logic. See `spawn` and `spawn_blocking`. fn spawn_inner( &self, name: &'static str, group: impl Into, task: impl Future + Send + 'static, task_type: TaskType, ) { let on_exit = self.on_exit.clone(); let metrics = self.metrics.clone(); let registry = self.task_registry.clone(); let group = match group.into() { GroupName::Specific(var) => var, // If no group is specified use default. GroupName::Default => DEFAULT_GROUP_NAME, }; let task_type_label = match task_type { TaskType::Blocking => "blocking", TaskType::Async => "async", }; // Note that we increase the started counter here and not within the future. This way, // we could properly visualize on Prometheus situations where the spawning doesn't work. if let Some(metrics) = &self.metrics { metrics.tasks_spawned.with_label_values(&[name, group, task_type_label]).inc(); // We do a dummy increase in order for the task to show up in metrics. metrics .tasks_ended .with_label_values(&[name, "finished", group, task_type_label]) .inc_by(0); } let future = async move { // Register the task and keep the "token" alive until the task is ended. Then this // "token" will unregister this task. let _registry_token = registry.register_task(name, group); if let Some(metrics) = metrics { // Add some wrappers around `task`. let task = { let poll_duration = metrics.poll_duration.with_label_values(&[name, group, task_type_label]); let poll_start = metrics.poll_start.with_label_values(&[name, group, task_type_label]); let inner = prometheus_future::with_poll_durations(poll_duration, poll_start, task); // The logic of `AssertUnwindSafe` here is ok considering that we throw // away the `Future` after it has panicked. panic::AssertUnwindSafe(inner).catch_unwind() }; futures::pin_mut!(task); match select(on_exit, task).await { Either::Right((Err(payload), _)) => { metrics .tasks_ended .with_label_values(&[name, "panic", group, task_type_label]) .inc(); panic::resume_unwind(payload) }, Either::Right((Ok(()), _)) => { metrics .tasks_ended .with_label_values(&[name, "finished", group, task_type_label]) .inc(); }, Either::Left(((), _)) => { // The `on_exit` has triggered. metrics .tasks_ended .with_label_values(&[name, "interrupted", group, task_type_label]) .inc(); }, } } else { futures::pin_mut!(task); let _ = select(on_exit, task).await; } } .in_current_span(); match task_type { TaskType::Async => { self.tokio_handle.spawn(future); }, TaskType::Blocking => { let handle = self.tokio_handle.clone(); self.tokio_handle.spawn_blocking(move || { handle.block_on(future); }); }, } } } impl pezsp_core::traits::SpawnNamed for SpawnTaskHandle { fn spawn_blocking( &self, name: &'static str, group: Option<&'static str>, future: BoxFuture<'static, ()>, ) { self.spawn_inner(name, group, future, TaskType::Blocking) } fn spawn( &self, name: &'static str, group: Option<&'static str>, future: BoxFuture<'static, ()>, ) { self.spawn_inner(name, group, future, TaskType::Async) } } /// A wrapper over `SpawnTaskHandle` that will notify a receiver whenever any /// task spawned through it fails. The service should be on the receiver side /// and will shut itself down whenever it receives any message, i.e. an /// essential task has failed. #[derive(Clone)] pub struct SpawnEssentialTaskHandle { essential_failed_tx: TracingUnboundedSender<()>, inner: SpawnTaskHandle, } impl SpawnEssentialTaskHandle { /// Creates a new `SpawnEssentialTaskHandle`. pub fn new( essential_failed_tx: TracingUnboundedSender<()>, spawn_task_handle: SpawnTaskHandle, ) -> SpawnEssentialTaskHandle { SpawnEssentialTaskHandle { essential_failed_tx, inner: spawn_task_handle } } /// Spawns the given task with the given name. /// /// See also [`SpawnTaskHandle::spawn`]. pub fn spawn( &self, name: &'static str, group: impl Into, task: impl Future + Send + 'static, ) { self.spawn_inner(name, group, task, TaskType::Async) } /// Spawns the blocking task with the given name. /// /// See also [`SpawnTaskHandle::spawn_blocking`]. pub fn spawn_blocking( &self, name: &'static str, group: impl Into, task: impl Future + Send + 'static, ) { self.spawn_inner(name, group, task, TaskType::Blocking) } fn spawn_inner( &self, name: &'static str, group: impl Into, task: impl Future + Send + 'static, task_type: TaskType, ) { let essential_failed = self.essential_failed_tx.clone(); let essential_task = std::panic::AssertUnwindSafe(task).catch_unwind().map(move |_| { log::error!("Essential task `{}` failed. Shutting down service.", name); let _ = essential_failed.close(); }); let _ = self.inner.spawn_inner(name, group, essential_task, task_type); } } impl pezsp_core::traits::SpawnEssentialNamed for SpawnEssentialTaskHandle { fn spawn_essential_blocking( &self, name: &'static str, group: Option<&'static str>, future: BoxFuture<'static, ()>, ) { self.spawn_blocking(name, group, future); } fn spawn_essential( &self, name: &'static str, group: Option<&'static str>, future: BoxFuture<'static, ()>, ) { self.spawn(name, group, future); } } /// Helper struct to manage background/async tasks in Service. pub struct TaskManager { /// A future that resolves when the service has exited, this is useful to /// make sure any internally spawned futures stop when the service does. on_exit: exit_future::Exit, /// A signal that makes the exit future above resolve, fired on drop. _signal: Signal, /// Tokio runtime handle that is used to spawn futures. tokio_handle: Handle, /// Prometheus metric where to report the polling times. metrics: Option, /// Send a signal when a spawned essential task has concluded. The next time /// the service future is polled it should complete with an error. essential_failed_tx: TracingUnboundedSender<()>, /// A receiver for spawned essential-tasks concluding. essential_failed_rx: TracingUnboundedReceiver<()>, /// Things to keep alive until the task manager is dropped. keep_alive: Box, /// A list of other `TaskManager`'s to terminate and gracefully shutdown when the parent /// terminates and gracefully shutdown. Also ends the parent `future()` if a child's essential /// task fails. children: Vec, /// The registry of all running tasks. task_registry: TaskRegistry, } impl TaskManager { /// If a Prometheus registry is passed, it will be used to report statistics about the /// service tasks. pub fn new( tokio_handle: Handle, prometheus_registry: Option<&Registry>, ) -> Result { let (signal, on_exit) = exit_future::signal(); // A side-channel for essential tasks to communicate shutdown. let (essential_failed_tx, essential_failed_rx) = tracing_unbounded("mpsc_essential_tasks", 100); let metrics = prometheus_registry.map(Metrics::register).transpose()?; Ok(Self { on_exit, _signal: signal, tokio_handle, metrics, essential_failed_tx, essential_failed_rx, keep_alive: Box::new(()), children: Vec::new(), task_registry: Default::default(), }) } /// Get a handle for spawning tasks. pub fn spawn_handle(&self) -> SpawnTaskHandle { SpawnTaskHandle { on_exit: self.on_exit.clone(), tokio_handle: self.tokio_handle.clone(), metrics: self.metrics.clone(), task_registry: self.task_registry.clone(), } } /// Get a handle for spawning essential tasks. pub fn spawn_essential_handle(&self) -> SpawnEssentialTaskHandle { SpawnEssentialTaskHandle::new(self.essential_failed_tx.clone(), self.spawn_handle()) } /// Return a future that will end with success if the signal to terminate was sent /// (`self.terminate()`) or with an error if an essential task fails. /// /// # Warning /// /// This function will not wait until the end of the remaining task. pub fn future<'a>( &'a mut self, ) -> Pin> + Send + 'a>> { Box::pin(async move { let mut t1 = self.essential_failed_rx.next().fuse(); let mut t2 = self.on_exit.clone().fuse(); let mut t3 = try_join_all( self.children .iter_mut() .map(|x| x.future()) // Never end this future if there is no error because if there is no children, // it must not stop .chain(std::iter::once(pending().boxed())), ) .fuse(); futures::select! { _ = t1 => Err(Error::Other("Essential task failed.".into())), _ = t2 => Ok(()), res = t3 => Err(res.map(|_| ()).expect_err("this future never ends; qed")), } }) } /// Set what the task manager should keep alive, can be called multiple times. pub fn keep_alive(&mut self, to_keep_alive: T) { // allows this fn to safely called multiple times. use std::mem; let old = mem::replace(&mut self.keep_alive, Box::new(())); self.keep_alive = Box::new((to_keep_alive, old)); } /// Register another TaskManager to terminate and gracefully shutdown when the parent /// terminates and gracefully shutdown. Also ends the parent `future()` if a child's essential /// task fails. (But don't end the parent if a child's normal task fails.) pub fn add_child(&mut self, child: TaskManager) { self.children.push(child); } /// Consume `self` and return the [`TaskRegistry`]. /// /// This [`TaskRegistry`] can be used to check for still running tasks after this task manager /// was dropped. pub fn into_task_registry(self) -> TaskRegistry { self.task_registry } } #[derive(Clone)] struct Metrics { // This list is ordered alphabetically poll_duration: HistogramVec, poll_start: CounterVec, tasks_spawned: CounterVec, tasks_ended: CounterVec, } impl Metrics { fn register(registry: &Registry) -> Result { Ok(Self { poll_duration: register(HistogramVec::new( HistogramOpts { common_opts: Opts::new( "bizinikiwi_tasks_polling_duration", "Duration in seconds of each invocation of Future::poll" ), buckets: exponential_buckets(0.001, 4.0, 9) .expect("function parameters are constant and always valid; qed"), }, &["task_name", "task_group", "kind"] )?, registry)?, poll_start: register(CounterVec::new( Opts::new( "bizinikiwi_tasks_polling_started_total", "Total number of times we started invoking Future::poll" ), &["task_name", "task_group", "kind"] )?, registry)?, tasks_spawned: register(CounterVec::new( Opts::new( "bizinikiwi_tasks_spawned_total", "Total number of tasks that have been spawned on the Service" ), &["task_name", "task_group", "kind"] )?, registry)?, tasks_ended: register(CounterVec::new( Opts::new( "bizinikiwi_tasks_ended_total", "Total number of tasks for which Future::poll has returned Ready(()) or panicked" ), &["task_name", "reason", "task_group", "kind"] )?, registry)?, }) } } /// Ensures that a [`Task`] is unregistered when this object is dropped. struct UnregisterOnDrop { task: Task, registry: TaskRegistry, } impl Drop for UnregisterOnDrop { fn drop(&mut self) { let mut tasks = self.registry.tasks.lock(); if let Entry::Occupied(mut entry) = (*tasks).entry(self.task.clone()) { *entry.get_mut() -= 1; if *entry.get() == 0 { entry.remove(); } } } } /// Represents a running async task in the [`TaskManager`]. /// /// As a task is identified by a name and a group, it is totally valid that there exists multiple /// tasks with the same name and group. #[derive(Clone, Hash, Eq, PartialEq)] pub struct Task { /// The name of the task. pub name: &'static str, /// The group this task is associated to. pub group: &'static str, } impl Task { /// Returns if the `group` is the [`DEFAULT_GROUP_NAME`]. pub fn is_default_group(&self) -> bool { self.group == DEFAULT_GROUP_NAME } } /// Keeps track of all running [`Task`]s in [`TaskManager`]. #[derive(Clone, Default)] pub struct TaskRegistry { tasks: Arc>>, } impl TaskRegistry { /// Register a task with the given `name` and `group`. /// /// Returns [`UnregisterOnDrop`] that ensures that the task is unregistered when this value is /// dropped. fn register_task(&self, name: &'static str, group: &'static str) -> UnregisterOnDrop { let task = Task { name, group }; { let mut tasks = self.tasks.lock(); *(*tasks).entry(task.clone()).or_default() += 1; } UnregisterOnDrop { task, registry: self.clone() } } /// Returns the running tasks. /// /// As a task is only identified by its `name` and `group`, there can be duplicate tasks. The /// number per task represents the concurrently running tasks with the same identifier. pub fn running_tasks(&self) -> HashMap { (*self.tasks.lock()).clone() } }