Changed how relay loops are started (#840)

* slightly changed relay loop initialization

* git mv

* clippy

* more clippy

* loop_run -> run_loop

* review and clippy

* clippy
This commit is contained in:
Svyatoslav Nikolsky
2021-03-24 11:46:30 +03:00
committed by Bastian Köcher
parent 8d122b03f1
commit a17c7eb80c
21 changed files with 379 additions and 348 deletions
+2
View File
@@ -16,6 +16,8 @@
//! Utilities used by different relays.
pub use relay_loop::relay_loop;
use backoff::{backoff::Backoff, ExponentialBackoff};
use futures::future::FutureExt;
use std::time::Duration;
+23 -120
View File
@@ -14,12 +14,13 @@
// You should have received a copy of the GNU General Public License
// along with Parity Bridges Common. If not, see <http://www.gnu.org/licenses/>.
pub use global::GlobalMetrics;
pub use substrate_prometheus_endpoint::{register, Counter, CounterVec, Gauge, GaugeVec, Opts, Registry, F64, U64};
use async_std::sync::{Arc, Mutex};
use std::net::SocketAddr;
use substrate_prometheus_endpoint::init_prometheus;
use sysinfo::{ProcessExt, RefreshKind, System, SystemExt};
use async_trait::async_trait;
use std::time::Duration;
mod global;
/// Prometheus endpoint MetricsParams.
#[derive(Debug, Clone)]
@@ -31,62 +32,32 @@ pub struct MetricsParams {
}
/// Metrics API.
pub trait Metrics {
pub trait Metrics: Clone + Send + Sync + 'static {
/// Register metrics in the registry.
fn register(&self, registry: &Registry) -> Result<(), String>;
}
/// Global Prometheus metrics.
#[derive(Debug, Clone)]
pub struct GlobalMetrics {
system: Arc<Mutex<System>>,
system_average_load: GaugeVec<F64>,
process_cpu_usage_percentage: Gauge<F64>,
process_memory_usage_bytes: Gauge<U64>,
}
/// Standalone metrics API.
///
/// Metrics of this kind know how to update themselves, so we may just spawn and forget the
/// asynchronous self-update task.
#[async_trait]
pub trait StandaloneMetrics: Metrics {
/// Update metric values.
async fn update(&self);
/// Start Prometheus endpoint with given metrics registry.
pub fn start(
prefix: String,
params: Option<MetricsParams>,
global_metrics: &GlobalMetrics,
extra_metrics: &impl Metrics,
) {
let params = match params {
Some(params) => params,
None => return,
};
/// Metrics update interval.
fn update_interval(&self) -> Duration;
assert!(!prefix.is_empty(), "Metrics prefix can not be empty");
let do_start = move || {
let prometheus_socket_addr = SocketAddr::new(
params
.host
.parse()
.map_err(|err| format!("Invalid Prometheus host {}: {}", params.host, err))?,
params.port,
);
let metrics_registry =
Registry::new_custom(Some(prefix), None).expect("only fails if prefix is empty; prefix is not empty; qed");
global_metrics.register(&metrics_registry)?;
extra_metrics.register(&metrics_registry)?;
/// Spawn the self update task that will keep update metric value at given intervals.
fn spawn(self) {
async_std::task::spawn(async move {
init_prometheus(prometheus_socket_addr, metrics_registry)
.await
.map_err(|err| format!("Error starting Prometheus endpoint: {}", err))
let update_interval = self.update_interval();
loop {
self.update().await;
async_std::task::sleep(update_interval).await;
}
});
Ok(())
};
let result: Result<(), String> = do_start();
if let Err(err) = result {
log::warn!(
target: "bridge",
"Failed to expose metrics: {}",
err,
);
}
}
@@ -98,71 +69,3 @@ impl Default for MetricsParams {
}
}
}
impl Metrics for GlobalMetrics {
fn register(&self, registry: &Registry) -> Result<(), String> {
register(self.system_average_load.clone(), registry).map_err(|e| e.to_string())?;
register(self.process_cpu_usage_percentage.clone(), registry).map_err(|e| e.to_string())?;
register(self.process_memory_usage_bytes.clone(), registry).map_err(|e| e.to_string())?;
Ok(())
}
}
impl Default for GlobalMetrics {
fn default() -> Self {
GlobalMetrics {
system: Arc::new(Mutex::new(System::new_with_specifics(RefreshKind::everything()))),
system_average_load: GaugeVec::new(Opts::new("system_average_load", "System load average"), &["over"])
.expect("metric is static and thus valid; qed"),
process_cpu_usage_percentage: Gauge::new("process_cpu_usage_percentage", "Process CPU usage")
.expect("metric is static and thus valid; qed"),
process_memory_usage_bytes: Gauge::new(
"process_memory_usage_bytes",
"Process memory (resident set size) usage",
)
.expect("metric is static and thus valid; qed"),
}
}
}
impl GlobalMetrics {
/// Update metrics.
pub async fn update(&self) {
// update system-wide metrics
let mut system = self.system.lock().await;
let load = system.get_load_average();
self.system_average_load.with_label_values(&["1min"]).set(load.one);
self.system_average_load.with_label_values(&["5min"]).set(load.five);
self.system_average_load.with_label_values(&["15min"]).set(load.fifteen);
// update process-related metrics
let pid = sysinfo::get_current_pid().expect(
"only fails where pid is unavailable (os=unknown || arch=wasm32);\
relay is not supposed to run in such MetricsParamss;\
qed",
);
let is_process_refreshed = system.refresh_process(pid);
match (is_process_refreshed, system.get_process(pid)) {
(true, Some(process_info)) => {
let cpu_usage = process_info.cpu_usage() as f64;
let memory_usage = process_info.memory() * 1024;
log::trace!(
target: "bridge-metrics",
"Refreshed process metrics: CPU={}, memory={}",
cpu_usage,
memory_usage,
);
self.process_cpu_usage_percentage
.set(if cpu_usage.is_finite() { cpu_usage } else { 0f64 });
self.process_memory_usage_bytes.set(memory_usage);
}
_ => {
log::warn!(
target: "bridge",
"Failed to refresh process information. Metrics may show obsolete values",
);
}
}
}
}
+109
View File
@@ -0,0 +1,109 @@
// Copyright 2019-2021 Parity Technologies (UK) Ltd.
// This file is part of Parity Bridges Common.
// Parity Bridges Common is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Parity Bridges Common is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Parity Bridges Common. If not, see <http://www.gnu.org/licenses/>.
//! Global system-wide Prometheus metrics exposed by relays.
use crate::metrics::{Metrics, StandaloneMetrics};
use async_std::sync::{Arc, Mutex};
use async_trait::async_trait;
use std::time::Duration;
use substrate_prometheus_endpoint::{register, Gauge, GaugeVec, Opts, Registry, F64, U64};
use sysinfo::{ProcessExt, RefreshKind, System, SystemExt};
/// Global metrics update interval.
const UPDATE_INTERVAL: Duration = Duration::from_secs(10);
/// Global Prometheus metrics.
#[derive(Debug, Clone)]
pub struct GlobalMetrics {
system: Arc<Mutex<System>>,
system_average_load: GaugeVec<F64>,
process_cpu_usage_percentage: Gauge<F64>,
process_memory_usage_bytes: Gauge<U64>,
}
impl Metrics for GlobalMetrics {
fn register(&self, registry: &Registry) -> Result<(), String> {
register(self.system_average_load.clone(), registry).map_err(|e| e.to_string())?;
register(self.process_cpu_usage_percentage.clone(), registry).map_err(|e| e.to_string())?;
register(self.process_memory_usage_bytes.clone(), registry).map_err(|e| e.to_string())?;
Ok(())
}
}
#[async_trait]
impl StandaloneMetrics for GlobalMetrics {
async fn update(&self) {
// update system-wide metrics
let mut system = self.system.lock().await;
let load = system.get_load_average();
self.system_average_load.with_label_values(&["1min"]).set(load.one);
self.system_average_load.with_label_values(&["5min"]).set(load.five);
self.system_average_load.with_label_values(&["15min"]).set(load.fifteen);
// update process-related metrics
let pid = sysinfo::get_current_pid().expect(
"only fails where pid is unavailable (os=unknown || arch=wasm32);\
relay is not supposed to run in such MetricsParamss;\
qed",
);
let is_process_refreshed = system.refresh_process(pid);
match (is_process_refreshed, system.get_process(pid)) {
(true, Some(process_info)) => {
let cpu_usage = process_info.cpu_usage() as f64;
let memory_usage = process_info.memory() * 1024;
log::trace!(
target: "bridge-metrics",
"Refreshed process metrics: CPU={}, memory={}",
cpu_usage,
memory_usage,
);
self.process_cpu_usage_percentage
.set(if cpu_usage.is_finite() { cpu_usage } else { 0f64 });
self.process_memory_usage_bytes.set(memory_usage);
}
_ => {
log::warn!(
target: "bridge",
"Failed to refresh process information. Metrics may show obsolete values",
);
}
}
}
fn update_interval(&self) -> Duration {
UPDATE_INTERVAL
}
}
impl Default for GlobalMetrics {
fn default() -> Self {
GlobalMetrics {
system: Arc::new(Mutex::new(System::new_with_specifics(RefreshKind::everything()))),
system_average_load: GaugeVec::new(Opts::new("system_average_load", "System load average"), &["over"])
.expect("metric is static and thus valid; qed"),
process_cpu_usage_percentage: Gauge::new("process_cpu_usage_percentage", "Process CPU usage")
.expect("metric is static and thus valid; qed"),
process_memory_usage_bytes: Gauge::new(
"process_memory_usage_bytes",
"Process memory (resident set size) usage",
)
.expect("metric is static and thus valid; qed"),
}
}
}
+162 -56
View File
@@ -14,10 +14,12 @@
// You should have received a copy of the GNU General Public License
// along with Parity Bridges Common. If not, see <http://www.gnu.org/licenses/>.
use crate::metrics::{Metrics, MetricsParams, StandaloneMetrics};
use crate::{FailedClient, MaybeConnectionError};
use async_trait::async_trait;
use std::{fmt::Debug, future::Future, time::Duration};
use std::{fmt::Debug, future::Future, net::SocketAddr, time::Duration};
use substrate_prometheus_endpoint::{init_prometheus, Registry};
/// Default pause between reconnect attempts.
pub const RECONNECT_DELAY: Duration = Duration::from_secs(10);
@@ -32,60 +34,164 @@ pub trait Client: Clone + Send + Sync {
async fn reconnect(&mut self) -> Result<(), Self::Error>;
}
/// Run relay loop.
///
/// This function represents an outer loop, which in turn calls provided `loop_run` function to do
/// actual job. When `loop_run` returns, this outer loop reconnects to failed client (source,
/// target or both) and calls `loop_run` again.
pub async fn run<SC: Client, TC: Client, R, F>(
reconnect_delay: Duration,
mut source_client: SC,
mut target_client: TC,
loop_run: R,
) where
R: Fn(SC, TC) -> F,
F: Future<Output = Result<(), FailedClient>>,
{
loop {
let result = loop_run(source_client.clone(), target_client.clone()).await;
match result {
Ok(()) => break,
Err(failed_client) => loop {
async_std::task::sleep(reconnect_delay).await;
if failed_client == FailedClient::Both || failed_client == FailedClient::Source {
match source_client.reconnect().await {
Ok(()) => (),
Err(error) => {
log::warn!(
target: "bridge",
"Failed to reconnect to source client. Going to retry in {}s: {:?}",
reconnect_delay.as_secs(),
error,
);
continue;
}
}
}
if failed_client == FailedClient::Both || failed_client == FailedClient::Target {
match target_client.reconnect().await {
Ok(()) => (),
Err(error) => {
log::warn!(
target: "bridge",
"Failed to reconnect to target client. Going to retry in {}s: {:?}",
reconnect_delay.as_secs(),
error,
);
continue;
}
}
}
break;
},
}
log::debug!(target: "bridge", "Restarting relay loop");
/// Returns generic loop that may be customized and started.
pub fn relay_loop<SC, TC>(source_client: SC, target_client: TC) -> Loop<SC, TC, ()> {
Loop {
source_client,
target_client,
loop_metric: None,
}
}
/// Generic relay loop.
pub struct Loop<SC, TC, LM> {
source_client: SC,
target_client: TC,
loop_metric: Option<LM>,
}
/// Relay loop metrics builder.
pub struct LoopMetrics<SC, TC, LM> {
relay_loop: Loop<SC, TC, ()>,
registry: Registry,
loop_metric: Option<LM>,
}
impl<SC, TC, LM> Loop<SC, TC, LM> {
/// Start building loop metrics using given prefix.
///
/// Panics if `prefix` is empty.
pub fn with_metrics(self, prefix: String) -> LoopMetrics<SC, TC, ()> {
assert!(!prefix.is_empty(), "Metrics prefix can not be empty");
LoopMetrics {
relay_loop: Loop {
source_client: self.source_client,
target_client: self.target_client,
loop_metric: None,
},
registry: Registry::new_custom(Some(prefix), None)
.expect("only fails if prefix is empty; prefix is not empty; qed"),
loop_metric: None,
}
}
/// Run relay loop.
///
/// This function represents an outer loop, which in turn calls provided `run_loop` function to do
/// actual job. When `run_loop` returns, this outer loop reconnects to failed client (source,
/// target or both) and calls `run_loop` again.
pub async fn run<R, F>(mut self, run_loop: R) -> Result<(), String>
where
R: Fn(SC, TC, Option<LM>) -> F,
F: Future<Output = Result<(), FailedClient>>,
SC: Client,
TC: Client,
LM: Clone,
{
loop {
let result = run_loop(
self.source_client.clone(),
self.target_client.clone(),
self.loop_metric.clone(),
)
.await;
match result {
Ok(()) => break,
Err(failed_client) => loop {
async_std::task::sleep(RECONNECT_DELAY).await;
if failed_client == FailedClient::Both || failed_client == FailedClient::Source {
match self.source_client.reconnect().await {
Ok(()) => (),
Err(error) => {
log::warn!(
target: "bridge",
"Failed to reconnect to source client. Going to retry in {}s: {:?}",
RECONNECT_DELAY.as_secs(),
error,
);
continue;
}
}
}
if failed_client == FailedClient::Both || failed_client == FailedClient::Target {
match self.target_client.reconnect().await {
Ok(()) => (),
Err(error) => {
log::warn!(
target: "bridge",
"Failed to reconnect to target client. Going to retry in {}s: {:?}",
RECONNECT_DELAY.as_secs(),
error,
);
continue;
}
}
}
break;
},
}
log::debug!(target: "bridge", "Restarting relay loop");
}
Ok(())
}
}
impl<SC, TC, LM> LoopMetrics<SC, TC, LM> {
/// Add relay loop metrics.
///
/// Loop metrics will be passed to the loop callback.
pub fn loop_metric<NewLM: Metrics>(self, loop_metric: NewLM) -> Result<LoopMetrics<SC, TC, NewLM>, String> {
loop_metric.register(&self.registry)?;
Ok(LoopMetrics {
relay_loop: self.relay_loop,
registry: self.registry,
loop_metric: Some(loop_metric),
})
}
/// Add standalone metrics.
pub fn standalone_metric<M: StandaloneMetrics>(self, standalone_metrics: M) -> Result<Self, String> {
standalone_metrics.register(&self.registry)?;
standalone_metrics.spawn();
Ok(self)
}
/// Expose metrics using given params.
///
/// If `params` is `None`, metrics are not exposed.
pub async fn expose(self, params: Option<MetricsParams>) -> Result<Loop<SC, TC, LM>, String> {
if let Some(params) = params {
let socket_addr = SocketAddr::new(
params.host.parse().map_err(|err| {
format!(
"Invalid host {} is used to expose Prometheus metrics: {}",
params.host, err,
)
})?,
params.port,
);
let registry = self.registry;
async_std::task::spawn(async move {
let result = init_prometheus(socket_addr, registry).await;
log::trace!(
target: "bridge-metrics",
"Prometheus endpoint has exited with result: {:?}",
result,
);
});
}
Ok(Loop {
source_client: self.relay_loop.source_client,
target_client: self.relay_loop.target_client,
loop_metric: self.loop_metric,
})
}
}