mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-13 05:51:02 +00:00
Changed how relay loops are started (#840)
* slightly changed relay loop initialization * git mv * clippy * more clippy * loop_run -> run_loop * review and clippy * clippy
This commit is contained in:
committed by
Bastian Köcher
parent
8d122b03f1
commit
a17c7eb80c
@@ -16,6 +16,8 @@
|
||||
|
||||
//! Utilities used by different relays.
|
||||
|
||||
pub use relay_loop::relay_loop;
|
||||
|
||||
use backoff::{backoff::Backoff, ExponentialBackoff};
|
||||
use futures::future::FutureExt;
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -14,12 +14,13 @@
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Parity Bridges Common. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
pub use global::GlobalMetrics;
|
||||
pub use substrate_prometheus_endpoint::{register, Counter, CounterVec, Gauge, GaugeVec, Opts, Registry, F64, U64};
|
||||
|
||||
use async_std::sync::{Arc, Mutex};
|
||||
use std::net::SocketAddr;
|
||||
use substrate_prometheus_endpoint::init_prometheus;
|
||||
use sysinfo::{ProcessExt, RefreshKind, System, SystemExt};
|
||||
use async_trait::async_trait;
|
||||
use std::time::Duration;
|
||||
|
||||
mod global;
|
||||
|
||||
/// Prometheus endpoint MetricsParams.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -31,62 +32,32 @@ pub struct MetricsParams {
|
||||
}
|
||||
|
||||
/// Metrics API.
|
||||
pub trait Metrics {
|
||||
pub trait Metrics: Clone + Send + Sync + 'static {
|
||||
/// Register metrics in the registry.
|
||||
fn register(&self, registry: &Registry) -> Result<(), String>;
|
||||
}
|
||||
|
||||
/// Global Prometheus metrics.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GlobalMetrics {
|
||||
system: Arc<Mutex<System>>,
|
||||
system_average_load: GaugeVec<F64>,
|
||||
process_cpu_usage_percentage: Gauge<F64>,
|
||||
process_memory_usage_bytes: Gauge<U64>,
|
||||
}
|
||||
/// Standalone metrics API.
|
||||
///
|
||||
/// Metrics of this kind know how to update themselves, so we may just spawn and forget the
|
||||
/// asynchronous self-update task.
|
||||
#[async_trait]
|
||||
pub trait StandaloneMetrics: Metrics {
|
||||
/// Update metric values.
|
||||
async fn update(&self);
|
||||
|
||||
/// Start Prometheus endpoint with given metrics registry.
|
||||
pub fn start(
|
||||
prefix: String,
|
||||
params: Option<MetricsParams>,
|
||||
global_metrics: &GlobalMetrics,
|
||||
extra_metrics: &impl Metrics,
|
||||
) {
|
||||
let params = match params {
|
||||
Some(params) => params,
|
||||
None => return,
|
||||
};
|
||||
/// Metrics update interval.
|
||||
fn update_interval(&self) -> Duration;
|
||||
|
||||
assert!(!prefix.is_empty(), "Metrics prefix can not be empty");
|
||||
|
||||
let do_start = move || {
|
||||
let prometheus_socket_addr = SocketAddr::new(
|
||||
params
|
||||
.host
|
||||
.parse()
|
||||
.map_err(|err| format!("Invalid Prometheus host {}: {}", params.host, err))?,
|
||||
params.port,
|
||||
);
|
||||
let metrics_registry =
|
||||
Registry::new_custom(Some(prefix), None).expect("only fails if prefix is empty; prefix is not empty; qed");
|
||||
global_metrics.register(&metrics_registry)?;
|
||||
extra_metrics.register(&metrics_registry)?;
|
||||
/// Spawn the self update task that will keep update metric value at given intervals.
|
||||
fn spawn(self) {
|
||||
async_std::task::spawn(async move {
|
||||
init_prometheus(prometheus_socket_addr, metrics_registry)
|
||||
.await
|
||||
.map_err(|err| format!("Error starting Prometheus endpoint: {}", err))
|
||||
let update_interval = self.update_interval();
|
||||
loop {
|
||||
self.update().await;
|
||||
async_std::task::sleep(update_interval).await;
|
||||
}
|
||||
});
|
||||
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let result: Result<(), String> = do_start();
|
||||
if let Err(err) = result {
|
||||
log::warn!(
|
||||
target: "bridge",
|
||||
"Failed to expose metrics: {}",
|
||||
err,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,71 +69,3 @@ impl Default for MetricsParams {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Metrics for GlobalMetrics {
|
||||
fn register(&self, registry: &Registry) -> Result<(), String> {
|
||||
register(self.system_average_load.clone(), registry).map_err(|e| e.to_string())?;
|
||||
register(self.process_cpu_usage_percentage.clone(), registry).map_err(|e| e.to_string())?;
|
||||
register(self.process_memory_usage_bytes.clone(), registry).map_err(|e| e.to_string())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for GlobalMetrics {
|
||||
fn default() -> Self {
|
||||
GlobalMetrics {
|
||||
system: Arc::new(Mutex::new(System::new_with_specifics(RefreshKind::everything()))),
|
||||
system_average_load: GaugeVec::new(Opts::new("system_average_load", "System load average"), &["over"])
|
||||
.expect("metric is static and thus valid; qed"),
|
||||
process_cpu_usage_percentage: Gauge::new("process_cpu_usage_percentage", "Process CPU usage")
|
||||
.expect("metric is static and thus valid; qed"),
|
||||
process_memory_usage_bytes: Gauge::new(
|
||||
"process_memory_usage_bytes",
|
||||
"Process memory (resident set size) usage",
|
||||
)
|
||||
.expect("metric is static and thus valid; qed"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GlobalMetrics {
|
||||
/// Update metrics.
|
||||
pub async fn update(&self) {
|
||||
// update system-wide metrics
|
||||
let mut system = self.system.lock().await;
|
||||
let load = system.get_load_average();
|
||||
self.system_average_load.with_label_values(&["1min"]).set(load.one);
|
||||
self.system_average_load.with_label_values(&["5min"]).set(load.five);
|
||||
self.system_average_load.with_label_values(&["15min"]).set(load.fifteen);
|
||||
|
||||
// update process-related metrics
|
||||
let pid = sysinfo::get_current_pid().expect(
|
||||
"only fails where pid is unavailable (os=unknown || arch=wasm32);\
|
||||
relay is not supposed to run in such MetricsParamss;\
|
||||
qed",
|
||||
);
|
||||
let is_process_refreshed = system.refresh_process(pid);
|
||||
match (is_process_refreshed, system.get_process(pid)) {
|
||||
(true, Some(process_info)) => {
|
||||
let cpu_usage = process_info.cpu_usage() as f64;
|
||||
let memory_usage = process_info.memory() * 1024;
|
||||
log::trace!(
|
||||
target: "bridge-metrics",
|
||||
"Refreshed process metrics: CPU={}, memory={}",
|
||||
cpu_usage,
|
||||
memory_usage,
|
||||
);
|
||||
|
||||
self.process_cpu_usage_percentage
|
||||
.set(if cpu_usage.is_finite() { cpu_usage } else { 0f64 });
|
||||
self.process_memory_usage_bytes.set(memory_usage);
|
||||
}
|
||||
_ => {
|
||||
log::warn!(
|
||||
target: "bridge",
|
||||
"Failed to refresh process information. Metrics may show obsolete values",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,109 @@
|
||||
// Copyright 2019-2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Parity Bridges Common.
|
||||
|
||||
// Parity Bridges Common is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Parity Bridges Common is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Parity Bridges Common. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Global system-wide Prometheus metrics exposed by relays.
|
||||
|
||||
use crate::metrics::{Metrics, StandaloneMetrics};
|
||||
|
||||
use async_std::sync::{Arc, Mutex};
|
||||
use async_trait::async_trait;
|
||||
use std::time::Duration;
|
||||
use substrate_prometheus_endpoint::{register, Gauge, GaugeVec, Opts, Registry, F64, U64};
|
||||
use sysinfo::{ProcessExt, RefreshKind, System, SystemExt};
|
||||
|
||||
/// Global metrics update interval.
|
||||
const UPDATE_INTERVAL: Duration = Duration::from_secs(10);
|
||||
|
||||
/// Global Prometheus metrics.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GlobalMetrics {
|
||||
system: Arc<Mutex<System>>,
|
||||
system_average_load: GaugeVec<F64>,
|
||||
process_cpu_usage_percentage: Gauge<F64>,
|
||||
process_memory_usage_bytes: Gauge<U64>,
|
||||
}
|
||||
|
||||
impl Metrics for GlobalMetrics {
|
||||
fn register(&self, registry: &Registry) -> Result<(), String> {
|
||||
register(self.system_average_load.clone(), registry).map_err(|e| e.to_string())?;
|
||||
register(self.process_cpu_usage_percentage.clone(), registry).map_err(|e| e.to_string())?;
|
||||
register(self.process_memory_usage_bytes.clone(), registry).map_err(|e| e.to_string())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl StandaloneMetrics for GlobalMetrics {
|
||||
async fn update(&self) {
|
||||
// update system-wide metrics
|
||||
let mut system = self.system.lock().await;
|
||||
let load = system.get_load_average();
|
||||
self.system_average_load.with_label_values(&["1min"]).set(load.one);
|
||||
self.system_average_load.with_label_values(&["5min"]).set(load.five);
|
||||
self.system_average_load.with_label_values(&["15min"]).set(load.fifteen);
|
||||
|
||||
// update process-related metrics
|
||||
let pid = sysinfo::get_current_pid().expect(
|
||||
"only fails where pid is unavailable (os=unknown || arch=wasm32);\
|
||||
relay is not supposed to run in such MetricsParamss;\
|
||||
qed",
|
||||
);
|
||||
let is_process_refreshed = system.refresh_process(pid);
|
||||
match (is_process_refreshed, system.get_process(pid)) {
|
||||
(true, Some(process_info)) => {
|
||||
let cpu_usage = process_info.cpu_usage() as f64;
|
||||
let memory_usage = process_info.memory() * 1024;
|
||||
log::trace!(
|
||||
target: "bridge-metrics",
|
||||
"Refreshed process metrics: CPU={}, memory={}",
|
||||
cpu_usage,
|
||||
memory_usage,
|
||||
);
|
||||
|
||||
self.process_cpu_usage_percentage
|
||||
.set(if cpu_usage.is_finite() { cpu_usage } else { 0f64 });
|
||||
self.process_memory_usage_bytes.set(memory_usage);
|
||||
}
|
||||
_ => {
|
||||
log::warn!(
|
||||
target: "bridge",
|
||||
"Failed to refresh process information. Metrics may show obsolete values",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn update_interval(&self) -> Duration {
|
||||
UPDATE_INTERVAL
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for GlobalMetrics {
|
||||
fn default() -> Self {
|
||||
GlobalMetrics {
|
||||
system: Arc::new(Mutex::new(System::new_with_specifics(RefreshKind::everything()))),
|
||||
system_average_load: GaugeVec::new(Opts::new("system_average_load", "System load average"), &["over"])
|
||||
.expect("metric is static and thus valid; qed"),
|
||||
process_cpu_usage_percentage: Gauge::new("process_cpu_usage_percentage", "Process CPU usage")
|
||||
.expect("metric is static and thus valid; qed"),
|
||||
process_memory_usage_bytes: Gauge::new(
|
||||
"process_memory_usage_bytes",
|
||||
"Process memory (resident set size) usage",
|
||||
)
|
||||
.expect("metric is static and thus valid; qed"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -14,10 +14,12 @@
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Parity Bridges Common. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use crate::metrics::{Metrics, MetricsParams, StandaloneMetrics};
|
||||
use crate::{FailedClient, MaybeConnectionError};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use std::{fmt::Debug, future::Future, time::Duration};
|
||||
use std::{fmt::Debug, future::Future, net::SocketAddr, time::Duration};
|
||||
use substrate_prometheus_endpoint::{init_prometheus, Registry};
|
||||
|
||||
/// Default pause between reconnect attempts.
|
||||
pub const RECONNECT_DELAY: Duration = Duration::from_secs(10);
|
||||
@@ -32,60 +34,164 @@ pub trait Client: Clone + Send + Sync {
|
||||
async fn reconnect(&mut self) -> Result<(), Self::Error>;
|
||||
}
|
||||
|
||||
/// Run relay loop.
|
||||
///
|
||||
/// This function represents an outer loop, which in turn calls provided `loop_run` function to do
|
||||
/// actual job. When `loop_run` returns, this outer loop reconnects to failed client (source,
|
||||
/// target or both) and calls `loop_run` again.
|
||||
pub async fn run<SC: Client, TC: Client, R, F>(
|
||||
reconnect_delay: Duration,
|
||||
mut source_client: SC,
|
||||
mut target_client: TC,
|
||||
loop_run: R,
|
||||
) where
|
||||
R: Fn(SC, TC) -> F,
|
||||
F: Future<Output = Result<(), FailedClient>>,
|
||||
{
|
||||
loop {
|
||||
let result = loop_run(source_client.clone(), target_client.clone()).await;
|
||||
|
||||
match result {
|
||||
Ok(()) => break,
|
||||
Err(failed_client) => loop {
|
||||
async_std::task::sleep(reconnect_delay).await;
|
||||
if failed_client == FailedClient::Both || failed_client == FailedClient::Source {
|
||||
match source_client.reconnect().await {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
log::warn!(
|
||||
target: "bridge",
|
||||
"Failed to reconnect to source client. Going to retry in {}s: {:?}",
|
||||
reconnect_delay.as_secs(),
|
||||
error,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if failed_client == FailedClient::Both || failed_client == FailedClient::Target {
|
||||
match target_client.reconnect().await {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
log::warn!(
|
||||
target: "bridge",
|
||||
"Failed to reconnect to target client. Going to retry in {}s: {:?}",
|
||||
reconnect_delay.as_secs(),
|
||||
error,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
},
|
||||
}
|
||||
|
||||
log::debug!(target: "bridge", "Restarting relay loop");
|
||||
/// Returns generic loop that may be customized and started.
|
||||
pub fn relay_loop<SC, TC>(source_client: SC, target_client: TC) -> Loop<SC, TC, ()> {
|
||||
Loop {
|
||||
source_client,
|
||||
target_client,
|
||||
loop_metric: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Generic relay loop.
|
||||
pub struct Loop<SC, TC, LM> {
|
||||
source_client: SC,
|
||||
target_client: TC,
|
||||
loop_metric: Option<LM>,
|
||||
}
|
||||
|
||||
/// Relay loop metrics builder.
|
||||
pub struct LoopMetrics<SC, TC, LM> {
|
||||
relay_loop: Loop<SC, TC, ()>,
|
||||
registry: Registry,
|
||||
loop_metric: Option<LM>,
|
||||
}
|
||||
|
||||
impl<SC, TC, LM> Loop<SC, TC, LM> {
|
||||
/// Start building loop metrics using given prefix.
|
||||
///
|
||||
/// Panics if `prefix` is empty.
|
||||
pub fn with_metrics(self, prefix: String) -> LoopMetrics<SC, TC, ()> {
|
||||
assert!(!prefix.is_empty(), "Metrics prefix can not be empty");
|
||||
|
||||
LoopMetrics {
|
||||
relay_loop: Loop {
|
||||
source_client: self.source_client,
|
||||
target_client: self.target_client,
|
||||
loop_metric: None,
|
||||
},
|
||||
registry: Registry::new_custom(Some(prefix), None)
|
||||
.expect("only fails if prefix is empty; prefix is not empty; qed"),
|
||||
loop_metric: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Run relay loop.
|
||||
///
|
||||
/// This function represents an outer loop, which in turn calls provided `run_loop` function to do
|
||||
/// actual job. When `run_loop` returns, this outer loop reconnects to failed client (source,
|
||||
/// target or both) and calls `run_loop` again.
|
||||
pub async fn run<R, F>(mut self, run_loop: R) -> Result<(), String>
|
||||
where
|
||||
R: Fn(SC, TC, Option<LM>) -> F,
|
||||
F: Future<Output = Result<(), FailedClient>>,
|
||||
SC: Client,
|
||||
TC: Client,
|
||||
LM: Clone,
|
||||
{
|
||||
loop {
|
||||
let result = run_loop(
|
||||
self.source_client.clone(),
|
||||
self.target_client.clone(),
|
||||
self.loop_metric.clone(),
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(()) => break,
|
||||
Err(failed_client) => loop {
|
||||
async_std::task::sleep(RECONNECT_DELAY).await;
|
||||
if failed_client == FailedClient::Both || failed_client == FailedClient::Source {
|
||||
match self.source_client.reconnect().await {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
log::warn!(
|
||||
target: "bridge",
|
||||
"Failed to reconnect to source client. Going to retry in {}s: {:?}",
|
||||
RECONNECT_DELAY.as_secs(),
|
||||
error,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if failed_client == FailedClient::Both || failed_client == FailedClient::Target {
|
||||
match self.target_client.reconnect().await {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
log::warn!(
|
||||
target: "bridge",
|
||||
"Failed to reconnect to target client. Going to retry in {}s: {:?}",
|
||||
RECONNECT_DELAY.as_secs(),
|
||||
error,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
},
|
||||
}
|
||||
|
||||
log::debug!(target: "bridge", "Restarting relay loop");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<SC, TC, LM> LoopMetrics<SC, TC, LM> {
|
||||
/// Add relay loop metrics.
|
||||
///
|
||||
/// Loop metrics will be passed to the loop callback.
|
||||
pub fn loop_metric<NewLM: Metrics>(self, loop_metric: NewLM) -> Result<LoopMetrics<SC, TC, NewLM>, String> {
|
||||
loop_metric.register(&self.registry)?;
|
||||
|
||||
Ok(LoopMetrics {
|
||||
relay_loop: self.relay_loop,
|
||||
registry: self.registry,
|
||||
loop_metric: Some(loop_metric),
|
||||
})
|
||||
}
|
||||
|
||||
/// Add standalone metrics.
|
||||
pub fn standalone_metric<M: StandaloneMetrics>(self, standalone_metrics: M) -> Result<Self, String> {
|
||||
standalone_metrics.register(&self.registry)?;
|
||||
standalone_metrics.spawn();
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Expose metrics using given params.
|
||||
///
|
||||
/// If `params` is `None`, metrics are not exposed.
|
||||
pub async fn expose(self, params: Option<MetricsParams>) -> Result<Loop<SC, TC, LM>, String> {
|
||||
if let Some(params) = params {
|
||||
let socket_addr = SocketAddr::new(
|
||||
params.host.parse().map_err(|err| {
|
||||
format!(
|
||||
"Invalid host {} is used to expose Prometheus metrics: {}",
|
||||
params.host, err,
|
||||
)
|
||||
})?,
|
||||
params.port,
|
||||
);
|
||||
|
||||
let registry = self.registry;
|
||||
async_std::task::spawn(async move {
|
||||
let result = init_prometheus(socket_addr, registry).await;
|
||||
log::trace!(
|
||||
target: "bridge-metrics",
|
||||
"Prometheus endpoint has exited with result: {:?}",
|
||||
result,
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
Ok(Loop {
|
||||
source_client: self.relay_loop.source_client,
|
||||
target_client: self.relay_loop.target_client,
|
||||
loop_metric: self.loop_metric,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user