Alert on frequent network errors (#7410)

* Introduce is_frequent util

* Add dirty warn_if_frequent! implementation

* Add freq

* Fix order in condition

* Update

* Update docs

* Fix

* Remove old impl

* Fix errors

* Add wif to av-distr

* Add wif to col prot

* Rename

* Add wif to state-distr

* Address review comments

* Change Freq implementation

* Remove the zero division check

* Make rate explicit

* Fix typo

* Update rate constant

* Introduce explicit rates

* Update docs

* Split errors freq

* Downgrade coarsetime
This commit is contained in:
Andrei Eres
2023-07-17 14:05:57 +02:00
committed by GitHub
parent dd7d2f924b
commit 174f23d1cc
12 changed files with 268 additions and 15 deletions
@@ -260,6 +260,8 @@ impl RunningTask {
let mut succeeded = false;
let mut count: u32 = 0;
let mut span = self.span.child("run-fetch-chunk-task").with_relay_parent(self.relay_parent);
let mut network_error_freq = gum::Freq::new();
let mut canceled_freq = gum::Freq::new();
// Try validators in reverse order:
while let Some(validator) = self.group.pop() {
// Report retries:
@@ -272,7 +274,10 @@ impl RunningTask {
.with_chunk_index(self.request.index.0)
.with_stage(jaeger::Stage::AvailabilityDistribution);
// Send request:
let resp = match self.do_request(&validator).await {
let resp = match self
.do_request(&validator, &mut network_error_freq, &mut canceled_freq)
.await
{
Ok(resp) => resp,
Err(TaskError::ShuttingDown) => {
gum::info!(
@@ -342,6 +347,8 @@ impl RunningTask {
async fn do_request(
&mut self,
validator: &AuthorityDiscoveryId,
nerwork_error_freq: &mut gum::Freq,
canceled_freq: &mut gum::Freq,
) -> std::result::Result<ChunkFetchingResponse, TaskError> {
gum::trace!(
target: LOG_TARGET,
@@ -386,7 +393,9 @@ impl RunningTask {
Err(TaskError::PeerError)
},
Err(RequestError::NetworkError(err)) => {
gum::debug!(
gum::warn_if_frequent!(
freq: nerwork_error_freq,
max_rate: gum::Times::PerHour(100),
target: LOG_TARGET,
origin = ?validator,
relay_parent = ?self.relay_parent,
@@ -400,7 +409,9 @@ impl RunningTask {
Err(TaskError::PeerError)
},
Err(RequestError::Canceled(oneshot::Canceled)) => {
gum::debug!(
gum::warn_if_frequent!(
freq: canceled_freq,
max_rate: gum::Times::PerHour(100),
target: LOG_TARGET,
origin = ?validator,
relay_parent = ?self.relay_parent,