Alert on frequent network errors (#7410)

* Introduce is_frequent util

* Add dirty warn_if_frequent! implementation

* Add freq

* Fix order in condition

* Update

* Update docs

* Fix

* Remove old impl

* Fix errors

* Add wif to av-distr

* Add wif to col prot

* Rename

* Add wif to state-distr

* Address review comments

* Change Freq implementation

* Remove the zero division check

* Make rate explicit

* Fix typo

* Update rate constant

* Introduce explicit rates

* Update docs

* Split errors freq

* Downgrade coarsetime
This commit is contained in:
Andrei Eres
2023-07-17 14:05:57 +02:00
committed by GitHub
parent dd7d2f924b
commit 174f23d1cc
12 changed files with 268 additions and 15 deletions
@@ -1270,6 +1270,9 @@ async fn run_inner<Context>(
let check_collations_stream = tick_stream(CHECK_COLLATIONS_POLL);
futures::pin_mut!(check_collations_stream);
let mut network_error_freq = gum::Freq::new();
let mut canceled_freq = gum::Freq::new();
loop {
select! {
_ = reputation_delay => {
@@ -1312,6 +1315,8 @@ async fn run_inner<Context>(
&mut state.requested_collations,
&state.metrics,
&state.span_per_relay_parent,
&mut network_error_freq,
&mut canceled_freq,
).await;
for (peer_id, rep) in reputation_changes {
@@ -1328,14 +1333,22 @@ async fn poll_requests(
requested_collations: &mut HashMap<PendingCollation, PerRequest>,
metrics: &Metrics,
span_per_relay_parent: &HashMap<Hash, PerLeafSpan>,
network_error_freq: &mut gum::Freq,
canceled_freq: &mut gum::Freq,
) -> Vec<(PeerId, Rep)> {
let mut retained_requested = HashSet::new();
let mut reputation_changes = Vec::new();
for (pending_collation, per_req) in requested_collations.iter_mut() {
// Despite the await, this won't block on the response itself.
let result =
poll_collation_response(metrics, span_per_relay_parent, pending_collation, per_req)
.await;
let result = poll_collation_response(
metrics,
span_per_relay_parent,
pending_collation,
per_req,
network_error_freq,
canceled_freq,
)
.await;
if !result.is_ready() {
retained_requested.insert(pending_collation.clone());
@@ -1479,6 +1492,8 @@ async fn poll_collation_response(
spans: &HashMap<Hash, PerLeafSpan>,
pending_collation: &PendingCollation,
per_req: &mut PerRequest,
network_error_freq: &mut gum::Freq,
canceled_freq: &mut gum::Freq,
) -> CollationFetchResult {
if never!(per_req.from_collator.is_terminated()) {
gum::error!(
@@ -1522,7 +1537,9 @@ async fn poll_collation_response(
CollationFetchResult::Error(None)
},
Err(RequestError::NetworkError(err)) => {
gum::debug!(
gum::warn_if_frequent!(
freq: network_error_freq,
max_rate: gum::Times::PerHour(100),
target: LOG_TARGET,
hash = ?pending_collation.relay_parent,
para_id = ?pending_collation.para_id,
@@ -1537,7 +1554,9 @@ async fn poll_collation_response(
CollationFetchResult::Error(Some(COST_NETWORK_ERROR))
},
Err(RequestError::Canceled(err)) => {
gum::debug!(
gum::warn_if_frequent!(
freq: canceled_freq,
max_rate: gum::Times::PerHour(100),
target: LOG_TARGET,
hash = ?pending_collation.relay_parent,
para_id = ?pending_collation.para_id,