diff --git a/substrate/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml b/substrate/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml index 5b0daba3d8..40a489bd09 100644 --- a/substrate/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml +++ b/substrate/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml @@ -49,10 +49,10 @@ tests: ###################################################################### - eval_time: 6m - alertname: LowNumberOfNewBlocks + alertname: BlockProductionSlow exp_alerts: - eval_time: 7m - alertname: LowNumberOfNewBlocks + alertname: BlockProductionSlow exp_alerts: - exp_labels: severity: warning @@ -61,11 +61,12 @@ tests: job: polkadot status: best exp_annotations: - message: "Less than one new block per minute on instance - polkadot-abcdef01234-abcdef." + message: "Best block on instance + polkadot-abcdef01234-abcdef increases by less than 1 per + minute for more than 3 minutes." - eval_time: 14m - alertname: LowNumberOfNewBlocks + alertname: BlockProductionSlow exp_alerts: - exp_labels: severity: warning @@ -74,8 +75,9 @@ tests: job: polkadot status: best exp_annotations: - message: "Less than one new block per minute on instance - polkadot-abcdef01234-abcdef." + message: "Best block on instance + polkadot-abcdef01234-abcdef increases by less than 1 per + minute for more than 3 minutes." - exp_labels: severity: critical pod: polkadot-abcdef01234-abcdef @@ -83,8 +85,9 @@ tests: job: polkadot status: best exp_annotations: - message: "Less than one new block per minute on instance - polkadot-abcdef01234-abcdef." + message: "Best block on instance + polkadot-abcdef01234-abcdef increases by less than 1 per + minute for more than 10 minutes." ###################################################################### # Block finalization @@ -105,7 +108,7 @@ tests: exp_annotations: message: "Finalized block on instance polkadot-abcdef01234-abcdef increases by less than 1 per - minute." + minute for more than 3 minutes." - eval_time: 14m alertname: BlockFinalizationSlow @@ -119,7 +122,7 @@ tests: exp_annotations: message: "Finalized block on instance polkadot-abcdef01234-abcdef increases by less than 1 per - minute." + minute for more than 3 minutes." - exp_labels: severity: critical pod: polkadot-abcdef01234-abcdef @@ -129,7 +132,7 @@ tests: exp_annotations: message: "Finalized block on instance polkadot-abcdef01234-abcdef increases by less than 1 per - minute." + minute for more than 10 minutes." ###################################################################### # Transaction queue @@ -155,7 +158,7 @@ tests: exp_annotations: message: "The transaction pool size on node polkadot-abcdef01234-abcdef has been monotonically - increasing for the last 10 minutes." + increasing for more than 10 minutes." - eval_time: 43m alertname: TransactionQueueSizeIncreasing # Number of validations scheduled is growing twice as fast as the @@ -170,7 +173,7 @@ tests: exp_annotations: message: "The transaction pool size on node polkadot-abcdef01234-abcdef has been monotonically - increasing for the last 10 minutes." + increasing for more than 10 minutes." - exp_labels: severity: critical pod: polkadot-abcdef01234-abcdef @@ -179,7 +182,7 @@ tests: exp_annotations: message: "The transaction pool size on node polkadot-abcdef01234-abcdef has been monotonically - increasing for the last 30 minutes." + increasing for more than 30 minutes." - eval_time: 49m alertname: TransactionQueueSizeHigh # After minute 43 the number of validations scheduled jumps up @@ -193,18 +196,18 @@ tests: job: polkadot exp_annotations: message: "The transaction pool size on node - polkadot-abcdef01234-abcdef has been above 10_000 for the - last 5 minutes." + polkadot-abcdef01234-abcdef has been above 10_000 for more + than 5 minutes." ###################################################################### # Networking ###################################################################### - eval_time: 3m # Values: 3 2 2 - alertname: LowNumberOfPeers + alertname: NumberOfPeersLow exp_alerts: - eval_time: 4m # Values: 2 2 2 - alertname: LowNumberOfPeers + alertname: NumberOfPeersLow exp_alerts: - exp_labels: severity: warning @@ -216,7 +219,7 @@ tests: than 3 peers for more than 3 minutes" - eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 - alertname: LowNumberOfPeers + alertname: NumberOfPeersLow exp_alerts: - exp_labels: severity: warning diff --git a/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml b/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml index 7f36fedb4b..3dde038d88 100644 --- a/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml +++ b/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml @@ -6,18 +6,18 @@ groups: # Block production ############################################################################## - - alert: LowNumberOfNewBlocks + - alert: BlockProductionSlow annotations: - message: 'Less than one new block per minute on instance {{ - $labels.instance }}.' + message: 'Best block on instance {{ $labels.instance }} increases by + less than 1 per minute for more than 3 minutes.' expr: increase(polkadot_block_height{status="best"}[1m]) < 1 for: 3m labels: severity: warning - - alert: LowNumberOfNewBlocks + - alert: BlockProductionSlow annotations: - message: 'Less than one new block per minute on instance {{ - $labels.instance }}.' + message: 'Best block on instance {{ $labels.instance }} increases by + less than 1 per minute for more than 10 minutes.' expr: increase(polkadot_block_height{status="best"}[1m]) < 1 for: 10m labels: @@ -34,7 +34,7 @@ groups: severity: warning annotations: message: 'Finalized block on instance {{ $labels.instance }} increases by - less than 1 per minute.' + less than 1 per minute for more than 3 minutes.' - alert: BlockFinalizationSlow expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1 for: 10m @@ -42,7 +42,7 @@ groups: severity: critical annotations: message: 'Finalized block on instance {{ $labels.instance }} increases by - less than 1 per minute.' + less than 1 per minute for more than 10 minutes.' - alert: BlockFinalizationLaggingBehind # Under the assumption of an average block production of 6 seconds, # "best" and "finalized" being more than 10 blocks apart would imply @@ -54,7 +54,7 @@ groups: severity: critical annotations: message: "Block finalization on instance {{ $labels.instance }} is behind - block production by {{ $value }} for more than 8m" + block production by {{ $value }} for more than 8 minutes." ############################################################################## # Transaction queue @@ -68,7 +68,7 @@ groups: severity: warning annotations: message: 'The transaction pool size on node {{ $labels.instance }} has - been monotonically increasing for the last 10 minutes.' + been monotonically increasing for more than 10 minutes.' - alert: TransactionQueueSizeIncreasing expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) - increase(polkadot_sub_txpool_validations_finished[5m]) > 0' @@ -77,7 +77,7 @@ groups: severity: critical annotations: message: 'The transaction pool size on node {{ $labels.instance }} has - been monotonically increasing for the last 30 minutes.' + been monotonically increasing for more than 30 minutes.' - alert: TransactionQueueSizeHigh expr: 'polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10000' @@ -86,13 +86,13 @@ groups: severity: critical annotations: message: 'The transaction pool size on node {{ $labels.instance }} has - been above 10_000 for the last 5 minutes.' + been above 10_000 for more than 5 minutes.' ############################################################################## # Networking ############################################################################## - - alert: LowNumberOfPeers + - alert: NumberOfPeersLow expr: polkadot_sub_libp2p_peers_count < 3 for: 3m labels: @@ -100,7 +100,7 @@ groups: annotations: message: 'The node {{ $labels.instance }} has less than 3 peers for more than 3 minutes' - - alert: LowNumberOfPeers + - alert: NumberOfPeersLow expr: polkadot_sub_libp2p_peers_count < 3 for: 15m labels: @@ -113,7 +113,7 @@ groups: # System ############################################################################## - - alert: HighNumberOfFileDescriptors + - alert: NumberOfFileDescriptorsHigh expr: 'node_filefd_allocated{domain=~"kusama|polkadot"} > 10000' for: 3m labels: @@ -126,7 +126,7 @@ groups: # Others ############################################################################## - - alert: AuthorityDiscoveryHighDiscoveryFailure + - alert: AuthorityDiscoveryDiscoveryFailureHigh expr: 'polkadot_authority_discovery_handle_value_found_event_failure / ignoring(name) polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5' @@ -134,5 +134,6 @@ groups: labels: severity: warning annotations: - message: "Authority discovery on node {{ $labels.instance }} fails to - process more than 50 % of the values found on the DHT." + message: 'Authority discovery on node {{ $labels.instance }} fails to + process more than 50 % of the values found on the DHT for more than 2 + hours.'