mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-05-30 05:51:02 +00:00
.maintain/monitoring: Normalize alerting rules (#7232)
* .maintain/monitoring: Normalize alerting rules - Start alert names with their component and end with the describing adjective. - Describe alert duration in `message` with `for more than` across all alerts. * .maintain/monitoring: Fix alert tests
This commit is contained in:
@@ -49,10 +49,10 @@ tests:
|
||||
######################################################################
|
||||
|
||||
- eval_time: 6m
|
||||
alertname: LowNumberOfNewBlocks
|
||||
alertname: BlockProductionSlow
|
||||
exp_alerts:
|
||||
- eval_time: 7m
|
||||
alertname: LowNumberOfNewBlocks
|
||||
alertname: BlockProductionSlow
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
@@ -61,11 +61,12 @@ tests:
|
||||
job: polkadot
|
||||
status: best
|
||||
exp_annotations:
|
||||
message: "Less than one new block per minute on instance
|
||||
polkadot-abcdef01234-abcdef."
|
||||
message: "Best block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
minute for more than 3 minutes."
|
||||
|
||||
- eval_time: 14m
|
||||
alertname: LowNumberOfNewBlocks
|
||||
alertname: BlockProductionSlow
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
@@ -74,8 +75,9 @@ tests:
|
||||
job: polkadot
|
||||
status: best
|
||||
exp_annotations:
|
||||
message: "Less than one new block per minute on instance
|
||||
polkadot-abcdef01234-abcdef."
|
||||
message: "Best block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
minute for more than 3 minutes."
|
||||
- exp_labels:
|
||||
severity: critical
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
@@ -83,8 +85,9 @@ tests:
|
||||
job: polkadot
|
||||
status: best
|
||||
exp_annotations:
|
||||
message: "Less than one new block per minute on instance
|
||||
polkadot-abcdef01234-abcdef."
|
||||
message: "Best block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
minute for more than 10 minutes."
|
||||
|
||||
######################################################################
|
||||
# Block finalization
|
||||
@@ -105,7 +108,7 @@ tests:
|
||||
exp_annotations:
|
||||
message: "Finalized block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
minute."
|
||||
minute for more than 3 minutes."
|
||||
|
||||
- eval_time: 14m
|
||||
alertname: BlockFinalizationSlow
|
||||
@@ -119,7 +122,7 @@ tests:
|
||||
exp_annotations:
|
||||
message: "Finalized block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
minute."
|
||||
minute for more than 3 minutes."
|
||||
- exp_labels:
|
||||
severity: critical
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
@@ -129,7 +132,7 @@ tests:
|
||||
exp_annotations:
|
||||
message: "Finalized block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
minute."
|
||||
minute for more than 10 minutes."
|
||||
|
||||
######################################################################
|
||||
# Transaction queue
|
||||
@@ -155,7 +158,7 @@ tests:
|
||||
exp_annotations:
|
||||
message: "The transaction pool size on node
|
||||
polkadot-abcdef01234-abcdef has been monotonically
|
||||
increasing for the last 10 minutes."
|
||||
increasing for more than 10 minutes."
|
||||
- eval_time: 43m
|
||||
alertname: TransactionQueueSizeIncreasing
|
||||
# Number of validations scheduled is growing twice as fast as the
|
||||
@@ -170,7 +173,7 @@ tests:
|
||||
exp_annotations:
|
||||
message: "The transaction pool size on node
|
||||
polkadot-abcdef01234-abcdef has been monotonically
|
||||
increasing for the last 10 minutes."
|
||||
increasing for more than 10 minutes."
|
||||
- exp_labels:
|
||||
severity: critical
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
@@ -179,7 +182,7 @@ tests:
|
||||
exp_annotations:
|
||||
message: "The transaction pool size on node
|
||||
polkadot-abcdef01234-abcdef has been monotonically
|
||||
increasing for the last 30 minutes."
|
||||
increasing for more than 30 minutes."
|
||||
- eval_time: 49m
|
||||
alertname: TransactionQueueSizeHigh
|
||||
# After minute 43 the number of validations scheduled jumps up
|
||||
@@ -193,18 +196,18 @@ tests:
|
||||
job: polkadot
|
||||
exp_annotations:
|
||||
message: "The transaction pool size on node
|
||||
polkadot-abcdef01234-abcdef has been above 10_000 for the
|
||||
last 5 minutes."
|
||||
polkadot-abcdef01234-abcdef has been above 10_000 for more
|
||||
than 5 minutes."
|
||||
|
||||
######################################################################
|
||||
# Networking
|
||||
######################################################################
|
||||
|
||||
- eval_time: 3m # Values: 3 2 2
|
||||
alertname: LowNumberOfPeers
|
||||
alertname: NumberOfPeersLow
|
||||
exp_alerts:
|
||||
- eval_time: 4m # Values: 2 2 2
|
||||
alertname: LowNumberOfPeers
|
||||
alertname: NumberOfPeersLow
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
@@ -216,7 +219,7 @@ tests:
|
||||
than 3 peers for more than 3 minutes"
|
||||
|
||||
- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
|
||||
alertname: LowNumberOfPeers
|
||||
alertname: NumberOfPeersLow
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
|
||||
@@ -6,18 +6,18 @@ groups:
|
||||
# Block production
|
||||
##############################################################################
|
||||
|
||||
- alert: LowNumberOfNewBlocks
|
||||
- alert: BlockProductionSlow
|
||||
annotations:
|
||||
message: 'Less than one new block per minute on instance {{
|
||||
$labels.instance }}.'
|
||||
message: 'Best block on instance {{ $labels.instance }} increases by
|
||||
less than 1 per minute for more than 3 minutes.'
|
||||
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: LowNumberOfNewBlocks
|
||||
- alert: BlockProductionSlow
|
||||
annotations:
|
||||
message: 'Less than one new block per minute on instance {{
|
||||
$labels.instance }}.'
|
||||
message: 'Best block on instance {{ $labels.instance }} increases by
|
||||
less than 1 per minute for more than 10 minutes.'
|
||||
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
@@ -34,7 +34,7 @@ groups:
|
||||
severity: warning
|
||||
annotations:
|
||||
message: 'Finalized block on instance {{ $labels.instance }} increases by
|
||||
less than 1 per minute.'
|
||||
less than 1 per minute for more than 3 minutes.'
|
||||
- alert: BlockFinalizationSlow
|
||||
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
|
||||
for: 10m
|
||||
@@ -42,7 +42,7 @@ groups:
|
||||
severity: critical
|
||||
annotations:
|
||||
message: 'Finalized block on instance {{ $labels.instance }} increases by
|
||||
less than 1 per minute.'
|
||||
less than 1 per minute for more than 10 minutes.'
|
||||
- alert: BlockFinalizationLaggingBehind
|
||||
# Under the assumption of an average block production of 6 seconds,
|
||||
# "best" and "finalized" being more than 10 blocks apart would imply
|
||||
@@ -54,7 +54,7 @@ groups:
|
||||
severity: critical
|
||||
annotations:
|
||||
message: "Block finalization on instance {{ $labels.instance }} is behind
|
||||
block production by {{ $value }} for more than 8m"
|
||||
block production by {{ $value }} for more than 8 minutes."
|
||||
|
||||
##############################################################################
|
||||
# Transaction queue
|
||||
@@ -68,7 +68,7 @@ groups:
|
||||
severity: warning
|
||||
annotations:
|
||||
message: 'The transaction pool size on node {{ $labels.instance }} has
|
||||
been monotonically increasing for the last 10 minutes.'
|
||||
been monotonically increasing for more than 10 minutes.'
|
||||
- alert: TransactionQueueSizeIncreasing
|
||||
expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
|
||||
increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
|
||||
@@ -77,7 +77,7 @@ groups:
|
||||
severity: critical
|
||||
annotations:
|
||||
message: 'The transaction pool size on node {{ $labels.instance }} has
|
||||
been monotonically increasing for the last 30 minutes.'
|
||||
been monotonically increasing for more than 30 minutes.'
|
||||
- alert: TransactionQueueSizeHigh
|
||||
expr: 'polkadot_sub_txpool_validations_scheduled -
|
||||
polkadot_sub_txpool_validations_finished > 10000'
|
||||
@@ -86,13 +86,13 @@ groups:
|
||||
severity: critical
|
||||
annotations:
|
||||
message: 'The transaction pool size on node {{ $labels.instance }} has
|
||||
been above 10_000 for the last 5 minutes.'
|
||||
been above 10_000 for more than 5 minutes.'
|
||||
|
||||
##############################################################################
|
||||
# Networking
|
||||
##############################################################################
|
||||
|
||||
- alert: LowNumberOfPeers
|
||||
- alert: NumberOfPeersLow
|
||||
expr: polkadot_sub_libp2p_peers_count < 3
|
||||
for: 3m
|
||||
labels:
|
||||
@@ -100,7 +100,7 @@ groups:
|
||||
annotations:
|
||||
message: 'The node {{ $labels.instance }} has less than 3 peers for more
|
||||
than 3 minutes'
|
||||
- alert: LowNumberOfPeers
|
||||
- alert: NumberOfPeersLow
|
||||
expr: polkadot_sub_libp2p_peers_count < 3
|
||||
for: 15m
|
||||
labels:
|
||||
@@ -113,7 +113,7 @@ groups:
|
||||
# System
|
||||
##############################################################################
|
||||
|
||||
- alert: HighNumberOfFileDescriptors
|
||||
- alert: NumberOfFileDescriptorsHigh
|
||||
expr: 'node_filefd_allocated{domain=~"kusama|polkadot"} > 10000'
|
||||
for: 3m
|
||||
labels:
|
||||
@@ -126,7 +126,7 @@ groups:
|
||||
# Others
|
||||
##############################################################################
|
||||
|
||||
- alert: AuthorityDiscoveryHighDiscoveryFailure
|
||||
- alert: AuthorityDiscoveryDiscoveryFailureHigh
|
||||
expr: 'polkadot_authority_discovery_handle_value_found_event_failure /
|
||||
ignoring(name)
|
||||
polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5'
|
||||
@@ -134,5 +134,6 @@ groups:
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
message: "Authority discovery on node {{ $labels.instance }} fails to
|
||||
process more than 50 % of the values found on the DHT."
|
||||
message: 'Authority discovery on node {{ $labels.instance }} fails to
|
||||
process more than 50 % of the values found on the DHT for more than 2
|
||||
hours.'
|
||||
|
||||
Reference in New Issue
Block a user