mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-14 04:01:10 +00:00
Substrate alerts rules update (#10642)
* .maintain/monitoring: Update substrate prometheus alert rules * match the `substrate_` metrics prefix in alerts instead of `polkadot_`, following changes in #9543 * remove the filtering on polkadot|kusama domain for NumberOfFileDescriptorsHigh alert * .maintain/monitoring: Update substrate Grafana dashboards * match the `substrate_` metrics prefix instead of `polkadot_` in dashboards, following changes in #9543 * .maintain/monitoring: make the NumberOfFileDescriptorsHigh alert only apply for metrics tagged with 'chain'
This commit is contained in:
@@ -6,39 +6,39 @@ evaluation_interval: 1m
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'polkadot_sub_libp2p_peers_count{
|
||||
job="polkadot",
|
||||
pod="polkadot-abcdef01234-abcdef",
|
||||
instance="polkadot-abcdef01234-abcdef",
|
||||
- series: 'substrate_sub_libp2p_peers_count{
|
||||
job="substrate",
|
||||
pod="substrate-abcdef01234-abcdef",
|
||||
instance="substrate-abcdef01234-abcdef",
|
||||
}'
|
||||
values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
|
||||
|
||||
- series: 'polkadot_sub_txpool_validations_scheduled{
|
||||
job="polkadot",
|
||||
pod="polkadot-abcdef01234-abcdef",
|
||||
instance="polkadot-abcdef01234-abcdef",
|
||||
- series: 'substrate_sub_txpool_validations_scheduled{
|
||||
job="substrate",
|
||||
pod="substrate-abcdef01234-abcdef",
|
||||
instance="substrate-abcdef01234-abcdef",
|
||||
}'
|
||||
values: '11+1x10 22+2x30 10043x5'
|
||||
|
||||
- series: 'polkadot_sub_txpool_validations_finished{
|
||||
job="polkadot",
|
||||
pod="polkadot-abcdef01234-abcdef",
|
||||
instance="polkadot-abcdef01234-abcdef",
|
||||
- series: 'substrate_sub_txpool_validations_finished{
|
||||
job="substrate",
|
||||
pod="substrate-abcdef01234-abcdef",
|
||||
instance="substrate-abcdef01234-abcdef",
|
||||
}'
|
||||
values: '0+1x42 42x5'
|
||||
|
||||
- series: 'polkadot_block_height{
|
||||
status="best", job="polkadot",
|
||||
pod="polkadot-abcdef01234-abcdef",
|
||||
instance="polkadot-abcdef01234-abcdef",
|
||||
- series: 'substrate_block_height{
|
||||
status="best", job="substrate",
|
||||
pod="substrate-abcdef01234-abcdef",
|
||||
instance="substrate-abcdef01234-abcdef",
|
||||
}'
|
||||
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...
|
||||
|
||||
- series: 'polkadot_block_height{
|
||||
- series: 'substrate_block_height{
|
||||
status="finalized",
|
||||
job="polkadot",
|
||||
pod="polkadot-abcdef01234-abcdef",
|
||||
instance="polkadot-abcdef01234-abcdef",
|
||||
job="substrate",
|
||||
pod="substrate-abcdef01234-abcdef",
|
||||
instance="substrate-abcdef01234-abcdef",
|
||||
}'
|
||||
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...
|
||||
|
||||
@@ -56,13 +56,13 @@ tests:
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
status: best
|
||||
exp_annotations:
|
||||
message: "Best block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
substrate-abcdef01234-abcdef increases by less than 1 per
|
||||
minute for more than 3 minutes."
|
||||
|
||||
- eval_time: 14m
|
||||
@@ -70,23 +70,23 @@ tests:
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
status: best
|
||||
exp_annotations:
|
||||
message: "Best block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
substrate-abcdef01234-abcdef increases by less than 1 per
|
||||
minute for more than 3 minutes."
|
||||
- exp_labels:
|
||||
severity: critical
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
status: best
|
||||
exp_annotations:
|
||||
message: "Best block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
substrate-abcdef01234-abcdef increases by less than 1 per
|
||||
minute for more than 10 minutes."
|
||||
|
||||
######################################################################
|
||||
@@ -101,13 +101,13 @@ tests:
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
status: finalized
|
||||
exp_annotations:
|
||||
message: "Finalized block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
substrate-abcdef01234-abcdef increases by less than 1 per
|
||||
minute for more than 3 minutes."
|
||||
|
||||
- eval_time: 14m
|
||||
@@ -115,23 +115,23 @@ tests:
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
status: finalized
|
||||
exp_annotations:
|
||||
message: "Finalized block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
substrate-abcdef01234-abcdef increases by less than 1 per
|
||||
minute for more than 3 minutes."
|
||||
- exp_labels:
|
||||
severity: critical
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
status: finalized
|
||||
exp_annotations:
|
||||
message: "Finalized block on instance
|
||||
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||
substrate-abcdef01234-abcdef increases by less than 1 per
|
||||
minute for more than 10 minutes."
|
||||
|
||||
######################################################################
|
||||
@@ -152,12 +152,12 @@ tests:
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
exp_annotations:
|
||||
message: "The transaction pool size on node
|
||||
polkadot-abcdef01234-abcdef has been monotonically
|
||||
substrate-abcdef01234-abcdef has been monotonically
|
||||
increasing for more than 10 minutes."
|
||||
- eval_time: 43m
|
||||
alertname: TransactionQueueSizeIncreasing
|
||||
@@ -167,21 +167,21 @@ tests:
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
exp_annotations:
|
||||
message: "The transaction pool size on node
|
||||
polkadot-abcdef01234-abcdef has been monotonically
|
||||
substrate-abcdef01234-abcdef has been monotonically
|
||||
increasing for more than 10 minutes."
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
exp_annotations:
|
||||
message: "The transaction pool size on node
|
||||
polkadot-abcdef01234-abcdef has been monotonically
|
||||
substrate-abcdef01234-abcdef has been monotonically
|
||||
increasing for more than 30 minutes."
|
||||
- eval_time: 49m
|
||||
alertname: TransactionQueueSizeHigh
|
||||
@@ -191,12 +191,12 @@ tests:
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
exp_annotations:
|
||||
message: "The transaction pool size on node
|
||||
polkadot-abcdef01234-abcdef has been above 10_000 for more
|
||||
substrate-abcdef01234-abcdef has been above 10_000 for more
|
||||
than 5 minutes."
|
||||
|
||||
######################################################################
|
||||
@@ -211,11 +211,11 @@ tests:
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
exp_annotations:
|
||||
message: "The node polkadot-abcdef01234-abcdef has less
|
||||
message: "The node substrate-abcdef01234-abcdef has less
|
||||
than 3 peers for more than 3 minutes"
|
||||
|
||||
- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
|
||||
@@ -223,17 +223,17 @@ tests:
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
severity: warning
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
exp_annotations:
|
||||
message: "The node polkadot-abcdef01234-abcdef has less
|
||||
message: "The node substrate-abcdef01234-abcdef has less
|
||||
than 3 peers for more than 3 minutes"
|
||||
- exp_labels:
|
||||
severity: critical
|
||||
pod: polkadot-abcdef01234-abcdef
|
||||
instance: polkadot-abcdef01234-abcdef
|
||||
job: polkadot
|
||||
pod: substrate-abcdef01234-abcdef
|
||||
instance: substrate-abcdef01234-abcdef
|
||||
job: substrate
|
||||
exp_annotations:
|
||||
message: "The node polkadot-abcdef01234-abcdef has less
|
||||
message: "The node substrate-abcdef01234-abcdef has less
|
||||
than 3 peers for more than 15 minutes"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
groups:
|
||||
- name: polkadot.rules
|
||||
- name: substrate.rules
|
||||
rules:
|
||||
|
||||
##############################################################################
|
||||
@@ -10,7 +10,7 @@ groups:
|
||||
annotations:
|
||||
message: 'Best block on instance {{ $labels.instance }} increases by
|
||||
less than 1 per minute for more than 3 minutes.'
|
||||
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
||||
expr: increase(substrate_block_height{status="best"}[1m]) < 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -18,7 +18,7 @@ groups:
|
||||
annotations:
|
||||
message: 'Best block on instance {{ $labels.instance }} increases by
|
||||
less than 1 per minute for more than 10 minutes.'
|
||||
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
||||
expr: increase(substrate_block_height{status="best"}[1m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -28,7 +28,7 @@ groups:
|
||||
##############################################################################
|
||||
|
||||
- alert: BlockFinalizationSlow
|
||||
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
|
||||
expr: increase(substrate_block_height{status="finalized"}[1m]) < 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -36,7 +36,7 @@ groups:
|
||||
message: 'Finalized block on instance {{ $labels.instance }} increases by
|
||||
less than 1 per minute for more than 3 minutes.'
|
||||
- alert: BlockFinalizationSlow
|
||||
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
|
||||
expr: increase(substrate_block_height{status="finalized"}[1m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -47,8 +47,8 @@ groups:
|
||||
# Under the assumption of an average block production of 6 seconds,
|
||||
# "best" and "finalized" being more than 10 blocks apart would imply
|
||||
# more than a 1 minute delay between block production and finalization.
|
||||
expr: '(polkadot_block_height{status="best"} - ignoring(status)
|
||||
polkadot_block_height{status="finalized"}) > 10'
|
||||
expr: '(substrate_block_height{status="best"} - ignoring(status)
|
||||
substrate_block_height{status="finalized"}) > 10'
|
||||
for: 8m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -61,8 +61,8 @@ groups:
|
||||
##############################################################################
|
||||
|
||||
- alert: TransactionQueueSizeIncreasing
|
||||
expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
|
||||
increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
|
||||
expr: 'increase(substrate_sub_txpool_validations_scheduled[5m]) -
|
||||
increase(substrate_sub_txpool_validations_finished[5m]) > 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -70,8 +70,8 @@ groups:
|
||||
message: 'The transaction pool size on node {{ $labels.instance }} has
|
||||
been monotonically increasing for more than 10 minutes.'
|
||||
- alert: TransactionQueueSizeIncreasing
|
||||
expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
|
||||
increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
|
||||
expr: 'increase(substrate_sub_txpool_validations_scheduled[5m]) -
|
||||
increase(substrate_sub_txpool_validations_finished[5m]) > 0'
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -79,8 +79,8 @@ groups:
|
||||
message: 'The transaction pool size on node {{ $labels.instance }} has
|
||||
been monotonically increasing for more than 30 minutes.'
|
||||
- alert: TransactionQueueSizeHigh
|
||||
expr: 'polkadot_sub_txpool_validations_scheduled -
|
||||
polkadot_sub_txpool_validations_finished > 10000'
|
||||
expr: 'substrate_sub_txpool_validations_scheduled -
|
||||
substrate_sub_txpool_validations_finished > 10000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -93,7 +93,7 @@ groups:
|
||||
##############################################################################
|
||||
|
||||
- alert: NumberOfPeersLow
|
||||
expr: polkadot_sub_libp2p_peers_count < 3
|
||||
expr: substrate_sub_libp2p_peers_count < 3
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -101,7 +101,7 @@ groups:
|
||||
message: 'The node {{ $labels.instance }} has less than 3 peers for more
|
||||
than 3 minutes'
|
||||
- alert: NumberOfPeersLow
|
||||
expr: polkadot_sub_libp2p_peers_count < 3
|
||||
expr: substrate_sub_libp2p_peers_count < 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -109,7 +109,7 @@ groups:
|
||||
message: 'The node {{ $labels.instance }} has less than 3 peers for more
|
||||
than 15 minutes'
|
||||
- alert: NoIncomingConnection
|
||||
expr: increase(polkadot_sub_libp2p_incoming_connections_total[20m]) == 0
|
||||
expr: increase(substrate_sub_libp2p_incoming_connections_total[20m]) == 0
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
@@ -121,7 +121,7 @@ groups:
|
||||
##############################################################################
|
||||
|
||||
- alert: NumberOfFileDescriptorsHigh
|
||||
expr: 'node_filefd_allocated{domain=~"kusama|polkadot"} > 10000'
|
||||
expr: 'node_filefd_allocated{chain!=""} > 10000'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -134,9 +134,9 @@ groups:
|
||||
##############################################################################
|
||||
|
||||
- alert: AuthorityDiscoveryDiscoveryFailureHigh
|
||||
expr: 'polkadot_authority_discovery_handle_value_found_event_failure /
|
||||
expr: 'substrate_authority_discovery_handle_value_found_event_failure /
|
||||
ignoring(name)
|
||||
polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5'
|
||||
substrate_authority_discovery_dht_event_received{name="value_found"} > 0.5'
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -147,9 +147,9 @@ groups:
|
||||
|
||||
- alert: UnboundedChannelPersistentlyLarge
|
||||
expr: '(
|
||||
(polkadot_unbounded_channel_len{action = "send"} -
|
||||
ignoring(action) polkadot_unbounded_channel_len{action = "received"})
|
||||
or on(instance) polkadot_unbounded_channel_len{action = "send"}
|
||||
(substrate_unbounded_channel_len{action = "send"} -
|
||||
ignoring(action) substrate_unbounded_channel_len{action = "received"})
|
||||
or on(instance) substrate_unbounded_channel_len{action = "send"}
|
||||
) >= 200'
|
||||
for: 5m
|
||||
labels:
|
||||
@@ -160,9 +160,9 @@ groups:
|
||||
|
||||
- alert: UnboundedChannelVeryLarge
|
||||
expr: '(
|
||||
(polkadot_unbounded_channel_len{action = "send"} -
|
||||
ignoring(action) polkadot_unbounded_channel_len{action = "received"})
|
||||
or on(instance) polkadot_unbounded_channel_len{action = "send"}
|
||||
(substrate_unbounded_channel_len{action = "send"} -
|
||||
ignoring(action) substrate_unbounded_channel_len{action = "received"})
|
||||
or on(instance) substrate_unbounded_channel_len{action = "send"}
|
||||
) > 15000'
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
"name": "VAR_METRIC_NAMESPACE",
|
||||
"type": "constant",
|
||||
"label": "Prefix of the metrics",
|
||||
"value": "polkadot",
|
||||
"value": "substrate",
|
||||
"description": ""
|
||||
}
|
||||
],
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
"name": "VAR_METRIC_NAMESPACE",
|
||||
"type": "constant",
|
||||
"label": "Prefix of the metrics",
|
||||
"value": "polkadot",
|
||||
"value": "substrate",
|
||||
"description": ""
|
||||
}
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user