mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-15 22:01:04 +00:00
.maintain/monitoring: Add alerting rule tests (#6343)
* .maintain/monitoring: Add alerting rule tests * .maintain/monitoring/alerting-rules/alerting-rules.yaml: Break lines * .gitlab-ci.yml: Add promtool rule testing step
This commit is contained in:
@@ -367,6 +367,7 @@ test-prometheus-alerting-rules:
|
|||||||
- curl -L https://github.com/prometheus/prometheus/releases/download/v2.19.0/prometheus-2.19.0.linux-amd64.tar.gz --output prometheus.tar.gz
|
- curl -L https://github.com/prometheus/prometheus/releases/download/v2.19.0/prometheus-2.19.0.linux-amd64.tar.gz --output prometheus.tar.gz
|
||||||
- tar -xzf prometheus.tar.gz
|
- tar -xzf prometheus.tar.gz
|
||||||
- ./prometheus-*/promtool check rules .maintain/monitoring/alerting-rules/alerting-rules.yaml
|
- ./prometheus-*/promtool check rules .maintain/monitoring/alerting-rules/alerting-rules.yaml
|
||||||
|
- cat .maintain/monitoring/alerting-rules/alerting-rules.yaml | ./prometheus-*/promtool test rules .maintain/monitoring/alerting-rules/alerting-rule-tests.yaml
|
||||||
|
|
||||||
#### stage: build
|
#### stage: build
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,239 @@
|
|||||||
|
rule_files:
|
||||||
|
- /dev/stdin
|
||||||
|
|
||||||
|
evaluation_interval: 1m
|
||||||
|
|
||||||
|
tests:
|
||||||
|
- interval: 1m
|
||||||
|
input_series:
|
||||||
|
- series: 'polkadot_sub_libp2p_peers_count{
|
||||||
|
job="polkadot",
|
||||||
|
pod="polkadot-abcdef01234-abcdef",
|
||||||
|
instance="polkadot-abcdef01234-abcdef",
|
||||||
|
}'
|
||||||
|
values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
|
||||||
|
|
||||||
|
- series: 'polkadot_sub_txpool_validations_scheduled{
|
||||||
|
job="polkadot",
|
||||||
|
pod="polkadot-abcdef01234-abcdef",
|
||||||
|
instance="polkadot-abcdef01234-abcdef",
|
||||||
|
}'
|
||||||
|
values: '10+1x30' # 10 11 12 13 .. 40
|
||||||
|
|
||||||
|
- series: 'polkadot_sub_txpool_validations_finished{
|
||||||
|
job="polkadot",
|
||||||
|
pod="polkadot-abcdef01234-abcdef",
|
||||||
|
instance="polkadot-abcdef01234-abcdef",
|
||||||
|
}'
|
||||||
|
values: '0x30' # 0 0 0 0 .. 0
|
||||||
|
|
||||||
|
- series: 'polkadot_block_height{
|
||||||
|
status="best", job="polkadot",
|
||||||
|
pod="polkadot-abcdef01234-abcdef",
|
||||||
|
instance="polkadot-abcdef01234-abcdef",
|
||||||
|
}'
|
||||||
|
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...
|
||||||
|
|
||||||
|
- series: 'polkadot_block_height{
|
||||||
|
status="finalized",
|
||||||
|
job="polkadot",
|
||||||
|
pod="polkadot-abcdef01234-abcdef",
|
||||||
|
instance="polkadot-abcdef01234-abcdef",
|
||||||
|
}'
|
||||||
|
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...
|
||||||
|
|
||||||
|
- series: 'polkadot_cpu_usage_percentage{
|
||||||
|
job="polkadot",
|
||||||
|
pod="polkadot-abcdef01234-abcdef",
|
||||||
|
instance="polkadot-abcdef01234-abcdef",
|
||||||
|
}'
|
||||||
|
values: '0+20x5 100+0x5' # 0 20 40 60 80 100 100 100 100 100 100
|
||||||
|
|
||||||
|
alert_rule_test:
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# Resource usage
|
||||||
|
######################################################################
|
||||||
|
|
||||||
|
- eval_time: 9m
|
||||||
|
alertname: HighCPUUsage
|
||||||
|
exp_alerts:
|
||||||
|
- eval_time: 10m
|
||||||
|
alertname: HighCPUUsage
|
||||||
|
exp_alerts:
|
||||||
|
- exp_labels:
|
||||||
|
severity: warning
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
exp_annotations:
|
||||||
|
message: "The node polkadot-abcdef01234-abcdef has a CPU
|
||||||
|
usage higher than 100% for more than 5 minutes"
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# Block production
|
||||||
|
######################################################################
|
||||||
|
|
||||||
|
- eval_time: 6m
|
||||||
|
alertname: LowNumberOfNewBlocks
|
||||||
|
exp_alerts:
|
||||||
|
- eval_time: 7m
|
||||||
|
alertname: LowNumberOfNewBlocks
|
||||||
|
exp_alerts:
|
||||||
|
- exp_labels:
|
||||||
|
severity: warning
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
status: best
|
||||||
|
exp_annotations:
|
||||||
|
message: "Less than one new block per minute on instance
|
||||||
|
polkadot-abcdef01234-abcdef."
|
||||||
|
|
||||||
|
- eval_time: 14m
|
||||||
|
alertname: LowNumberOfNewBlocks
|
||||||
|
exp_alerts:
|
||||||
|
- exp_labels:
|
||||||
|
severity: warning
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
status: best
|
||||||
|
exp_annotations:
|
||||||
|
message: "Less than one new block per minute on instance
|
||||||
|
polkadot-abcdef01234-abcdef."
|
||||||
|
- exp_labels:
|
||||||
|
severity: critical
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
status: best
|
||||||
|
exp_annotations:
|
||||||
|
message: "Less than one new block per minute on instance
|
||||||
|
polkadot-abcdef01234-abcdef."
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# Block finalization
|
||||||
|
######################################################################
|
||||||
|
|
||||||
|
- eval_time: 6m
|
||||||
|
alertname: BlockFinalizationSlow
|
||||||
|
exp_alerts:
|
||||||
|
- eval_time: 7m
|
||||||
|
alertname: BlockFinalizationSlow
|
||||||
|
exp_alerts:
|
||||||
|
- exp_labels:
|
||||||
|
severity: warning
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
status: finalized
|
||||||
|
exp_annotations:
|
||||||
|
message: "Finalized block on instance
|
||||||
|
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||||
|
minute."
|
||||||
|
|
||||||
|
- eval_time: 14m
|
||||||
|
alertname: BlockFinalizationSlow
|
||||||
|
exp_alerts:
|
||||||
|
- exp_labels:
|
||||||
|
severity: warning
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
status: finalized
|
||||||
|
exp_annotations:
|
||||||
|
message: "Finalized block on instance
|
||||||
|
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||||
|
minute."
|
||||||
|
- exp_labels:
|
||||||
|
severity: critical
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
status: finalized
|
||||||
|
exp_annotations:
|
||||||
|
message: "Finalized block on instance
|
||||||
|
polkadot-abcdef01234-abcdef increases by less than 1 per
|
||||||
|
minute."
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# Transaction queue
|
||||||
|
######################################################################
|
||||||
|
|
||||||
|
- eval_time: 10m
|
||||||
|
alertname: TransactionQueueSize
|
||||||
|
exp_alerts:
|
||||||
|
- eval_time: 11m
|
||||||
|
alertname: TransactionQueueSize
|
||||||
|
exp_alerts:
|
||||||
|
- exp_labels:
|
||||||
|
severity: warning
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
exp_annotations:
|
||||||
|
message: "The node polkadot-abcdef01234-abcdef has more
|
||||||
|
than 10 transactions in the queue for more than 10
|
||||||
|
minutes"
|
||||||
|
|
||||||
|
- eval_time: 31m
|
||||||
|
alertname: TransactionQueueSize
|
||||||
|
exp_alerts:
|
||||||
|
- exp_labels:
|
||||||
|
severity: warning
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
exp_annotations:
|
||||||
|
message: "The node polkadot-abcdef01234-abcdef has more
|
||||||
|
than 10 transactions in the queue for more than 10
|
||||||
|
minutes"
|
||||||
|
- exp_labels:
|
||||||
|
severity: critical
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
exp_annotations:
|
||||||
|
message: "The node polkadot-abcdef01234-abcdef has more
|
||||||
|
than 10 transactions in the queue for more than 30
|
||||||
|
minutes"
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# Networking
|
||||||
|
######################################################################
|
||||||
|
|
||||||
|
- eval_time: 3m # Values: 3 2 2
|
||||||
|
alertname: LowNumberOfPeers
|
||||||
|
exp_alerts:
|
||||||
|
- eval_time: 4m # Values: 2 2 2
|
||||||
|
alertname: LowNumberOfPeers
|
||||||
|
exp_alerts:
|
||||||
|
- exp_labels:
|
||||||
|
severity: warning
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
exp_annotations:
|
||||||
|
message: "The node polkadot-abcdef01234-abcdef has less
|
||||||
|
than 3 peers for more than 3 minutes"
|
||||||
|
|
||||||
|
- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
|
||||||
|
alertname: LowNumberOfPeers
|
||||||
|
exp_alerts:
|
||||||
|
- exp_labels:
|
||||||
|
severity: warning
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
exp_annotations:
|
||||||
|
message: "The node polkadot-abcdef01234-abcdef has less
|
||||||
|
than 3 peers for more than 3 minutes"
|
||||||
|
- exp_labels:
|
||||||
|
severity: critical
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
exp_annotations:
|
||||||
|
message: "The node polkadot-abcdef01234-abcdef has less
|
||||||
|
than 3 peers for more than 15 minutes"
|
||||||
@@ -12,7 +12,8 @@ groups:
|
|||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
message: 'The node {{ $labels.instance }} has a CPU usage higher than 100% for more than 5 minutes'
|
message: 'The node {{ $labels.instance }} has a CPU usage higher than 100%
|
||||||
|
for more than 5 minutes'
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Block production
|
# Block production
|
||||||
@@ -20,14 +21,16 @@ groups:
|
|||||||
|
|
||||||
- alert: LowNumberOfNewBlocks
|
- alert: LowNumberOfNewBlocks
|
||||||
annotations:
|
annotations:
|
||||||
message: 'Less than one new block per minute on instance {{ $labels.instance }}.'
|
message: 'Less than one new block per minute on instance {{
|
||||||
|
$labels.instance }}.'
|
||||||
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: LowNumberOfNewBlocks
|
- alert: LowNumberOfNewBlocks
|
||||||
annotations:
|
annotations:
|
||||||
message: 'Less than one new block per minute on instance {{ $labels.instance }}.'
|
message: 'Less than one new block per minute on instance {{
|
||||||
|
$labels.instance }}.'
|
||||||
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
@@ -43,43 +46,51 @@ groups:
|
|||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.'
|
message: 'Finalized block on instance {{ $labels.instance }} increases by
|
||||||
|
less than 1 per minute.'
|
||||||
- alert: BlockFinalizationSlow
|
- alert: BlockFinalizationSlow
|
||||||
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
|
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.'
|
message: 'Finalized block on instance {{ $labels.instance }} increases by
|
||||||
|
less than 1 per minute.'
|
||||||
- alert: BlockFinalizationLaggingBehind
|
- alert: BlockFinalizationLaggingBehind
|
||||||
# Under the assumption of an average block production of 6 seconds,
|
# Under the assumption of an average block production of 6 seconds,
|
||||||
# "best" and "finalized" being more than 10 blocks apart would imply
|
# "best" and "finalized" being more than 10 blocks apart would imply
|
||||||
# more than a 1 minute delay between block production and finalization.
|
# more than a 1 minute delay between block production and finalization.
|
||||||
expr: (polkadot_block_height_number{status="best"} - ignoring(status) polkadot_block_height_number{status="finalized"}) > 10
|
expr: '(polkadot_block_height_number{status="best"} - ignoring(status)
|
||||||
|
polkadot_block_height_number{status="finalized"}) > 10'
|
||||||
for: 8m
|
for: 8m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
message: "Block finalization on instance {{ $labels.instance }} is behind block production by {{ $value }} for more than 8m"
|
message: "Block finalization on instance {{ $labels.instance }} is behind
|
||||||
|
block production by {{ $value }} for more than 8m"
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Transaction queue
|
# Transaction queue
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
- alert: TransactionQueueSize
|
- alert: TransactionQueueSize
|
||||||
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
|
expr: 'polkadot_sub_txpool_validations_scheduled -
|
||||||
|
polkadot_sub_txpool_validations_finished > 10'
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 10 minutes'
|
message: 'The node {{ $labels.instance }} has more than 10 transactions in
|
||||||
|
the queue for more than 10 minutes'
|
||||||
- alert: TransactionQueueSize
|
- alert: TransactionQueueSize
|
||||||
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
|
expr: 'polkadot_sub_txpool_validations_scheduled -
|
||||||
|
polkadot_sub_txpool_validations_finished > 10'
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 30 minutes'
|
message: 'The node {{ $labels.instance }} has more than 10 transactions in
|
||||||
|
the queue for more than 30 minutes'
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Networking
|
# Networking
|
||||||
@@ -91,23 +102,28 @@ groups:
|
|||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
message: 'The node {{ $labels.instance }} has less than 3 peers for more than 3 minutes'
|
message: 'The node {{ $labels.instance }} has less than 3 peers for more
|
||||||
|
than 3 minutes'
|
||||||
- alert: LowNumberOfPeers
|
- alert: LowNumberOfPeers
|
||||||
expr: polkadot_sub_libp2p_peers_count < 3
|
expr: polkadot_sub_libp2p_peers_count < 3
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
message: 'The node {{ $labels.instance }} has less than 3 peers for more than 15 minutes'
|
message: 'The node {{ $labels.instance }} has less than 3 peers for more
|
||||||
|
than 15 minutes'
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Others
|
# Others
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
- alert: AuthorityDiscoveryHighDiscoveryFailure
|
- alert: AuthorityDiscoveryHighDiscoveryFailure
|
||||||
expr: polkadot_authority_discovery_handle_value_found_event_failure / ignoring(name) polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5
|
expr: 'polkadot_authority_discovery_handle_value_found_event_failure /
|
||||||
|
ignoring(name)
|
||||||
|
polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5'
|
||||||
for: 2h
|
for: 2h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
message: "Authority discovery on node {{ $labels.instance }} fails to process more than 50 % of the values found on the DHT."
|
message: "Authority discovery on node {{ $labels.instance }} fails to
|
||||||
|
process more than 50 % of the values found on the DHT."
|
||||||
|
|||||||
Reference in New Issue
Block a user