240 lines
10 KiB
YAML
240 lines
10 KiB
YAML
rule_files:
|
|
- /dev/stdin
|
|
|
|
evaluation_interval: 1m
|
|
|
|
tests:
|
|
- interval: 1m
|
|
input_series:
|
|
- series: 'substrate_sub_libp2p_peers_count{
|
|
job="substrate",
|
|
pod="substrate-abcdef01234-abcdef",
|
|
instance="substrate-abcdef01234-abcdef",
|
|
}'
|
|
values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
|
|
|
|
- series: 'substrate_sub_txpool_validations_scheduled{
|
|
job="substrate",
|
|
pod="substrate-abcdef01234-abcdef",
|
|
instance="substrate-abcdef01234-abcdef",
|
|
}'
|
|
values: '11+1x10 22+2x30 10043x5'
|
|
|
|
- series: 'substrate_sub_txpool_validations_finished{
|
|
job="substrate",
|
|
pod="substrate-abcdef01234-abcdef",
|
|
instance="substrate-abcdef01234-abcdef",
|
|
}'
|
|
values: '0+1x42 42x5'
|
|
|
|
- series: 'substrate_block_height{
|
|
status="best", job="substrate",
|
|
pod="substrate-abcdef01234-abcdef",
|
|
instance="substrate-abcdef01234-abcdef",
|
|
}'
|
|
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...
|
|
|
|
- series: 'substrate_block_height{
|
|
status="finalized",
|
|
job="substrate",
|
|
pod="substrate-abcdef01234-abcdef",
|
|
instance="substrate-abcdef01234-abcdef",
|
|
}'
|
|
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...
|
|
|
|
alert_rule_test:
|
|
|
|
######################################################################
|
|
# Block production
|
|
######################################################################
|
|
|
|
- eval_time: 6m
|
|
alertname: BlockProductionSlow
|
|
exp_alerts:
|
|
- eval_time: 7m
|
|
alertname: BlockProductionSlow
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
status: best
|
|
exp_annotations:
|
|
message: "Best block on instance
|
|
substrate-abcdef01234-abcdef increases by less than 1 per
|
|
minute for more than 3 minutes."
|
|
|
|
- eval_time: 14m
|
|
alertname: BlockProductionSlow
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
status: best
|
|
exp_annotations:
|
|
message: "Best block on instance
|
|
substrate-abcdef01234-abcdef increases by less than 1 per
|
|
minute for more than 3 minutes."
|
|
- exp_labels:
|
|
severity: critical
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
status: best
|
|
exp_annotations:
|
|
message: "Best block on instance
|
|
substrate-abcdef01234-abcdef increases by less than 1 per
|
|
minute for more than 10 minutes."
|
|
|
|
######################################################################
|
|
# Block finalization
|
|
######################################################################
|
|
|
|
- eval_time: 6m
|
|
alertname: BlockFinalizationSlow
|
|
exp_alerts:
|
|
- eval_time: 7m
|
|
alertname: BlockFinalizationSlow
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
status: finalized
|
|
exp_annotations:
|
|
message: "Finalized block on instance
|
|
substrate-abcdef01234-abcdef increases by less than 1 per
|
|
minute for more than 3 minutes."
|
|
|
|
- eval_time: 14m
|
|
alertname: BlockFinalizationSlow
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
status: finalized
|
|
exp_annotations:
|
|
message: "Finalized block on instance
|
|
substrate-abcdef01234-abcdef increases by less than 1 per
|
|
minute for more than 3 minutes."
|
|
- exp_labels:
|
|
severity: critical
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
status: finalized
|
|
exp_annotations:
|
|
message: "Finalized block on instance
|
|
substrate-abcdef01234-abcdef increases by less than 1 per
|
|
minute for more than 10 minutes."
|
|
|
|
######################################################################
|
|
# Transaction queue
|
|
######################################################################
|
|
|
|
- eval_time: 11m
|
|
alertname: TransactionQueueSizeIncreasing
|
|
# Number of validations scheduled and finished both grow at a rate
|
|
# of 1 in the first 10 minutes, thereby the queue is not increasing
|
|
# in size, thus don't expect an alert.
|
|
exp_alerts:
|
|
- eval_time: 22m
|
|
alertname: TransactionQueueSizeIncreasing
|
|
# Number of validations scheduled is growing twice as fast as the
|
|
# number of validations finished after minute 10. Thus expect
|
|
# warning alert after 20 minutes.
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
exp_annotations:
|
|
message: "The transaction pool size on node
|
|
substrate-abcdef01234-abcdef has been monotonically
|
|
increasing for more than 10 minutes."
|
|
- eval_time: 43m
|
|
alertname: TransactionQueueSizeIncreasing
|
|
# Number of validations scheduled is growing twice as fast as the
|
|
# number of validations finished after minute 10. Thus expect
|
|
# both warning and critical alert after 40 minutes.
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
exp_annotations:
|
|
message: "The transaction pool size on node
|
|
substrate-abcdef01234-abcdef has been monotonically
|
|
increasing for more than 10 minutes."
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
exp_annotations:
|
|
message: "The transaction pool size on node
|
|
substrate-abcdef01234-abcdef has been monotonically
|
|
increasing for more than 30 minutes."
|
|
- eval_time: 49m
|
|
alertname: TransactionQueueSizeHigh
|
|
# After minute 43 the number of validations scheduled jumps up
|
|
# drastically while the number of validations finished stays the
|
|
# same. Thus expect an alert.
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
exp_annotations:
|
|
message: "The transaction pool size on node
|
|
substrate-abcdef01234-abcdef has been above 10_000 for more
|
|
than 5 minutes."
|
|
|
|
######################################################################
|
|
# Networking
|
|
######################################################################
|
|
|
|
- eval_time: 3m # Values: 3 2 2
|
|
alertname: NumberOfPeersLow
|
|
exp_alerts:
|
|
- eval_time: 4m # Values: 2 2 2
|
|
alertname: NumberOfPeersLow
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
exp_annotations:
|
|
message: "The node substrate-abcdef01234-abcdef has less
|
|
than 3 peers for more than 3 minutes"
|
|
|
|
- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
|
|
alertname: NumberOfPeersLow
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
exp_annotations:
|
|
message: "The node substrate-abcdef01234-abcdef has less
|
|
than 3 peers for more than 3 minutes"
|
|
- exp_labels:
|
|
severity: critical
|
|
pod: substrate-abcdef01234-abcdef
|
|
instance: substrate-abcdef01234-abcdef
|
|
job: substrate
|
|
exp_annotations:
|
|
message: "The node substrate-abcdef01234-abcdef has less
|
|
than 3 peers for more than 15 minutes"
|