Files
pezkuwi-subxt/substrate/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml
T
Max Inden fe9c01fc68 .maintain/monitoring/alerting-rules: Remove HighCPUUsage alert (#6648)
The `HighCPUUsage` alert is based on the `cpu_usage_percentage` metric.
Instead of exposing the overall CPU usage in percent, the metric exposes
the per core usage summed over all cores.

This commit removes the alert for two reasons:

1. Substrate itself does not expose the core count and thus one can not
alert based on the `cpu_usage_percentage` metric.

2. Alerting based on CPU usage is generic and not specific to Substrate
or Blockchains. Thus any CPU usage alert suffice.
2020-07-17 07:43:57 +00:00

237 lines
9.8 KiB
YAML

rule_files:
- /dev/stdin
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'polkadot_sub_libp2p_peers_count{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
- series: 'polkadot_sub_txpool_validations_scheduled{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '11+1x10 22+2x30 10043x5'
- series: 'polkadot_sub_txpool_validations_finished{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '0+1x42 42x5'
- series: 'polkadot_block_height{
status="best", job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...
- series: 'polkadot_block_height{
status="finalized",
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...
alert_rule_test:
######################################################################
# Block production
######################################################################
- eval_time: 6m
alertname: LowNumberOfNewBlocks
exp_alerts:
- eval_time: 7m
alertname: LowNumberOfNewBlocks
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: best
exp_annotations:
message: "Less than one new block per minute on instance
polkadot-abcdef01234-abcdef."
- eval_time: 14m
alertname: LowNumberOfNewBlocks
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: best
exp_annotations:
message: "Less than one new block per minute on instance
polkadot-abcdef01234-abcdef."
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: best
exp_annotations:
message: "Less than one new block per minute on instance
polkadot-abcdef01234-abcdef."
######################################################################
# Block finalization
######################################################################
- eval_time: 6m
alertname: BlockFinalizationSlow
exp_alerts:
- eval_time: 7m
alertname: BlockFinalizationSlow
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
minute."
- eval_time: 14m
alertname: BlockFinalizationSlow
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
minute."
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
minute."
######################################################################
# Transaction queue
######################################################################
- eval_time: 11m
alertname: TransactionQueueSizeIncreasing
# Number of validations scheduled and finished both grow at a rate
# of 1 in the first 10 minutes, thereby the queue is not increasing
# in size, thus don't expect an alert.
exp_alerts:
- eval_time: 22m
alertname: TransactionQueueSizeIncreasing
# Number of validations scheduled is growing twice as fast as the
# number of validations finished after minute 10. Thus expect
# warning alert after 20 minutes.
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been monotonically
increasing for the last 10 minutes."
- eval_time: 43m
alertname: TransactionQueueSizeIncreasing
# Number of validations scheduled is growing twice as fast as the
# number of validations finished after minute 10. Thus expect
# both warning and critical alert after 40 minutes.
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been monotonically
increasing for the last 10 minutes."
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been monotonically
increasing for the last 30 minutes."
- eval_time: 49m
alertname: TransactionQueueSizeHigh
# After minute 43 the number of validations scheduled jumps up
# drastically while the number of validations finished stays the
# same. Thus expect an alert.
exp_alerts:
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been above 10_000 for the
last 5 minutes."
######################################################################
# Networking
######################################################################
- eval_time: 3m # Values: 3 2 2
alertname: LowNumberOfPeers
exp_alerts:
- eval_time: 4m # Values: 2 2 2
alertname: LowNumberOfPeers
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
than 3 peers for more than 3 minutes"
- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
alertname: LowNumberOfPeers
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
than 3 peers for more than 3 minutes"
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
than 3 peers for more than 15 minutes"