mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-05-30 09:21:04 +00:00
fe9c01fc68
The `HighCPUUsage` alert is based on the `cpu_usage_percentage` metric. Instead of exposing the overall CPU usage in percent, the metric exposes the per core usage summed over all cores. This commit removes the alert for two reasons: 1. Substrate itself does not expose the core count and thus one can not alert based on the `cpu_usage_percentage` metric. 2. Alerting based on CPU usage is generic and not specific to Substrate or Blockchains. Thus any CPU usage alert suffice.
237 lines
9.8 KiB
YAML
237 lines
9.8 KiB
YAML
rule_files:
|
|
- /dev/stdin
|
|
|
|
evaluation_interval: 1m
|
|
|
|
tests:
|
|
- interval: 1m
|
|
input_series:
|
|
- series: 'polkadot_sub_libp2p_peers_count{
|
|
job="polkadot",
|
|
pod="polkadot-abcdef01234-abcdef",
|
|
instance="polkadot-abcdef01234-abcdef",
|
|
}'
|
|
values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
|
|
|
|
- series: 'polkadot_sub_txpool_validations_scheduled{
|
|
job="polkadot",
|
|
pod="polkadot-abcdef01234-abcdef",
|
|
instance="polkadot-abcdef01234-abcdef",
|
|
}'
|
|
values: '11+1x10 22+2x30 10043x5'
|
|
|
|
- series: 'polkadot_sub_txpool_validations_finished{
|
|
job="polkadot",
|
|
pod="polkadot-abcdef01234-abcdef",
|
|
instance="polkadot-abcdef01234-abcdef",
|
|
}'
|
|
values: '0+1x42 42x5'
|
|
|
|
- series: 'polkadot_block_height{
|
|
status="best", job="polkadot",
|
|
pod="polkadot-abcdef01234-abcdef",
|
|
instance="polkadot-abcdef01234-abcdef",
|
|
}'
|
|
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...
|
|
|
|
- series: 'polkadot_block_height{
|
|
status="finalized",
|
|
job="polkadot",
|
|
pod="polkadot-abcdef01234-abcdef",
|
|
instance="polkadot-abcdef01234-abcdef",
|
|
}'
|
|
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...
|
|
|
|
alert_rule_test:
|
|
|
|
######################################################################
|
|
# Block production
|
|
######################################################################
|
|
|
|
- eval_time: 6m
|
|
alertname: LowNumberOfNewBlocks
|
|
exp_alerts:
|
|
- eval_time: 7m
|
|
alertname: LowNumberOfNewBlocks
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
status: best
|
|
exp_annotations:
|
|
message: "Less than one new block per minute on instance
|
|
polkadot-abcdef01234-abcdef."
|
|
|
|
- eval_time: 14m
|
|
alertname: LowNumberOfNewBlocks
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
status: best
|
|
exp_annotations:
|
|
message: "Less than one new block per minute on instance
|
|
polkadot-abcdef01234-abcdef."
|
|
- exp_labels:
|
|
severity: critical
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
status: best
|
|
exp_annotations:
|
|
message: "Less than one new block per minute on instance
|
|
polkadot-abcdef01234-abcdef."
|
|
|
|
######################################################################
|
|
# Block finalization
|
|
######################################################################
|
|
|
|
- eval_time: 6m
|
|
alertname: BlockFinalizationSlow
|
|
exp_alerts:
|
|
- eval_time: 7m
|
|
alertname: BlockFinalizationSlow
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
status: finalized
|
|
exp_annotations:
|
|
message: "Finalized block on instance
|
|
polkadot-abcdef01234-abcdef increases by less than 1 per
|
|
minute."
|
|
|
|
- eval_time: 14m
|
|
alertname: BlockFinalizationSlow
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
status: finalized
|
|
exp_annotations:
|
|
message: "Finalized block on instance
|
|
polkadot-abcdef01234-abcdef increases by less than 1 per
|
|
minute."
|
|
- exp_labels:
|
|
severity: critical
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
status: finalized
|
|
exp_annotations:
|
|
message: "Finalized block on instance
|
|
polkadot-abcdef01234-abcdef increases by less than 1 per
|
|
minute."
|
|
|
|
######################################################################
|
|
# Transaction queue
|
|
######################################################################
|
|
|
|
- eval_time: 11m
|
|
alertname: TransactionQueueSizeIncreasing
|
|
# Number of validations scheduled and finished both grow at a rate
|
|
# of 1 in the first 10 minutes, thereby the queue is not increasing
|
|
# in size, thus don't expect an alert.
|
|
exp_alerts:
|
|
- eval_time: 22m
|
|
alertname: TransactionQueueSizeIncreasing
|
|
# Number of validations scheduled is growing twice as fast as the
|
|
# number of validations finished after minute 10. Thus expect
|
|
# warning alert after 20 minutes.
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
exp_annotations:
|
|
message: "The transaction pool size on node
|
|
polkadot-abcdef01234-abcdef has been monotonically
|
|
increasing for the last 10 minutes."
|
|
- eval_time: 43m
|
|
alertname: TransactionQueueSizeIncreasing
|
|
# Number of validations scheduled is growing twice as fast as the
|
|
# number of validations finished after minute 10. Thus expect
|
|
# both warning and critical alert after 40 minutes.
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
exp_annotations:
|
|
message: "The transaction pool size on node
|
|
polkadot-abcdef01234-abcdef has been monotonically
|
|
increasing for the last 10 minutes."
|
|
- exp_labels:
|
|
severity: critical
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
exp_annotations:
|
|
message: "The transaction pool size on node
|
|
polkadot-abcdef01234-abcdef has been monotonically
|
|
increasing for the last 30 minutes."
|
|
- eval_time: 49m
|
|
alertname: TransactionQueueSizeHigh
|
|
# After minute 43 the number of validations scheduled jumps up
|
|
# drastically while the number of validations finished stays the
|
|
# same. Thus expect an alert.
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: critical
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
exp_annotations:
|
|
message: "The transaction pool size on node
|
|
polkadot-abcdef01234-abcdef has been above 10_000 for the
|
|
last 5 minutes."
|
|
|
|
######################################################################
|
|
# Networking
|
|
######################################################################
|
|
|
|
- eval_time: 3m # Values: 3 2 2
|
|
alertname: LowNumberOfPeers
|
|
exp_alerts:
|
|
- eval_time: 4m # Values: 2 2 2
|
|
alertname: LowNumberOfPeers
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
exp_annotations:
|
|
message: "The node polkadot-abcdef01234-abcdef has less
|
|
than 3 peers for more than 3 minutes"
|
|
|
|
- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
|
|
alertname: LowNumberOfPeers
|
|
exp_alerts:
|
|
- exp_labels:
|
|
severity: warning
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
exp_annotations:
|
|
message: "The node polkadot-abcdef01234-abcdef has less
|
|
than 3 peers for more than 3 minutes"
|
|
- exp_labels:
|
|
severity: critical
|
|
pod: polkadot-abcdef01234-abcdef
|
|
instance: polkadot-abcdef01234-abcdef
|
|
job: polkadot
|
|
exp_annotations:
|
|
message: "The node polkadot-abcdef01234-abcdef has less
|
|
than 3 peers for more than 15 minutes"
|