mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-05-30 07:01:03 +00:00
fe9c01fc68
The `HighCPUUsage` alert is based on the `cpu_usage_percentage` metric. Instead of exposing the overall CPU usage in percent, the metric exposes the per core usage summed over all cores. This commit removes the alert for two reasons: 1. Substrate itself does not expose the core count and thus one can not alert based on the `cpu_usage_percentage` metric. 2. Alerting based on CPU usage is generic and not specific to Substrate or Blockchains. Thus any CPU usage alert suffice.
126 lines
4.5 KiB
YAML
126 lines
4.5 KiB
YAML
groups:
|
|
- name: polkadot.rules
|
|
rules:
|
|
|
|
##############################################################################
|
|
# Block production
|
|
##############################################################################
|
|
|
|
- alert: LowNumberOfNewBlocks
|
|
annotations:
|
|
message: 'Less than one new block per minute on instance {{
|
|
$labels.instance }}.'
|
|
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
- alert: LowNumberOfNewBlocks
|
|
annotations:
|
|
message: 'Less than one new block per minute on instance {{
|
|
$labels.instance }}.'
|
|
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
|
|
##############################################################################
|
|
# Block finalization
|
|
##############################################################################
|
|
|
|
- alert: BlockFinalizationSlow
|
|
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
message: 'Finalized block on instance {{ $labels.instance }} increases by
|
|
less than 1 per minute.'
|
|
- alert: BlockFinalizationSlow
|
|
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
message: 'Finalized block on instance {{ $labels.instance }} increases by
|
|
less than 1 per minute.'
|
|
- alert: BlockFinalizationLaggingBehind
|
|
# Under the assumption of an average block production of 6 seconds,
|
|
# "best" and "finalized" being more than 10 blocks apart would imply
|
|
# more than a 1 minute delay between block production and finalization.
|
|
expr: '(polkadot_block_height_number{status="best"} - ignoring(status)
|
|
polkadot_block_height_number{status="finalized"}) > 10'
|
|
for: 8m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
message: "Block finalization on instance {{ $labels.instance }} is behind
|
|
block production by {{ $value }} for more than 8m"
|
|
|
|
##############################################################################
|
|
# Transaction queue
|
|
##############################################################################
|
|
|
|
- alert: TransactionQueueSizeIncreasing
|
|
expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
|
|
increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
message: 'The transaction pool size on node {{ $labels.instance }} has
|
|
been monotonically increasing for the last 10 minutes.'
|
|
- alert: TransactionQueueSizeIncreasing
|
|
expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
|
|
increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
|
|
for: 30m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
message: 'The transaction pool size on node {{ $labels.instance }} has
|
|
been monotonically increasing for the last 30 minutes.'
|
|
- alert: TransactionQueueSizeHigh
|
|
expr: 'polkadot_sub_txpool_validations_scheduled -
|
|
polkadot_sub_txpool_validations_finished > 10000'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
message: 'The transaction pool size on node {{ $labels.instance }} has
|
|
been above 10_000 for the last 5 minutes.'
|
|
|
|
##############################################################################
|
|
# Networking
|
|
##############################################################################
|
|
|
|
- alert: LowNumberOfPeers
|
|
expr: polkadot_sub_libp2p_peers_count < 3
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
message: 'The node {{ $labels.instance }} has less than 3 peers for more
|
|
than 3 minutes'
|
|
- alert: LowNumberOfPeers
|
|
expr: polkadot_sub_libp2p_peers_count < 3
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
message: 'The node {{ $labels.instance }} has less than 3 peers for more
|
|
than 15 minutes'
|
|
|
|
##############################################################################
|
|
# Others
|
|
##############################################################################
|
|
|
|
- alert: AuthorityDiscoveryHighDiscoveryFailure
|
|
expr: 'polkadot_authority_discovery_handle_value_found_event_failure /
|
|
ignoring(name)
|
|
polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5'
|
|
for: 2h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
message: "Authority discovery on node {{ $labels.instance }} fails to
|
|
process more than 50 % of the values found on the DHT."
|