mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-05-01 05:27:56 +00:00
.maintain/monitoring: Add an initial set of Prometheus alerting rules (#6095)
Create a place to collaborate on Prometheus alerting rules for Substrate starting with a basic set of rules covering: - Resource usage - Block production - Block finalization - Transaction queue - Networking - ... Others
This commit is contained in:
@@ -0,0 +1,113 @@
|
||||
groups:
|
||||
- name: polkadot.rules
|
||||
rules:
|
||||
|
||||
##############################################################################
|
||||
# Resource usage
|
||||
##############################################################################
|
||||
|
||||
- alert: HighCPUUsage
|
||||
expr: polkadot_cpu_usage_percentage >= 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
message: 'The node {{ $labels.instance }} has a CPU usage higher than 100% for more than 5 minutes'
|
||||
|
||||
##############################################################################
|
||||
# Block production
|
||||
##############################################################################
|
||||
|
||||
- alert: LowNumberOfNewBlocks
|
||||
annotations:
|
||||
message: 'Less than one new block per minute on instance {{ $labels.instance }}.'
|
||||
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: LowNumberOfNewBlocks
|
||||
annotations:
|
||||
message: 'Less than one new block per minute on instance {{ $labels.instance }}.'
|
||||
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
##############################################################################
|
||||
# Block finalization
|
||||
##############################################################################
|
||||
|
||||
- alert: BlockFinalizationSlow
|
||||
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.'
|
||||
- alert: BlockFinalizationSlow
|
||||
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.'
|
||||
- alert: BlockFinalizationLaggingBehind
|
||||
# Under the assumption of an average block production of 6 seconds,
|
||||
# "best" and "finalized" being more than 10 blocks apart would imply
|
||||
# more than a 1 minute delay between block production and finalization.
|
||||
expr: (polkadot_block_height_number{status="best"} - ignoring(status) polkadot_block_height_number{status="finalized"}) > 10
|
||||
for: 8m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
message: "Block finalization on instance {{ $labels.instance }} is behind block production by {{ $value }} for more than 8m"
|
||||
|
||||
##############################################################################
|
||||
# Transaction queue
|
||||
##############################################################################
|
||||
|
||||
- alert: TransactionQueueSize
|
||||
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 10 minutes'
|
||||
- alert: TransactionQueueSize
|
||||
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 30 minutes'
|
||||
|
||||
##############################################################################
|
||||
# Networking
|
||||
##############################################################################
|
||||
|
||||
- alert: LowNumberOfPeers
|
||||
expr: polkadot_sub_libp2p_peers_count < 3
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
message: 'The node {{ $labels.instance }} has less than 3 peers for more than 3 minutes'
|
||||
- alert: LowNumberOfPeers
|
||||
expr: polkadot_sub_libp2p_peers_count < 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
message: 'The node {{ $labels.instance }} has less than 3 peers for more than 15 minutes'
|
||||
|
||||
##############################################################################
|
||||
# Others
|
||||
##############################################################################
|
||||
|
||||
- alert: AuthorityDiscoveryHighDiscoveryFailure
|
||||
expr: polkadot_authority_discovery_handle_value_found_event_failure / ignoring(name) polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
message: "Authority discovery on node {{ $labels.instance }} fails to process more than 50 % of the values found on the DHT."
|
||||
Reference in New Issue
Block a user