diff --git a/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml b/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml new file mode 100644 index 0000000000..cb5b3c271d --- /dev/null +++ b/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml @@ -0,0 +1,113 @@ +groups: +- name: polkadot.rules + rules: + + ############################################################################## + # Resource usage + ############################################################################## + + - alert: HighCPUUsage + expr: polkadot_cpu_usage_percentage >= 100 + for: 5m + labels: + severity: warning + annotations: + message: 'The node {{ $labels.instance }} has a CPU usage higher than 100% for more than 5 minutes' + + ############################################################################## + # Block production + ############################################################################## + + - alert: LowNumberOfNewBlocks + annotations: + message: 'Less than one new block per minute on instance {{ $labels.instance }}.' + expr: increase(polkadot_block_height{status="best"}[1m]) < 1 + for: 3m + labels: + severity: warning + - alert: LowNumberOfNewBlocks + annotations: + message: 'Less than one new block per minute on instance {{ $labels.instance }}.' + expr: increase(polkadot_block_height{status="best"}[1m]) < 1 + for: 10m + labels: + severity: critical + + ############################################################################## + # Block finalization + ############################################################################## + + - alert: BlockFinalizationSlow + expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1 + for: 3m + labels: + severity: warning + annotations: + message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.' + - alert: BlockFinalizationSlow + expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1 + for: 10m + labels: + severity: critical + annotations: + message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.' + - alert: BlockFinalizationLaggingBehind + # Under the assumption of an average block production of 6 seconds, + # "best" and "finalized" being more than 10 blocks apart would imply + # more than a 1 minute delay between block production and finalization. + expr: (polkadot_block_height_number{status="best"} - ignoring(status) polkadot_block_height_number{status="finalized"}) > 10 + for: 8m + labels: + severity: critical + annotations: + message: "Block finalization on instance {{ $labels.instance }} is behind block production by {{ $value }} for more than 8m" + + ############################################################################## + # Transaction queue + ############################################################################## + + - alert: TransactionQueueSize + expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10 + for: 10m + labels: + severity: warning + annotations: + message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 10 minutes' + - alert: TransactionQueueSize + expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10 + for: 30m + labels: + severity: critical + annotations: + message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 30 minutes' + + ############################################################################## + # Networking + ############################################################################## + + - alert: LowNumberOfPeers + expr: polkadot_sub_libp2p_peers_count < 3 + for: 3m + labels: + severity: warning + annotations: + message: 'The node {{ $labels.instance }} has less than 3 peers for more than 3 minutes' + - alert: LowNumberOfPeers + expr: polkadot_sub_libp2p_peers_count < 3 + for: 15m + labels: + severity: critical + annotations: + message: 'The node {{ $labels.instance }} has less than 3 peers for more than 15 minutes' + + ############################################################################## + # Others + ############################################################################## + + - alert: AuthorityDiscoveryHighDiscoveryFailure + expr: polkadot_authority_discovery_handle_value_found_event_failure / ignoring(name) polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5 + for: 2h + labels: + severity: warning + annotations: + message: "Authority discovery on node {{ $labels.instance }} fails to process more than 50 % of the values found on the DHT."