groups: - name: substrate.rules rules: ############################################################################## # Block production ############################################################################## - alert: BlockProductionSlow annotations: message: 'Best block on instance {{ $labels.instance }} increases by less than 1 per minute for more than 3 minutes.' expr: increase(substrate_block_height{status="best"}[1m]) < 1 for: 3m labels: severity: warning - alert: BlockProductionSlow annotations: message: 'Best block on instance {{ $labels.instance }} increases by less than 1 per minute for more than 10 minutes.' expr: increase(substrate_block_height{status="best"}[1m]) < 1 for: 10m labels: severity: critical ############################################################################## # Block finalization ############################################################################## - alert: BlockFinalizationSlow expr: increase(substrate_block_height{status="finalized"}[1m]) < 1 for: 3m labels: severity: warning annotations: message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute for more than 3 minutes.' - alert: BlockFinalizationSlow expr: increase(substrate_block_height{status="finalized"}[1m]) < 1 for: 10m labels: severity: critical annotations: message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute for more than 10 minutes.' - alert: BlockFinalizationLaggingBehind # Under the assumption of an average block production of 6 seconds, # "best" and "finalized" being more than 10 blocks apart would imply # more than a 1 minute delay between block production and finalization. expr: '(substrate_block_height{status="best"} - ignoring(status) substrate_block_height{status="finalized"}) > 10' for: 8m labels: severity: critical annotations: message: "Block finalization on instance {{ $labels.instance }} is behind block production by {{ $value }} for more than 8 minutes." ############################################################################## # Transaction queue ############################################################################## - alert: TransactionQueueSizeIncreasing expr: 'increase(substrate_sub_txpool_validations_scheduled[5m]) - increase(substrate_sub_txpool_validations_finished[5m]) > 0' for: 10m labels: severity: warning annotations: message: 'The transaction pool size on node {{ $labels.instance }} has been monotonically increasing for more than 10 minutes.' - alert: TransactionQueueSizeIncreasing expr: 'increase(substrate_sub_txpool_validations_scheduled[5m]) - increase(substrate_sub_txpool_validations_finished[5m]) > 0' for: 30m labels: severity: warning annotations: message: 'The transaction pool size on node {{ $labels.instance }} has been monotonically increasing for more than 30 minutes.' - alert: TransactionQueueSizeHigh expr: 'substrate_sub_txpool_validations_scheduled - substrate_sub_txpool_validations_finished > 10000' for: 5m labels: severity: warning annotations: message: 'The transaction pool size on node {{ $labels.instance }} has been above 10_000 for more than 5 minutes.' ############################################################################## # Networking ############################################################################## - alert: NumberOfPeersLow expr: substrate_sub_libp2p_peers_count < 3 for: 3m labels: severity: warning annotations: message: 'The node {{ $labels.instance }} has less than 3 peers for more than 3 minutes' - alert: NumberOfPeersLow expr: substrate_sub_libp2p_peers_count < 3 for: 15m labels: severity: critical annotations: message: 'The node {{ $labels.instance }} has less than 3 peers for more than 15 minutes' - alert: NoIncomingConnection expr: increase(substrate_sub_libp2p_incoming_connections_total[20m]) == 0 labels: severity: warning annotations: message: 'The node {{ $labels.instance }} has not received any new incoming TCP connection in the past 20 minutes. Is it connected to the Internet?' ############################################################################## # System ############################################################################## - alert: NumberOfFileDescriptorsHigh expr: 'node_filefd_allocated{chain!=""} > 10000' for: 3m labels: severity: warning annotations: message: 'The node {{ $labels.instance }} has more than 10_000 file descriptors allocated for more than 3 minutes' ############################################################################## # Others ############################################################################## - alert: AuthorityDiscoveryDiscoveryFailureHigh expr: 'substrate_authority_discovery_handle_value_found_event_failure / ignoring(name) substrate_authority_discovery_dht_event_received{name="value_found"} > 0.5' for: 2h labels: severity: warning annotations: message: 'Authority discovery on node {{ $labels.instance }} fails to process more than 50 % of the values found on the DHT for more than 2 hours.' - alert: UnboundedChannelPersistentlyLarge expr: 'substrate_unbounded_channel_size >= 200' for: 5m labels: severity: warning annotations: message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains more than 200 items for more than 5 minutes. Node might be frozen.' - alert: UnboundedChannelVeryLarge expr: '( (substrate_unbounded_channel_len{action = "send"} - ignoring(action) substrate_unbounded_channel_len{action = "received"}) or on(instance) substrate_unbounded_channel_len{action = "send"} ) > 15000' labels: severity: warning annotations: message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains more than 15000 items.'