diff --git a/substrate/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml b/substrate/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml index 069cfaf977..288750be3c 100644 --- a/substrate/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml +++ b/substrate/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml @@ -18,14 +18,14 @@ tests: pod="polkadot-abcdef01234-abcdef", instance="polkadot-abcdef01234-abcdef", }' - values: '10+1x30' # 10 11 12 13 .. 40 + values: '11+1x10 22+2x30 10043x5' - series: 'polkadot_sub_txpool_validations_finished{ job="polkadot", pod="polkadot-abcdef01234-abcdef", instance="polkadot-abcdef01234-abcdef", }' - values: '0x30' # 0 0 0 0 .. 0 + values: '0+1x42 42x5' - series: 'polkadot_block_height{ status="best", job="polkadot", @@ -161,11 +161,17 @@ tests: # Transaction queue ###################################################################### - - eval_time: 10m - alertname: TransactionQueueSize - exp_alerts: - eval_time: 11m - alertname: TransactionQueueSize + alertname: TransactionQueueSizeIncreasing + # Number of validations scheduled and finished both grow at a rate + # of 1 in the first 10 minutes, thereby the queue is not increasing + # in size, thus don't expect an alert. + exp_alerts: + - eval_time: 22m + alertname: TransactionQueueSizeIncreasing + # Number of validations scheduled is growing twice as fast as the + # number of validations finished after minute 10. Thus expect + # warning alert after 20 minutes. exp_alerts: - exp_labels: severity: warning @@ -173,12 +179,14 @@ tests: instance: polkadot-abcdef01234-abcdef job: polkadot exp_annotations: - message: "The node polkadot-abcdef01234-abcdef has more - than 10 transactions in the queue for more than 10 - minutes" - - - eval_time: 31m - alertname: TransactionQueueSize + message: "The transaction pool size on node + polkadot-abcdef01234-abcdef has been monotonically + increasing for the last 10 minutes." + - eval_time: 43m + alertname: TransactionQueueSizeIncreasing + # Number of validations scheduled is growing twice as fast as the + # number of validations finished after minute 10. Thus expect + # both warning and critical alert after 40 minutes. exp_alerts: - exp_labels: severity: warning @@ -186,18 +194,33 @@ tests: instance: polkadot-abcdef01234-abcdef job: polkadot exp_annotations: - message: "The node polkadot-abcdef01234-abcdef has more - than 10 transactions in the queue for more than 10 - minutes" + message: "The transaction pool size on node + polkadot-abcdef01234-abcdef has been monotonically + increasing for the last 10 minutes." - exp_labels: severity: critical pod: polkadot-abcdef01234-abcdef instance: polkadot-abcdef01234-abcdef job: polkadot exp_annotations: - message: "The node polkadot-abcdef01234-abcdef has more - than 10 transactions in the queue for more than 30 - minutes" + message: "The transaction pool size on node + polkadot-abcdef01234-abcdef has been monotonically + increasing for the last 30 minutes." + - eval_time: 49m + alertname: TransactionQueueSizeHigh + # After minute 43 the number of validations scheduled jumps up + # drastically while the number of validations finished stays the + # same. Thus expect an alert. + exp_alerts: + - exp_labels: + severity: critical + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + exp_annotations: + message: "The transaction pool size on node + polkadot-abcdef01234-abcdef has been above 10_000 for the + last 5 minutes." ###################################################################### # Networking diff --git a/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml b/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml index 06d204f7af..2ed3889a2c 100644 --- a/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml +++ b/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml @@ -73,24 +73,33 @@ groups: # Transaction queue ############################################################################## - - alert: TransactionQueueSize - expr: 'polkadot_sub_txpool_validations_scheduled - - polkadot_sub_txpool_validations_finished > 10' + - alert: TransactionQueueSizeIncreasing + expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) - + increase(polkadot_sub_txpool_validations_finished[5m]) > 0' for: 10m labels: severity: warning annotations: - message: 'The node {{ $labels.instance }} has more than 10 transactions in - the queue for more than 10 minutes' - - alert: TransactionQueueSize - expr: 'polkadot_sub_txpool_validations_scheduled - - polkadot_sub_txpool_validations_finished > 10' + message: 'The transaction pool size on node {{ $labels.instance }} has + been monotonically increasing for the last 10 minutes.' + - alert: TransactionQueueSizeIncreasing + expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) - + increase(polkadot_sub_txpool_validations_finished[5m]) > 0' for: 30m labels: severity: critical annotations: - message: 'The node {{ $labels.instance }} has more than 10 transactions in - the queue for more than 30 minutes' + message: 'The transaction pool size on node {{ $labels.instance }} has + been monotonically increasing for the last 30 minutes.' + - alert: TransactionQueueSizeHigh + expr: 'polkadot_sub_txpool_validations_scheduled - + polkadot_sub_txpool_validations_finished > 10000' + for: 5m + labels: + severity: critical + annotations: + message: 'The transaction pool size on node {{ $labels.instance }} has + been above 10_000 for the last 5 minutes.' ############################################################################## # Networking