.maintain/monitoring/alerting-rules: Adjust transaction queue size alert (#6426)

The transaction queue size alert has been firing with a constant 10
transactions in the queue. While maybe problematic those 10 transactions
don't need to be the same across scrape intervals.

Instead of alerting with a size above 10, alert based on two things:

1. Monotonically increasing queue size

2. Upper limit queue size reached
This commit is contained in:
Max Inden
2020-07-01 10:31:56 +02:00
committed by GitHub
parent d73de3bed7
commit 585ea531a3
2 changed files with 60 additions and 28 deletions
@@ -18,14 +18,14 @@ tests:
pod="polkadot-abcdef01234-abcdef", pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef", instance="polkadot-abcdef01234-abcdef",
}' }'
values: '10+1x30' # 10 11 12 13 .. 40 values: '11+1x10 22+2x30 10043x5'
- series: 'polkadot_sub_txpool_validations_finished{ - series: 'polkadot_sub_txpool_validations_finished{
job="polkadot", job="polkadot",
pod="polkadot-abcdef01234-abcdef", pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef", instance="polkadot-abcdef01234-abcdef",
}' }'
values: '0x30' # 0 0 0 0 .. 0 values: '0+1x42 42x5'
- series: 'polkadot_block_height{ - series: 'polkadot_block_height{
status="best", job="polkadot", status="best", job="polkadot",
@@ -161,11 +161,17 @@ tests:
# Transaction queue # Transaction queue
###################################################################### ######################################################################
- eval_time: 10m
alertname: TransactionQueueSize
exp_alerts:
- eval_time: 11m - eval_time: 11m
alertname: TransactionQueueSize alertname: TransactionQueueSizeIncreasing
# Number of validations scheduled and finished both grow at a rate
# of 1 in the first 10 minutes, thereby the queue is not increasing
# in size, thus don't expect an alert.
exp_alerts:
- eval_time: 22m
alertname: TransactionQueueSizeIncreasing
# Number of validations scheduled is growing twice as fast as the
# number of validations finished after minute 10. Thus expect
# warning alert after 20 minutes.
exp_alerts: exp_alerts:
- exp_labels: - exp_labels:
severity: warning severity: warning
@@ -173,12 +179,14 @@ tests:
instance: polkadot-abcdef01234-abcdef instance: polkadot-abcdef01234-abcdef
job: polkadot job: polkadot
exp_annotations: exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more message: "The transaction pool size on node
than 10 transactions in the queue for more than 10 polkadot-abcdef01234-abcdef has been monotonically
minutes" increasing for the last 10 minutes."
- eval_time: 43m
- eval_time: 31m alertname: TransactionQueueSizeIncreasing
alertname: TransactionQueueSize # Number of validations scheduled is growing twice as fast as the
# number of validations finished after minute 10. Thus expect
# both warning and critical alert after 40 minutes.
exp_alerts: exp_alerts:
- exp_labels: - exp_labels:
severity: warning severity: warning
@@ -186,18 +194,33 @@ tests:
instance: polkadot-abcdef01234-abcdef instance: polkadot-abcdef01234-abcdef
job: polkadot job: polkadot
exp_annotations: exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more message: "The transaction pool size on node
than 10 transactions in the queue for more than 10 polkadot-abcdef01234-abcdef has been monotonically
minutes" increasing for the last 10 minutes."
- exp_labels: - exp_labels:
severity: critical severity: critical
pod: polkadot-abcdef01234-abcdef pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef instance: polkadot-abcdef01234-abcdef
job: polkadot job: polkadot
exp_annotations: exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more message: "The transaction pool size on node
than 10 transactions in the queue for more than 30 polkadot-abcdef01234-abcdef has been monotonically
minutes" increasing for the last 30 minutes."
- eval_time: 49m
alertname: TransactionQueueSizeHigh
# After minute 43 the number of validations scheduled jumps up
# drastically while the number of validations finished stays the
# same. Thus expect an alert.
exp_alerts:
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been above 10_000 for the
last 5 minutes."
###################################################################### ######################################################################
# Networking # Networking
@@ -73,24 +73,33 @@ groups:
# Transaction queue # Transaction queue
############################################################################## ##############################################################################
- alert: TransactionQueueSize - alert: TransactionQueueSizeIncreasing
expr: 'polkadot_sub_txpool_validations_scheduled - expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
polkadot_sub_txpool_validations_finished > 10' increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
message: 'The node {{ $labels.instance }} has more than 10 transactions in message: 'The transaction pool size on node {{ $labels.instance }} has
the queue for more than 10 minutes' been monotonically increasing for the last 10 minutes.'
- alert: TransactionQueueSize - alert: TransactionQueueSizeIncreasing
expr: 'polkadot_sub_txpool_validations_scheduled - expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
polkadot_sub_txpool_validations_finished > 10' increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
for: 30m for: 30m
labels: labels:
severity: critical severity: critical
annotations: annotations:
message: 'The node {{ $labels.instance }} has more than 10 transactions in message: 'The transaction pool size on node {{ $labels.instance }} has
the queue for more than 30 minutes' been monotonically increasing for the last 30 minutes.'
- alert: TransactionQueueSizeHigh
expr: 'polkadot_sub_txpool_validations_scheduled -
polkadot_sub_txpool_validations_finished > 10000'
for: 5m
labels:
severity: critical
annotations:
message: 'The transaction pool size on node {{ $labels.instance }} has
been above 10_000 for the last 5 minutes.'
############################################################################## ##############################################################################
# Networking # Networking