.maintain/monitoring/alerting-rules: Adjust transaction queue size alert (#6426)

The transaction queue size alert has been firing with a constant 10
transactions in the queue. While maybe problematic those 10 transactions
don't need to be the same across scrape intervals.

Instead of alerting with a size above 10, alert based on two things:

1. Monotonically increasing queue size

2. Upper limit queue size reached
This commit is contained in:
Max Inden
2020-07-01 10:31:56 +02:00
committed by GitHub
parent d73de3bed7
commit 585ea531a3
2 changed files with 60 additions and 28 deletions
@@ -18,14 +18,14 @@ tests:
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '10+1x30' # 10 11 12 13 .. 40
values: '11+1x10 22+2x30 10043x5'
- series: 'polkadot_sub_txpool_validations_finished{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '0x30' # 0 0 0 0 .. 0
values: '0+1x42 42x5'
- series: 'polkadot_block_height{
status="best", job="polkadot",
@@ -161,11 +161,17 @@ tests:
# Transaction queue
######################################################################
- eval_time: 10m
alertname: TransactionQueueSize
exp_alerts:
- eval_time: 11m
alertname: TransactionQueueSize
alertname: TransactionQueueSizeIncreasing
# Number of validations scheduled and finished both grow at a rate
# of 1 in the first 10 minutes, thereby the queue is not increasing
# in size, thus don't expect an alert.
exp_alerts:
- eval_time: 22m
alertname: TransactionQueueSizeIncreasing
# Number of validations scheduled is growing twice as fast as the
# number of validations finished after minute 10. Thus expect
# warning alert after 20 minutes.
exp_alerts:
- exp_labels:
severity: warning
@@ -173,12 +179,14 @@ tests:
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more
than 10 transactions in the queue for more than 10
minutes"
- eval_time: 31m
alertname: TransactionQueueSize
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been monotonically
increasing for the last 10 minutes."
- eval_time: 43m
alertname: TransactionQueueSizeIncreasing
# Number of validations scheduled is growing twice as fast as the
# number of validations finished after minute 10. Thus expect
# both warning and critical alert after 40 minutes.
exp_alerts:
- exp_labels:
severity: warning
@@ -186,18 +194,33 @@ tests:
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more
than 10 transactions in the queue for more than 10
minutes"
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been monotonically
increasing for the last 10 minutes."
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more
than 10 transactions in the queue for more than 30
minutes"
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been monotonically
increasing for the last 30 minutes."
- eval_time: 49m
alertname: TransactionQueueSizeHigh
# After minute 43 the number of validations scheduled jumps up
# drastically while the number of validations finished stays the
# same. Thus expect an alert.
exp_alerts:
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been above 10_000 for the
last 5 minutes."
######################################################################
# Networking
@@ -73,24 +73,33 @@ groups:
# Transaction queue
##############################################################################
- alert: TransactionQueueSize
expr: 'polkadot_sub_txpool_validations_scheduled -
polkadot_sub_txpool_validations_finished > 10'
- alert: TransactionQueueSizeIncreasing
expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
for: 10m
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has more than 10 transactions in
the queue for more than 10 minutes'
- alert: TransactionQueueSize
expr: 'polkadot_sub_txpool_validations_scheduled -
polkadot_sub_txpool_validations_finished > 10'
message: 'The transaction pool size on node {{ $labels.instance }} has
been monotonically increasing for the last 10 minutes.'
- alert: TransactionQueueSizeIncreasing
expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
for: 30m
labels:
severity: critical
annotations:
message: 'The node {{ $labels.instance }} has more than 10 transactions in
the queue for more than 30 minutes'
message: 'The transaction pool size on node {{ $labels.instance }} has
been monotonically increasing for the last 30 minutes.'
- alert: TransactionQueueSizeHigh
expr: 'polkadot_sub_txpool_validations_scheduled -
polkadot_sub_txpool_validations_finished > 10000'
for: 5m
labels:
severity: critical
annotations:
message: 'The transaction pool size on node {{ $labels.instance }} has
been above 10_000 for the last 5 minutes.'
##############################################################################
# Networking