mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-05-30 03:31:05 +00:00
.maintain/monitoring/alerting-rules: Adjust transaction queue size alert (#6426)
The transaction queue size alert has been firing with a constant 10 transactions in the queue. While maybe problematic those 10 transactions don't need to be the same across scrape intervals. Instead of alerting with a size above 10, alert based on two things: 1. Monotonically increasing queue size 2. Upper limit queue size reached
This commit is contained in:
@@ -18,14 +18,14 @@ tests:
|
|||||||
pod="polkadot-abcdef01234-abcdef",
|
pod="polkadot-abcdef01234-abcdef",
|
||||||
instance="polkadot-abcdef01234-abcdef",
|
instance="polkadot-abcdef01234-abcdef",
|
||||||
}'
|
}'
|
||||||
values: '10+1x30' # 10 11 12 13 .. 40
|
values: '11+1x10 22+2x30 10043x5'
|
||||||
|
|
||||||
- series: 'polkadot_sub_txpool_validations_finished{
|
- series: 'polkadot_sub_txpool_validations_finished{
|
||||||
job="polkadot",
|
job="polkadot",
|
||||||
pod="polkadot-abcdef01234-abcdef",
|
pod="polkadot-abcdef01234-abcdef",
|
||||||
instance="polkadot-abcdef01234-abcdef",
|
instance="polkadot-abcdef01234-abcdef",
|
||||||
}'
|
}'
|
||||||
values: '0x30' # 0 0 0 0 .. 0
|
values: '0+1x42 42x5'
|
||||||
|
|
||||||
- series: 'polkadot_block_height{
|
- series: 'polkadot_block_height{
|
||||||
status="best", job="polkadot",
|
status="best", job="polkadot",
|
||||||
@@ -161,11 +161,17 @@ tests:
|
|||||||
# Transaction queue
|
# Transaction queue
|
||||||
######################################################################
|
######################################################################
|
||||||
|
|
||||||
- eval_time: 10m
|
|
||||||
alertname: TransactionQueueSize
|
|
||||||
exp_alerts:
|
|
||||||
- eval_time: 11m
|
- eval_time: 11m
|
||||||
alertname: TransactionQueueSize
|
alertname: TransactionQueueSizeIncreasing
|
||||||
|
# Number of validations scheduled and finished both grow at a rate
|
||||||
|
# of 1 in the first 10 minutes, thereby the queue is not increasing
|
||||||
|
# in size, thus don't expect an alert.
|
||||||
|
exp_alerts:
|
||||||
|
- eval_time: 22m
|
||||||
|
alertname: TransactionQueueSizeIncreasing
|
||||||
|
# Number of validations scheduled is growing twice as fast as the
|
||||||
|
# number of validations finished after minute 10. Thus expect
|
||||||
|
# warning alert after 20 minutes.
|
||||||
exp_alerts:
|
exp_alerts:
|
||||||
- exp_labels:
|
- exp_labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -173,12 +179,14 @@ tests:
|
|||||||
instance: polkadot-abcdef01234-abcdef
|
instance: polkadot-abcdef01234-abcdef
|
||||||
job: polkadot
|
job: polkadot
|
||||||
exp_annotations:
|
exp_annotations:
|
||||||
message: "The node polkadot-abcdef01234-abcdef has more
|
message: "The transaction pool size on node
|
||||||
than 10 transactions in the queue for more than 10
|
polkadot-abcdef01234-abcdef has been monotonically
|
||||||
minutes"
|
increasing for the last 10 minutes."
|
||||||
|
- eval_time: 43m
|
||||||
- eval_time: 31m
|
alertname: TransactionQueueSizeIncreasing
|
||||||
alertname: TransactionQueueSize
|
# Number of validations scheduled is growing twice as fast as the
|
||||||
|
# number of validations finished after minute 10. Thus expect
|
||||||
|
# both warning and critical alert after 40 minutes.
|
||||||
exp_alerts:
|
exp_alerts:
|
||||||
- exp_labels:
|
- exp_labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -186,18 +194,33 @@ tests:
|
|||||||
instance: polkadot-abcdef01234-abcdef
|
instance: polkadot-abcdef01234-abcdef
|
||||||
job: polkadot
|
job: polkadot
|
||||||
exp_annotations:
|
exp_annotations:
|
||||||
message: "The node polkadot-abcdef01234-abcdef has more
|
message: "The transaction pool size on node
|
||||||
than 10 transactions in the queue for more than 10
|
polkadot-abcdef01234-abcdef has been monotonically
|
||||||
minutes"
|
increasing for the last 10 minutes."
|
||||||
- exp_labels:
|
- exp_labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
pod: polkadot-abcdef01234-abcdef
|
pod: polkadot-abcdef01234-abcdef
|
||||||
instance: polkadot-abcdef01234-abcdef
|
instance: polkadot-abcdef01234-abcdef
|
||||||
job: polkadot
|
job: polkadot
|
||||||
exp_annotations:
|
exp_annotations:
|
||||||
message: "The node polkadot-abcdef01234-abcdef has more
|
message: "The transaction pool size on node
|
||||||
than 10 transactions in the queue for more than 30
|
polkadot-abcdef01234-abcdef has been monotonically
|
||||||
minutes"
|
increasing for the last 30 minutes."
|
||||||
|
- eval_time: 49m
|
||||||
|
alertname: TransactionQueueSizeHigh
|
||||||
|
# After minute 43 the number of validations scheduled jumps up
|
||||||
|
# drastically while the number of validations finished stays the
|
||||||
|
# same. Thus expect an alert.
|
||||||
|
exp_alerts:
|
||||||
|
- exp_labels:
|
||||||
|
severity: critical
|
||||||
|
pod: polkadot-abcdef01234-abcdef
|
||||||
|
instance: polkadot-abcdef01234-abcdef
|
||||||
|
job: polkadot
|
||||||
|
exp_annotations:
|
||||||
|
message: "The transaction pool size on node
|
||||||
|
polkadot-abcdef01234-abcdef has been above 10_000 for the
|
||||||
|
last 5 minutes."
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Networking
|
# Networking
|
||||||
|
|||||||
@@ -73,24 +73,33 @@ groups:
|
|||||||
# Transaction queue
|
# Transaction queue
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
- alert: TransactionQueueSize
|
- alert: TransactionQueueSizeIncreasing
|
||||||
expr: 'polkadot_sub_txpool_validations_scheduled -
|
expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
|
||||||
polkadot_sub_txpool_validations_finished > 10'
|
increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
message: 'The node {{ $labels.instance }} has more than 10 transactions in
|
message: 'The transaction pool size on node {{ $labels.instance }} has
|
||||||
the queue for more than 10 minutes'
|
been monotonically increasing for the last 10 minutes.'
|
||||||
- alert: TransactionQueueSize
|
- alert: TransactionQueueSizeIncreasing
|
||||||
expr: 'polkadot_sub_txpool_validations_scheduled -
|
expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
|
||||||
polkadot_sub_txpool_validations_finished > 10'
|
increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
message: 'The node {{ $labels.instance }} has more than 10 transactions in
|
message: 'The transaction pool size on node {{ $labels.instance }} has
|
||||||
the queue for more than 30 minutes'
|
been monotonically increasing for the last 30 minutes.'
|
||||||
|
- alert: TransactionQueueSizeHigh
|
||||||
|
expr: 'polkadot_sub_txpool_validations_scheduled -
|
||||||
|
polkadot_sub_txpool_validations_finished > 10000'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
message: 'The transaction pool size on node {{ $labels.instance }} has
|
||||||
|
been above 10_000 for the last 5 minutes.'
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Networking
|
# Networking
|
||||||
|
|||||||
Reference in New Issue
Block a user