From 0ff724c93917f9f3f790d773758cf03f7946326a Mon Sep 17 00:00:00 2001 From: Max Inden Date: Mon, 5 Oct 2020 10:40:24 +0200 Subject: [PATCH] .maintain/monitoring: Add alert when continuous task ends (#7250) * .maintain/monitoring: Add alert when continuous task ends Through the `polkadot_tasks_ended_total` Prometheus metric one can tell when a task ended. Use this metric to alert when specific known-to-be-continuous tasks end on a node. * .maintain/monitoring: Don't hard-code task names --- .../monitoring/alerting-rules/alerting-rules.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml b/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml index 3dde038d88..16a27c06d3 100644 --- a/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml +++ b/substrate/.maintain/monitoring/alerting-rules/alerting-rules.yaml @@ -126,6 +126,16 @@ groups: # Others ############################################################################## + - alert: ContinuousTaskEnded + expr: '(polkadot_tasks_spawned_total == 1) - on(instance, task_name) + (polkadot_tasks_ended_total == 1)' + for: 5m + labels: + severity: warning + annotations: + message: 'Continuous task {{ $labels.task_name }} on node + {{ $labels.instance }} ended unexpectedly.' + - alert: AuthorityDiscoveryDiscoveryFailureHigh expr: 'polkadot_authority_discovery_handle_value_found_event_failure / ignoring(name)