Skip to content

Commit 0c84691

Browse files
authored
Merge pull request grafana#387 from stevesg/am-notifications-rules
Add recording rules for Alertmanager dashboard,
2 parents 41a44af + 826af4d commit 0c84691

File tree

3 files changed

+86
-19
lines changed

3 files changed

+86
-19
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
* [ENHANCEMENT] Add support for Azure storage in Alertmanager configuration. #381
6363
* [ENHANCEMENT] Add support for running Alertmanager in sharding mode. #394
6464
* [ENHANCEMENT] Allow to customize PromQL engine settings via `queryEngineConfig`. #399
65+
* [ENHANCEMENT] Add recording rules to improve responsiveness of Alertmanager dashboard. #387
6566
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
6667
* [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329
6768
* [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335

cortex-mixin/dashboards/alertmanager.libsonnet

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
1111
})
1212
.addPanel(
1313
$.panel('Total Alerts') +
14-
$.statPanel('sum(cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), format='short')
14+
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short')
1515
)
1616
.addPanel(
1717
$.panel('Total Silences') +
18-
$.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short')
18+
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short')
1919
)
2020
.addPanel(
2121
$.panel('Tenants') +
@@ -29,11 +29,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
2929
$.queryPanel(
3030
[
3131
|||
32-
sum(rate(cortex_alertmanager_alerts_received_total{%s}[$__rate_interval]))
32+
sum(cluster_job:cortex_alertmanager_alerts_received_total:rate5m{%s})
3333
-
34-
sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval]))
34+
sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})
3535
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
36-
'sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
36+
'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
3737
],
3838
['success', 'failed']
3939
)
@@ -46,11 +46,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
4646
$.queryPanel(
4747
[
4848
|||
49-
sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval]))
49+
sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s})
5050
-
51-
sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval]))
51+
sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})
5252
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
53-
'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
53+
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
5454
],
5555
['success', 'failed']
5656
)
@@ -61,13 +61,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
6161
[
6262
|||
6363
(
64-
sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval])) by(integration)
64+
sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) by(integration)
6565
-
66-
sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration)
66+
sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)
6767
) > 0
6868
or on () vector(0)
6969
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
70-
'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration)' % $.jobMatcher('alertmanager'),
70+
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher('alertmanager'),
7171
],
7272
['success - {{ integration }}', 'failed - {{ integration }}']
7373
)
@@ -104,15 +104,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
104104
.addPanel(
105105
$.panel('Per %s Alerts' % $._config.per_instance_label) +
106106
$.queryPanel(
107-
'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')],
107+
'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')],
108108
'{{%s}}' % $._config.per_instance_label
109109
) +
110110
$.stack
111111
)
112112
.addPanel(
113113
$.panel('Per %s Silences' % $._config.per_instance_label) +
114114
$.queryPanel(
115-
'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')],
115+
'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')],
116116
'{{%s}}' % $._config.per_instance_label
117117
) +
118118
$.stack
@@ -205,11 +205,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
205205
$.queryPanel(
206206
[
207207
|||
208-
sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval]))
208+
sum(cluster_job:cortex_alertmanager_state_replication_total:rate5m{%s})
209209
-
210-
sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))
210+
sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})
211211
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
212-
'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
212+
'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
213213
],
214214
['success', 'failed']
215215
)
@@ -219,11 +219,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
219219
$.queryPanel(
220220
[
221221
|||
222-
sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval]))
222+
sum(cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m{%s})
223223
-
224-
sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))
224+
sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})
225225
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
226-
'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
226+
'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
227227
],
228228
['success', 'failed']
229229
)

cortex-mixin/recording_rules.libsonnet

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,72 @@ local utils = import 'mixin-utils/utils.libsonnet';
366366
},
367367
],
368368
},
369+
{
370+
name: 'cortex_alertmanager_rules',
371+
rules: [
372+
// Aggregations of per-user Alertmanager metrics used in dashboards.
373+
{
374+
record: 'cluster_job_%s:cortex_alertmanager_alerts:sum' % $._config.per_instance_label,
375+
expr: |||
376+
sum by (cluster, job, %s) (cortex_alertmanager_alerts)
377+
||| % $._config.per_instance_label,
378+
},
379+
{
380+
record: 'cluster_job_%s:cortex_alertmanager_silences:sum' % $._config.per_instance_label,
381+
expr: |||
382+
sum by (cluster, job, %s) (cortex_alertmanager_silences)
383+
||| % $._config.per_instance_label,
384+
},
385+
{
386+
record: 'cluster_job:cortex_alertmanager_alerts_received_total:rate5m',
387+
expr: |||
388+
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
389+
|||,
390+
},
391+
{
392+
record: 'cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m',
393+
expr: |||
394+
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
395+
|||,
396+
},
397+
{
398+
record: 'cluster_job_integration:cortex_alertmanager_notifications_total:rate5m',
399+
expr: |||
400+
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
401+
|||,
402+
},
403+
{
404+
record: 'cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m',
405+
expr: |||
406+
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
407+
|||,
408+
},
409+
{
410+
record: 'cluster_job:cortex_alertmanager_state_replication_total:rate5m',
411+
expr: |||
412+
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
413+
|||,
414+
},
415+
{
416+
record: 'cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m',
417+
expr: |||
418+
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
419+
|||,
420+
},
421+
{
422+
record: 'cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m',
423+
expr: |||
424+
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))
425+
|||,
426+
},
427+
{
428+
record: 'cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m',
429+
expr: |||
430+
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
431+
|||,
432+
},
433+
],
434+
},
369435
],
370436
},
371437
}

0 commit comments

Comments
 (0)