Skip to content

Commit 02c8224

Browse files
committed
Add recording rules for speeding up Alertmanager dashboard.
With large numbers of tenants the queries for some panels on thos dashboard can become quite slow as the metrics exposed are per-tenant.
1 parent 41a44af commit 02c8224

File tree

2 files changed

+80
-19
lines changed

2 files changed

+80
-19
lines changed

cortex-mixin/dashboards/alertmanager.libsonnet

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
1111
})
1212
.addPanel(
1313
$.panel('Total Alerts') +
14-
$.statPanel('sum(cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), format='short')
14+
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short')
1515
)
1616
.addPanel(
1717
$.panel('Total Silences') +
18-
$.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short')
18+
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short')
1919
)
2020
.addPanel(
2121
$.panel('Tenants') +
@@ -29,11 +29,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
2929
$.queryPanel(
3030
[
3131
|||
32-
sum(rate(cortex_alertmanager_alerts_received_total{%s}[$__rate_interval]))
32+
sum(cluster_job:cortex_alertmanager_alerts_received_total:rate5m{%s})
3333
-
34-
sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval]))
34+
sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})
3535
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
36-
'sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
36+
'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
3737
],
3838
['success', 'failed']
3939
)
@@ -46,11 +46,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
4646
$.queryPanel(
4747
[
4848
|||
49-
sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval]))
49+
sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s})
5050
-
51-
sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval]))
51+
sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})
5252
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
53-
'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
53+
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
5454
],
5555
['success', 'failed']
5656
)
@@ -61,13 +61,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
6161
[
6262
|||
6363
(
64-
sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval])) by(integration)
64+
sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) by(integration)
6565
-
66-
sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration)
66+
sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)
6767
) > 0
6868
or on () vector(0)
6969
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
70-
'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration)' % $.jobMatcher('alertmanager'),
70+
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher('alertmanager'),
7171
],
7272
['success - {{ integration }}', 'failed - {{ integration }}']
7373
)
@@ -104,15 +104,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
104104
.addPanel(
105105
$.panel('Per %s Alerts' % $._config.per_instance_label) +
106106
$.queryPanel(
107-
'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')],
107+
'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')],
108108
'{{%s}}' % $._config.per_instance_label
109109
) +
110110
$.stack
111111
)
112112
.addPanel(
113113
$.panel('Per %s Silences' % $._config.per_instance_label) +
114114
$.queryPanel(
115-
'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')],
115+
'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')],
116116
'{{%s}}' % $._config.per_instance_label
117117
) +
118118
$.stack
@@ -205,11 +205,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
205205
$.queryPanel(
206206
[
207207
|||
208-
sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval]))
208+
sum(cluster_job:cortex_alertmanager_state_replication_total{%s}:rate5m)
209209
-
210-
sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))
210+
sum(cluster_job:cortex_alertmanager_state_replication_failed_total{%s}:rate5m)
211211
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
212-
'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
212+
'sum(cluster_job:cortex_alertmanager_state_replication_failed_total{%s}:rate5m)' % $.jobMatcher('alertmanager'),
213213
],
214214
['success', 'failed']
215215
)
@@ -219,11 +219,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
219219
$.queryPanel(
220220
[
221221
|||
222-
sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval]))
222+
sum(cluster_job:cortex_alertmanager_partial_state_merges_total{%s}:rate5m)
223223
-
224-
sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))
224+
sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total{%s}:rate5m)
225225
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
226-
'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
226+
'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total{%s}:rate5m)' % $.jobMatcher('alertmanager'),
227227
],
228228
['success', 'failed']
229229
)

cortex-mixin/recording_rules.libsonnet

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,67 @@ local utils = import 'mixin-utils/utils.libsonnet';
364364
)
365365
|||,
366366
},
367+
// Aggregations of per-user Alertmanager metrics used in dashboards.
368+
{
369+
record: 'cluster_job_%s:cortex_alertmanager_alerts:sum' % $._config.per_instance_label,
370+
expr: |||
371+
sum by (cluster, job, %s) (cortex_alertmanager_alerts)
372+
||| % $._config.per_instance_label,
373+
},
374+
{
375+
record: 'cluster_job_%s:cortex_alertmanager_silences:sum' % $._config.per_instance_label,
376+
expr: |||
377+
sum by (cluster, job, %s) (cortex_alertmanager_silences)
378+
||| % $._config.per_instance_label,
379+
},
380+
{
381+
record: 'cluster_job:cortex_alertmanager_alerts_received_total:rate5m',
382+
expr: |||
383+
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
384+
|||,
385+
},
386+
{
387+
record: 'cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m',
388+
expr: |||
389+
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
390+
|||,
391+
},
392+
{
393+
record: 'cluster_job_integration:cortex_alertmanager_notifications_total:rate5m',
394+
expr: |||
395+
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
396+
|||,
397+
},
398+
{
399+
record: 'cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m',
400+
expr: |||
401+
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
402+
|||,
403+
},
404+
{
405+
record: 'cluster_job:cortex_alertmanager_state_replication_total:rate5m',
406+
expr: |||
407+
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
408+
|||,
409+
},
410+
{
411+
record: 'cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m',
412+
expr: |||
413+
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
414+
|||,
415+
},
416+
{
417+
record: 'cluster_job:cortex_alertmanager_state_persist_total:rate5m',
418+
expr: |||
419+
sum by (cluster, job) (rate(cortex_alertmanager_state_persist_total[5m]))
420+
|||,
421+
},
422+
{
423+
record: 'cluster_job:cortex_alertmanager_state_persist_failed_total:rate5m',
424+
expr: |||
425+
sum by (cluster, job) (rate(cortex_alertmanager_state_persist_failed_total[5m]))
426+
|||,
427+
},
367428
],
368429
},
369430
],

0 commit comments

Comments
 (0)