Skip to content

Commit 1d0d032

Browse files
author
Tyler Reid
committed
Merge remote-tracking branch 'upstream/main' into distributor-inflight-push-alerts
2 parents a091553 + b8901a9 commit 1d0d032

File tree

5 files changed

+53
-46
lines changed

5 files changed

+53
-46
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@
6767
* [ENHANCEMENT] Add recording rules to improve responsiveness of Alertmanager dashboard. #387
6868
* [ENHANCEMENT] Add `CortexRolloutStuck` alert. #405
6969
* [ENHANCEMENT] Added `CortexKVStoreFailure` alert. #406
70+
* [ENHANCEMENT] Use configured `ruler` jobname for ruler dashboard panels. #409
71+
* [ENHANCEMENT] Add ability to override `datasource` for generated dashboards. #407
72+
* [ENHANCEMENT] Use alertmanager jobname for alertmanager dashboard panels #411
7073
* [ENHANCEMENT] Added `CortexDistributorReachingInflightPushRequestLimit` alert. #408
7174
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
7275
* [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329

cortex-mixin/config.libsonnet

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
store_gateway: '(store-gateway|cortex$)',
3838
gateway: '(gateway|cortex-gw|cortex-gw-internal)',
3939
compactor: 'compactor.*', // Match also custom compactor deployments.
40+
alertmanager: 'alertmanager',
4041
},
4142

4243
// Grouping labels, to uniquely identify and group by {jobs, clusters}
@@ -68,5 +69,8 @@
6869

6970
// The routes to exclude from alerts.
7071
alert_excluded_routes: [],
72+
73+
// Name of the datasource for which the dashboards should attach to
74+
dashboard_datasource: 'default',
7175
},
7276
}

cortex-mixin/dashboards/alertmanager.libsonnet

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
1111
})
1212
.addPanel(
1313
$.panel('Total Alerts') +
14-
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short')
14+
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short')
1515
)
1616
.addPanel(
1717
$.panel('Total Silences') +
18-
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short')
18+
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short')
1919
)
2020
.addPanel(
2121
$.panel('Tenants') +
22-
$.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager'), format='short')
22+
$.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher($._config.job_names.alertmanager), format='short')
2323
)
2424
)
2525
.addRow(
@@ -32,8 +32,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
3232
sum(cluster_job:cortex_alertmanager_alerts_received_total:rate5m{%s})
3333
-
3434
sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})
35-
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
36-
'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
35+
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
36+
'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager),
3737
],
3838
['success', 'failed']
3939
)
@@ -49,8 +49,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
4949
sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s})
5050
-
5151
sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})
52-
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
53-
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
52+
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
53+
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager),
5454
],
5555
['success', 'failed']
5656
)
@@ -66,15 +66,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
6666
sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)
6767
) > 0
6868
or on () vector(0)
69-
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
70-
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher('alertmanager'),
69+
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
70+
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher($._config.job_names.alertmanager),
7171
],
7272
['success - {{ integration }}', 'failed - {{ integration }}']
7373
)
7474
)
7575
.addPanel(
7676
$.panel('Latency') +
77-
$.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher('alertmanager'))
77+
$.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager))
7878
)
7979
)
8080
.addRow(
@@ -96,23 +96,23 @@ local utils = import 'mixin-utils/utils.libsonnet';
9696
.addPanel(
9797
$.panel('Per %s Tenants' % $._config.per_instance_label) +
9898
$.queryPanel(
99-
'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')],
99+
'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)],
100100
'{{%s}}' % $._config.per_instance_label
101101
) +
102102
$.stack
103103
)
104104
.addPanel(
105105
$.panel('Per %s Alerts' % $._config.per_instance_label) +
106106
$.queryPanel(
107-
'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')],
107+
'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)],
108108
'{{%s}}' % $._config.per_instance_label
109109
) +
110110
$.stack
111111
)
112112
.addPanel(
113113
$.panel('Per %s Silences' % $._config.per_instance_label) +
114114
$.queryPanel(
115-
'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')],
115+
'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)],
116116
'{{%s}}' % $._config.per_instance_label
117117
) +
118118
$.stack
@@ -128,23 +128,23 @@ local utils = import 'mixin-utils/utils.libsonnet';
128128
sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))
129129
-
130130
sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))
131-
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
132-
'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
131+
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
132+
'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
133133
],
134134
['success', 'failed']
135135
)
136136
)
137137
.addPanel(
138138
$.panel('Syncs/sec (By Reason)') +
139139
$.queryPanel(
140-
'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
140+
'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
141141
'{{reason}}'
142142
)
143143
)
144144
.addPanel(
145145
$.panel('Ring Check Errors/sec') +
146146
$.queryPanel(
147-
'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
147+
'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
148148
'errors'
149149
)
150150
)
@@ -154,7 +154,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
154154
.addPanel(
155155
$.panel('Initial syncs /sec') +
156156
$.queryPanel(
157-
'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
157+
'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
158158
'{{outcome}}'
159159
) + {
160160
targets: [
@@ -167,7 +167,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
167167
)
168168
.addPanel(
169169
$.panel('Initial sync duration') +
170-
$.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) + {
170+
$.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) + {
171171
targets: [
172172
target {
173173
interval: '1m',
@@ -184,8 +184,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
184184
sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval]))
185185
-
186186
sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))
187-
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
188-
'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
187+
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
188+
'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
189189
],
190190
['success', 'failed']
191191
) + {
@@ -208,8 +208,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
208208
sum(cluster_job:cortex_alertmanager_state_replication_total:rate5m{%s})
209209
-
210210
sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})
211-
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
212-
'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
211+
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
212+
'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager),
213213
],
214214
['success', 'failed']
215215
)
@@ -222,8 +222,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
222222
sum(cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m{%s})
223223
-
224224
sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})
225-
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
226-
'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
225+
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
226+
'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager),
227227
],
228228
['success', 'failed']
229229
)
@@ -236,8 +236,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
236236
sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval]))
237237
-
238238
sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))
239-
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
240-
'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
239+
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
240+
'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
241241
],
242242
['success', 'failed']
243243
)

cortex-mixin/dashboards/dashboard-utils.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
88
// - default tags,
99
// - some links that propagate the selectred cluster.
1010
dashboard(title)::
11-
super.dashboard(title) + {
11+
super.dashboard(title=title, datasource=$._config.dashboard_datasource) + {
1212
addRowIf(condition, row)::
1313
if condition
1414
then self.addRow(row)

cortex-mixin/dashboards/ruler.libsonnet

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -68,19 +68,19 @@ local utils = import 'mixin-utils/utils.libsonnet';
6868
})
6969
.addPanel(
7070
$.panel('Active Configurations') +
71-
$.statPanel('sum(cortex_ruler_managers_total{%s})' % $.jobMatcher('ruler'), format='short')
71+
$.statPanel('sum(cortex_ruler_managers_total{%s})' % $.jobMatcher($._config.job_names.ruler), format='short')
7272
)
7373
.addPanel(
7474
$.panel('Total Rules') +
75-
$.statPanel('sum(cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher('ruler'), format='short')
75+
$.statPanel('sum(cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher($._config.job_names.ruler), format='short')
7676
)
7777
.addPanel(
7878
$.panel('Read from Ingesters - QPS') +
79-
$.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}[5m]))' % $.jobMatcher('ruler'), format='reqps')
79+
$.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps')
8080
)
8181
.addPanel(
8282
$.panel('Write to Ingesters - QPS') +
83-
$.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}[5m]))' % $.jobMatcher('ruler'), format='reqps')
83+
$.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps')
8484
)
8585
)
8686
.addRow(
@@ -89,16 +89,16 @@ local utils = import 'mixin-utils/utils.libsonnet';
8989
$.panel('EPS') +
9090
$.queryPanel(
9191
[
92-
$.rulerQueries.ruleEvaluations.success % [$.jobMatcher('ruler'), $.jobMatcher('ruler')],
93-
$.rulerQueries.ruleEvaluations.failure % $.jobMatcher('ruler'),
92+
$.rulerQueries.ruleEvaluations.success % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)],
93+
$.rulerQueries.ruleEvaluations.failure % $.jobMatcher($._config.job_names.ruler),
9494
],
9595
['success', 'failed'],
9696
),
9797
)
9898
.addPanel(
9999
$.panel('Latency') +
100100
$.queryPanel(
101-
$.rulerQueries.ruleEvaluations.latency % [$.jobMatcher('ruler'), $.jobMatcher('ruler')],
101+
$.rulerQueries.ruleEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)],
102102
'average'
103103
),
104104
)
@@ -126,22 +126,22 @@ local utils = import 'mixin-utils/utils.libsonnet';
126126
$.row('Writes (Ingesters)')
127127
.addPanel(
128128
$.panel('QPS') +
129-
$.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher('ruler'))
129+
$.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler))
130130
)
131131
.addPanel(
132132
$.panel('Latency') +
133-
$.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher('ruler'))
133+
$.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler))
134134
)
135135
)
136136
.addRow(
137137
$.row('Reads (Ingesters)')
138138
.addPanel(
139139
$.panel('QPS') +
140-
$.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher('ruler'))
140+
$.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler))
141141
)
142142
.addPanel(
143143
$.panel('Latency') +
144-
$.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher('ruler'))
144+
$.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler))
145145
)
146146
)
147147
.addRowIf(
@@ -208,34 +208,34 @@ local utils = import 'mixin-utils/utils.libsonnet';
208208
$.row('Notifications')
209209
.addPanel(
210210
$.panel('Delivery Errors') +
211-
$.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}')
211+
$.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}')
212212
)
213213
.addPanel(
214214
$.panel('Queue Length') +
215-
$.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}')
215+
$.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}')
216216
)
217217
.addPanel(
218218
$.panel('Dropped') +
219-
$.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher('ruler'), '{{ user }}')
219+
$.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher($._config.job_names.ruler), '{{ user }}')
220220
)
221221
)
222222
.addRow(
223223
($.row('Group Evaluations') + { collapse: true })
224224
.addPanel(
225225
$.panel('Missed Iterations') +
226-
$.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher('ruler'), '{{ user }}'),
226+
$.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher($._config.job_names.ruler), '{{ user }}'),
227227
)
228228
.addPanel(
229229
$.panel('Latency') +
230230
$.queryPanel(
231-
$.rulerQueries.groupEvaluations.latency % [$.jobMatcher('ruler'), $.jobMatcher('ruler')],
231+
$.rulerQueries.groupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)],
232232
'{{ user }}'
233233
),
234234
)
235235
.addPanel(
236236
$.panel('Failures') +
237237
$.queryPanel(
238-
$.rulerQueries.perUserPerGroupEvaluations.failure % [$.jobMatcher('ruler')], '{{ rule_group }}'
238+
$.rulerQueries.perUserPerGroupEvaluations.failure % [$.jobMatcher($._config.job_names.ruler)], '{{ rule_group }}'
239239
)
240240
)
241241
)
@@ -244,7 +244,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
244244
.addPanel(
245245
$.panel('Latency') +
246246
$.queryPanel(
247-
$.rulerQueries.perUserPerGroupEvaluations.latency % [$.jobMatcher('ruler'), $.jobMatcher('ruler')],
247+
$.rulerQueries.perUserPerGroupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)],
248248
'{{ user }}'
249249
)
250250
)

0 commit comments

Comments
 (0)