Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.

Commit ee591ee

Browse files
authored
Merge pull request #313 from stevesg/alertmanager-sharding
Extend Alertmanager dashboard with currently unused metrics.
2 parents a337270 + 629d288 commit ee591ee

File tree

2 files changed

+141
-0
lines changed

2 files changed

+141
-0
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@
3232
* [ENHANCEMENT] Ruler dashboard: added "Per route p99 latency" panel in the "Configuration API" row. #353
3333
* [ENHANCEMENT] Increased the `for` duration of the `CortexIngesterReachingSeriesLimit` warning alert to 3h. #362
3434
* [ENHANCEMENT] Added a new tier (`medium_small_user`) so we have another tier between 100K and 1Mil active series. #364
35+
* [ENHANCEMENT] Extend Alertmanager dashboard: #313
36+
* "Tenants" stat panel - shows number of discovered tenant configurations.
37+
* "Replication" row - information about the replication of tenants/alerts/silences over instances.
38+
* "Tenant Configuration Sync" row - information about the configuration sync procedure.
39+
* "Sharding Initial State Sync" row - information about the initial state sync procedure when sharding is enabled.
40+
* "Sharding Runtime State Sync" row - information about various state operations which occur when sharding is enabled (replication, fetch, marge, persist).
3541
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
3642
* [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329
3743
* [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335

cortex-mixin/dashboards/alertmanager.libsonnet

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
1717
$.panel('Total Silences') +
1818
$.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short')
1919
)
20+
.addPanel(
21+
$.panel('Tenants') +
22+
$.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager'), format='short')
23+
)
2024
)
2125
.addRow(
2226
$.row('Alerts Received')
@@ -86,5 +90,136 @@ local utils = import 'mixin-utils/utils.libsonnet';
8690
)
8791
.addRows(
8892
$.getObjectStoreRows('Alertmanager Configuration Object Store (Alertmanager accesses)', 'alertmanager-storage')
93+
)
94+
.addRow(
95+
$.row('Replication')
96+
.addPanel(
97+
$.panel('Per %s Tenants' % $._config.per_instance_label) +
98+
$.queryPanel(
99+
'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')],
100+
'{{%s}}' % $._config.per_instance_label
101+
) +
102+
$.stack
103+
)
104+
.addPanel(
105+
$.panel('Per %s Alerts' % $._config.per_instance_label) +
106+
$.queryPanel(
107+
'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')],
108+
'{{%s}}' % $._config.per_instance_label
109+
) +
110+
$.stack
111+
)
112+
.addPanel(
113+
$.panel('Per %s Silences' % $._config.per_instance_label) +
114+
$.queryPanel(
115+
'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')],
116+
'{{%s}}' % $._config.per_instance_label
117+
) +
118+
$.stack
119+
)
120+
)
121+
.addRow(
122+
$.row('Tenant Configuration Sync')
123+
.addPanel(
124+
$.panel('Syncs/sec') +
125+
$.queryPanel(
126+
[
127+
|||
128+
sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))
129+
-
130+
sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))
131+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
132+
'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
133+
],
134+
['success', 'failed']
135+
)
136+
)
137+
.addPanel(
138+
$.panel('Syncs/sec (By Reason)') +
139+
$.queryPanel(
140+
'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
141+
'{{reason}}'
142+
)
143+
)
144+
.addPanel(
145+
$.panel('Ring Check Errors/sec') +
146+
$.queryPanel(
147+
'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
148+
'errors'
149+
)
150+
)
151+
)
152+
.addRow(
153+
$.row('Sharding Initial State Sync')
154+
.addPanel(
155+
$.panel('Initial syncs/sec') +
156+
$.queryPanel(
157+
'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
158+
'{{outcome}}'
159+
)
160+
)
161+
.addPanel(
162+
$.panel('Initial sync duration') +
163+
$.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager'))
164+
)
165+
.addPanel(
166+
$.panel('Fetch state from other alertmanagers /sec') +
167+
$.queryPanel(
168+
[
169+
|||
170+
sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval]))
171+
-
172+
sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))
173+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
174+
'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
175+
],
176+
['success', 'failed']
177+
)
178+
)
179+
)
180+
.addRow(
181+
$.row('Sharding Runtime State Sync')
182+
.addPanel(
183+
$.panel('Replicate state to other alertmanagers /sec') +
184+
$.queryPanel(
185+
[
186+
|||
187+
sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval]))
188+
-
189+
sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))
190+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
191+
'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
192+
],
193+
['success', 'failed']
194+
)
195+
)
196+
.addPanel(
197+
$.panel('Merge state from other alertmanagers /sec') +
198+
$.queryPanel(
199+
[
200+
|||
201+
sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval]))
202+
-
203+
sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))
204+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
205+
'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
206+
],
207+
['success', 'failed']
208+
)
209+
)
210+
.addPanel(
211+
$.panel('Persist state to remote storage /sec') +
212+
$.queryPanel(
213+
[
214+
|||
215+
sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval]))
216+
-
217+
sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))
218+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
219+
'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
220+
],
221+
['success', 'failed']
222+
)
223+
)
89224
),
90225
}

0 commit comments

Comments
 (0)