@@ -17,6 +17,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
17
17
$.panel('Total Silences' ) +
18
18
$.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager' ), format='short' )
19
19
)
20
+ .addPanel(
21
+ $.panel('Tenants' ) +
22
+ $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager' ), format='short' )
23
+ )
20
24
)
21
25
.addRow(
22
26
$.row('Alerts Received' )
@@ -86,5 +90,136 @@ local utils = import 'mixin-utils/utils.libsonnet';
86
90
)
87
91
.addRows(
88
92
$.getObjectStoreRows('Alertmanager Configuration Object Store (Alertmanager accesses)' , 'alertmanager-storage' )
93
+ )
94
+ .addRow(
95
+ $.row('Replication' )
96
+ .addPanel(
97
+ $.panel('Per %s Tenants' % $._config.per_instance_label) +
98
+ $.queryPanel(
99
+ 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager' )],
100
+ '{{%s}}' % $._config.per_instance_label
101
+ ) +
102
+ $.stack
103
+ )
104
+ .addPanel(
105
+ $.panel('Per %s Alerts' % $._config.per_instance_label) +
106
+ $.queryPanel(
107
+ 'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager' )],
108
+ '{{%s}}' % $._config.per_instance_label
109
+ ) +
110
+ $.stack
111
+ )
112
+ .addPanel(
113
+ $.panel('Per %s Silences' % $._config.per_instance_label) +
114
+ $.queryPanel(
115
+ 'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager' )],
116
+ '{{%s}}' % $._config.per_instance_label
117
+ ) +
118
+ $.stack
119
+ )
120
+ )
121
+ .addRow(
122
+ $.row('Tenant Configuration Sync' )
123
+ .addPanel(
124
+ $.panel('Syncs/sec' ) +
125
+ $.queryPanel(
126
+ [
127
+ |||
128
+ sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))
129
+ -
130
+ sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))
131
+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
132
+ 'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
133
+ ],
134
+ ['success' , 'failed' ]
135
+ )
136
+ )
137
+ .addPanel(
138
+ $.panel('Syncs/sec (By Reason)' ) +
139
+ $.queryPanel(
140
+ 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
141
+ '{{reason}}'
142
+ )
143
+ )
144
+ .addPanel(
145
+ $.panel('Ring Check Errors/sec' ) +
146
+ $.queryPanel(
147
+ 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
148
+ 'errors'
149
+ )
150
+ )
151
+ )
152
+ .addRow(
153
+ $.row('Sharding Initial State Sync' )
154
+ .addPanel(
155
+ $.panel('Initial syncs/sec' ) +
156
+ $.queryPanel(
157
+ 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
158
+ '{{outcome}}'
159
+ )
160
+ )
161
+ .addPanel(
162
+ $.panel('Initial sync duration' ) +
163
+ $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds' , '{%s}' % $.jobMatcher('alertmanager' ))
164
+ )
165
+ .addPanel(
166
+ $.panel('Fetch state from other alertmanagers /sec' ) +
167
+ $.queryPanel(
168
+ [
169
+ |||
170
+ sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval]))
171
+ -
172
+ sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))
173
+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
174
+ 'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
175
+ ],
176
+ ['success' , 'failed' ]
177
+ )
178
+ )
179
+ )
180
+ .addRow(
181
+ $.row('Sharding Runtime State Sync' )
182
+ .addPanel(
183
+ $.panel('Replicate state to other alertmanagers /sec' ) +
184
+ $.queryPanel(
185
+ [
186
+ |||
187
+ sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval]))
188
+ -
189
+ sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))
190
+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
191
+ 'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
192
+ ],
193
+ ['success' , 'failed' ]
194
+ )
195
+ )
196
+ .addPanel(
197
+ $.panel('Merge state from other alertmanagers /sec' ) +
198
+ $.queryPanel(
199
+ [
200
+ |||
201
+ sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval]))
202
+ -
203
+ sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))
204
+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
205
+ 'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
206
+ ],
207
+ ['success' , 'failed' ]
208
+ )
209
+ )
210
+ .addPanel(
211
+ $.panel('Persist state to remote storage /sec' ) +
212
+ $.queryPanel(
213
+ [
214
+ |||
215
+ sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval]))
216
+ -
217
+ sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))
218
+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
219
+ 'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
220
+ ],
221
+ ['success' , 'failed' ]
222
+ )
223
+ )
89
224
),
90
225
}
0 commit comments