Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.

Commit dccf32a

Browse files
authored
Merge pull request #365 from grafana/fix-scaling-dashboard-for-multi-zone-deployments
Fix scaling dashboard to work on multi-zone ingesters
2 parents 8c2d6c0 + 28561cb commit dccf32a

File tree

2 files changed

+59
-16
lines changed

2 files changed

+59
-16
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
3636
* [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329
3737
* [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335
38+
* [BUGFIX] Fixed scaling dashboard to correctly work when a Cortex service deployment spans across multiple zones (a zone is expected to have the `zone-[a-z]` suffix). #365
3839

3940
## 1.9.0 / 2021-05-18
4041

cortex-mixin/recording_rules.libsonnet

Lines changed: 58 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,20 @@ local utils = import 'mixin-utils/utils.libsonnet';
6969
rules: [
7070
{
7171
// Convenience rule to get the number of replicas for both a deployment and a statefulset.
72+
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
7273
record: 'cluster_namespace_deployment:actual_replicas:count',
7374
expr: |||
74-
sum by (cluster, namespace, deployment) (kube_deployment_spec_replicas)
75-
or
7675
sum by (cluster, namespace, deployment) (
77-
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)")
76+
label_replace(
77+
kube_deployment_spec_replicas,
78+
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
79+
# always matches everything and the (optional) zone is not removed.
80+
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
81+
)
82+
)
83+
or
84+
sum by (cluster, namespace, deployment) (
85+
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
7886
)
7987
|||,
8088
},
@@ -188,7 +196,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
188196
expr: |||
189197
ceil(
190198
(sum by (cluster, namespace) (
191-
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester"}
199+
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
192200
) / 4)
193201
/
194202
avg by (cluster, namespace) (
@@ -199,18 +207,25 @@ local utils = import 'mixin-utils/utils.libsonnet';
199207
},
200208
{
201209
// Convenience rule to get the CPU utilization for both a deployment and a statefulset.
210+
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
202211
record: 'cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate',
203212
expr: |||
204213
sum by (cluster, namespace, deployment) (
205214
label_replace(
206-
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate,
207-
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
215+
label_replace(
216+
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate,
217+
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
218+
),
219+
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
220+
# always matches everything and the (optional) zone is not removed.
221+
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
208222
)
209223
)
210224
|||,
211225
},
212226
{
213227
// Convenience rule to get the CPU request for both a deployment and a statefulset.
228+
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
214229
record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum',
215230
expr: |||
216231
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
@@ -223,8 +238,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
223238
(
224239
sum by (cluster, namespace, deployment) (
225240
label_replace(
226-
kube_pod_container_resource_requests_cpu_cores,
227-
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
241+
label_replace(
242+
kube_pod_container_resource_requests_cpu_cores,
243+
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
244+
),
245+
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
246+
# always matches everything and the (optional) zone is not removed.
247+
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
228248
)
229249
)
230250
)
@@ -234,8 +254,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
234254
(
235255
sum by (cluster, namespace, deployment) (
236256
label_replace(
237-
kube_pod_container_resource_requests{resource="cpu"},
238-
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
257+
label_replace(
258+
kube_pod_container_resource_requests{resource="cpu"},
259+
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
260+
),
261+
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
262+
# always matches everything and the (optional) zone is not removed.
263+
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
239264
)
240265
)
241266
)
@@ -261,18 +286,25 @@ local utils = import 'mixin-utils/utils.libsonnet';
261286
},
262287
{
263288
// Convenience rule to get the Memory utilization for both a deployment and a statefulset.
289+
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
264290
record: 'cluster_namespace_deployment:container_memory_usage_bytes:sum',
265291
expr: |||
266292
sum by (cluster, namespace, deployment) (
267293
label_replace(
268-
container_memory_usage_bytes,
269-
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
294+
label_replace(
295+
container_memory_usage_bytes,
296+
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
297+
),
298+
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
299+
# always matches everything and the (optional) zone is not removed.
300+
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
270301
)
271302
)
272303
|||,
273304
},
274305
{
275306
// Convenience rule to get the Memory request for both a deployment and a statefulset.
307+
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
276308
record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum',
277309
expr: |||
278310
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
@@ -285,8 +317,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
285317
(
286318
sum by (cluster, namespace, deployment) (
287319
label_replace(
288-
kube_pod_container_resource_requests_memory_bytes,
289-
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
320+
label_replace(
321+
kube_pod_container_resource_requests_memory_bytes,
322+
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
323+
),
324+
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
325+
# always matches everything and the (optional) zone is not removed.
326+
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
290327
)
291328
)
292329
)
@@ -296,8 +333,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
296333
(
297334
sum by (cluster, namespace, deployment) (
298335
label_replace(
299-
kube_pod_container_resource_requests{resource="memory"},
300-
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
336+
label_replace(
337+
kube_pod_container_resource_requests{resource="memory"},
338+
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
339+
),
340+
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
341+
# always matches everything and the (optional) zone is not removed.
342+
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
301343
)
302344
)
303345
)

0 commit comments

Comments
 (0)