Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.

Commit 9b04c90

Browse files
authored
Merge pull request #278 from grafana/scaling-rules
Add recording rules to calculate Cortex scaling
2 parents 5cf0c4f + fbf3f98 commit 9b04c90

File tree

4 files changed

+249
-99
lines changed

4 files changed

+249
-99
lines changed

cortex-mixin/dashboards/scaling.libsonnet

Lines changed: 37 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -6,105 +6,54 @@ local utils = import 'mixin-utils/utils.libsonnet';
66
($.dashboard('Cortex / Scaling') + { uid: '88c041017b96856c9176e07cf557bdcf' })
77
.addClusterSelectorTemplates()
88
.addRow(
9-
$.row('Workload-based scaling')
10-
.addPanel(
11-
$.panel('Workload-based scaling') + { sort: { col: 1, desc: false } } +
12-
$.tablePanel([
13-
|||
14-
sum by (cluster, namespace, deployment) (
15-
kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"}
16-
or
17-
label_replace(
18-
kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"},
19-
"deployment", "$1", "statefulset", "(.*)"
20-
)
21-
)
22-
|||,
23-
|||
24-
quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(cortex_distributor_received_samples_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "ingester", "cluster", ".*"))[1h:])
25-
* 3 / 80e3
26-
|||,
27-
|||
28-
label_replace(
29-
sum by(cluster, namespace) (
30-
cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace"}
31-
) / 1e+6,
32-
"deployment", "ingester", "cluster", ".*"
33-
)
34-
or
35-
label_replace(
36-
sum by (cluster, namespace) (
37-
4 * cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
38-
*
39-
cortex_ingester_chunk_size_bytes_sum{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
40-
/
41-
cortex_ingester_chunk_size_bytes_count{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
42-
)
43-
/
44-
avg by (cluster, namespace) (memcached_limit_bytes{cluster=~"$cluster", namespace=~"$namespace", job=~".+/memcached"}),
45-
"deployment", "memcached", "namespace", ".*"
46-
)
9+
($.row('Cortex Service Scaling') + { height: '200px' })
10+
.addPanel({
11+
type: 'text',
12+
title: '',
13+
options: {
14+
content: |||
15+
This dashboards shows any services which are not scaled correctly.
16+
The table below gives the required number of replicas and the reason why.
17+
We only show services without enough replicas.
18+
19+
Reasons:
20+
- **sample_rate**: There are not enough replicas to handle the
21+
sample rate. Applies to distributor and ingesters.
22+
- **active_series**: There are not enough replicas
23+
to handle the number of active series. Applies to ingesters.
24+
- **cpu_usage**: There are not enough replicas
25+
based on the CPU usage of the jobs vs the resource requests.
26+
Applies to all jobs.
27+
- **memory_usage**: There are not enough replicas based on the memory
28+
usage vs the resource requests. Applies to all jobs.
29+
- **active_series_limits**: There are not enough replicas to hold 60% of the
30+
sum of all the per tenant series limits.
31+
- **sample_rate_limits**: There are not enough replicas to handle 60% of the
32+
sum of all the per tenant rate limits.
4733
|||,
48-
], {
49-
cluster: { alias: 'Cluster' },
50-
namespace: { alias: 'Namespace' },
51-
deployment: { alias: 'Deployment' },
52-
'Value #A': { alias: 'Current Replicas', decimals: 0 },
53-
'Value #B': { alias: 'Required Replicas, by ingestion rate', decimals: 0 },
54-
'Value #C': { alias: 'Required Replicas, by active series', decimals: 0 },
55-
})
56-
)
34+
mode: 'markdown',
35+
},
36+
})
5737
)
5838
.addRow(
59-
($.row('Resource-based scaling') + { height: '500px' })
39+
($.row('Scaling') + { height: '400px' })
6040
.addPanel(
61-
$.panel('Resource-based scaling') + { sort: { col: 1, desc: false } } +
41+
$.panel('Workload-based scaling') + { sort: { col: 0, desc: false } } +
6242
$.tablePanel([
6343
|||
64-
sum by (cluster, namespace, deployment) (
65-
kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
66-
or
67-
label_replace(
68-
kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
69-
"deployment", "$1", "statefulset", "(.*)"
70-
)
71-
)
72-
|||,
73-
|||
74-
sum by (cluster, namespace, deployment) (
75-
kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
76-
or
77-
label_replace(
78-
kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
79-
"deployment", "$1", "statefulset", "(.*)"
80-
)
81-
)
82-
*
83-
quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:])
84-
/
85-
sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_cpu_cores{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))
86-
|||,
87-
|||
88-
sum by (cluster, namespace, deployment) (
89-
kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
90-
or
91-
label_replace(
92-
kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
93-
"deployment", "$1", "statefulset", "(.*)"
94-
)
44+
sort_desc(
45+
cluster_namespace_deployment_reason:required_replicas:count{cluster=~"$cluster", namespace=~"$namespace"}
46+
> ignoring(reason) group_left
47+
cluster_namespace_deployment:actual_replicas:count{cluster=~"$cluster", namespace=~"$namespace"}
9548
)
96-
*
97-
quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m])
98-
/
99-
sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_memory_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))
10049
|||,
10150
], {
51+
'__name__': { alias: 'Cluster', type: 'hidden' },
10252
cluster: { alias: 'Cluster' },
10353
namespace: { alias: 'Namespace' },
104-
deployment: { alias: 'Deployment' },
105-
'Value #A': { alias: 'Current Replicas', decimals: 0 },
106-
'Value #B': { alias: 'Required Replicas, by CPU usage', decimals: 0 },
107-
'Value #C': { alias: 'Required Replicas, by Memory usage', decimals: 0 },
54+
deployment: { alias: 'Service' },
55+
reason: { alias: 'Reason' },
56+
'Value': { alias: 'Required Replicas', decimals: 0 },
10857
})
10958
)
11059
),

cortex-mixin/recording_rules.libsonnet

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,207 @@ local utils = import 'mixin-utils/utils.libsonnet';
5858
},
5959
],
6060
},
61+
{
62+
local _config = {
63+
max_series_per_ingester: 1.5e6,
64+
max_samples_per_sec_per_ingester: 80e3,
65+
max_samples_per_sec_per_distributor: 240e3,
66+
limit_utilisation_target: 0.6,
67+
},
68+
name: 'cortex_scaling_rules',
69+
rules: [
70+
{
71+
// Convenience rule to get the number of replicas for both a deployment and a statefulset.
72+
record: 'cluster_namespace_deployment:actual_replicas:count',
73+
expr: |||
74+
sum by (cluster, namespace, deployment) (kube_deployment_spec_replicas)
75+
or
76+
sum by (cluster, namespace, deployment) (
77+
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)")
78+
)
79+
|||,
80+
},
81+
{
82+
// Distributors should be able to deal with 240k samples/s.
83+
record: 'cluster_namespace_deployment_reason:required_replicas:count',
84+
labels: {
85+
deployment: 'distributor',
86+
reason: 'sample_rate',
87+
},
88+
expr: |||
89+
ceil(
90+
quantile_over_time(0.99,
91+
sum by (cluster, namespace) (
92+
cluster_namespace_job:cortex_distributor_received_samples:rate5m
93+
)[24h:]
94+
)
95+
/ %(max_samples_per_sec_per_distributor)s
96+
)
97+
||| % _config,
98+
},
99+
{
100+
// We should be about to cover 80% of our limits,
101+
// and ingester can have 80k samples/s.
102+
record: 'cluster_namespace_deployment_reason:required_replicas:count',
103+
labels: {
104+
deployment: 'distributor',
105+
reason: 'sample_rate_limits',
106+
},
107+
expr: |||
108+
ceil(
109+
sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"})
110+
* %(limit_utilisation_target)s / %(max_samples_per_sec_per_distributor)s
111+
)
112+
||| % _config,
113+
},
114+
{
115+
// We want ingesters each ingester to deal with 80k samples/s.
116+
// NB we measure this at the distributors and multiple by RF (3).
117+
record: 'cluster_namespace_deployment_reason:required_replicas:count',
118+
labels: {
119+
deployment: 'ingester',
120+
reason: 'sample_rate',
121+
},
122+
expr: |||
123+
ceil(
124+
quantile_over_time(0.99,
125+
sum by (cluster, namespace) (
126+
cluster_namespace_job:cortex_distributor_received_samples:rate5m
127+
)[24h:]
128+
)
129+
* 3 / %(max_samples_per_sec_per_ingester)s
130+
)
131+
||| % _config,
132+
},
133+
{
134+
// Ingester should have 1.5M series in memory
135+
record: 'cluster_namespace_deployment_reason:required_replicas:count',
136+
labels: {
137+
deployment: 'ingester',
138+
reason: 'active_series',
139+
},
140+
expr: |||
141+
ceil(
142+
quantile_over_time(0.99,
143+
sum by(cluster, namespace) (
144+
cortex_ingester_memory_series
145+
)[24h:]
146+
)
147+
/ %(max_series_per_ingester)s
148+
)
149+
||| % _config,
150+
},
151+
{
152+
// We should be about to cover 60% of our limits,
153+
// and ingester can have 1.5M series in memory
154+
record: 'cluster_namespace_deployment_reason:required_replicas:count',
155+
labels: {
156+
deployment: 'ingester',
157+
reason: 'active_series_limits',
158+
},
159+
expr: |||
160+
ceil(
161+
sum by (cluster, namespace) (cortex_overrides{limit_name="max_global_series_per_user"})
162+
* 3 * %(limit_utilisation_target)s / %(max_series_per_ingester)s
163+
)
164+
||| % _config,
165+
},
166+
{
167+
// We should be about to cover 60% of our limits,
168+
// and ingester can have 80k samples/s.
169+
record: 'cluster_namespace_deployment_reason:required_replicas:count',
170+
labels: {
171+
deployment: 'ingester',
172+
reason: 'sample_rate_limits',
173+
},
174+
expr: |||
175+
ceil(
176+
sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"})
177+
* %(limit_utilisation_target)s / %(max_samples_per_sec_per_ingester)s
178+
)
179+
||| % _config,
180+
},
181+
{
182+
// Ingesters store 96h of data on disk - we want memcached to store 1/4 of that.
183+
record: 'cluster_namespace_deployment_reason:required_replicas:count',
184+
labels: {
185+
deployment: 'memcached',
186+
reason: 'active_series',
187+
},
188+
expr: |||
189+
ceil(
190+
(sum by (cluster, namespace) (
191+
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester"}
192+
) / 4)
193+
/
194+
avg by (cluster, namespace) (
195+
memcached_limit_bytes{job=~".+/memcached"}
196+
)
197+
)
198+
|||,
199+
},
200+
{
201+
// Jobs should be sized to their CPU usage.
202+
// We do this by comparing 99th percentile usage over the last 24hrs to
203+
// their current provisioned #replicas and resource requests.
204+
record: 'cluster_namespace_deployment_reason:required_replicas:count',
205+
labels: {
206+
reason: 'cpu_usage',
207+
},
208+
expr: |||
209+
ceil(
210+
cluster_namespace_deployment:actual_replicas:count
211+
*
212+
quantile_over_time(0.99,
213+
sum by (cluster, namespace, deployment) (
214+
label_replace(
215+
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate,
216+
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
217+
)
218+
)[24h:5m]
219+
)
220+
/
221+
sum by (cluster, namespace, deployment) (
222+
label_replace(
223+
kube_pod_container_resource_requests_cpu_cores,
224+
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
225+
)
226+
)
227+
)
228+
|||,
229+
},
230+
{
231+
// Jobs should be sized to their Memory usage.
232+
// We do this by comparing 99th percentile usage over the last 24hrs to
233+
// their current provisioned #replicas and resource requests.
234+
record: 'cluster_namespace_deployment_reason:required_replicas:count',
235+
labels: {
236+
reason: 'memory_usage',
237+
},
238+
expr: |||
239+
ceil(
240+
cluster_namespace_deployment:actual_replicas:count
241+
*
242+
quantile_over_time(0.99,
243+
sum by (cluster, namespace, deployment) (
244+
label_replace(
245+
container_memory_usage_bytes,
246+
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
247+
)
248+
)[24h:5m]
249+
)
250+
/
251+
sum by (cluster, namespace, deployment) (
252+
label_replace(
253+
kube_pod_container_resource_requests_memory_bytes,
254+
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
255+
)
256+
)
257+
)
258+
|||,
259+
},
260+
],
261+
},
61262
],
62263
},
63264
}

cortex/ingester.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
),
4444

4545
ingester_statefulset_args::
46-
$._config.grpcConfig +
46+
$._config.grpcConfig
4747
{
4848
'ingester.wal-enabled': true,
4949
'ingester.checkpoint-enabled': true,

cortex/query-frontend.libsonnet

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
local container = $.core.v1.container,
33

44
query_frontend_args::
5-
$._config.grpcConfig +
5+
$._config.grpcConfig
66
{
77
target: 'query-frontend',
88

@@ -38,17 +38,17 @@
3838
'limits.per-user-override-config': '/etc/cortex/overrides.yaml',
3939
} + (
4040
if $._config.queryFrontend.sharded_queries_enabled then
41-
{
42-
'querier.parallelise-shardable-queries': 'true',
41+
{
42+
'querier.parallelise-shardable-queries': 'true',
4343

44-
// in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate.
45-
// basically base * shard_factor * query_split_factor / num_frontends where
46-
'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas),
44+
// in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate.
45+
// basically base * shard_factor * query_split_factor / num_frontends where
46+
'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas),
4747

48-
'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'],
49-
} + $._config.storageConfig
50-
else {}
51-
),
48+
'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'],
49+
} + $._config.storageConfig
50+
else {}
51+
),
5252

5353
query_frontend_container::
5454
container.new('query-frontend', $._images.query_frontend) +

0 commit comments

Comments
 (0)