@@ -6,105 +6,54 @@ local utils = import 'mixin-utils/utils.libsonnet';
6
6
($.dashboard('Cortex / Scaling' ) + { uid: '88c041017b96856c9176e07cf557bdcf' })
7
7
.addClusterSelectorTemplates()
8
8
.addRow(
9
- $.row('Workload-based scaling' )
10
- .addPanel(
11
- $.panel('Workload-based scaling' ) + { sort: { col: 1 , desc: false } } +
12
- $.tablePanel([
13
- |||
14
- sum by (cluster, namespace, deployment) (
15
- kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"}
16
- or
17
- label_replace(
18
- kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"},
19
- "deployment", "$1", "statefulset", "(.*)"
20
- )
21
- )
22
- ||| ,
23
- |||
24
- quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(cortex_distributor_received_samples_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "ingester", "cluster", ".*"))[1h:])
25
- * 3 / 80e3
26
- ||| ,
27
- |||
28
- label_replace(
29
- sum by(cluster, namespace) (
30
- cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace"}
31
- ) / 1e+6,
32
- "deployment", "ingester", "cluster", ".*"
33
- )
34
- or
35
- label_replace(
36
- sum by (cluster, namespace) (
37
- 4 * cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
38
- *
39
- cortex_ingester_chunk_size_bytes_sum{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
40
- /
41
- cortex_ingester_chunk_size_bytes_count{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
42
- )
43
- /
44
- avg by (cluster, namespace) (memcached_limit_bytes{cluster=~"$cluster", namespace=~"$namespace", job=~".+/memcached"}),
45
- "deployment", "memcached", "namespace", ".*"
46
- )
9
+ ($.row('Cortex Service Scaling' ) + { height: '200px' })
10
+ .addPanel({
11
+ type: 'text' ,
12
+ title: '' ,
13
+ options: {
14
+ content: |||
15
+ This dashboards shows any services which are not scaled correctly.
16
+ The table below gives the required number of replicas and the reason why.
17
+ We only show services without enough replicas.
18
+
19
+ Reasons:
20
+ - **sample_rate**: There are not enough replicas to handle the
21
+ sample rate. Applies to distributor and ingesters.
22
+ - **active_series**: There are not enough replicas
23
+ to handle the number of active series. Applies to ingesters.
24
+ - **cpu_usage**: There are not enough replicas
25
+ based on the CPU usage of the jobs vs the resource requests.
26
+ Applies to all jobs.
27
+ - **memory_usage**: There are not enough replicas based on the memory
28
+ usage vs the resource requests. Applies to all jobs.
29
+ - **active_series_limits**: There are not enough replicas to hold 60% of the
30
+ sum of all the per tenant series limits.
31
+ - **sample_rate_limits**: There are not enough replicas to handle 60% of the
32
+ sum of all the per tenant rate limits.
47
33
||| ,
48
- ], {
49
- cluster: { alias: 'Cluster' },
50
- namespace: { alias: 'Namespace' },
51
- deployment: { alias: 'Deployment' },
52
- 'Value #A' : { alias: 'Current Replicas' , decimals: 0 },
53
- 'Value #B' : { alias: 'Required Replicas, by ingestion rate' , decimals: 0 },
54
- 'Value #C' : { alias: 'Required Replicas, by active series' , decimals: 0 },
55
- })
56
- )
34
+ mode: 'markdown' ,
35
+ },
36
+ })
57
37
)
58
38
.addRow(
59
- ($.row('Resource-based scaling ' ) + { height: '500px ' })
39
+ ($.row('Scaling ' ) + { height: '400px ' })
60
40
.addPanel(
61
- $.panel('Resource -based scaling' ) + { sort: { col: 1 , desc: false } } +
41
+ $.panel('Workload -based scaling' ) + { sort: { col: 0 , desc: false } } +
62
42
$.tablePanel([
63
43
|||
64
- sum by (cluster, namespace, deployment) (
65
- kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
66
- or
67
- label_replace(
68
- kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
69
- "deployment", "$1", "statefulset", "(.*)"
70
- )
71
- )
72
- ||| ,
73
- |||
74
- sum by (cluster, namespace, deployment) (
75
- kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
76
- or
77
- label_replace(
78
- kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
79
- "deployment", "$1", "statefulset", "(.*)"
80
- )
81
- )
82
- *
83
- quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:])
84
- /
85
- sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_cpu_cores{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))
86
- ||| ,
87
- |||
88
- sum by (cluster, namespace, deployment) (
89
- kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
90
- or
91
- label_replace(
92
- kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
93
- "deployment", "$1", "statefulset", "(.*)"
94
- )
44
+ sort_desc(
45
+ cluster_namespace_deployment_reason:required_replicas:count{cluster=~"$cluster", namespace=~"$namespace"}
46
+ > ignoring(reason) group_left
47
+ cluster_namespace_deployment:actual_replicas:count{cluster=~"$cluster", namespace=~"$namespace"}
95
48
)
96
- *
97
- quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m])
98
- /
99
- sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_memory_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))
100
49
||| ,
101
50
], {
51
+ '__name__' : { alias: 'Cluster' , type: 'hidden' },
102
52
cluster: { alias: 'Cluster' },
103
53
namespace: { alias: 'Namespace' },
104
- deployment: { alias: 'Deployment' },
105
- 'Value #A' : { alias: 'Current Replicas' , decimals: 0 },
106
- 'Value #B' : { alias: 'Required Replicas, by CPU usage' , decimals: 0 },
107
- 'Value #C' : { alias: 'Required Replicas, by Memory usage' , decimals: 0 },
54
+ deployment: { alias: 'Service' },
55
+ reason: { alias: 'Reason' },
56
+ 'Value' : { alias: 'Required Replicas' , decimals: 0 },
108
57
})
109
58
)
110
59
),
0 commit comments