Skip to content

Commit a1c276d

Browse files
authored
Merge pull request #892 from SuperQ/superq/split-histogram
Split kube_apiserver rules
2 parents 883f294 + 0b456ff commit a1c276d

5 files changed

+306
-284
lines changed
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
{
2+
prometheusRules+:: {
3+
local SLODays = $._config.SLOs.apiserver.days + 'd',
4+
local verbs = [
5+
{ type: 'read', selector: $._config.kubeApiserverReadSelector },
6+
{ type: 'write', selector: $._config.kubeApiserverWriteSelector },
7+
],
8+
9+
groups+: [
10+
{
11+
name: 'kube-apiserver-availability.rules',
12+
interval: '3m',
13+
rules: [
14+
{
15+
record: 'code_verb:apiserver_request_total:increase%s' % SLODays,
16+
expr: |||
17+
avg_over_time(code_verb:apiserver_request_total:increase1h[%s]) * 24 * %d
18+
||| % [SLODays, $._config.SLOs.apiserver.days],
19+
},
20+
] + [
21+
{
22+
record: 'code:apiserver_request_total:increase%s' % SLODays,
23+
expr: |||
24+
sum by (%s, code) (code_verb:apiserver_request_total:increase%s{%s})
25+
||| % [$._config.clusterLabel, SLODays, verb.selector],
26+
labels: {
27+
verb: verb.type,
28+
},
29+
}
30+
for verb in verbs
31+
] + [
32+
{
33+
record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h',
34+
expr: |||
35+
sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s}[1h]))
36+
||| % $._config,
37+
},
38+
{
39+
record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%s' % SLODays,
40+
expr: |||
41+
sum by (%s, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[%s]) * 24 * %s)
42+
||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days],
43+
},
44+
{
45+
record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h',
46+
expr: |||
47+
sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
48+
||| % $._config,
49+
},
50+
{
51+
record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%s' % SLODays,
52+
expr: |||
53+
sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[%s]) * 24 * %s)
54+
||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days],
55+
},
56+
{
57+
record: 'apiserver_request:availability%s' % SLODays,
58+
expr: |||
59+
1 - (
60+
(
61+
# write too slow
62+
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s})
63+
-
64+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"})
65+
) +
66+
(
67+
# read too slow
68+
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s})
69+
-
70+
(
71+
(
72+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"})
73+
or
74+
vector(0)
75+
)
76+
+
77+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"})
78+
+
79+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"})
80+
)
81+
) +
82+
# errors
83+
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{code=~"5.."} or vector(0))
84+
)
85+
/
86+
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s)
87+
||| % ($._config { SLODays: SLODays }),
88+
labels: {
89+
verb: 'all',
90+
},
91+
},
92+
{
93+
record: 'apiserver_request:availability%s' % SLODays,
94+
expr: |||
95+
1 - (
96+
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s})
97+
-
98+
(
99+
# too slow
100+
(
101+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"})
102+
or
103+
vector(0)
104+
)
105+
+
106+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"})
107+
+
108+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"})
109+
)
110+
+
111+
# errors
112+
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="read",code=~"5.."} or vector(0))
113+
)
114+
/
115+
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="read"})
116+
||| % ($._config { SLODays: SLODays, days: $._config.SLOs.apiserver.days }),
117+
labels: {
118+
verb: 'read',
119+
},
120+
},
121+
{
122+
record: 'apiserver_request:availability%s' % SLODays,
123+
expr: |||
124+
1 - (
125+
(
126+
# too slow
127+
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s})
128+
-
129+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"})
130+
)
131+
+
132+
# errors
133+
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="write",code=~"5.."} or vector(0))
134+
)
135+
/
136+
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="write"})
137+
||| % ($._config { SLODays: SLODays, days: $._config.SLOs.apiserver.days }),
138+
labels: {
139+
verb: 'write',
140+
},
141+
},
142+
] + [
143+
{
144+
record: 'code_resource:apiserver_request_total:rate5m',
145+
expr: |||
146+
sum by (%s,code,resource) (rate(apiserver_request_total{%s}[5m]))
147+
||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector])],
148+
labels: {
149+
verb: verb.type,
150+
},
151+
}
152+
for verb in verbs
153+
] + [
154+
{
155+
record: 'code_verb:apiserver_request_total:increase1h',
156+
expr: |||
157+
sum by (%s, code, verb) (increase(apiserver_request_total{%s,verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"%s"}[1h]))
158+
||| % [$._config.clusterLabel, $._config.kubeApiserverSelector, code],
159+
}
160+
for code in ['2..', '3..', '4..', '5..']
161+
],
162+
},
163+
],
164+
},
165+
}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
{
2+
prometheusRules+:: {
3+
groups+: [
4+
{
5+
name: 'kube-apiserver-burnrate.rules',
6+
rules: [
7+
{
8+
record: 'apiserver_request:burnrate%(window)s' % w,
9+
expr: |||
10+
(
11+
(
12+
# too slow
13+
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s]))
14+
-
15+
(
16+
(
17+
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}[%(window)s]))
18+
or
19+
vector(0)
20+
)
21+
+
22+
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}[%(window)s]))
23+
+
24+
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}[%(window)s]))
25+
)
26+
)
27+
+
28+
# errors
29+
sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,code=~"5.."}[%(window)s]))
30+
)
31+
/
32+
sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s}[%(window)s]))
33+
||| % {
34+
clusterLabel: $._config.clusterLabel,
35+
window: w,
36+
kubeApiserverSelector: $._config.kubeApiserverSelector,
37+
kubeApiserverReadSelector: $._config.kubeApiserverReadSelector,
38+
kubeApiserverNonStreamingSelector: $._config.kubeApiserverNonStreamingSelector,
39+
kubeApiserverReadResourceLatency: $._config.kubeApiserverReadResourceLatency,
40+
kubeApiserverReadNamespaceLatency: $._config.kubeApiserverReadNamespaceLatency,
41+
kubeApiserverReadClusterLatency: $._config.kubeApiserverReadClusterLatency,
42+
},
43+
labels: {
44+
verb: 'read',
45+
},
46+
}
47+
for w in std.set([ // Get the unique array of short and long window rates
48+
w.short
49+
for w in $._config.SLOs.apiserver.windows
50+
] + [
51+
w.long
52+
for w in $._config.SLOs.apiserver.windows
53+
])
54+
] + [
55+
{
56+
record: 'apiserver_request:burnrate%(window)s' % w,
57+
expr: |||
58+
(
59+
(
60+
# too slow
61+
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s]))
62+
-
63+
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le="%(kubeApiserverWriteLatency)s"}[%(window)s]))
64+
)
65+
+
66+
sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,code=~"5.."}[%(window)s]))
67+
)
68+
/
69+
sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s}[%(window)s]))
70+
||| % {
71+
clusterLabel: $._config.clusterLabel,
72+
window: w,
73+
kubeApiserverSelector: $._config.kubeApiserverSelector,
74+
kubeApiserverWriteSelector: $._config.kubeApiserverWriteSelector,
75+
kubeApiserverNonStreamingSelector: $._config.kubeApiserverNonStreamingSelector,
76+
kubeApiserverWriteLatency: $._config.kubeApiserverWriteLatency,
77+
},
78+
labels: {
79+
verb: 'write',
80+
},
81+
}
82+
for w in std.set([ // Get the unique array of short and long window rates
83+
w.short
84+
for w in $._config.SLOs.apiserver.windows
85+
] + [
86+
w.long
87+
for w in $._config.SLOs.apiserver.windows
88+
])
89+
],
90+
},
91+
],
92+
},
93+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
_config+:: {
3+
kubeApiserverSelector: 'job="kube-apiserver"',
4+
podLabel: 'pod',
5+
kubeApiserverReadSelector: 'verb=~"LIST|GET"',
6+
kubeApiserverWriteSelector: 'verb=~"POST|PUT|PATCH|DELETE"',
7+
kubeApiserverNonStreamingSelector: 'subresource!~"proxy|attach|log|exec|portforward"',
8+
// These are buckets that exist on the apiserver_request_sli_duration_seconds_bucket histogram.
9+
// They are what the Kubernetes SIG Scalability is using to measure availability of Kubernetes clusters.
10+
// If you want to change these, make sure the "le" buckets exist on the histogram!
11+
kubeApiserverReadResourceLatency: '1',
12+
kubeApiserverReadNamespaceLatency: '5',
13+
kubeApiserverReadClusterLatency: '30',
14+
kubeApiserverWriteLatency: '1',
15+
},
16+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
prometheusRules+:: {
3+
local verbs = [
4+
{ type: 'read', selector: $._config.kubeApiserverReadSelector },
5+
{ type: 'write', selector: $._config.kubeApiserverWriteSelector },
6+
],
7+
8+
groups+: [
9+
{
10+
name: 'kube-apiserver-histogram.rules',
11+
rules:
12+
[
13+
{
14+
record: 'cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile',
15+
expr: |||
16+
histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{%s}[5m]))) > 0
17+
||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector, $._config.kubeApiserverNonStreamingSelector])],
18+
labels: {
19+
verb: verb.type,
20+
quantile: '0.99',
21+
},
22+
}
23+
for verb in verbs
24+
],
25+
},
26+
],
27+
},
28+
}

0 commit comments

Comments
 (0)