|
5 | 5 | kubeApiserverReadSelector: 'verb=~"LIST|GET"',
|
6 | 6 | kubeApiserverWriteSelector: 'verb=~"POST|PUT|PATCH|DELETE"',
|
7 | 7 | kubeApiserverNonStreamingSelector: 'subresource!~"proxy|attach|log|exec|portforward"',
|
8 |
| - // These are buckets that exist on the apiserver_request_slo_duration_seconds_bucket histogram. |
| 8 | + // These are buckets that exist on the apiserver_request_sli_duration_seconds_bucket histogram. |
9 | 9 | // They are what the Kubernetes SIG Scalability is using to measure availability of Kubernetes clusters.
|
10 | 10 | // If you want to change these, make sure the "le" buckets exist on the histogram!
|
11 | 11 | kubeApiserverReadResourceLatency: '1',
|
|
31 | 31 | (
|
32 | 32 | (
|
33 | 33 | # too slow
|
34 |
| - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) |
| 34 | + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) |
35 | 35 | -
|
36 | 36 | (
|
37 | 37 | (
|
38 |
| - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}[%(window)s])) |
| 38 | + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}[%(window)s])) |
39 | 39 | or
|
40 | 40 | vector(0)
|
41 | 41 | )
|
42 | 42 | +
|
43 |
| - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}[%(window)s])) |
| 43 | + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}[%(window)s])) |
44 | 44 | +
|
45 |
| - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}[%(window)s])) |
| 45 | + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}[%(window)s])) |
46 | 46 | )
|
47 | 47 | )
|
48 | 48 | +
|
|
79 | 79 | (
|
80 | 80 | (
|
81 | 81 | # too slow
|
82 |
| - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) |
| 82 | + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) |
83 | 83 | -
|
84 |
| - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le="%(kubeApiserverWriteLatency)s"}[%(window)s])) |
| 84 | + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le="%(kubeApiserverWriteLatency)s"}[%(window)s])) |
85 | 85 | )
|
86 | 86 | +
|
87 | 87 | sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,code=~"5.."}[%(window)s]))
|
|
114 | 114 | rules:
|
115 | 115 | [
|
116 | 116 | {
|
117 |
| - record: 'cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile', |
| 117 | + record: 'cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile', |
118 | 118 | expr: |||
|
119 |
| - histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{%s}[5m]))) > 0 |
| 119 | + histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{%s}[5m]))) > 0 |
120 | 120 | ||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector, $._config.kubeApiserverNonStreamingSelector])],
|
121 | 121 | labels: {
|
122 | 122 | verb: verb.type,
|
|
149 | 149 | for verb in verbs
|
150 | 150 | ] + [
|
151 | 151 | {
|
152 |
| - record: 'cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h', |
| 152 | + record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h', |
153 | 153 | expr: |||
|
154 |
| - sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s}[1h])) |
| 154 | + sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s}[1h])) |
155 | 155 | ||| % $._config,
|
156 | 156 | },
|
157 | 157 | {
|
158 |
| - record: 'cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%s' % SLODays, |
| 158 | + record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%s' % SLODays, |
159 | 159 | expr: |||
|
160 |
| - sum by (%s, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[%s]) * 24 * %s) |
| 160 | + sum by (%s, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[%s]) * 24 * %s) |
161 | 161 | ||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days],
|
162 | 162 | },
|
163 | 163 | {
|
164 |
| - record: 'cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h', |
| 164 | + record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h', |
165 | 165 | expr: |||
|
166 |
| - sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h])) |
| 166 | + sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h])) |
167 | 167 | ||| % $._config,
|
168 | 168 | },
|
169 | 169 | {
|
170 |
| - record: 'cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%s' % SLODays, |
| 170 | + record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%s' % SLODays, |
171 | 171 | expr: |||
|
172 |
| - sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[%s]) * 24 * %s) |
| 172 | + sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[%s]) * 24 * %s) |
173 | 173 | ||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days],
|
174 | 174 | },
|
175 | 175 | {
|
|
178 | 178 | 1 - (
|
179 | 179 | (
|
180 | 180 | # write too slow
|
181 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) |
| 181 | + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) |
182 | 182 | -
|
183 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) |
| 183 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) |
184 | 184 | ) +
|
185 | 185 | (
|
186 | 186 | # read too slow
|
187 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) |
| 187 | + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) |
188 | 188 | -
|
189 | 189 | (
|
190 | 190 | (
|
191 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) |
| 191 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) |
192 | 192 | or
|
193 | 193 | vector(0)
|
194 | 194 | )
|
195 | 195 | +
|
196 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) |
| 196 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) |
197 | 197 | +
|
198 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) |
| 198 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) |
199 | 199 | )
|
200 | 200 | ) +
|
201 | 201 | # errors
|
|
212 | 212 | record: 'apiserver_request:availability%s' % SLODays,
|
213 | 213 | expr: |||
|
214 | 214 | 1 - (
|
215 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) |
| 215 | + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) |
216 | 216 | -
|
217 | 217 | (
|
218 | 218 | # too slow
|
219 | 219 | (
|
220 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) |
| 220 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) |
221 | 221 | or
|
222 | 222 | vector(0)
|
223 | 223 | )
|
224 | 224 | +
|
225 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) |
| 225 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) |
226 | 226 | +
|
227 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) |
| 227 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) |
228 | 228 | )
|
229 | 229 | +
|
230 | 230 | # errors
|
|
243 | 243 | 1 - (
|
244 | 244 | (
|
245 | 245 | # too slow
|
246 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) |
| 246 | + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) |
247 | 247 | -
|
248 |
| - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) |
| 248 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) |
249 | 249 | )
|
250 | 250 | +
|
251 | 251 | # errors
|
|
0 commit comments