|
5 | 5 | kubeApiserverReadSelector: 'verb=~"LIST|GET"', |
6 | 6 | kubeApiserverWriteSelector: 'verb=~"POST|PUT|PATCH|DELETE"', |
7 | 7 | kubeApiserverNonStreamingSelector: 'subresource!~"proxy|attach|log|exec|portforward"', |
8 | | - // These are buckets that exist on the apiserver_request_duration_seconds_bucket histogram. |
| 8 | + // These are buckets that exist on the apiserver_request_slo_duration_seconds_bucket histogram. |
9 | 9 | // They are what the Kubernetes SIG Scalability is using to measure availability of Kubernetes clusters. |
10 | 10 | // If you want to change these, make sure the "le" buckets exist on the histogram! |
11 | 11 | kubeApiserverReadResourceLatency: '1', |
|
31 | 31 | ( |
32 | 32 | ( |
33 | 33 | # too slow |
34 | | - sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) |
| 34 | + sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) |
35 | 35 | - |
36 | 36 | ( |
37 | 37 | ( |
38 | | - sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}[%(window)s])) |
| 38 | + sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}[%(window)s])) |
39 | 39 | or |
40 | 40 | vector(0) |
41 | 41 | ) |
42 | 42 | + |
43 | | - sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}[%(window)s])) |
| 43 | + sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}[%(window)s])) |
44 | 44 | + |
45 | | - sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}[%(window)s])) |
| 45 | + sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}[%(window)s])) |
46 | 46 | ) |
47 | 47 | ) |
48 | 48 | + |
|
79 | 79 | ( |
80 | 80 | ( |
81 | 81 | # too slow |
82 | | - sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) |
| 82 | + sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) |
83 | 83 | - |
84 | | - sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le="%(kubeApiserverWriteLatency)s"}[%(window)s])) |
| 84 | + sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le="%(kubeApiserverWriteLatency)s"}[%(window)s])) |
85 | 85 | ) |
86 | 86 | + |
87 | 87 | sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,code=~"5.."}[%(window)s])) |
|
114 | 114 | rules: |
115 | 115 | [ |
116 | 116 | { |
117 | | - record: 'cluster_quantile:apiserver_request_duration_seconds:histogram_quantile', |
| 117 | + record: 'cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile', |
118 | 118 | expr: ||| |
119 | | - histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_duration_seconds_bucket{%s}[5m]))) > 0 |
| 119 | + histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{%s}[5m]))) > 0 |
120 | 120 | ||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector, $._config.kubeApiserverNonStreamingSelector])], |
121 | 121 | labels: { |
122 | 122 | verb: verb.type, |
|
149 | 149 | for verb in verbs |
150 | 150 | ] + [ |
151 | 151 | { |
152 | | - record: 'cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h', |
| 152 | + record: 'cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h', |
153 | 153 | expr: ||| |
154 | | - sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_duration_seconds_count[1h])) |
| 154 | + sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_slo_duration_seconds_count[1h])) |
155 | 155 | ||| % $._config, |
156 | 156 | }, |
157 | 157 | { |
158 | | - record: 'cluster_verb_scope:apiserver_request_duration_seconds_count:increase%s' % SLODays, |
| 158 | + record: 'cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%s' % SLODays, |
159 | 159 | expr: ||| |
160 | | - sum by (%s, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h[%s]) * 24 * %s) |
| 160 | + sum by (%s, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[%s]) * 24 * %s) |
161 | 161 | ||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days], |
162 | 162 | }, |
163 | 163 | { |
164 | | - record: 'cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h', |
| 164 | + record: 'cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h', |
165 | 165 | expr: ||| |
166 | | - sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_duration_seconds_bucket[1h])) |
| 166 | + sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h])) |
167 | 167 | ||| % $._config, |
168 | 168 | }, |
169 | 169 | { |
170 | | - record: 'cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%s' % SLODays, |
| 170 | + record: 'cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%s' % SLODays, |
171 | 171 | expr: ||| |
172 | | - sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h[%s]) * 24 * %s) |
| 172 | + sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[%s]) * 24 * %s) |
173 | 173 | ||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days], |
174 | 174 | }, |
175 | 175 | { |
|
178 | 178 | 1 - ( |
179 | 179 | ( |
180 | 180 | # write too slow |
181 | | - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) |
| 181 | + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) |
182 | 182 | - |
183 | | - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) |
| 183 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) |
184 | 184 | ) + |
185 | 185 | ( |
186 | 186 | # read too slow |
187 | | - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) |
| 187 | + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) |
188 | 188 | - |
189 | 189 | ( |
190 | 190 | ( |
191 | | - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) |
| 191 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) |
192 | 192 | or |
193 | 193 | vector(0) |
194 | 194 | ) |
195 | 195 | + |
196 | | - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) |
| 196 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) |
197 | 197 | + |
198 | | - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) |
| 198 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) |
199 | 199 | ) |
200 | 200 | ) + |
201 | 201 | # errors |
|
212 | 212 | record: 'apiserver_request:availability%s' % SLODays, |
213 | 213 | expr: ||| |
214 | 214 | 1 - ( |
215 | | - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) |
| 215 | + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) |
216 | 216 | - |
217 | 217 | ( |
218 | 218 | # too slow |
219 | 219 | ( |
220 | | - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) |
| 220 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) |
221 | 221 | or |
222 | 222 | vector(0) |
223 | 223 | ) |
224 | 224 | + |
225 | | - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) |
| 225 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) |
226 | 226 | + |
227 | | - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) |
| 227 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) |
228 | 228 | ) |
229 | 229 | + |
230 | 230 | # errors |
|
243 | 243 | 1 - ( |
244 | 244 | ( |
245 | 245 | # too slow |
246 | | - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) |
| 246 | + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) |
247 | 247 | - |
248 | | - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) |
| 248 | + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) |
249 | 249 | ) |
250 | 250 | + |
251 | 251 | # errors |
|
0 commit comments