Skip to content

Commit 834daaa

Browse files
sleepyfoodieskl
andauthored
feat: new container cpu usage recording rule using rate() (#1025)
* chore: revert irate function back to rate * chore: keep irate for node_namespace_pod_ontainer and add rate in as well * chore: add test for apps * Update tests/tests.yaml Co-authored-by: Stephen Lang <[email protected]> * chore: add 5m to queries to match recording name * chore: update readme * update readme * chore: fix readme formatting * Update README.md Co-authored-by: Stephen Lang <[email protected]> * Update dashboards/resources/node.libsonnet Co-authored-by: Stephen Lang <[email protected]> * try adding space --------- Co-authored-by: Stephen Lang <[email protected]>
1 parent 5894150 commit 834daaa

File tree

10 files changed

+53
-24
lines changed

10 files changed

+53
-24
lines changed

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,20 @@ Maintainers can trigger the [release workflow](.github/workflows/release.yaml) b
5656

5757
We wanted to backfill `release-0.1` to `release-0.12` to have a changelog, but we were not able to use a GitHub action in a newer commit to trigger a release that generates a changelog on older commits. See #489 for full discussion.
5858

59+
## Metrics Deprecation
60+
61+
The following recording rule is marked deprecated. It will be removed in v2.0.0.
62+
63+
```bash
64+
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
65+
```
66+
67+
It will be replaced by the following recording rule to preserve data points using `rate` and add `5m` to indicate the range of the rate query in the recording rule name.
68+
69+
```bash
70+
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m
71+
```
72+
5973
## How to use
6074

6175
This mixin is designed to be vendored into the repo with your infrastructure config. To do this, use [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler):

dashboards/resources/cluster.libsonnet

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ local var = g.dashboard.variable;
129129
+ tsPanel.queryOptions.withTargets([
130130
prometheus.new(
131131
'${datasource}',
132-
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config
132+
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config
133133
)
134134
+ prometheus.withLegendFormat('__auto'),
135135
]),
@@ -144,23 +144,23 @@ local var = g.dashboard.variable;
144144
+ prometheus.withInstant(true)
145145
+ prometheus.withFormat('table'),
146146

147-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config)
147+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config)
148148
+ prometheus.withInstant(true)
149149
+ prometheus.withFormat('table'),
150150

151151
prometheus.new('${datasource}', 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config)
152152
+ prometheus.withInstant(true)
153153
+ prometheus.withFormat('table'),
154154

155-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config)
155+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config)
156156
+ prometheus.withInstant(true)
157157
+ prometheus.withFormat('table'),
158158

159159
prometheus.new('${datasource}', 'sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config)
160160
+ prometheus.withInstant(true)
161161
+ prometheus.withFormat('table'),
162162

163-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config)
163+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config)
164164
+ prometheus.withInstant(true)
165165
+ prometheus.withFormat('table'),
166166
])

dashboards/resources/multi-cluster.libsonnet

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,27 +99,27 @@ local var = g.dashboard.variable;
9999
cpuUsage: [
100100
tsPanel.new('CPU Usage')
101101
+ tsPanel.queryOptions.withTargets([
102-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s)' % $._config)
102+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (%(clusterLabel)s)' % $._config)
103103
+ prometheus.withLegendFormat('__auto'),
104104
]),
105105
],
106106

107107
cpuQuota: [
108108
g.panel.table.new('CPU Quota')
109109
+ g.panel.table.queryOptions.withTargets([
110-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s)' % $._config)
110+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (%(clusterLabel)s)' % $._config)
111111
+ prometheus.withInstant(true)
112112
+ prometheus.withFormat('table'),
113113
prometheus.new('${datasource}', 'sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config)
114114
+ prometheus.withInstant(true)
115115
+ prometheus.withFormat('table'),
116-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config)
116+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config)
117117
+ prometheus.withInstant(true)
118118
+ prometheus.withFormat('table'),
119119
prometheus.new('${datasource}', 'sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config)
120120
+ prometheus.withInstant(true)
121121
+ prometheus.withFormat('table'),
122-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config)
122+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config)
123123
+ prometheus.withInstant(true)
124124
+ prometheus.withFormat('table'),
125125
])

dashboards/resources/namespace.libsonnet

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,15 +91,15 @@ local var = g.dashboard.variable;
9191
statPanel(
9292
'CPU Utilisation (from requests)',
9393
'percentunit',
94-
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config
94+
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config
9595
)
9696
+ stat.gridPos.withW(6)
9797
+ stat.gridPos.withH(3),
9898

9999
statPanel(
100100
'CPU Utilisation (from limits)',
101101
'percentunit',
102-
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config
102+
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config
103103
)
104104
+ stat.gridPos.withW(6)
105105
+ stat.gridPos.withH(3),
@@ -125,7 +125,7 @@ local var = g.dashboard.variable;
125125
+ tsPanel.queryOptions.withTargets([
126126
prometheus.new(
127127
'${datasource}',
128-
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config
128+
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config
129129
)
130130
+ prometheus.withLegendFormat('__auto'),
131131

@@ -197,19 +197,19 @@ local var = g.dashboard.variable;
197197
table.new('CPU Quota')
198198
+ table.gridPos.withW(24)
199199
+ table.queryOptions.withTargets([
200-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config)
200+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config)
201201
+ prometheus.withInstant(true)
202202
+ prometheus.withFormat('table'),
203203
prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config)
204204
+ prometheus.withInstant(true)
205205
+ prometheus.withFormat('table'),
206-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config)
206+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config)
207207
+ prometheus.withInstant(true)
208208
+ prometheus.withFormat('table'),
209209
prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config)
210210
+ prometheus.withInstant(true)
211211
+ prometheus.withFormat('table'),
212-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config)
212+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config)
213213
+ prometheus.withInstant(true)
214214
+ prometheus.withFormat('table'),
215215
])

dashboards/resources/node.libsonnet

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ local var = g.dashboard.variable;
8787

8888
prometheus.new(
8989
'${datasource}',
90-
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config,
90+
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config,
9191
)
9292
+ prometheus.withLegendFormat('{{pod}}'),
9393
])
@@ -105,19 +105,19 @@ local var = g.dashboard.variable;
105105

106106
table.new('CPU Quota')
107107
+ table.queryOptions.withTargets([
108-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config)
108+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config)
109109
+ prometheus.withInstant(true)
110110
+ prometheus.withFormat('table'),
111111
prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config)
112112
+ prometheus.withInstant(true)
113113
+ prometheus.withFormat('table'),
114-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config)
114+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config)
115115
+ prometheus.withInstant(true)
116116
+ prometheus.withFormat('table'),
117117
prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config)
118118
+ prometheus.withInstant(true)
119119
+ prometheus.withFormat('table'),
120-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config)
120+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config)
121121
+ prometheus.withInstant(true)
122122
+ prometheus.withFormat('table'),
123123
])

dashboards/resources/pod.libsonnet

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ local var = g.dashboard.variable;
9494
+ tsPanel.queryOptions.withTargets([
9595
prometheus.new(
9696
'${datasource}',
97-
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="$namespace", pod="$pod", %(clusterLabel)s="$cluster", container!=""}) by (container)' % $._config
97+
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{namespace="$namespace", pod="$pod", %(clusterLabel)s="$cluster", container!=""}) by (container)' % $._config
9898
)
9999
+ prometheus.withLegendFormat('__auto'),
100100

@@ -208,19 +208,19 @@ local var = g.dashboard.variable;
208208
table.new('CPU Quota')
209209
+ table.gridPos.withW(24)
210210
+ table.queryOptions.withTargets([
211-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config)
211+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config)
212212
+ prometheus.withInstant(true)
213213
+ prometheus.withFormat('table'),
214214
prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config)
215215
+ prometheus.withInstant(true)
216216
+ prometheus.withFormat('table'),
217-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config)
217+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config)
218218
+ prometheus.withInstant(true)
219219
+ prometheus.withFormat('table'),
220220
prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config)
221221
+ prometheus.withInstant(true)
222222
+ prometheus.withFormat('table'),
223-
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config)
223+
prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config)
224224
+ prometheus.withInstant(true)
225225
+ prometheus.withFormat('table'),
226226
])

dashboards/resources/workload-namespace.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ local var = g.dashboard.variable;
9191

9292
local cpuUsageQuery = |||
9393
sum(
94-
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}
94+
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}
9595
* on(namespace,pod)
9696
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type=~"$type"}
9797
) by (workload, workload_type)

dashboards/resources/workload.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ local var = g.dashboard.variable;
103103

104104
local cpuUsageQuery = |||
105105
sum(
106-
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}
106+
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}
107107
* on(namespace,pod)
108108
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type=~"$type"}
109109
) by (pod)

0 commit comments

Comments
 (0)