From 943b74dd67bd034e76e6ca925aca5642bb9b3e69 Mon Sep 17 00:00:00 2001 From: Serena K Date: Thu, 6 Feb 2025 13:38:14 -0500 Subject: [PATCH 01/11] chore: revert irate function back to rate --- dashboards/resources/cluster.libsonnet | 8 ++++---- dashboards/resources/multi-cluster.libsonnet | 8 ++++---- dashboards/resources/namespace.libsonnet | 12 ++++++------ dashboards/resources/node.libsonnet | 8 ++++---- dashboards/resources/pod.libsonnet | 8 ++++---- dashboards/resources/workload-namespace.libsonnet | 2 +- dashboards/resources/workload.libsonnet | 2 +- rules/apps.libsonnet | 4 ++-- tests/tests.yaml | 4 ++-- 9 files changed, 28 insertions(+), 28 deletions(-) diff --git a/dashboards/resources/cluster.libsonnet b/dashboards/resources/cluster.libsonnet index 9f83c905e..789f46dea 100644 --- a/dashboards/resources/cluster.libsonnet +++ b/dashboards/resources/cluster.libsonnet @@ -129,7 +129,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config ) + prometheus.withLegendFormat('__auto'), ]), @@ -144,7 +144,7 @@ local var = g.dashboard.variable; + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), @@ -152,7 +152,7 @@ local var = g.dashboard.variable; + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), @@ -160,7 +160,7 @@ local var = g.dashboard.variable; + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) diff --git a/dashboards/resources/multi-cluster.libsonnet b/dashboards/resources/multi-cluster.libsonnet index e68734967..56aa70087 100644 --- a/dashboards/resources/multi-cluster.libsonnet +++ b/dashboards/resources/multi-cluster.libsonnet @@ -99,7 +99,7 @@ local var = g.dashboard.variable; cpuUsage: [ tsPanel.new('CPU Usage') + tsPanel.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s)' % $._config) + prometheus.withLegendFormat('__auto'), ]), ], @@ -107,19 +107,19 @@ local var = g.dashboard.variable; cpuQuota: [ g.panel.table.new('CPU Quota') + g.panel.table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) diff --git a/dashboards/resources/namespace.libsonnet b/dashboards/resources/namespace.libsonnet index 50ab9011b..a60729511 100644 --- a/dashboards/resources/namespace.libsonnet +++ b/dashboards/resources/namespace.libsonnet @@ -91,7 +91,7 @@ local var = g.dashboard.variable; statPanel( 'CPU Utilisation (from requests)', 'percentunit', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config ) + stat.gridPos.withW(6) + stat.gridPos.withH(3), @@ -99,7 +99,7 @@ local var = g.dashboard.variable; statPanel( 'CPU Utilisation (from limits)', 'percentunit', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config ) + stat.gridPos.withW(6) + stat.gridPos.withH(3), @@ -125,7 +125,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config ) + prometheus.withLegendFormat('__auto'), @@ -197,19 +197,19 @@ local var = g.dashboard.variable; table.new('CPU Quota') + table.gridPos.withW(24) + table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) diff --git a/dashboards/resources/node.libsonnet b/dashboards/resources/node.libsonnet index 35282ef62..2e8e0d9bb 100644 --- a/dashboards/resources/node.libsonnet +++ b/dashboards/resources/node.libsonnet @@ -87,7 +87,7 @@ local var = g.dashboard.variable; prometheus.new( '${datasource}', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config, + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config, ) + prometheus.withLegendFormat('{{pod}}'), ]) @@ -105,19 +105,19 @@ local var = g.dashboard.variable; table.new('CPU Quota') + table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) diff --git a/dashboards/resources/pod.libsonnet b/dashboards/resources/pod.libsonnet index b7be85fd5..f869e9810 100644 --- a/dashboards/resources/pod.libsonnet +++ b/dashboards/resources/pod.libsonnet @@ -94,7 +94,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="$namespace", pod="$pod", %(clusterLabel)s="$cluster", container!=""}) by (container)' % $._config + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace="$namespace", pod="$pod", %(clusterLabel)s="$cluster", container!=""}) by (container)' % $._config ) + prometheus.withLegendFormat('__auto'), @@ -208,19 +208,19 @@ local var = g.dashboard.variable; table.new('CPU Quota') + table.gridPos.withW(24) + table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) diff --git a/dashboards/resources/workload-namespace.libsonnet b/dashboards/resources/workload-namespace.libsonnet index cbe8b08d3..1b07ffb93 100644 --- a/dashboards/resources/workload-namespace.libsonnet +++ b/dashboards/resources/workload-namespace.libsonnet @@ -91,7 +91,7 @@ local var = g.dashboard.variable; local cpuUsageQuery = ||| sum( - node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"} + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type=~"$type"} ) by (workload, workload_type) diff --git a/dashboards/resources/workload.libsonnet b/dashboards/resources/workload.libsonnet index 21ffcdbd8..c442fb5d1 100644 --- a/dashboards/resources/workload.libsonnet +++ b/dashboards/resources/workload.libsonnet @@ -103,7 +103,7 @@ local var = g.dashboard.variable; local cpuUsageQuery = ||| sum( - node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"} + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type=~"$type"} ) by (pod) diff --git a/rules/apps.libsonnet b/rules/apps.libsonnet index 7cd9c0804..1dc319c44 100644 --- a/rules/apps.libsonnet +++ b/rules/apps.libsonnet @@ -13,10 +13,10 @@ // Reduces cardinality of this timeseries by #cores, which makes it // more useable in dashboards. Also, allows us to do things like // quantile_over_time(...) which would otherwise not be possible. - record: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate', + record: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate', expr: ||| sum by (%(clusterLabel)s, namespace, pod, container) ( - irate(container_cpu_usage_seconds_total{%(cadvisorSelector)s, image!=""}[5m]) + rate(container_cpu_usage_seconds_total{%(cadvisorSelector)s, image!=""}[5m]) ) * on (%(clusterLabel)s, namespace, pod) group_left(node) topk by (%(clusterLabel)s, namespace, pod) ( 1, max by(%(clusterLabel)s, namespace, pod, node) (kube_pod_info{node!=""}) ) diff --git a/tests/tests.yaml b/tests/tests.yaml index 49a81c730..875c4a147 100644 --- a/tests/tests.yaml +++ b/tests/tests.yaml @@ -773,10 +773,10 @@ tests: values: '1+0x5' promql_expr_test: - eval_time: 5m - expr: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + expr: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate exp_samples: - value: 5.0e-2 - labels: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster="kubernetes",namespace="monitoring", pod="alertmanager-main-0", container="alertmanager",node="node1"}' + labels: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster="kubernetes",namespace="monitoring", pod="alertmanager-main-0", container="alertmanager",node="node1"}' - interval: 1m input_series: From d9264b5c22f3d8bf52394401b92b4dc371f7a10e Mon Sep 17 00:00:00 2001 From: Serena K Date: Fri, 7 Feb 2025 08:06:16 -0500 Subject: [PATCH 02/11] chore: keep irate for node_namespace_pod_ontainer and add rate in as well --- rules/apps.libsonnet | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/rules/apps.libsonnet b/rules/apps.libsonnet index 1dc319c44..decc0125c 100644 --- a/rules/apps.libsonnet +++ b/rules/apps.libsonnet @@ -13,7 +13,7 @@ // Reduces cardinality of this timeseries by #cores, which makes it // more useable in dashboards. Also, allows us to do things like // quantile_over_time(...) which would otherwise not be possible. - record: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate', + record: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m', expr: ||| sum by (%(clusterLabel)s, namespace, pod, container) ( rate(container_cpu_usage_seconds_total{%(cadvisorSelector)s, image!=""}[5m]) @@ -22,6 +22,16 @@ ) ||| % $._config, }, + { + record: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate', + expr: ||| + sum by (%(clusterLabel)s, namespace, pod, container) ( + irate(container_cpu_usage_seconds_total{%(cadvisorSelector)s, image!=""}[5m]) + ) * on (%(clusterLabel)s, namespace, pod) group_left(node) topk by (%(clusterLabel)s, namespace, pod) ( + 1, max by(%(clusterLabel)s, namespace, pod, node) (kube_pod_info{node!=""}) + ) + ||| % $._config, + }, ], }, { From c7ba140fa79ef463a5dd00f45f53f03ac9777ab5 Mon Sep 17 00:00:00 2001 From: Serena K Date: Fri, 7 Feb 2025 13:31:25 -0500 Subject: [PATCH 03/11] chore: add test for apps --- tests/tests.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/tests.yaml b/tests/tests.yaml index 875c4a147..54c5c0ca8 100644 --- a/tests/tests.yaml +++ b/tests/tests.yaml @@ -773,7 +773,12 @@ tests: values: '1+0x5' promql_expr_test: - eval_time: 5m - expr: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + expr: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + exp_samples: + - value: 5.0e-2 + labels: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster="kubernetes",namespace="monitoring", pod="alertmanager-main-0", container="alertmanager",node="node1"}' + - eval_time: 5m + expr: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m exp_samples: - value: 5.0e-2 labels: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster="kubernetes",namespace="monitoring", pod="alertmanager-main-0", container="alertmanager",node="node1"}' From ae2c9a66704bb47503a20f1d82ea30527d232e64 Mon Sep 17 00:00:00 2001 From: Serena <26805916+sleepyfoodie@users.noreply.github.com> Date: Fri, 14 Feb 2025 13:53:35 -0500 Subject: [PATCH 04/11] Update tests/tests.yaml Co-authored-by: Stephen Lang --- tests/tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests.yaml b/tests/tests.yaml index 54c5c0ca8..8977d712c 100644 --- a/tests/tests.yaml +++ b/tests/tests.yaml @@ -781,7 +781,7 @@ tests: expr: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m exp_samples: - value: 5.0e-2 - labels: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster="kubernetes",namespace="monitoring", pod="alertmanager-main-0", container="alertmanager",node="node1"}' + labels: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster="kubernetes",namespace="monitoring", pod="alertmanager-main-0", container="alertmanager",node="node1"}' - interval: 1m input_series: From c2a9315d642ff9d1d258bfaffae07e09e9da04ed Mon Sep 17 00:00:00 2001 From: Serena K Date: Fri, 14 Feb 2025 14:51:18 -0500 Subject: [PATCH 05/11] chore: add 5m to queries to match recording name --- dashboards/resources/cluster.libsonnet | 8 ++++---- dashboards/resources/multi-cluster.libsonnet | 8 ++++---- dashboards/resources/namespace.libsonnet | 12 ++++++------ dashboards/resources/node.libsonnet | 8 ++++---- dashboards/resources/pod.libsonnet | 8 ++++---- dashboards/resources/workload-namespace.libsonnet | 2 +- dashboards/resources/workload.libsonnet | 2 +- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/dashboards/resources/cluster.libsonnet b/dashboards/resources/cluster.libsonnet index 789f46dea..e06522ad2 100644 --- a/dashboards/resources/cluster.libsonnet +++ b/dashboards/resources/cluster.libsonnet @@ -129,7 +129,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config ) + prometheus.withLegendFormat('__auto'), ]), @@ -144,7 +144,7 @@ local var = g.dashboard.variable; + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), @@ -152,7 +152,7 @@ local var = g.dashboard.variable; + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), @@ -160,7 +160,7 @@ local var = g.dashboard.variable; + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) diff --git a/dashboards/resources/multi-cluster.libsonnet b/dashboards/resources/multi-cluster.libsonnet index 56aa70087..9807327ba 100644 --- a/dashboards/resources/multi-cluster.libsonnet +++ b/dashboards/resources/multi-cluster.libsonnet @@ -99,7 +99,7 @@ local var = g.dashboard.variable; cpuUsage: [ tsPanel.new('CPU Usage') + tsPanel.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (%(clusterLabel)s)' % $._config) + prometheus.withLegendFormat('__auto'), ]), ], @@ -107,19 +107,19 @@ local var = g.dashboard.variable; cpuQuota: [ g.panel.table.new('CPU Quota') + g.panel.table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (%(clusterLabel)s)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) diff --git a/dashboards/resources/namespace.libsonnet b/dashboards/resources/namespace.libsonnet index a60729511..4423f8968 100644 --- a/dashboards/resources/namespace.libsonnet +++ b/dashboards/resources/namespace.libsonnet @@ -91,7 +91,7 @@ local var = g.dashboard.variable; statPanel( 'CPU Utilisation (from requests)', 'percentunit', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config ) + stat.gridPos.withW(6) + stat.gridPos.withH(3), @@ -99,7 +99,7 @@ local var = g.dashboard.variable; statPanel( 'CPU Utilisation (from limits)', 'percentunit', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"})' % $._config ) + stat.gridPos.withW(6) + stat.gridPos.withH(3), @@ -125,7 +125,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config ) + prometheus.withLegendFormat('__auto'), @@ -197,19 +197,19 @@ local var = g.dashboard.variable; table.new('CPU Quota') + table.gridPos.withW(24) + table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) diff --git a/dashboards/resources/node.libsonnet b/dashboards/resources/node.libsonnet index 2e8e0d9bb..7ea8d4cff 100644 --- a/dashboards/resources/node.libsonnet +++ b/dashboards/resources/node.libsonnet @@ -87,7 +87,7 @@ local var = g.dashboard.variable; prometheus.new( '${datasource}', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config, + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config, ) + prometheus.withLegendFormat('{{pod}}'), ]) @@ -105,19 +105,19 @@ local var = g.dashboard.variable; table.new('CPU Quota') + table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(5m{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) diff --git a/dashboards/resources/pod.libsonnet b/dashboards/resources/pod.libsonnet index f869e9810..ae623fc62 100644 --- a/dashboards/resources/pod.libsonnet +++ b/dashboards/resources/pod.libsonnet @@ -94,7 +94,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace="$namespace", pod="$pod", %(clusterLabel)s="$cluster", container!=""}) by (container)' % $._config + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{namespace="$namespace", pod="$pod", %(clusterLabel)s="$cluster", container!=""}) by (container)' % $._config ) + prometheus.withLegendFormat('__auto'), @@ -208,19 +208,19 @@ local var = g.dashboard.variable; table.new('CPU Quota') + table.gridPos.withW(24) + table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) diff --git a/dashboards/resources/workload-namespace.libsonnet b/dashboards/resources/workload-namespace.libsonnet index 1b07ffb93..2f781fb7c 100644 --- a/dashboards/resources/workload-namespace.libsonnet +++ b/dashboards/resources/workload-namespace.libsonnet @@ -91,7 +91,7 @@ local var = g.dashboard.variable; local cpuUsageQuery = ||| sum( - node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"} + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type=~"$type"} ) by (workload, workload_type) diff --git a/dashboards/resources/workload.libsonnet b/dashboards/resources/workload.libsonnet index c442fb5d1..996851c8a 100644 --- a/dashboards/resources/workload.libsonnet +++ b/dashboards/resources/workload.libsonnet @@ -103,7 +103,7 @@ local var = g.dashboard.variable; local cpuUsageQuery = ||| sum( - node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"} + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", namespace="$namespace"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type=~"$type"} ) by (pod) From 13c42ae734db46ef007e454a6291b19e9dd4fab3 Mon Sep 17 00:00:00 2001 From: Serena K Date: Fri, 14 Feb 2025 15:00:55 -0500 Subject: [PATCH 06/11] chore: update readme --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 8e1d88f03..9f1e03e64 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,19 @@ Maintainers can trigger the [release workflow](.github/workflows/release.yaml) b We wanted to backfill `release-0.1` to `release-0.12` to have a changelog, but we were not able to use a GitHub action in a newer commit to trigger a release that generates a changelog on older commits. See #489 for full discussion. +## Metrics Deprecation + +The following recording rule is marked deprecated. They will be removed in v2.0.0. + ```bash + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + ``` +It will be replaced by the following recording rule to +- use `rate` over `irate` in favour of preserving more points of data +- add `5m` in recording rule name to indicate rate + ```bash + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m + ``` + ## How to use This mixin is designed to be vendored into the repo with your infrastructure config. To do this, use [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler): From b39c5e8ce95c4089603fab4ddf47a4ba05742a0a Mon Sep 17 00:00:00 2001 From: Serena K Date: Fri, 14 Feb 2025 15:11:35 -0500 Subject: [PATCH 07/11] update readme --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9f1e03e64..c4e97e52f 100644 --- a/README.md +++ b/README.md @@ -62,12 +62,13 @@ The following recording rule is marked deprecated. They will be removed in v2.0. ```bash node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate ``` -It will be replaced by the following recording rule to -- use `rate` over `irate` in favour of preserving more points of data -- add `5m` in recording rule name to indicate rate +It will be replaced by the following recording rule ```bash node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m ``` +Reasons for replacement: +- use `rate` over `irate` in favour of preserving more points of data +- add `5m` in recording rule name to indicate rate ## How to use From 48b28f4b0978b7cbe46eaec4cc6d4f7086266bf5 Mon Sep 17 00:00:00 2001 From: Serena K Date: Fri, 14 Feb 2025 15:14:30 -0500 Subject: [PATCH 08/11] chore: fix readme formatting --- README.md | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index c4e97e52f..fbdca661f 100644 --- a/README.md +++ b/README.md @@ -59,16 +59,13 @@ We wanted to backfill `release-0.1` to `release-0.12` to have a changelog, but w ## Metrics Deprecation The following recording rule is marked deprecated. They will be removed in v2.0.0. - ```bash - node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate - ``` -It will be replaced by the following recording rule - ```bash - node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m - ``` -Reasons for replacement: -- use `rate` over `irate` in favour of preserving more points of data -- add `5m` in recording rule name to indicate rate +```bash +node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate +``` +It will be replaced by the following recording rule to preserve data points using `rate` and add `5m` to indicate rate in the recording rule name. +```bash +node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m +``` ## How to use From ff2894b453efd3603e38741c83b255e3bf7544ce Mon Sep 17 00:00:00 2001 From: Serena <26805916+sleepyfoodie@users.noreply.github.com> Date: Tue, 18 Feb 2025 08:40:28 -0500 Subject: [PATCH 09/11] Update README.md Co-authored-by: Stephen Lang --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fbdca661f..5539b308f 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ The following recording rule is marked deprecated. They will be removed in v2.0. ```bash node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate ``` -It will be replaced by the following recording rule to preserve data points using `rate` and add `5m` to indicate rate in the recording rule name. +It will be replaced by the following recording rule to preserve data points using `rate` and add `5m` to indicate the range of the rate query in the recording rule name. ```bash node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m ``` From b94046a9f41d9a8cf339fe8c297266baca11e934 Mon Sep 17 00:00:00 2001 From: Serena <26805916+sleepyfoodie@users.noreply.github.com> Date: Tue, 18 Feb 2025 08:41:55 -0500 Subject: [PATCH 10/11] Update dashboards/resources/node.libsonnet Co-authored-by: Stephen Lang --- dashboards/resources/node.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dashboards/resources/node.libsonnet b/dashboards/resources/node.libsonnet index 7ea8d4cff..e03fe29c4 100644 --- a/dashboards/resources/node.libsonnet +++ b/dashboards/resources/node.libsonnet @@ -117,7 +117,7 @@ local var = g.dashboard.variable; prometheus.new('${datasource}', 'sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(5m{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{%(clusterLabel)s="$cluster", node=~"$node"}) by (pod)' % $._config) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) From c7cf95c8b1e7bb556f2186e89bc5b44ce2d388e7 Mon Sep 17 00:00:00 2001 From: Serena K Date: Tue, 18 Feb 2025 08:50:03 -0500 Subject: [PATCH 11/11] try adding space --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5539b308f..b2690f01f 100644 --- a/README.md +++ b/README.md @@ -58,11 +58,14 @@ We wanted to backfill `release-0.1` to `release-0.12` to have a changelog, but w ## Metrics Deprecation -The following recording rule is marked deprecated. They will be removed in v2.0.0. +The following recording rule is marked deprecated. It will be removed in v2.0.0. + ```bash node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate ``` + It will be replaced by the following recording rule to preserve data points using `rate` and add `5m` to indicate the range of the rate query in the recording rule name. + ```bash node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m ```