fix: CPUThrottlingHigh multiple series error (#1119)

skl · web-flow · commit 830bad01f230 · 2025-10-24T12:15:41.000+01:00
diff --git a/alerts/resource_alerts.libsonnet b/alerts/resource_alerts.libsonnet
@@ -216,10 +216,22 @@ local utils = import '../lib/utils.libsonnet';
           {
             alert: 'CPUThrottlingHigh',
             expr: |||
-              sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node)
-                / on (%(clusterLabel)s, %(namespaceLabel)s, pod, container, instance) group_left
-              sum(increase(container_cpu_cfs_periods_total{%(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node)
-                > ( %(cpuThrottlingPercent)s / 100 )
+              sum without (id, metrics_path, name, image, endpoint, job, node) (
+                topk by (%(clusterLabel)s, %(namespaceLabel)s, pod, container, instance) (1,
+                  increase(
+                    container_cpu_cfs_throttled_periods_total{container!="", %(cadvisorSelector)s, %(cpuThrottlingSelector)s}
+                  [5m])
+                )
+              )
+              / on (%(clusterLabel)s, %(namespaceLabel)s, pod, container, instance) group_left
+              sum without (id, metrics_path, name, image, endpoint, job, node) (
+                topk by (%(clusterLabel)s, %(namespaceLabel)s, pod, container, instance) (1,
+                  increase(
+                    container_cpu_cfs_periods_total{%(cadvisorSelector)s, %(cpuThrottlingSelector)s}
+                  [5m])
+                )
+              )
+              > ( %(cpuThrottlingPercent)s / 100 )
             ||| % $._config,
             'for': '15m',
             labels: {
diff --git a/tests/resource_alerts-test.yaml b/tests/resource_alerts-test.yaml
@@ -0,0 +1,31 @@
+rule_files:
+- ../prometheus_alerts.yaml
+
+tests:
+- interval: 1m
+  name: CPUThrottlingHigh fires when CPU throttling exceeds threshold
+  input_series:
+  - series: 'container_cpu_cfs_throttled_periods_total{cluster="cluster1", namespace="default", pod="test-pod", container="test-container", instance="node1", job="cadvisor"}'
+    values: '0+30x20'
+  - series: 'container_cpu_cfs_periods_total{cluster="cluster1", namespace="default", pod="test-pod", container="test-container", instance="node1", job="cadvisor"}'
+    values: '0+100x20'
+  # Ensure the alert rule can still be evaluated with duplicate series.
+  - series: 'container_cpu_cfs_periods_total{cluster="cluster1", namespace="default", pod="test-pod", container="test-container", instance="node1", job="cadvisor", node_kubernetes_io_exclude_from_external_load_balancers="karpenter"}'
+    values: '0+100x20'
+  alert_rule_test:
+  - eval_time: 14m
+    alertname: CPUThrottlingHigh
+  - eval_time: 20m
+    alertname: CPUThrottlingHigh
+    exp_alerts:
+    - exp_labels:
+        severity: "info"
+        cluster: "cluster1"
+        namespace: "default"
+        pod: "test-pod"
+        container: "test-container"
+        instance: "node1"
+      exp_annotations:
+        description: "30% throttling of CPU in namespace default for container test-container in pod test-pod."
+        summary: "Processes experience elevated CPU throttling."
+        runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh"