Skip to content

Commit 830bad0

Browse files
authored
fix: CPUThrottlingHigh multiple series error (#1119)
1 parent 8fcc368 commit 830bad0

File tree

2 files changed

+47
-4
lines changed

2 files changed

+47
-4
lines changed

alerts/resource_alerts.libsonnet

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -216,10 +216,22 @@ local utils = import '../lib/utils.libsonnet';
216216
{
217217
alert: 'CPUThrottlingHigh',
218218
expr: |||
219-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node)
220-
/ on (%(clusterLabel)s, %(namespaceLabel)s, pod, container, instance) group_left
221-
sum(increase(container_cpu_cfs_periods_total{%(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node)
222-
> ( %(cpuThrottlingPercent)s / 100 )
219+
sum without (id, metrics_path, name, image, endpoint, job, node) (
220+
topk by (%(clusterLabel)s, %(namespaceLabel)s, pod, container, instance) (1,
221+
increase(
222+
container_cpu_cfs_throttled_periods_total{container!="", %(cadvisorSelector)s, %(cpuThrottlingSelector)s}
223+
[5m])
224+
)
225+
)
226+
/ on (%(clusterLabel)s, %(namespaceLabel)s, pod, container, instance) group_left
227+
sum without (id, metrics_path, name, image, endpoint, job, node) (
228+
topk by (%(clusterLabel)s, %(namespaceLabel)s, pod, container, instance) (1,
229+
increase(
230+
container_cpu_cfs_periods_total{%(cadvisorSelector)s, %(cpuThrottlingSelector)s}
231+
[5m])
232+
)
233+
)
234+
> ( %(cpuThrottlingPercent)s / 100 )
223235
||| % $._config,
224236
'for': '15m',
225237
labels: {

tests/resource_alerts-test.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
rule_files:
2+
- ../prometheus_alerts.yaml
3+
4+
tests:
5+
- interval: 1m
6+
name: CPUThrottlingHigh fires when CPU throttling exceeds threshold
7+
input_series:
8+
- series: 'container_cpu_cfs_throttled_periods_total{cluster="cluster1", namespace="default", pod="test-pod", container="test-container", instance="node1", job="cadvisor"}'
9+
values: '0+30x20'
10+
- series: 'container_cpu_cfs_periods_total{cluster="cluster1", namespace="default", pod="test-pod", container="test-container", instance="node1", job="cadvisor"}'
11+
values: '0+100x20'
12+
# Ensure the alert rule can still be evaluated with duplicate series.
13+
- series: 'container_cpu_cfs_periods_total{cluster="cluster1", namespace="default", pod="test-pod", container="test-container", instance="node1", job="cadvisor", node_kubernetes_io_exclude_from_external_load_balancers="karpenter"}'
14+
values: '0+100x20'
15+
alert_rule_test:
16+
- eval_time: 14m
17+
alertname: CPUThrottlingHigh
18+
- eval_time: 20m
19+
alertname: CPUThrottlingHigh
20+
exp_alerts:
21+
- exp_labels:
22+
severity: "info"
23+
cluster: "cluster1"
24+
namespace: "default"
25+
pod: "test-pod"
26+
container: "test-container"
27+
instance: "node1"
28+
exp_annotations:
29+
description: "30% throttling of CPU in namespace default for container test-container in pod test-pod."
30+
summary: "Processes experience elevated CPU throttling."
31+
runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh"

0 commit comments

Comments
 (0)