Skip to content

Commit 2ef386b

Browse files
committed
fix: KubeletTooManyPods duplicate series error
1 parent 9ceec88 commit 2ef386b

File tree

2 files changed

+36
-23
lines changed

2 files changed

+36
-23
lines changed

alerts/kubelet.libsonnet

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,8 @@ local utils = import '../lib/utils.libsonnet';
6060
// Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it.
6161
// We have to ignore this special node in the KubeletTooManyPods alert.
6262
expr: |||
63-
count by (%(clusterLabel)s, node) (
64-
(kube_pod_status_phase{%(kubeStateMetricsSelector)s, phase="Running"} == 1)
65-
* on (%(clusterLabel)s, namespace, pod) group_left (node)
66-
group by (%(clusterLabel)s, namespace, pod, node) (
67-
kube_pod_info{%(kubeStateMetricsSelector)s}
68-
)
63+
max by (cluster, node) (
64+
label_replace(kubelet_running_pods{%(kubeletSelector)s} > 1, "node", "$1", "instance", "(.*)")
6965
)
7066
/
7167
max by (%(clusterLabel)s, node) (

tests.yaml

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -403,22 +403,13 @@ tests:
403403
- eval_time: 61m
404404
alertname: KubePersistentVolumeInodesFillingUp
405405

406-
- interval: 1m
406+
- name: KubeletTooManyPods alert (single-node)
407+
interval: 1m
407408
input_series:
408-
- series: 'kube_node_status_capacity{resource="pods",instance="172.17.0.5:8443",cluster="kubernetes",node="minikube",job="kube-state-metrics",namespace="kube-system"}'
409-
values: '3+0x15'
410-
- series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-1",service="kube-state-metrics"}'
411-
values: '1+0x15'
412-
- series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-1",service="kube-state-metrics"}'
413-
values: '1+0x15'
414-
- series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-2",service="kube-state-metrics"}'
415-
values: '1+0x15'
416-
- series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-2",service="kube-state-metrics"}'
417-
values: '1+0x15'
418-
- series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-3",service="kube-state-metrics"}'
419-
values: '1+0x15'
420-
- series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-3",service="kube-state-metrics"}'
421-
values: '1+0x15'
409+
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}'
410+
values: '3x15'
411+
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}'
412+
values: '3x15'
422413
alert_rule_test:
423414
- eval_time: 10m
424415
alertname: KubeletTooManyPods
@@ -427,11 +418,37 @@ tests:
427418
exp_alerts:
428419
- exp_labels:
429420
cluster: kubernetes
430-
node: minikube
421+
node: node0
422+
severity: info
423+
exp_annotations:
424+
summary: "Kubelet is running at capacity."
425+
description: "Kubelet 'node0' is running at 100% of its Pod capacity."
426+
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
427+
428+
- name: KubeletTooManyPods alert (multi-node)
429+
interval: 1m
430+
input_series:
431+
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}'
432+
values: '3x15'
433+
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}'
434+
values: '6x15'
435+
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node1", job="kubelet"}'
436+
values: '3x15'
437+
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node1", job="kube-state-metrics", resource="pods", unit="integer"}'
438+
values: '3x15'
439+
alert_rule_test:
440+
- eval_time: 10m
441+
alertname: KubeletTooManyPods
442+
- eval_time: 15m
443+
alertname: KubeletTooManyPods
444+
exp_alerts:
445+
- exp_labels:
446+
cluster: kubernetes
447+
node: node1
431448
severity: info
432449
exp_annotations:
433450
summary: "Kubelet is running at capacity."
434-
description: "Kubelet 'minikube' is running at 100% of its Pod capacity."
451+
description: "Kubelet 'node1' is running at 100% of its Pod capacity."
435452
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
436453

437454
- interval: 1m

0 commit comments

Comments
 (0)