Skip to content

Commit 4e1cecf

Browse files
authored
fix(KubeletTooManyPods): handle inconsistent instance labels (#1021)
1 parent b9f0943 commit 4e1cecf

File tree

2 files changed

+20
-6
lines changed

2 files changed

+20
-6
lines changed

alerts/kubelet.libsonnet

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,16 @@ local utils = import '../lib/utils.libsonnet';
6060
// Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it.
6161
// We have to ignore this special node in the KubeletTooManyPods alert.
6262
expr: |||
63-
max by (cluster, node) (
64-
label_replace(kubelet_running_pods{%(kubeletSelector)s} > 1, "node", "$1", "instance", "(.*)")
63+
(
64+
max by (cluster, instance) (
65+
kubelet_running_pods{%(kubeletSelector)s} > 1
66+
)
67+
* on (cluster, instance) group_left(node)
68+
max by (cluster, instance, node) (
69+
kubelet_node_name{%(kubeletSelector)s}
70+
)
6571
)
66-
/
72+
/ on (cluster, node) group_left()
6773
max by (%(clusterLabel)s, node) (
6874
kube_node_status_capacity{%(kubeStateMetricsSelector)s, resource="pods"} != 1
6975
) > 0.95

tests.yaml

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -403,11 +403,13 @@ tests:
403403
- eval_time: 61m
404404
alertname: KubePersistentVolumeInodesFillingUp
405405

406-
- name: KubeletTooManyPods alert (single-node)
406+
- name: KubeletTooManyPods alert (single-node with instance as IP address)
407407
interval: 1m
408408
input_series:
409-
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}'
409+
- series: 'kubelet_running_pods{cluster="kubernetes", instance="10.0.0.0", job="kubelet"}'
410410
values: '3x15'
411+
- series: 'kubelet_node_name{cluster="kubernetes", instance="10.0.0.0", node="node0", job="kubelet"}'
412+
values: '1x15'
411413
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}'
412414
values: '3x15'
413415
alert_rule_test:
@@ -418,22 +420,27 @@ tests:
418420
exp_alerts:
419421
- exp_labels:
420422
cluster: kubernetes
423+
instance: 10.0.0.0
421424
node: node0
422425
severity: info
423426
exp_annotations:
424427
summary: "Kubelet is running at capacity."
425428
description: "Kubelet 'node0' is running at 100% of its Pod capacity."
426429
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
427430

428-
- name: KubeletTooManyPods alert (multi-node)
431+
- name: KubeletTooManyPods alert (multi-node with instance as node name)
429432
interval: 1m
430433
input_series:
431434
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}'
432435
values: '3x15'
436+
- series: 'kubelet_node_name{cluster="kubernetes", instance="node0", node="node0", job="kubelet"}'
437+
values: '1x15'
433438
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}'
434439
values: '6x15'
435440
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node1", job="kubelet"}'
436441
values: '3x15'
442+
- series: 'kubelet_node_name{cluster="kubernetes", instance="node1", node="node1", job="kubelet"}'
443+
values: '1x15'
437444
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node1", job="kube-state-metrics", resource="pods", unit="integer"}'
438445
values: '3x15'
439446
alert_rule_test:
@@ -444,6 +451,7 @@ tests:
444451
exp_alerts:
445452
- exp_labels:
446453
cluster: kubernetes
454+
instance: node1
447455
node: node1
448456
severity: info
449457
exp_annotations:

0 commit comments

Comments
 (0)