diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index e09170f8e..b08d11388 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -60,12 +60,8 @@ local utils = import '../lib/utils.libsonnet'; // Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it. // We have to ignore this special node in the KubeletTooManyPods alert. expr: ||| - count by (%(clusterLabel)s, node) ( - (kube_pod_status_phase{%(kubeStateMetricsSelector)s, phase="Running"} == 1) - * on (%(clusterLabel)s, namespace, pod) group_left (node) - group by (%(clusterLabel)s, namespace, pod, node) ( - kube_pod_info{%(kubeStateMetricsSelector)s} - ) + max by (cluster, node) ( + label_replace(kubelet_running_pods{%(kubeletSelector)s} > 1, "node", "$1", "instance", "(.*)") ) / max by (%(clusterLabel)s, node) ( diff --git a/tests.yaml b/tests.yaml index 3dd5920ea..32c1a52e2 100644 --- a/tests.yaml +++ b/tests.yaml @@ -403,22 +403,13 @@ tests: - eval_time: 61m alertname: KubePersistentVolumeInodesFillingUp -- interval: 1m +- name: KubeletTooManyPods alert (single-node) + interval: 1m input_series: - - series: 'kube_node_status_capacity{resource="pods",instance="172.17.0.5:8443",cluster="kubernetes",node="minikube",job="kube-state-metrics",namespace="kube-system"}' - values: '3+0x15' - - series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-1",service="kube-state-metrics"}' - values: '1+0x15' - - series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-1",service="kube-state-metrics"}' - values: '1+0x15' - - series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-2",service="kube-state-metrics"}' - values: '1+0x15' - - series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-2",service="kube-state-metrics"}' - values: '1+0x15' - - series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-3",service="kube-state-metrics"}' - values: '1+0x15' - - series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-3",service="kube-state-metrics"}' - values: '1+0x15' + - series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}' + values: '3x15' + - series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}' + values: '3x15' alert_rule_test: - eval_time: 10m alertname: KubeletTooManyPods @@ -427,11 +418,37 @@ tests: exp_alerts: - exp_labels: cluster: kubernetes - node: minikube + node: node0 + severity: info + exp_annotations: + summary: "Kubelet is running at capacity." + description: "Kubelet 'node0' is running at 100% of its Pod capacity." + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods + +- name: KubeletTooManyPods alert (multi-node) + interval: 1m + input_series: + - series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}' + values: '3x15' + - series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}' + values: '6x15' + - series: 'kubelet_running_pods{cluster="kubernetes", instance="node1", job="kubelet"}' + values: '3x15' + - series: 'kube_node_status_capacity{cluster="kubernetes", node="node1", job="kube-state-metrics", resource="pods", unit="integer"}' + values: '3x15' + alert_rule_test: + - eval_time: 10m + alertname: KubeletTooManyPods + - eval_time: 15m + alertname: KubeletTooManyPods + exp_alerts: + - exp_labels: + cluster: kubernetes + node: node1 severity: info exp_annotations: summary: "Kubelet is running at capacity." - description: "Kubelet 'minikube' is running at 100% of its Pod capacity." + description: "Kubelet 'node1' is running at 100% of its Pod capacity." runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods - interval: 1m