feat(alerts): New KubePdbNotEnoughHealthyPods alert

skl · skl · commit 87f56da0f4e9 · 2025-03-25T12:10:25.000Z
diff --git a/alerts/apps_alerts.libsonnet b/alerts/apps_alerts.libsonnet
@@ -5,6 +5,7 @@ local utils = import '../lib/utils.libsonnet';
     kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
     kubeJobTimeoutDuration: error 'must provide value for kubeJobTimeoutDuration',
     kubeDaemonSetRolloutStuckFor: '15m',
+    kubePdbNotEnoughHealthyPodsFor: '15m',
     namespaceSelector: null,
     prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '',
   },
@@ -359,6 +360,28 @@ local utils = import '../lib/utils.libsonnet';
             'for': '15m',
             alert: 'KubeHpaMaxedOut',
           },
+          {
+            expr: |||
+              (
+                kube_poddisruptionbudget_status_desired_healthy{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
+                -
+                kube_poddisruptionbudget_status_current_healthy{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
+              )
+              > 0
+            ||| % $._config,
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              description: 'PDB %s{{ $labels.namespace }}/{{ $labels.poddisruptionbudget }} expects {{ $value }} more healthy pods. The desired number of healthy pods has not been met for at least %s.' % [
+                utils.ifShowMultiCluster($._config, '{{ $labels.%(clusterLabel)s }}/' % $._config),
+                $._config.kubePdbNotEnoughHealthyPodsFor,
+              ],
+              summary: 'PDB does not have enough healthy pods.',
+            },
+            'for': $._config.kubePdbNotEnoughHealthyPodsFor,
+            alert: 'KubePdbNotEnoughHealthyPods',
+          },
         ],
       },
     ],
diff --git a/runbook.md b/runbook.md
@@ -119,6 +119,16 @@ This page collects this repositories alerts and begins the process of describing
 + *Action*: Check the job using `kubectl describe job <job>` and look at the pod logs using `kubectl logs <pod>` for further information.
 + *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed/)
 
+##### Alert Name: "KubePdbNotEnoughHealthyPods"
++ *Message*: `PDB {{ $labels.namespace }}/{{ $labels.poddisruptionbudget }} expects {{ $value }} more healthy pods. The desired number of healthy pods has not been met for at least 15m.`
++ *Severity*: warning
++ *Action*: Check the status of the PDB using
+`kubectl get poddisruptionbudgets <pdb> -o yaml` and
+compare `status.currentHealthy` with `status.desiredHealthy`.
+Check the Kubernetes documentation for more information about
+[pod distruptions](https://kubernetes.io/docs/concepts/workloads/pods/disruptions/).
++ *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepdbnotenoughhealthypods/)
+
 ### Group Name: "kubernetes-resources"
 
 ##### Alert Name: "KubeCPUOvercommit"
diff --git a/tests/apps_alerts-test.yaml b/tests/apps_alerts-test.yaml
@@ -0,0 +1,27 @@
+rule_files:
+- ../prometheus_alerts.yaml
+
+tests:
+- interval: 1m
+  name: KubePdbNotEnoughHealthyPods fires when current healthly pods are less than desired
+  input_series:
+  - series: 'kube_poddisruptionbudget_status_desired_healthy{cluster="cluster1", namespace="ns1", poddisruptionbudget="pdb1", job="kube-state-metrics"}'
+    values: '4x15'
+  - series: 'kube_poddisruptionbudget_status_current_healthy{cluster="cluster1", namespace="ns1", poddisruptionbudget="pdb1", job="kube-state-metrics"}'
+    values: '3x15'
+  alert_rule_test:
+  - eval_time: 14m
+    alertname: KubePdbNotEnoughHealthyPods
+  - eval_time: 15m
+    alertname: KubePdbNotEnoughHealthyPods
+    exp_alerts:
+    - exp_labels:
+        severity: "warning"
+        cluster: "cluster1"
+        namespace: "ns1"
+        poddisruptionbudget: "pdb1"
+        job: "kube-state-metrics"
+      exp_annotations:
+        description: "PDB ns1/pdb1 expects 1 more healthy pods. The desired number of healthy pods has not been met for at least 15m."
+        runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepdbnotenoughhealthypods"
+        summary: "PDB does not have enough healthy pods."