Skip to content

Commit b131a73

Browse files
authored
feat(alerts): New KubePdbNotEnoughHealthyPods alert (#1045)
* feat(alerts): New KubePdbNotEnoughHealthyPods alert * chore: markdownfmt * fix: recording rule name does not match level:metric:operation format, must contain at least one colon
1 parent 30488b9 commit b131a73

File tree

4 files changed

+57
-2
lines changed

4 files changed

+57
-2
lines changed

alerts/apps_alerts.libsonnet

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ local utils = import '../lib/utils.libsonnet';
55
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
66
kubeJobTimeoutDuration: error 'must provide value for kubeJobTimeoutDuration',
77
kubeDaemonSetRolloutStuckFor: '15m',
8+
kubePdbNotEnoughHealthyPodsFor: '15m',
89
namespaceSelector: null,
910
prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '',
1011
},
@@ -359,6 +360,28 @@ local utils = import '../lib/utils.libsonnet';
359360
'for': '15m',
360361
alert: 'KubeHpaMaxedOut',
361362
},
363+
{
364+
expr: |||
365+
(
366+
kube_poddisruptionbudget_status_desired_healthy{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
367+
-
368+
kube_poddisruptionbudget_status_current_healthy{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
369+
)
370+
> 0
371+
||| % $._config,
372+
labels: {
373+
severity: 'warning',
374+
},
375+
annotations: {
376+
description: 'PDB %s{{ $labels.namespace }}/{{ $labels.poddisruptionbudget }} expects {{ $value }} more healthy pods. The desired number of healthy pods has not been met for at least %s.' % [
377+
utils.ifShowMultiCluster($._config, '{{ $labels.%(clusterLabel)s }}/' % $._config),
378+
$._config.kubePdbNotEnoughHealthyPodsFor,
379+
],
380+
summary: 'PDB does not have enough healthy pods.',
381+
},
382+
'for': $._config.kubePdbNotEnoughHealthyPodsFor,
383+
alert: 'KubePdbNotEnoughHealthyPods',
384+
},
362385
],
363386
},
364387
],

rules/rules.libsonnet

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,4 @@
22
(import 'apps.libsonnet') +
33
(import 'kube_scheduler.libsonnet') +
44
(import 'node.libsonnet') +
5-
(import 'kubelet.libsonnet') +
6-
(import 'windows.libsonnet')
5+
(import 'kubelet.libsonnet')

runbook.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,12 @@ This page collects this repositories alerts and begins the process of describing
119119
+ *Action*: Check the job using `kubectl describe job <job>` and look at the pod logs using `kubectl logs <pod>` for further information.
120120
+ *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed/)
121121

122+
##### Alert Name: "KubePdbNotEnoughHealthyPods"
123+
+ *Message*: `PDB {{ $labels.namespace }}/{{ $labels.poddisruptionbudget }} expects {{ $value }} more healthy pods. The desired number of healthy pods has not been met for at least 15m.`
124+
+ *Severity*: warning
125+
+ *Action*: Check the status of the PDB using `kubectl get poddisruptionbudgets <pdb> -o yaml` and compare `status.currentHealthy` with `status.desiredHealthy`. Check the Kubernetes documentation for more information about [pod distruptions](https://kubernetes.io/docs/concepts/workloads/pods/disruptions/).
126+
+ *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepdbnotenoughhealthypods/)
127+
122128
### Group Name: "kubernetes-resources"
123129

124130
##### Alert Name: "KubeCPUOvercommit"

tests/apps_alerts-test.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
rule_files:
2+
- ../prometheus_alerts.yaml
3+
4+
tests:
5+
- interval: 1m
6+
name: KubePdbNotEnoughHealthyPods fires when current healthly pods are less than desired
7+
input_series:
8+
- series: 'kube_poddisruptionbudget_status_desired_healthy{cluster="cluster1", namespace="ns1", poddisruptionbudget="pdb1", job="kube-state-metrics"}'
9+
values: '4x15'
10+
- series: 'kube_poddisruptionbudget_status_current_healthy{cluster="cluster1", namespace="ns1", poddisruptionbudget="pdb1", job="kube-state-metrics"}'
11+
values: '3x15'
12+
alert_rule_test:
13+
- eval_time: 14m
14+
alertname: KubePdbNotEnoughHealthyPods
15+
- eval_time: 15m
16+
alertname: KubePdbNotEnoughHealthyPods
17+
exp_alerts:
18+
- exp_labels:
19+
severity: "warning"
20+
cluster: "cluster1"
21+
namespace: "ns1"
22+
poddisruptionbudget: "pdb1"
23+
job: "kube-state-metrics"
24+
exp_annotations:
25+
description: "PDB ns1/pdb1 expects 1 more healthy pods. The desired number of healthy pods has not been met for at least 15m."
26+
runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepdbnotenoughhealthypods"
27+
summary: "PDB does not have enough healthy pods."

0 commit comments

Comments
 (0)