Skip to content

Commit 3f64e21

Browse files
committed
rename KubeNodeEviction, fix test case
Signed-off-by: TheRealNoob <[email protected]>
1 parent 0478f9b commit 3f64e21

File tree

3 files changed

+11
-15
lines changed

3 files changed

+11
-15
lines changed

alerts/kubelet.libsonnet

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ local utils = import '../lib/utils.libsonnet';
1414
kubeletCertExpirationCriticalSeconds: 1 * 24 * 3600,
1515

1616
// Evictions per second that will trigger an alert. The default value will trigger on any evictions.
17-
evictionRateThreshold: 0.0,
17+
KubeNodeEvictionRateThreshold: 0.0,
1818
},
1919

2020
prometheusAlerts+:: {
@@ -125,9 +125,13 @@ local utils = import '../lib/utils.libsonnet';
125125
},
126126
},
127127
{
128-
alert: 'KubeNodeEvictions',
128+
alert: 'KubeNodeEviction',
129129
expr: |||
130-
sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, node) > %(evictionRateThreshold)s
130+
sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, instance) > %(KubeNodeEvictionRateThreshold)s
131+
* on (%(clusterLabel)s, instance) group_left(node)
132+
max by (%(clusterLabel)s, instance, node) (
133+
kubelet_node_name{%(kubeletSelector)s}
134+
)
131135
||| % $._config,
132136
labels: {
133137
severity: 'info',

runbook.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,6 @@ This page collects this repositories alerts and begins the process of describing
191191
+ *Severity*: info
192192
+ *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodepressure/)
193193

194-
# If soft thresholds are crossed, pods will be evicted respecting TerminationGracePeriod. If Hard thresholds are crossed grace period will be ignored.
195-
196194
##### Alert Name: "KubeNodeUnreachable"
197195
+ *Message*: `{{ $labels.node }} is unreachable and some workloads may be rescheduled.`
198196
+ *Severity*: warning
@@ -208,10 +206,10 @@ This page collects this repositories alerts and begins the process of describing
208206
+ *Severity*: warning
209207
+ *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping/)
210208

211-
##### Alert Name: "KubeNodeEvictions"
209+
##### Alert Name: "KubeNodeEviction"
212210
+ *Message*: `Node {{ $labels.node }} is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.`
213211
+ *Severity*: info
214-
+ *Runbook*: [Link](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeevictions)
212+
+ *Runbook*: [Link](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeeviction)
215213

216214
##### Alert Name: "KubeletPlegDurationHigh"
217215
+ *Message*: `The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.`

tests/tests.yaml

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -696,23 +696,17 @@ tests:
696696
values: '1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3'
697697
alert_rule_test:
698698
- eval_time: 18m
699-
alertname: KubeNodeEvictions
699+
alertname: KubeNodeEviction
700700
exp_alerts:
701701
- exp_labels:
702702
eviction_signal: memory.available
703703
cluster: kubernetes
704704
node: minikube
705705
severity: info
706-
endpoint: https-main
707-
instance: 10.0.2.15:10250
708-
job: kubelet
709-
namespace: monitoring
710-
pod: kube-state-metrics-b894d84cc-d6htw
711-
service: kube-state-metrics
712706
exp_annotations:
713707
summary: "Node is evicting pods."
714708
description: 'Node minikube is evicting Pods due to memory.available. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.'
715-
runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeevictions'
709+
runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeeviction'
716710

717711
# Verify that node:node_num_cpu:sum triggers no many-to-many errors.
718712
- interval: 1m

0 commit comments

Comments
 (0)