From 4cecfedb1d656de227e354910e8e6a6c0c19bf3d Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Fri, 17 Jan 2025 13:10:10 -0600 Subject: [PATCH 01/19] feat: create alert "KubeletEvictingPods" Signed-off-by: TheRealNoob --- alerts/kubelet.libsonnet | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index 1bc4ea558..7931ebdef 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -221,6 +221,24 @@ local utils = import '../lib/utils.libsonnet'; summary: 'Kubelet has failed to renew its server certificate.', }, }, + { + alert: 'KubeletEvictingPods', + expr: ||| + changes(kubelet_evictions{%(kubeletSelector)s}[10m]) > 0 + and + resets(kubelet_evictions{%(kubeletSelector)s}[10m]) == 0 + ||| % $._config, + labels: { + severity: 'warning', + }, + 'for': '2m', + annotations: { + description: 'Kubelet on node {{ $labels.node }} is evicting pods due to resource pressure.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], + summary: 'Kubelet is evicting pods.', + }, + }, (import '../lib/absent_alert.libsonnet') { componentName:: 'Kubelet', selector:: $._config.kubeletSelector, From 7da52fc07c9043c3c3ebf0c90ef555cfa33d3b8d Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Tue, 21 Jan 2025 12:39:24 -0600 Subject: [PATCH 02/19] fix syntax --- alerts/kubelet.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index 7931ebdef..268e5de01 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -233,7 +233,7 @@ local utils = import '../lib/utils.libsonnet'; }, 'for': '2m', annotations: { - description: 'Kubelet on node {{ $labels.node }} is evicting pods due to resource pressure.' % [ + description: 'Kubelet on node {{ $labels.node }} is evicting pods due to resource pressure.%s' % [ utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), ], summary: 'Kubelet is evicting pods.', From ac9e4858ea38923427cde92b597092a3fd3cd87a Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Fri, 24 Jan 2025 11:02:25 -0600 Subject: [PATCH 03/19] move to resources.libsonnet Signed-off-by: TheRealNoob --- alerts/kubelet.libsonnet | 18 ------------------ alerts/resource_alerts.libsonnet | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index 68ef11f90..e09170f8e 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -225,24 +225,6 @@ local utils = import '../lib/utils.libsonnet'; summary: 'Kubelet has failed to renew its server certificate.', }, }, - { - alert: 'KubeletEvictingPods', - expr: ||| - changes(kubelet_evictions{%(kubeletSelector)s}[10m]) > 0 - and - resets(kubelet_evictions{%(kubeletSelector)s}[10m]) == 0 - ||| % $._config, - labels: { - severity: 'warning', - }, - 'for': '2m', - annotations: { - description: 'Kubelet on node {{ $labels.node }} is evicting pods due to resource pressure.%s' % [ - utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), - ], - summary: 'Kubelet is evicting pods.', - }, - }, (import '../lib/absent_alert.libsonnet') { componentName:: 'Kubelet', selector:: $._config.kubeletSelector, diff --git a/alerts/resource_alerts.libsonnet b/alerts/resource_alerts.libsonnet index 75abb51af..5ecb2dbb4 100644 --- a/alerts/resource_alerts.libsonnet +++ b/alerts/resource_alerts.libsonnet @@ -17,6 +17,8 @@ local utils = import '../lib/utils.libsonnet'; // See https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-can-i-configure-overprovisioning-with-cluster-autoscaler // for more details. ignoringOverprovisionedWorkloadSelector: '', + // Max evictions per second that will trigger an alert. The default value generally allows for only one pod occasionally being evicted. Any more evictions than that will trigger the alert. + highEvictionRateThreshold: 0.002, }, prometheusAlerts+:: { @@ -223,6 +225,22 @@ local utils = import '../lib/utils.libsonnet'; summary: 'Processes experience elevated CPU throttling.', }, }, + { + alert: 'KubeEvictionRateHigh', + expr: ||| + sum(rate(kubelet_evictions[15m])) by (%(clusterLabel)s) > %(highEvictionRateThreshold)s + ||| % $._config, + labels: { + severity: 'warning', + }, + 'for': '1m', + annotations: { + description: 'The cluster is evicting Pods at an unexpectedly high rate. This is typically caused by pods frequently exceeding RAM/ephemeral-storage limits or by nodes being NotReady for extended periods.%s' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], + summary: 'Cluster is evicting pods at an unexpectedly high rate.', + }, + }, ], }, ], From 3707ece6019a8ec4ad213d199a787d4280d82b02 Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Fri, 24 Jan 2025 11:21:14 -0600 Subject: [PATCH 04/19] add selector filter Signed-off-by: TheRealNoob --- alerts/resource_alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alerts/resource_alerts.libsonnet b/alerts/resource_alerts.libsonnet index 5ecb2dbb4..7a6d3a948 100644 --- a/alerts/resource_alerts.libsonnet +++ b/alerts/resource_alerts.libsonnet @@ -228,7 +228,7 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeEvictionRateHigh', expr: ||| - sum(rate(kubelet_evictions[15m])) by (%(clusterLabel)s) > %(highEvictionRateThreshold)s + sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by (%(clusterLabel)s) > %(highEvictionRateThreshold)s ||| % $._config, labels: { severity: 'warning', From c6c29b69b0bc54ee284ed13a5c907156867ed46e Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Thu, 30 Jan 2025 08:44:11 -0800 Subject: [PATCH 05/19] move {{cluster}} injection Co-authored-by: Stephen Lang --- alerts/resource_alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/alerts/resource_alerts.libsonnet b/alerts/resource_alerts.libsonnet index 7a6d3a948..83c8f1a4b 100644 --- a/alerts/resource_alerts.libsonnet +++ b/alerts/resource_alerts.libsonnet @@ -235,8 +235,8 @@ local utils = import '../lib/utils.libsonnet'; }, 'for': '1m', annotations: { - description: 'The cluster is evicting Pods at an unexpectedly high rate. This is typically caused by pods frequently exceeding RAM/ephemeral-storage limits or by nodes being NotReady for extended periods.%s' % [ - utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + description: 'The cluster%s is evicting Pods at an unexpectedly high rate. This is typically caused by pods frequently exceeding RAM/ephemeral-storage limits or by nodes being NotReady for extended periods.' % [ + utils.ifShowMultiCluster($._config, ' {{ $labels.%(clusterLabel)s }}' % $._config), ], summary: 'Cluster is evicting pods at an unexpectedly high rate.', }, From 72d20551d7f8544a1f0b6c45b41a3a1ba69a39b4 Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Thu, 6 Feb 2025 14:34:24 -0600 Subject: [PATCH 06/19] redo alerts Signed-off-by: TheRealNoob --- alerts/kubelet.libsonnet | 37 ++++++++++++++++++++++++++++++++ alerts/resource_alerts.libsonnet | 18 ---------------- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index e09170f8e..7c450f25c 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -12,6 +12,9 @@ local utils = import '../lib/utils.libsonnet'; kubeletCertExpirationWarningSeconds: 7 * 24 * 3600, kubeletCertExpirationCriticalSeconds: 1 * 24 * 3600, + + // Evictions per second that will trigger an alert. The default value will trigger on any evictions. + evictionRateThreshold: 0.0, }, prometheusAlerts+:: { @@ -37,6 +40,24 @@ local utils = import '../lib/utils.libsonnet'; 'for': '15m', alert: 'KubeNodeNotReady', }, + { + alert: 'KubeNodePressure', + expr: ||| + kube_node_status_condition{%(kubeStateMetricsSelector)s,condition=~".+Pressure",status="true"} == 1 + and on (%(clusterLabel)s, node) + kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0 + ||| % $._config, + labels: { + severity: 'info', + }, + 'for': '10m', + annotations: { + description: '{{ $labels.node }}%s has active Condition {{ $labels.condition }}. This is caused by resource usage exceeding eviction thresholds.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], + summary: 'Node has as active Condition.', + }, + }, { expr: ||| (kube_node_spec_taint{%(kubeStateMetricsSelector)s,key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{%(kubeStateMetricsSelector)s,key=~"%(kubeNodeUnreachableIgnoreKeys)s"}) == 1 @@ -101,6 +122,22 @@ local utils = import '../lib/utils.libsonnet'; summary: 'Node readiness status is flapping.', }, }, + { + alert: 'KubeEvictions', + expr: ||| + sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal) > %(evictionRateThreshold)s + ||| % $._config, + labels: { + severity: 'info', + }, + 'for': '0s', + annotations: { + description: 'The%s cluster is evicting Pods. This is caused by conditions such as MemoryPressure, DiskPressure, or PIDPressure. These are typically caused by Pods exceeding RAM/ephemeral-storage limits.' % [ + utils.ifShowMultiCluster($._config, ' {{ $labels.%(clusterLabel)s }}' % $._config), + ], + summary: 'Cluster is evicting pods due to {{ $labels.eviction_signal }}.', + }, + }, { alert: 'KubeletPlegDurationHigh', expr: ||| diff --git a/alerts/resource_alerts.libsonnet b/alerts/resource_alerts.libsonnet index 83c8f1a4b..75abb51af 100644 --- a/alerts/resource_alerts.libsonnet +++ b/alerts/resource_alerts.libsonnet @@ -17,8 +17,6 @@ local utils = import '../lib/utils.libsonnet'; // See https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-can-i-configure-overprovisioning-with-cluster-autoscaler // for more details. ignoringOverprovisionedWorkloadSelector: '', - // Max evictions per second that will trigger an alert. The default value generally allows for only one pod occasionally being evicted. Any more evictions than that will trigger the alert. - highEvictionRateThreshold: 0.002, }, prometheusAlerts+:: { @@ -225,22 +223,6 @@ local utils = import '../lib/utils.libsonnet'; summary: 'Processes experience elevated CPU throttling.', }, }, - { - alert: 'KubeEvictionRateHigh', - expr: ||| - sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by (%(clusterLabel)s) > %(highEvictionRateThreshold)s - ||| % $._config, - labels: { - severity: 'warning', - }, - 'for': '1m', - annotations: { - description: 'The cluster%s is evicting Pods at an unexpectedly high rate. This is typically caused by pods frequently exceeding RAM/ephemeral-storage limits or by nodes being NotReady for extended periods.' % [ - utils.ifShowMultiCluster($._config, ' {{ $labels.%(clusterLabel)s }}' % $._config), - ], - summary: 'Cluster is evicting pods at an unexpectedly high rate.', - }, - }, ], }, ], From 09488cb9731d9a177d0271057553ca015fb1cb2d Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Thu, 6 Feb 2025 14:56:18 -0600 Subject: [PATCH 07/19] update runbook Signed-off-by: TheRealNoob --- alerts/kubelet.libsonnet | 10 +++++----- runbook.md | 13 ++++++++++++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index 7c450f25c..c2d9930b8 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -123,19 +123,19 @@ local utils = import '../lib/utils.libsonnet'; }, }, { - alert: 'KubeEvictions', + alert: 'KubeNodeEvictions', expr: ||| - sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal) > %(evictionRateThreshold)s + sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, node) > %(evictionRateThreshold)s ||| % $._config, labels: { severity: 'info', }, 'for': '0s', annotations: { - description: 'The%s cluster is evicting Pods. This is caused by conditions such as MemoryPressure, DiskPressure, or PIDPressure. These are typically caused by Pods exceeding RAM/ephemeral-storage limits.' % [ - utils.ifShowMultiCluster($._config, ' {{ $labels.%(clusterLabel)s }}' % $._config), + description: 'Node {{ $labels.node }}%s is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.' % [ + utils.ifShowMultiCluster($._config, ' in {{ $labels.%(clusterLabel)s }}' % $._config), ], - summary: 'Cluster is evicting pods due to {{ $labels.eviction_signal }}.', + summary: 'Node is evicting pods.', }, }, { diff --git a/runbook.md b/runbook.md index f30238902..4014366d6 100644 --- a/runbook.md +++ b/runbook.md @@ -182,10 +182,16 @@ This page collects this repositories alerts and begins the process of describing ### Group Name: "kubernetes-system" ##### Alert Name: "KubeNodeNotReady" -+ *Message*: `{{ $labels.node }} has been unready for more than 15 minutes."` ++ *Message*: `{{ $labels.node }} has been unready for more than 15 minutes.` + *Severity*: warning + *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready/) +##### Alert Name: "KubeNodePressure" ++ *Message*: `{{ $labels.node }} has active Condition {{ $labels.condition }}. This is caused by resource usage exceeding eviction thresholds.` ++ *Severity*: info ++ *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodepressure/) +# If soft thresholds are crossed, pods will be evicted respecting TerminationGracePeriod. If Hard thresholds are crossed grace period will be ignored. + ##### Alert Name: "KubeNodeUnreachable" + *Message*: `{{ $labels.node }} is unreachable and some workloads may be rescheduled.` + *Severity*: warning @@ -201,6 +207,11 @@ This page collects this repositories alerts and begins the process of describing + *Severity*: warning + *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping/) +##### Alert Name: "KubeNodeEvictions" ++ *Message*: `Node {{ $labels.node }} is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.` ++ *Severity*: info ++ *Runbook*: [Link](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeevictions) + ##### Alert Name: "KubeletPlegDurationHigh" + *Message*: `The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.` + *Severity*: warning From f296e4d6995a44a976b32d5ffb0f822ea39e912e Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Thu, 6 Feb 2025 16:06:56 -0600 Subject: [PATCH 08/19] add tests Signed-off-by: TheRealNoob --- tests.yaml | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests.yaml b/tests.yaml index 3dd5920ea..6313ad7b0 100644 --- a/tests.yaml +++ b/tests.yaml @@ -601,6 +601,39 @@ tests: description: 'minikube has been unready for more than 15 minutes.' runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready' +- interval: 1m + input_series: + # node=minikube is uncordoned so we expect the alert to fire + - series: 'kube_node_status_condition{condition="MemoryPressure",status="true",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}' + values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}' + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + # node=minikube2 is cordoned so we expect the alert to not fire + - series: 'kube_node_status_condition{condition="MemoryPressure",status="true",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}' + values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + alert_rule_test: + - eval_time: 15m + alertname: KubeNodePressure + exp_alerts: + - exp_labels: + condition: MemoryPressure + status: "true" + cluster: kubernetes + node: minikube + severity: warning + endpoint: https-main + instance: 10.0.2.15:10250 + job: kube-state-metrics + namespace: monitoring + pod: kube-state-metrics-b894d84cc-d6htw + service: kube-state-metrics + exp_annotations: + summary: "Node has as active Condition." + description: 'minikube has active Condition MemoryPressure. This is caused by resource usage exceeding eviction thresholds.' + runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodepressure' + - interval: 1m input_series: # node=minikube is uncordoned so we expect the alert to fire @@ -626,6 +659,32 @@ tests: description: 'The readiness status of node minikube has changed 9 times in the last 15 minutes.' runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping' +- interval: 1m + input_series: + # This metric only appears with non-zero values, meaning the transition from 0 --> 1 doesn't trigger an alert + # However, since that's undesired behavior it'd be kinda pointless to test for it, even though it's expected. + - series: 'kubelet_evictions{eviction_signal="memory.available",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}' + values: '1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3' + alert_rule_test: + - eval_time: 18m + alertname: KubeNodeEvictions + exp_alerts: + - exp_labels: + eviction_signal: memory.available + cluster: kubernetes + node: minikube + severity: warning + endpoint: https-main + instance: 10.0.2.15:10250 + job: kube-state-metrics + namespace: monitoring + pod: kube-state-metrics-b894d84cc-d6htw + service: kube-state-metrics + exp_annotations: + summary: "Node is evicting pods." + description: 'Node minikube is evicting Pods due to memory.available. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.' + runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeevictions' + # Verify that node:node_num_cpu:sum triggers no many-to-many errors. - interval: 1m input_series: From 611836038000c715c62227bb3edecf2c232518a9 Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Fri, 7 Feb 2025 12:02:39 -0600 Subject: [PATCH 09/19] fix tests Signed-off-by: TheRealNoob --- tests.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests.yaml b/tests.yaml index 6313ad7b0..e5d5e9a8a 100644 --- a/tests.yaml +++ b/tests.yaml @@ -622,7 +622,7 @@ tests: status: "true" cluster: kubernetes node: minikube - severity: warning + severity: info endpoint: https-main instance: 10.0.2.15:10250 job: kube-state-metrics @@ -663,7 +663,7 @@ tests: input_series: # This metric only appears with non-zero values, meaning the transition from 0 --> 1 doesn't trigger an alert # However, since that's undesired behavior it'd be kinda pointless to test for it, even though it's expected. - - series: 'kubelet_evictions{eviction_signal="memory.available",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}' + - series: 'kubelet_evictions{eviction_signal="memory.available",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kubelet",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}' values: '1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3' alert_rule_test: - eval_time: 18m @@ -673,10 +673,10 @@ tests: eviction_signal: memory.available cluster: kubernetes node: minikube - severity: warning + severity: info endpoint: https-main instance: 10.0.2.15:10250 - job: kube-state-metrics + job: kubelet namespace: monitoring pod: kube-state-metrics-b894d84cc-d6htw service: kube-state-metrics From d13e847b1afdc3db6c6b4214bb7e5faab39669f7 Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Fri, 7 Feb 2025 12:13:59 -0600 Subject: [PATCH 10/19] fix "smelly selector" syntax preference this turned out to be a good chance because it made me realize there was an additional label value here that wasn't be handled. Signed-off-by: TheRealNoob --- alerts/kubelet.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index c2d9930b8..027a7fd3b 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -43,7 +43,7 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeNodePressure', expr: ||| - kube_node_status_condition{%(kubeStateMetricsSelector)s,condition=~".+Pressure",status="true"} == 1 + kube_node_status_condition{%(kubeStateMetricsSelector)s,condition=!"Ready",status="true"} == 1 and on (%(clusterLabel)s, node) kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0 ||| % $._config, From 5b396f18db92002434da2100815ec80b383b70b2 Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Fri, 7 Feb 2025 10:15:23 -0800 Subject: [PATCH 11/19] Update alerts/kubelet.libsonnet Co-authored-by: Stephen Lang --- alerts/kubelet.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index 027a7fd3b..9491c9df5 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -133,7 +133,7 @@ local utils = import '../lib/utils.libsonnet'; 'for': '0s', annotations: { description: 'Node {{ $labels.node }}%s is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.' % [ - utils.ifShowMultiCluster($._config, ' in {{ $labels.%(clusterLabel)s }}' % $._config), + utils.ifShowMultiCluster($._config, ' on {{ $labels.%(clusterLabel)s }}' % $._config), ], summary: 'Node is evicting pods.', }, From 4a2fe4c1d4b89ba7ab1d2a9b691247fa8ac6634f Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Fri, 7 Feb 2025 11:27:38 -0800 Subject: [PATCH 12/19] Update alerts/kubelet.libsonnet Co-authored-by: Stephen Lang --- alerts/kubelet.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index 7de87a5a2..d5584e996 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -43,7 +43,7 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeNodePressure', expr: ||| - kube_node_status_condition{%(kubeStateMetricsSelector)s,condition=!"Ready",status="true"} == 1 + kube_node_status_condition{%(kubeStateMetricsSelector)s,condition=~"(MemoryPressure|DiskPressure|PIDPressure)",status="true"} == 1 and on (%(clusterLabel)s, node) kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0 ||| % $._config, From 87f2ff2058f2ceeb9d94ce8e18b9498a17e3d398 Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Fri, 7 Feb 2025 18:01:50 -0600 Subject: [PATCH 13/19] add test KubeNodePressure Signed-off-by: TheRealNoob --- tests/tests.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/tests.yaml b/tests/tests.yaml index 2d883b782..07c00694d 100644 --- a/tests/tests.yaml +++ b/tests/tests.yaml @@ -639,6 +639,10 @@ tests: - series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}' values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' alert_rule_test: + # does not meet 'for: 10m' threshold + - eval_time: 12m + alertname: KubeNodePressure + exp_alerts: [] - eval_time: 15m alertname: KubeNodePressure exp_alerts: From 0478f9b2078bb72087c09b852f0e9aadfdd70fb8 Mon Sep 17 00:00:00 2001 From: Stephen Lang Date: Fri, 14 Feb 2025 11:48:59 +0000 Subject: [PATCH 14/19] chore: make --always-make markdownfmt --- runbook.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runbook.md b/runbook.md index 4014366d6..d3cc2b92b 100644 --- a/runbook.md +++ b/runbook.md @@ -190,7 +190,8 @@ This page collects this repositories alerts and begins the process of describing + *Message*: `{{ $labels.node }} has active Condition {{ $labels.condition }}. This is caused by resource usage exceeding eviction thresholds.` + *Severity*: info + *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodepressure/) -# If soft thresholds are crossed, pods will be evicted respecting TerminationGracePeriod. If Hard thresholds are crossed grace period will be ignored. + +# If soft thresholds are crossed, pods will be evicted respecting TerminationGracePeriod. If Hard thresholds are crossed grace period will be ignored. ##### Alert Name: "KubeNodeUnreachable" + *Message*: `{{ $labels.node }} is unreachable and some workloads may be rescheduled.` @@ -208,7 +209,7 @@ This page collects this repositories alerts and begins the process of describing + *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping/) ##### Alert Name: "KubeNodeEvictions" -+ *Message*: `Node {{ $labels.node }} is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.` ++ *Message*: `Node {{ $labels.node }} is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.` + *Severity*: info + *Runbook*: [Link](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeevictions) From 3f64e21a14dc8f43e396fc420efdf2e9313af849 Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Mon, 17 Feb 2025 21:45:21 -0600 Subject: [PATCH 15/19] rename KubeNodeEviction, fix test case Signed-off-by: TheRealNoob --- alerts/kubelet.libsonnet | 10 +++++++--- runbook.md | 6 ++---- tests/tests.yaml | 10 ++-------- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index d5584e996..ee742adad 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -14,7 +14,7 @@ local utils = import '../lib/utils.libsonnet'; kubeletCertExpirationCriticalSeconds: 1 * 24 * 3600, // Evictions per second that will trigger an alert. The default value will trigger on any evictions. - evictionRateThreshold: 0.0, + KubeNodeEvictionRateThreshold: 0.0, }, prometheusAlerts+:: { @@ -125,9 +125,13 @@ local utils = import '../lib/utils.libsonnet'; }, }, { - alert: 'KubeNodeEvictions', + alert: 'KubeNodeEviction', expr: ||| - sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, node) > %(evictionRateThreshold)s + sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, instance) > %(KubeNodeEvictionRateThreshold)s + * on (%(clusterLabel)s, instance) group_left(node) + max by (%(clusterLabel)s, instance, node) ( + kubelet_node_name{%(kubeletSelector)s} + ) ||| % $._config, labels: { severity: 'info', diff --git a/runbook.md b/runbook.md index d3cc2b92b..7a33cd03e 100644 --- a/runbook.md +++ b/runbook.md @@ -191,8 +191,6 @@ This page collects this repositories alerts and begins the process of describing + *Severity*: info + *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodepressure/) -# If soft thresholds are crossed, pods will be evicted respecting TerminationGracePeriod. If Hard thresholds are crossed grace period will be ignored. - ##### Alert Name: "KubeNodeUnreachable" + *Message*: `{{ $labels.node }} is unreachable and some workloads may be rescheduled.` + *Severity*: warning @@ -208,10 +206,10 @@ This page collects this repositories alerts and begins the process of describing + *Severity*: warning + *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping/) -##### Alert Name: "KubeNodeEvictions" +##### Alert Name: "KubeNodeEviction" + *Message*: `Node {{ $labels.node }} is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.` + *Severity*: info -+ *Runbook*: [Link](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeevictions) ++ *Runbook*: [Link](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeeviction) ##### Alert Name: "KubeletPlegDurationHigh" + *Message*: `The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.` diff --git a/tests/tests.yaml b/tests/tests.yaml index 07c00694d..718029c86 100644 --- a/tests/tests.yaml +++ b/tests/tests.yaml @@ -696,23 +696,17 @@ tests: values: '1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3' alert_rule_test: - eval_time: 18m - alertname: KubeNodeEvictions + alertname: KubeNodeEviction exp_alerts: - exp_labels: eviction_signal: memory.available cluster: kubernetes node: minikube severity: info - endpoint: https-main - instance: 10.0.2.15:10250 - job: kubelet - namespace: monitoring - pod: kube-state-metrics-b894d84cc-d6htw - service: kube-state-metrics exp_annotations: summary: "Node is evicting pods." description: 'Node minikube is evicting Pods due to memory.available. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.' - runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeevictions' + runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeeviction' # Verify that node:node_num_cpu:sum triggers no many-to-many errors. - interval: 1m From 8e5b1bcd4ae441ddd39b43ad34302fb09689c786 Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Mon, 17 Feb 2025 21:52:00 -0600 Subject: [PATCH 16/19] remove in-progress change Signed-off-by: TheRealNoob --- alerts/kubelet.libsonnet | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index ee742adad..559929510 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -127,11 +127,7 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeNodeEviction', expr: ||| - sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, instance) > %(KubeNodeEvictionRateThreshold)s - * on (%(clusterLabel)s, instance) group_left(node) - max by (%(clusterLabel)s, instance, node) ( - kubelet_node_name{%(kubeletSelector)s} - ) + sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, node) > %(KubeNodeEvictionRateThreshold)s ||| % $._config, labels: { severity: 'info', From 9bd81afeb6b1e2b10a26e6fbd6f997d077e4815d Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Fri, 21 Feb 2025 18:52:58 -0800 Subject: [PATCH 17/19] Update kubelet.libsonnet update KubeNodeEviction query --- alerts/kubelet.libsonnet | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index 559929510..c47aff89c 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -127,7 +127,12 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeNodeEviction', expr: ||| - sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, node) > %(KubeNodeEvictionRateThreshold)s + sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, instance) + * on (%(clusterLabel)s, instance) group_left(node) + max by (%(clusterLabel)s, instance, node) ( + kubelet_node_name{%(kubeletSelector)s} + ) + > %(KubeNodeEvictionRateThreshold)s ||| % $._config, labels: { severity: 'info', From 53c589fd4ae9dde5a70eba3d1abbcf3b9e56d6b6 Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Fri, 21 Feb 2025 19:06:29 -0800 Subject: [PATCH 18/19] Update tests.yaml update KubeNodeEviction test case --- tests/tests.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/tests.yaml b/tests/tests.yaml index 718029c86..05163ac53 100644 --- a/tests/tests.yaml +++ b/tests/tests.yaml @@ -694,6 +694,8 @@ tests: # However, since that's undesired behavior it'd be kinda pointless to test for it, even though it's expected. - series: 'kubelet_evictions{eviction_signal="memory.available",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kubelet",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}' values: '1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3' + - series: 'kubelet_node_name{cluster="kubernetes", instance="10.0.2.15:10250", node="minikube", job="kubelet"}' + values: '1x20' alert_rule_test: - eval_time: 18m alertname: KubeNodeEviction From dfb87a29721894c7645b6d881aff13a50a5fcbad Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Wed, 26 Feb 2025 13:23:59 -0800 Subject: [PATCH 19/19] update test KubeNodeEviction Co-authored-by: Stephen Lang --- tests/tests.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests.yaml b/tests/tests.yaml index 05163ac53..24dcfba67 100644 --- a/tests/tests.yaml +++ b/tests/tests.yaml @@ -704,6 +704,7 @@ tests: eviction_signal: memory.available cluster: kubernetes node: minikube + instance: 10.0.2.15:10250 severity: info exp_annotations: summary: "Node is evicting pods."