From e88580b9a261cee1a317888a191aa2fe14693459 Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Fri, 17 Jan 2025 12:30:52 -0600 Subject: [PATCH 1/4] feat: filter NodeReadiness alerts on uncordoned status Signed-off-by: TheRealNoob --- alerts/kubelet.libsonnet | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index 1bc4ea558..3aac388eb 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -22,6 +22,8 @@ local utils = import '../lib/utils.libsonnet'; { expr: ||| kube_node_status_condition{%(kubeStateMetricsSelector)s,condition="Ready",status="true"} == 0 + and on (%(clusterLabel)s, node) + kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0 ||| % $._config, labels: { severity: 'warning', @@ -30,7 +32,7 @@ local utils = import '../lib/utils.libsonnet'; description: '{{ $labels.node }} has been unready for more than 15 minutes%s.' % [ utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), ], - summary: 'Node is not ready.', + summary: 'Schedulable Node is not ready.', }, 'for': '15m', alert: 'KubeNodeNotReady', @@ -85,6 +87,8 @@ local utils = import '../lib/utils.libsonnet'; alert: 'KubeNodeReadinessFlapping', expr: ||| sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2 + and on (%(clusterLabel)s, node) + kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0 ||| % $._config, 'for': '15m', labels: { @@ -94,7 +98,7 @@ local utils = import '../lib/utils.libsonnet'; description: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes%s.' % [ utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), ], - summary: 'Node readiness status is flapping.', + summary: 'Schedulable Node readiness status is flapping.', }, }, { From 30d35ea6a6760f29afc20b7a61ec4491a60e710a Mon Sep 17 00:00:00 2001 From: TheRealNoob Date: Wed, 22 Jan 2025 15:28:24 -0600 Subject: [PATCH 2/4] add tests Signed-off-by: TheRealNoob --- alerts/kubelet.libsonnet | 4 ++-- tests.yaml | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index 3aac388eb..e09170f8e 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -32,7 +32,7 @@ local utils = import '../lib/utils.libsonnet'; description: '{{ $labels.node }} has been unready for more than 15 minutes%s.' % [ utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), ], - summary: 'Schedulable Node is not ready.', + summary: 'Node is not ready.', }, 'for': '15m', alert: 'KubeNodeNotReady', @@ -98,7 +98,7 @@ local utils = import '../lib/utils.libsonnet'; description: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes%s.' % [ utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), ], - summary: 'Schedulable Node readiness status is flapping.', + summary: 'Node readiness status is flapping.', }, }, { diff --git a/tests.yaml b/tests.yaml index b73aca723..839f108aa 100644 --- a/tests.yaml +++ b/tests.yaml @@ -570,8 +570,41 @@ tests: - interval: 1m input_series: + # node=minikube is uncordoned so we expect the alert to fire - series: 'kube_node_status_condition{condition="Ready",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics",status="true"}' + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics",status="true"}' + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + # node=minikube2 is cordoned so we expect the alert to not fire + - series: 'kube_node_status_condition{condition="Ready",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-f5e9f",service="kube-state-metrics"}' + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-f5e9f",service="kube-state-metrics"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + alert_rule_test: + - eval_time: 18m + alertname: KubeNodeNotReady + exp_alerts: + - exp_labels: + cluster: kubernetes + node: minikube + severity: warning + exp_annotations: + summary: "Node is not ready." + description: 'minikube has been unready for more than 15 minutes.' + runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready"' + +- interval: 1m + input_series: + # node=minikube is uncordoned so we expect the alert to fire + - series: 'kube_node_status_condition{condition="Ready",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics",status="true"}' + values: '1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1' + - series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics",status="true"}' + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + # node=minikube2 is cordoned so we expect the alert to not fire + - series: 'kube_node_status_condition{condition="Ready",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-f5e9f",service="kube-state-metrics",status="true"}' values: '1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1' + - series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-f5e9f",service="kube-state-metrics"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' alert_rule_test: - eval_time: 18m alertname: KubeNodeReadinessFlapping From 16880375db4ba84a9203d60f7b18369f30bc4946 Mon Sep 17 00:00:00 2001 From: Stephen Lang Date: Thu, 23 Jan 2025 11:59:46 +0000 Subject: [PATCH 3/4] Update tests.yaml --- tests.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests.yaml b/tests.yaml index 839f108aa..f82678611 100644 --- a/tests.yaml +++ b/tests.yaml @@ -588,6 +588,14 @@ tests: cluster: kubernetes node: minikube severity: warning + condition: Ready + endpoint: https-main + instance: 10.0.2.15:10250 + job: kube-state-metrics + namespace: monitoring + pod: kube-state-metrics-b894d84cc-d6htw + service: kube-state-metrics + status: "true" exp_annotations: summary: "Node is not ready." description: 'minikube has been unready for more than 15 minutes.' From d2f9d0a580796f274fb09abe7067de8d179131ca Mon Sep 17 00:00:00 2001 From: Stephen Lang Date: Thu, 23 Jan 2025 12:07:18 +0000 Subject: [PATCH 4/4] Update tests.yaml --- tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests.yaml b/tests.yaml index f82678611..3dd5920ea 100644 --- a/tests.yaml +++ b/tests.yaml @@ -599,7 +599,7 @@ tests: exp_annotations: summary: "Node is not ready." description: 'minikube has been unready for more than 15 minutes.' - runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready"' + runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready' - interval: 1m input_series: