Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions alerts/kubelet.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ local utils = import '../lib/utils.libsonnet';

kubeletCertExpirationWarningSeconds: 7 * 24 * 3600,
kubeletCertExpirationCriticalSeconds: 1 * 24 * 3600,

// Evictions per second that will trigger an alert. The default value will trigger on any evictions.
evictionRateThreshold: 0.0,
},

prometheusAlerts+:: {
Expand All @@ -37,6 +40,24 @@ local utils = import '../lib/utils.libsonnet';
'for': '15m',
alert: 'KubeNodeNotReady',
},
{
alert: 'KubeNodePressure',
expr: |||
kube_node_status_condition{%(kubeStateMetricsSelector)s,condition=!"Ready",status="true"} == 1
and on (%(clusterLabel)s, node)
kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0
||| % $._config,
labels: {
severity: 'info',
},
'for': '10m',
annotations: {
description: '{{ $labels.node }}%s has active Condition {{ $labels.condition }}. This is caused by resource usage exceeding eviction thresholds.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'Node has as active Condition.',
},
},
{
expr: |||
(kube_node_spec_taint{%(kubeStateMetricsSelector)s,key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{%(kubeStateMetricsSelector)s,key=~"%(kubeNodeUnreachableIgnoreKeys)s"}) == 1
Expand Down Expand Up @@ -101,6 +122,22 @@ local utils = import '../lib/utils.libsonnet';
summary: 'Node readiness status is flapping.',
},
},
{
alert: 'KubeNodeEvictions',
expr: |||
sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, node) > %(evictionRateThreshold)s
||| % $._config,
labels: {
severity: 'info',
},
'for': '0s',
annotations: {
description: 'Node {{ $labels.node }}%s is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.' % [
utils.ifShowMultiCluster($._config, ' on {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'Node is evicting pods.',
},
},
{
alert: 'KubeletPlegDurationHigh',
expr: |||
Expand Down
13 changes: 12 additions & 1 deletion runbook.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,10 +182,16 @@ This page collects this repositories alerts and begins the process of describing
### Group Name: "kubernetes-system"

##### Alert Name: "KubeNodeNotReady"
+ *Message*: `{{ $labels.node }} has been unready for more than 15 minutes."`
+ *Message*: `{{ $labels.node }} has been unready for more than 15 minutes.`
+ *Severity*: warning
+ *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready/)

##### Alert Name: "KubeNodePressure"
+ *Message*: `{{ $labels.node }} has active Condition {{ $labels.condition }}. This is caused by resource usage exceeding eviction thresholds.`
+ *Severity*: info
+ *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodepressure/)
# If soft thresholds are crossed, pods will be evicted respecting TerminationGracePeriod. If Hard thresholds are crossed grace period will be ignored.

##### Alert Name: "KubeNodeUnreachable"
+ *Message*: `{{ $labels.node }} is unreachable and some workloads may be rescheduled.`
+ *Severity*: warning
Expand All @@ -201,6 +207,11 @@ This page collects this repositories alerts and begins the process of describing
+ *Severity*: warning
+ *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping/)

##### Alert Name: "KubeNodeEvictions"
+ *Message*: `Node {{ $labels.node }} is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.`
+ *Severity*: info
+ *Runbook*: [Link](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeevictions)

##### Alert Name: "KubeletPlegDurationHigh"
+ *Message*: `The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.`
+ *Severity*: warning
Expand Down
59 changes: 59 additions & 0 deletions tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,39 @@ tests:
description: 'minikube has been unready for more than 15 minutes.'
runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready'

- interval: 1m
input_series:
# node=minikube is uncordoned so we expect the alert to fire
- series: 'kube_node_status_condition{condition="MemoryPressure",status="true",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}'
values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
- series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}'
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
# node=minikube2 is cordoned so we expect the alert to not fire
- series: 'kube_node_status_condition{condition="MemoryPressure",status="true",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}'
values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
- series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
alert_rule_test:
- eval_time: 15m
alertname: KubeNodePressure
exp_alerts:
- exp_labels:
condition: MemoryPressure
status: "true"
cluster: kubernetes
node: minikube
severity: info
endpoint: https-main
instance: 10.0.2.15:10250
job: kube-state-metrics
namespace: monitoring
pod: kube-state-metrics-b894d84cc-d6htw
service: kube-state-metrics
exp_annotations:
summary: "Node has as active Condition."
description: 'minikube has active Condition MemoryPressure. This is caused by resource usage exceeding eviction thresholds.'
runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodepressure'

- interval: 1m
input_series:
# node=minikube is uncordoned so we expect the alert to fire
Expand All @@ -626,6 +659,32 @@ tests:
description: 'The readiness status of node minikube has changed 9 times in the last 15 minutes.'
runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping'

- interval: 1m
input_series:
# This metric only appears with non-zero values, meaning the transition from 0 --> 1 doesn't trigger an alert
# However, since that's undesired behavior it'd be kinda pointless to test for it, even though it's expected.
- series: 'kubelet_evictions{eviction_signal="memory.available",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kubelet",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics"}'
values: '1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3'
alert_rule_test:
- eval_time: 18m
alertname: KubeNodeEvictions
exp_alerts:
- exp_labels:
eviction_signal: memory.available
cluster: kubernetes
node: minikube
severity: info
endpoint: https-main
instance: 10.0.2.15:10250
job: kubelet
namespace: monitoring
pod: kube-state-metrics-b894d84cc-d6htw
service: kube-state-metrics
exp_annotations:
summary: "Node is evicting pods."
description: 'Node minikube is evicting Pods due to memory.available. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.'
runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeevictions'

# Verify that node:node_num_cpu:sum triggers no many-to-many errors.
- interval: 1m
input_series:
Expand Down