Skip to content

Commit 612e6eb

Browse files
authored
Merge pull request #476 from brancz/improved-daemonset-alert
alerts: Improve DaemonSet rollout alert taking progress into account
2 parents 9d2c182 + cc3ee2c commit 612e6eb

File tree

2 files changed

+140
-4
lines changed

2 files changed

+140
-4
lines changed

alerts/apps_alerts.libsonnet

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,15 +150,35 @@
150150
{
151151
alert: 'KubeDaemonSetRolloutStuck',
152152
expr: |||
153-
kube_daemonset_status_number_ready{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
154-
/
155-
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} < 1.00
153+
(
154+
(
155+
kube_daemonset_status_current_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
156+
!=
157+
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
158+
) or (
159+
kube_daemonset_status_number_misscheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
160+
!=
161+
0
162+
) or (
163+
kube_daemonset_updated_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
164+
!=
165+
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
166+
) or (
167+
kube_daemonset_status_number_available{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
168+
!=
169+
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
170+
)
171+
) and (
172+
changes(kube_daemonset_updated_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m])
173+
==
174+
0
175+
)
156176
||| % $._config,
157177
labels: {
158178
severity: 'warning',
159179
},
160180
annotations: {
161-
message: 'Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.',
181+
message: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.',
162182
},
163183
'for': '15m',
164184
},

tests.yaml

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,3 +377,119 @@ tests:
377377
exp_samples:
378378
- value: 1.0e+3
379379
labels: 'node_namespace_pod_container:container_memory_swap{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",namespace="monitoring",node="node1",pod="alertmanager-main-0",service="kubelet"}'
380+
- interval: 1m
381+
# Current unequal desired and not progressing.
382+
input_series:
383+
- series: 'kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
384+
values: '4 4 4 4 3 4 4 4 3 4 4 4 3 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4'
385+
- series: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
386+
values: '4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4'
387+
- series: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
388+
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
389+
- series: 'kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
390+
values: '4 4 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4'
391+
- series: 'kube_daemonset_status_number_available{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
392+
values: '4 4 4 3 3 3 4 3 3 3 4 3 3 3 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4'
393+
alert_rule_test:
394+
- eval_time: 32m
395+
alertname: KubeDaemonSetRolloutStuck
396+
- eval_time: 33m
397+
alertname: KubeDaemonSetRolloutStuck
398+
exp_alerts:
399+
- exp_labels:
400+
job: kube-state-metrics
401+
namespace: monitoring
402+
daemonset: node-exporter
403+
severity: warning
404+
exp_annotations:
405+
message: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.'
406+
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
407+
- eval_time: 34m
408+
alertname: KubeDaemonSetRolloutStuck
409+
- interval: 1m
410+
# Misscheduled is non zero.
411+
input_series:
412+
- series: 'kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
413+
values: '4 4 4 4 3 4 4 4 3 4 4 4 3 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4'
414+
- series: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
415+
values: '4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4'
416+
- series: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
417+
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0'
418+
- series: 'kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
419+
values: '4 4 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4'
420+
- series: 'kube_daemonset_status_number_available{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
421+
values: '4 4 4 3 3 3 4 3 3 3 4 3 3 3 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4'
422+
alert_rule_test:
423+
- eval_time: 32m
424+
alertname: KubeDaemonSetRolloutStuck
425+
- eval_time: 33m
426+
alertname: KubeDaemonSetRolloutStuck
427+
exp_alerts:
428+
- exp_labels:
429+
job: kube-state-metrics
430+
namespace: monitoring
431+
daemonset: node-exporter
432+
severity: warning
433+
exp_annotations:
434+
message: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.'
435+
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
436+
- eval_time: 34m
437+
alertname: KubeDaemonSetRolloutStuck
438+
- interval: 1m
439+
# Updated number unequal desired.
440+
input_series:
441+
- series: 'kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
442+
values: '4 4 4 4 3 4 4 4 3 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4'
443+
- series: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
444+
values: '4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4'
445+
- series: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
446+
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
447+
- series: 'kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
448+
values: '4 4 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4'
449+
- series: 'kube_daemonset_status_number_available{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
450+
values: '4 4 4 3 3 3 4 3 3 3 4 3 3 3 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4'
451+
alert_rule_test:
452+
- eval_time: 32m
453+
alertname: KubeDaemonSetRolloutStuck
454+
- eval_time: 33m
455+
alertname: KubeDaemonSetRolloutStuck
456+
exp_alerts:
457+
- exp_labels:
458+
job: kube-state-metrics
459+
namespace: monitoring
460+
daemonset: node-exporter
461+
severity: warning
462+
exp_annotations:
463+
message: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.'
464+
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
465+
- eval_time: 34m
466+
alertname: KubeDaemonSetRolloutStuck
467+
- interval: 1m
468+
# Number available unequal desired.
469+
input_series:
470+
- series: 'kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
471+
values: '4 4 4 4 3 4 4 4 3 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4'
472+
- series: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
473+
values: '4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4'
474+
- series: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
475+
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
476+
- series: 'kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
477+
values: '4 4 0 0 0 1 1 1 1 2 2 2 2 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4'
478+
- series: 'kube_daemonset_status_number_available{job="kube-state-metrics",namespace="monitoring",daemonset="node-exporter"}'
479+
values: '4 4 4 3 3 3 4 3 3 3 4 3 3 3 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4'
480+
alert_rule_test:
481+
- eval_time: 34m
482+
alertname: KubeDaemonSetRolloutStuck
483+
- eval_time: 35m
484+
alertname: KubeDaemonSetRolloutStuck
485+
exp_alerts:
486+
- exp_labels:
487+
job: kube-state-metrics
488+
namespace: monitoring
489+
daemonset: node-exporter
490+
severity: warning
491+
exp_annotations:
492+
message: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.'
493+
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
494+
- eval_time: 36m
495+
alertname: KubeDaemonSetRolloutStuck

0 commit comments

Comments
 (0)