Skip to content

Commit 5c2dd2c

Browse files
authored
Merge pull request #746 from arajkumar/rename-KubeJobCompletion
Rename KubeJobCompletion to KubeJobNotCompleted and fix alert expression
2 parents b9b9443 + b70203c commit 5c2dd2c

File tree

4 files changed

+44
-6
lines changed

4 files changed

+44
-6
lines changed

alerts/apps_alerts.libsonnet

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
_config+:: {
33
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
4+
kubeJobTimeoutDuration: error 'must provide value for kubeJobTimeoutDuration',
45
namespaceSelector: null,
56
prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '',
67
},
@@ -235,16 +236,17 @@
235236
'for': '15m',
236237
},
237238
{
238-
alert: 'KubeJobCompletion',
239+
alert: 'KubeJobNotCompleted',
239240
expr: |||
240-
kube_job_spec_completions{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} - kube_job_status_succeeded{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0
241+
time() - max by(namespace, job_name) (kube_job_status_start_time{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
242+
and
243+
kube_job_status_active{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0) > %(kubeJobTimeoutDuration)s
241244
||| % $._config,
242-
'for': '12h',
243245
labels: {
244246
severity: 'warning',
245247
},
246248
annotations: {
247-
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.',
249+
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%(kubeJobTimeoutDuration)s" | humanizeDuration }} to complete.' % $._config,
248250
summary: 'Job did not complete in time',
249251
},
250252
},

config.libsonnet

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,5 +106,8 @@
106106
// so that specific storage alerts will not fire.With the default selector, adding a label `exclude-from-alerts: 'true'`
107107
// to the PVC will have the desired effect.
108108
pvExcludedSelector: 'label_excluded_from_alerts="true"',
109+
110+
// Default timeout value for k8s Jobs. The jobs which are active beyond this duration would trigger KubeJobNotCompleted alert.
111+
kubeJobTimeoutDuration: 12 * 60 * 60,
109112
},
110113
}

runbook.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ This page collects this repositories alerts and begins the process of describing
5757
+ *Message*: `A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are running where they are not supposed to run.`
5858
+ *Severity*: warning
5959

60-
##### Alert Name: "KubeJobCompletion"
61-
+ *Message*: `Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 1h to complete.`
60+
##### Alert Name: "KubeJobNotCompleted"
61+
+ *Message*: `Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%(kubeJobTimeoutDuration)s" | humanizeDuration }} to complete.`
6262
+ *Severity*: warning
6363
+ *Action*: Check the job using `kubectl describe job <job>` and look at the pod logs using `kubectl logs <pod>` for further information.
6464

tests.yaml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,39 @@ tests:
843843
description: "Job ns1/job-1597623120 failed to complete. Removing failed job after investigation should clear this alert."
844844
runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed"
845845

846+
- interval: 1m
847+
input_series:
848+
- series: 'kube_job_status_start_time{namespace="ns1", job="kube-state-metrics", instance="instance1", job_name="job1"}'
849+
values: '0+0x200 _x500 0+0x40'
850+
- series: 'kube_job_status_active{namespace="ns1", job="kube-state-metrics", instance="instance1", job_name="job1"}'
851+
values: '1x200 _x500 1x40'
852+
alert_rule_test:
853+
- eval_time: 6h
854+
alertname: KubeJobNotCompleted
855+
- eval_time: 12h1m
856+
alertname: KubeJobNotCompleted
857+
exp_alerts:
858+
- exp_labels:
859+
namespace: ns1
860+
job_name: job1
861+
severity: warning
862+
exp_annotations:
863+
summary: "Job did not complete in time"
864+
description: "Job ns1/job1 is taking more than 12h 0m 0s to complete."
865+
runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobnotcompleted"
866+
867+
- interval: 1m
868+
input_series:
869+
- series: 'kube_job_status_start_time{namespace="ns1", job="kube-state-metrics", instance="instance1", job_name="job1"}'
870+
values: '0+0x740'
871+
- series: 'kube_job_status_active{namespace="ns1", job="kube-state-metrics", instance="instance1", job_name="job1"}'
872+
values: '1+0x710 0x30'
873+
alert_rule_test:
874+
- eval_time: 6h
875+
alertname: KubeJobNotCompleted
876+
- eval_time: 12h
877+
alertname: KubeJobNotCompleted
878+
846879
- interval: 1m
847880
input_series:
848881
- series: 'apiserver_request_terminations_total{job="kube-apiserver",apiserver="kube-apiserver"}'

0 commit comments

Comments
 (0)