Skip to content

Commit b70203c

Browse files
committed
Rely on kube_job_status_start_time for alerting rather than for clause
Signed-off-by: Arunprasad Rajkumar <[email protected]>
1 parent 3c3b42d commit b70203c

File tree

4 files changed

+12
-17
lines changed

4 files changed

+12
-17
lines changed

alerts/apps_alerts.libsonnet

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
_config+:: {
33
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
4+
kubeJobTimeoutDuration: error 'must provide value for kubeJobTimeoutDuration',
45
namespaceSelector: null,
56
prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '',
67
},
@@ -237,16 +238,15 @@
237238
{
238239
alert: 'KubeJobNotCompleted',
239240
expr: |||
240-
time() - kube_job_status_start_time{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
241+
time() - max by(namespace, job_name) (kube_job_status_start_time{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
241242
and
242-
kube_job_status_active{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0
243+
kube_job_status_active{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0) > %(kubeJobTimeoutDuration)s
243244
||| % $._config,
244-
'for': '12h',
245245
labels: {
246246
severity: 'warning',
247247
},
248248
annotations: {
249-
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.',
249+
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%(kubeJobTimeoutDuration)s" | humanizeDuration }} to complete.' % $._config,
250250
summary: 'Job did not complete in time',
251251
},
252252
},

config.libsonnet

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,5 +106,8 @@
106106
// so that specific storage alerts will not fire.With the default selector, adding a label `exclude-from-alerts: 'true'`
107107
// to the PVC will have the desired effect.
108108
pvExcludedSelector: 'label_excluded_from_alerts="true"',
109+
110+
// Default timeout value for k8s Jobs. The jobs which are active beyond this duration would trigger KubeJobNotCompleted alert.
111+
kubeJobTimeoutDuration: 12 * 60 * 60,
109112
},
110113
}

runbook.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ This page collects this repositories alerts and begins the process of describing
5858
+ *Severity*: warning
5959

6060
##### Alert Name: "KubeJobNotCompleted"
61-
+ *Message*: `Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12h to complete.`
61+
+ *Message*: `Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%(kubeJobTimeoutDuration)s" | humanizeDuration }} to complete.`
6262
+ *Severity*: warning
6363
+ *Action*: Check the job using `kubectl describe job <job>` and look at the pod logs using `kubectl logs <pod>` for further information.
6464

tests.yaml

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -846,30 +846,22 @@ tests:
846846
- interval: 1m
847847
input_series:
848848
- series: 'kube_job_status_start_time{namespace="ns1", job="kube-state-metrics", instance="instance1", job_name="job1"}'
849-
values: '0+0x740'
849+
values: '0+0x200 _x500 0+0x40'
850850
- series: 'kube_job_status_active{namespace="ns1", job="kube-state-metrics", instance="instance1", job_name="job1"}'
851-
values: '1+0x740'
852-
promql_expr_test:
853-
- eval_time: 12h
854-
expr: time() - kube_job_status_start_time and kube_job_status_active > 0
855-
exp_samples:
856-
- value: 43200
857-
labels: '{namespace="ns1", job="kube-state-metrics", instance="instance1", job_name="job1"}'
851+
values: '1x200 _x500 1x40'
858852
alert_rule_test:
859853
- eval_time: 6h
860854
alertname: KubeJobNotCompleted
861-
- eval_time: 12h
855+
- eval_time: 12h1m
862856
alertname: KubeJobNotCompleted
863857
exp_alerts:
864858
- exp_labels:
865859
namespace: ns1
866860
job_name: job1
867861
severity: warning
868-
instance: instance1
869-
job: kube-state-metrics
870862
exp_annotations:
871863
summary: "Job did not complete in time"
872-
description: "Job ns1/job1 is taking more than 12 hours to complete."
864+
description: "Job ns1/job1 is taking more than 12h 0m 0s to complete."
873865
runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobnotcompleted"
874866

875867
- interval: 1m

0 commit comments

Comments
 (0)