Skip to content

Commit d6e5fb4

Browse files
authored
Make Job PodFailurePolicy e2e tests resilient to random failures to prepare them for conformance (kubernetes#126169)
* Make JobPodFailurePolicy tests for ignore resilient to random failures * Increase parallelism and evict in parallel * Code review fixes to the job e2e tests
1 parent bfd91fb commit d6e5fb4

File tree

2 files changed

+104
-154
lines changed

2 files changed

+104
-154
lines changed

test/e2e/apps/job.go

Lines changed: 87 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
"k8s.io/client-go/tools/cache"
4040
watchtools "k8s.io/client-go/tools/watch"
4141
"k8s.io/client-go/util/retry"
42+
"k8s.io/client-go/util/workqueue"
4243
batchinternal "k8s.io/kubernetes/pkg/apis/batch"
4344
"k8s.io/kubernetes/test/e2e/framework"
4445
e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
@@ -140,169 +141,110 @@ var _ = SIGDescribe("Job", func() {
140141
framework.ExpectNoError(err, "failed to ensure job failure in namespace: %s", f.Namespace.Name)
141142
})
142143

143-
ginkgo.It("should allow to use the pod failure policy to not count the failure towards the backoffLimit", func(ctx context.Context) {
144-
145-
// We set the backoffLimit to 0 so that any pod failure would trigger
146-
// job failure if not for the pod failure policy to ignore the failed
147-
// pods from counting them towards the backoffLimit. Also, we fail the
148-
// pod only once so that the job eventually succeeds.
149-
// In order to ensure a Job's pod fails once before succeeding we force
150-
// the Job's Pods to be scheduled to a single Node and use a hostPath
151-
// volume to persist data across new Pods.
152-
parallelism := int32(2)
153-
completions := int32(4)
154-
backoffLimit := int32(0)
155-
156-
ginkgo.By("Looking for a node to schedule job pod")
157-
node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
158-
framework.ExpectNoError(err)
159-
160-
ginkgo.By("Creating a job")
161-
job := e2ejob.NewTestJobOnNode("failOnce", "pod-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
162-
job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
163-
Rules: []batchv1.PodFailurePolicyRule{
164-
{
165-
Action: batchv1.PodFailurePolicyActionIgnore,
166-
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
167-
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
168-
Values: []int32{1},
169-
},
170-
},
171-
},
172-
}
173-
job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
174-
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
175-
176-
ginkgo.By("Ensuring job reaches completions")
177-
err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, ptr.To(batchv1.JobReasonCompletionsReached), completions)
178-
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
179-
})
180-
181144
/*
182-
Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the exit code
183-
Description: This test is using an indexed job. The pod corresponding to the 0th index
184-
creates a marker file on the host and runs 'forever' until evicted. We use
185-
the non-0-indexed pods to determine if the marker file is already
186-
created by the 0th indexed pod - the non-0-indexed pods fail and restart
187-
until the marker file is created (their potential failures are ignored
188-
based on the exit code). Once the marker file is created the 0th indexed
189-
pod is evicted (DisruptionTarget condition is added in the process),
190-
after restart it runs to successful completion.
191-
Steps:
192-
1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
193-
2. Create the indexed job with pod failure policy which ignores failed pods with 137 exit code
194-
3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
195-
4. Make sure the 0-indexed pod is running
196-
5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
197-
6. Await for the job to successfully complete
145+
Testname: Ensure pod failure policy allows to ignore failure matching on the exit code
146+
Description: This test is using an indexed job. The pod corresponding to each index
147+
creates a marker file on the host and runs 'forever' until evicted. Once
148+
the marker file is created the pod succeeds seeing it on restart. Thus,
149+
we trigger one failure per index due to eviction, so the Job would be
150+
marked as failed, if not for the ignore rule matching on exit codes.
198151
*/
199-
ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the exit code", func(ctx context.Context) {
200-
// We set the backoffLimit to 0 so that any pod failure would trigger
201-
// job failure if not for the pod failure policy to ignore the failed
202-
// pods from counting them towards the backoffLimit.
203-
parallelism := int32(2)
204-
completions := int32(4)
205-
backoffLimit := int32(0)
152+
ginkgo.It("should allow to use a pod failure policy to ignore failure matching on exit code", func(ctx context.Context) {
153+
// We set the backoffLimit = numPods-1 so that we can tolerate random
154+
// failures (like OutOfPods from kubelet). Yet, the Job would fail if the
155+
// pod failures were not be ignored.
156+
numPods := 3
157+
parallelism := int32(numPods)
158+
completions := int32(numPods)
159+
backoffLimit := int32(numPods) - 1
206160

207161
ginkgo.By("Looking for a node to schedule job pods")
208162
node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
209163
framework.ExpectNoError(err)
210164

211165
ginkgo.By("Creating a job")
212-
job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-exit-code", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
166+
job := e2ejob.NewTestJobOnNode("notTerminateOncePerIndex", "evicted-pod-ignore-on-exit-code", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
213167
job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
214168
job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
215169
Rules: []batchv1.PodFailurePolicyRule{
216170
{
217-
// Ignore failures of the non 0-indexed pods which fail until the marker file is created
218-
// And the 137 in the 0-indexed pod due to eviction.
171+
// Ignore the pod failure caused by the eviction based on the
172+
// exit code corresponding to SIGKILL.
219173
Action: batchv1.PodFailurePolicyActionIgnore,
220174
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
221175
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
222-
Values: []int32{1, 137},
176+
Values: []int32{137},
223177
},
224178
},
225179
},
226180
}
227181
job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
228182
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
229183

230-
ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
231-
err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
232-
framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
233-
234-
ginkgo.By("Awaiting for the 0-indexed pod to be running")
235-
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
236-
framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
184+
ginkgo.By("Waiting for all the pods to be ready")
185+
err = e2ejob.WaitForJobReady(ctx, f.ClientSet, f.Namespace.Name, job.Name, ptr.To(int32(numPods)))
186+
framework.ExpectNoError(err, "failed to await for all pods to be ready for job: %s/%s", job.Name, job.Namespace)
237187

188+
ginkgo.By("Fetch all running pods")
238189
pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
239190
framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
240-
gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
241-
pod := pods[0]
242-
ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
243-
evictTarget := &policyv1.Eviction{
244-
ObjectMeta: metav1.ObjectMeta{
245-
Name: pod.Name,
246-
Namespace: pod.Namespace,
247-
},
248-
}
249-
err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
250-
framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
191+
gomega.Expect(pods).To(gomega.HaveLen(numPods), "Number of running pods doesn't match parallelism")
192+
193+
ginkgo.By("Evict all the Pods")
194+
workqueue.ParallelizeUntil(ctx, numPods, numPods, func(index int) {
195+
defer ginkgo.GinkgoRecover()
196+
197+
pod := pods[index]
198+
ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
199+
evictTarget := &policyv1.Eviction{
200+
ObjectMeta: metav1.ObjectMeta{
201+
Name: pod.Name,
202+
Namespace: pod.Namespace,
203+
},
204+
}
205+
err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
206+
framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
251207

252-
ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
253-
err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
254-
framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
208+
ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
209+
err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
210+
framework.ExpectNoError(err, "failed to await for all pods to be deleted: %s/%s", pod.Name, pod.Namespace)
211+
})
255212

256213
ginkgo.By("Ensuring job reaches completions")
257-
err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, ptr.To(batchv1.JobReasonCompletionsReached), completions)
214+
err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, nil, completions)
258215
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
259216
})
260217

261218
/*
262-
Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the DisruptionTarget condition
263-
Description: This test is using an indexed job. The pod corresponding to the 0th index
264-
creates a marker file on the host and runs 'forever' until evicted. We use
265-
the non-0-indexed pods to determine if the marker file is already
266-
created by the 0th indexed pod - the non-0-indexed pods fail and restart
267-
until the marker file is created (their potential failures are ignored
268-
based on the exit code). Once the marker file is created the 0th indexed
269-
pod is evicted (DisruptionTarget condition is added in the process),
270-
after restart it runs to successful completion.
271-
Steps:
272-
1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
273-
2. Create the indexed job with pod failure policy which ignores failed pods with DisruptionTarget condition
274-
3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
275-
4. Make sure the 0-indexed pod is running
276-
5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
277-
6. Await for the job to successfully complete
219+
Testname: Ensure pod failure policy allows to ignore failure matching on the DisruptionTarget condition
220+
Description: This test is using an indexed job. The pod corresponding to each index
221+
creates a marker file on the host and runs 'forever' until evicted. Once
222+
the marker file is created the pod succeeds seeing it on restart. Thus,
223+
we trigger one failure per index due to eviction (DisruptionTarget
224+
condition is added in the process). The Job would be marked as failed,
225+
if not for the ignore rule matching on exit codes.
278226
*/
279-
ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the DisruptionTarget condition", func(ctx context.Context) {
280-
// We set the backoffLimit to 0 so that any pod failure would trigger
281-
// job failure if not for the pod failure policy to ignore the failed
282-
// pods from counting them towards the backoffLimit.
283-
parallelism := int32(2)
284-
completions := int32(4)
285-
backoffLimit := int32(0)
227+
ginkgo.It("should allow to use a pod failure policy to ignore failure matching on DisruptionTarget condition", func(ctx context.Context) {
228+
// We set the backoffLimit = numPods-1 so that we can tolerate random
229+
// failures (like OutOfPods from kubelet). Yet, the Job would fail if the
230+
// pod failures were not be ignored.
231+
numPods := 3
232+
parallelism := int32(numPods)
233+
completions := int32(numPods)
234+
backoffLimit := int32(numPods) - 1
286235

287236
ginkgo.By("Looking for a node to schedule job pods")
288237
node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
289238
framework.ExpectNoError(err)
290239

291240
ginkgo.By("Creating a job")
292-
job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-disruption-condition", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
241+
job := e2ejob.NewTestJobOnNode("notTerminateOncePerIndex", "evicted-pod-ignore-on-disruption-condition", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
293242
job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
294243
job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
295244
Rules: []batchv1.PodFailurePolicyRule{
296245
{
297-
// Ignore failures of the non 0-indexed pods which fail until the marker file is created
298-
Action: batchv1.PodFailurePolicyActionIgnore,
299-
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
300-
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
301-
Values: []int32{1},
302-
},
303-
},
304-
{
305-
// Ignore the pod failure caused by the eviction
246+
// Ignore the pod failure caused by the eviction based on the
247+
// DisruptionTarget condition
306248
Action: batchv1.PodFailurePolicyActionIgnore,
307249
OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
308250
{
@@ -316,34 +258,37 @@ var _ = SIGDescribe("Job", func() {
316258
job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
317259
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
318260

319-
ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
320-
err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
321-
framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
322-
323-
ginkgo.By("Awaiting for the 0-indexed pod to be running")
324-
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
325-
framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
261+
ginkgo.By("Waiting for all the pods to be ready")
262+
err = e2ejob.WaitForJobReady(ctx, f.ClientSet, f.Namespace.Name, job.Name, ptr.To(int32(numPods)))
263+
framework.ExpectNoError(err, "failed to await for all pods to be ready for job: %s/%s", job.Name, job.Namespace)
326264

265+
ginkgo.By("Fetch all running pods")
327266
pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
328267
framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
329-
gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
330-
pod := pods[0]
331-
ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
332-
evictTarget := &policyv1.Eviction{
333-
ObjectMeta: metav1.ObjectMeta{
334-
Name: pod.Name,
335-
Namespace: pod.Namespace,
336-
},
337-
}
338-
err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
339-
framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
268+
gomega.Expect(pods).To(gomega.HaveLen(numPods), "Number of running pods doesn't match parallelism")
269+
270+
ginkgo.By("Evict all the Pods")
271+
workqueue.ParallelizeUntil(ctx, numPods, numPods, func(index int) {
272+
defer ginkgo.GinkgoRecover()
273+
274+
pod := pods[index]
275+
ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
276+
evictTarget := &policyv1.Eviction{
277+
ObjectMeta: metav1.ObjectMeta{
278+
Name: pod.Name,
279+
Namespace: pod.Namespace,
280+
},
281+
}
282+
err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
283+
framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
340284

341-
ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
342-
err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
343-
framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
285+
ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
286+
err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
287+
framework.ExpectNoError(err, "failed to await for all pods to be deleted: %s/%s", pod.Name, pod.Namespace)
288+
})
344289

345290
ginkgo.By("Ensuring job reaches completions")
346-
err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, ptr.To(batchv1.JobReasonCompletionsReached), completions)
291+
err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, nil, completions)
347292
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
348293
})
349294

test/e2e/framework/job/fixtures.go

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -136,27 +136,32 @@ func NewTestJobOnNode(behavior, name string, rPol v1.RestartPolicy, parallelism,
136136
exit 1
137137
fi
138138
`}
139-
case "notTerminateOnce":
140-
// Do not terminate the 0-indexed pod in the first run and
141-
// succeed the second time. Fail the non-0-indexed pods until
142-
// the marker file is created by the 0-indexed pod. The fact that
143-
// the non-0-indexed pods are succeeded is used to determine that the
144-
// 0th indexed pod already created the marker file.
139+
case "notTerminateOncePerIndex":
140+
// Use marker files per index. If the given marker file already exists
141+
// then terminate successfully. Otherwise create the marker file and
142+
// sleep "forever" awaiting delete request.
145143
setupHostPathDirectory(job)
146144
job.Spec.Template.Spec.TerminationGracePeriodSeconds = ptr.To(int64(1))
147145
job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c"}
148146
job.Spec.Template.Spec.Containers[0].Args = []string{`
149-
if [[ -r /data/foo ]]
147+
if [[ -r /data/foo-$JOB_COMPLETION_INDEX ]]
150148
then
151149
exit 0
152-
elif [[ $JOB_COMPLETION_INDEX -eq 0 ]]
153-
then
154-
touch /data/foo
155-
sleep 1000000
156150
else
157-
exit 1
151+
touch /data/foo-$JOB_COMPLETION_INDEX
152+
sleep 1000000
158153
fi
159154
`}
155+
// Add readiness probe to allow the test client to check if the marker
156+
// file is already created before evicting the Pod.
157+
job.Spec.Template.Spec.Containers[0].ReadinessProbe = &v1.Probe{
158+
PeriodSeconds: 1,
159+
ProbeHandler: v1.ProbeHandler{
160+
Exec: &v1.ExecAction{
161+
Command: []string{"/bin/sh", "-c", "cat /data/foo-$JOB_COMPLETION_INDEX"},
162+
},
163+
},
164+
}
160165
}
161166
return job
162167
}

0 commit comments

Comments
 (0)