@@ -39,6 +39,7 @@ import (
39
39
"k8s.io/client-go/tools/cache"
40
40
watchtools "k8s.io/client-go/tools/watch"
41
41
"k8s.io/client-go/util/retry"
42
+ "k8s.io/client-go/util/workqueue"
42
43
batchinternal "k8s.io/kubernetes/pkg/apis/batch"
43
44
"k8s.io/kubernetes/test/e2e/framework"
44
45
e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
@@ -140,169 +141,110 @@ var _ = SIGDescribe("Job", func() {
140
141
framework .ExpectNoError (err , "failed to ensure job failure in namespace: %s" , f .Namespace .Name )
141
142
})
142
143
143
- ginkgo .It ("should allow to use the pod failure policy to not count the failure towards the backoffLimit" , func (ctx context.Context ) {
144
-
145
- // We set the backoffLimit to 0 so that any pod failure would trigger
146
- // job failure if not for the pod failure policy to ignore the failed
147
- // pods from counting them towards the backoffLimit. Also, we fail the
148
- // pod only once so that the job eventually succeeds.
149
- // In order to ensure a Job's pod fails once before succeeding we force
150
- // the Job's Pods to be scheduled to a single Node and use a hostPath
151
- // volume to persist data across new Pods.
152
- parallelism := int32 (2 )
153
- completions := int32 (4 )
154
- backoffLimit := int32 (0 )
155
-
156
- ginkgo .By ("Looking for a node to schedule job pod" )
157
- node , err := e2enode .GetRandomReadySchedulableNode (ctx , f .ClientSet )
158
- framework .ExpectNoError (err )
159
-
160
- ginkgo .By ("Creating a job" )
161
- job := e2ejob .NewTestJobOnNode ("failOnce" , "pod-failure-ignore" , v1 .RestartPolicyNever , parallelism , completions , nil , backoffLimit , node .Name )
162
- job .Spec .PodFailurePolicy = & batchv1.PodFailurePolicy {
163
- Rules : []batchv1.PodFailurePolicyRule {
164
- {
165
- Action : batchv1 .PodFailurePolicyActionIgnore ,
166
- OnExitCodes : & batchv1.PodFailurePolicyOnExitCodesRequirement {
167
- Operator : batchv1 .PodFailurePolicyOnExitCodesOpIn ,
168
- Values : []int32 {1 },
169
- },
170
- },
171
- },
172
- }
173
- job , err = e2ejob .CreateJob (ctx , f .ClientSet , f .Namespace .Name , job )
174
- framework .ExpectNoError (err , "failed to create job in namespace: %s" , f .Namespace .Name )
175
-
176
- ginkgo .By ("Ensuring job reaches completions" )
177
- err = e2ejob .WaitForJobComplete (ctx , f .ClientSet , f .Namespace .Name , job .Name , ptr .To (batchv1 .JobReasonCompletionsReached ), completions )
178
- framework .ExpectNoError (err , "failed to ensure job completion in namespace: %s" , f .Namespace .Name )
179
- })
180
-
181
144
/*
182
- Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the exit code
183
- Description: This test is using an indexed job. The pod corresponding to the 0th index
184
- creates a marker file on the host and runs 'forever' until evicted. We use
185
- the non-0-indexed pods to determine if the marker file is already
186
- created by the 0th indexed pod - the non-0-indexed pods fail and restart
187
- until the marker file is created (their potential failures are ignored
188
- based on the exit code). Once the marker file is created the 0th indexed
189
- pod is evicted (DisruptionTarget condition is added in the process),
190
- after restart it runs to successful completion.
191
- Steps:
192
- 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
193
- 2. Create the indexed job with pod failure policy which ignores failed pods with 137 exit code
194
- 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
195
- 4. Make sure the 0-indexed pod is running
196
- 5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
197
- 6. Await for the job to successfully complete
145
+ Testname: Ensure pod failure policy allows to ignore failure matching on the exit code
146
+ Description: This test is using an indexed job. The pod corresponding to each index
147
+ creates a marker file on the host and runs 'forever' until evicted. Once
148
+ the marker file is created the pod succeeds seeing it on restart. Thus,
149
+ we trigger one failure per index due to eviction, so the Job would be
150
+ marked as failed, if not for the ignore rule matching on exit codes.
198
151
*/
199
- ginkgo .It ("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the exit code" , func (ctx context.Context ) {
200
- // We set the backoffLimit to 0 so that any pod failure would trigger
201
- // job failure if not for the pod failure policy to ignore the failed
202
- // pods from counting them towards the backoffLimit.
203
- parallelism := int32 (2 )
204
- completions := int32 (4 )
205
- backoffLimit := int32 (0 )
152
+ ginkgo .It ("should allow to use a pod failure policy to ignore failure matching on exit code" , func (ctx context.Context ) {
153
+ // We set the backoffLimit = numPods-1 so that we can tolerate random
154
+ // failures (like OutOfPods from kubelet). Yet, the Job would fail if the
155
+ // pod failures were not be ignored.
156
+ numPods := 3
157
+ parallelism := int32 (numPods )
158
+ completions := int32 (numPods )
159
+ backoffLimit := int32 (numPods ) - 1
206
160
207
161
ginkgo .By ("Looking for a node to schedule job pods" )
208
162
node , err := e2enode .GetRandomReadySchedulableNode (ctx , f .ClientSet )
209
163
framework .ExpectNoError (err )
210
164
211
165
ginkgo .By ("Creating a job" )
212
- job := e2ejob .NewTestJobOnNode ("notTerminateOnce " , "evicted-pod-ignore-on-exit-code" , v1 .RestartPolicyNever , parallelism , completions , nil , backoffLimit , node .Name )
166
+ job := e2ejob .NewTestJobOnNode ("notTerminateOncePerIndex " , "evicted-pod-ignore-on-exit-code" , v1 .RestartPolicyNever , parallelism , completions , nil , backoffLimit , node .Name )
213
167
job .Spec .CompletionMode = ptr .To (batchv1 .IndexedCompletion )
214
168
job .Spec .PodFailurePolicy = & batchv1.PodFailurePolicy {
215
169
Rules : []batchv1.PodFailurePolicyRule {
216
170
{
217
- // Ignore failures of the non 0-indexed pods which fail until the marker file is created
218
- // And the 137 in the 0-indexed pod due to eviction .
171
+ // Ignore the pod failure caused by the eviction based on the
172
+ // exit code corresponding to SIGKILL .
219
173
Action : batchv1 .PodFailurePolicyActionIgnore ,
220
174
OnExitCodes : & batchv1.PodFailurePolicyOnExitCodesRequirement {
221
175
Operator : batchv1 .PodFailurePolicyOnExitCodesOpIn ,
222
- Values : []int32 {1 , 137 },
176
+ Values : []int32 {137 },
223
177
},
224
178
},
225
179
},
226
180
}
227
181
job , err = e2ejob .CreateJob (ctx , f .ClientSet , f .Namespace .Name , job )
228
182
framework .ExpectNoError (err , "failed to create job in namespace: %s" , f .Namespace .Name )
229
183
230
- ginkgo .By ("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created" )
231
- err = e2ejob .WaitForJobPodsSucceeded (ctx , f .ClientSet , f .Namespace .Name , job .Name , completions - 1 )
232
- framework .ExpectNoError (err , "failed to await for all non 0-indexed pods to succeed for job: %s/%s" , job .Name , job .Namespace )
233
-
234
- ginkgo .By ("Awaiting for the 0-indexed pod to be running" )
235
- err = e2ejob .WaitForJobPodsRunning (ctx , f .ClientSet , f .Namespace .Name , job .Name , 1 )
236
- framework .ExpectNoError (err , "failed to await for the 0-indexed pod to be running for the job: %s/%s" , job .Name , job .Namespace )
184
+ ginkgo .By ("Waiting for all the pods to be ready" )
185
+ err = e2ejob .WaitForJobReady (ctx , f .ClientSet , f .Namespace .Name , job .Name , ptr .To (int32 (numPods )))
186
+ framework .ExpectNoError (err , "failed to await for all pods to be ready for job: %s/%s" , job .Name , job .Namespace )
237
187
188
+ ginkgo .By ("Fetch all running pods" )
238
189
pods , err := e2ejob .GetAllRunningJobPods (ctx , f .ClientSet , f .Namespace .Name , job .Name )
239
190
framework .ExpectNoError (err , "failed to get running pods for the job: %s/%s" , job .Name , job .Namespace )
240
- gomega .Expect (pods ).To (gomega .HaveLen (1 ), "Exactly one running pod is expected" )
241
- pod := pods [0 ]
242
- ginkgo .By (fmt .Sprintf ("Evicting the running pod: %s/%s" , pod .Name , pod .Namespace ))
243
- evictTarget := & policyv1.Eviction {
244
- ObjectMeta : metav1.ObjectMeta {
245
- Name : pod .Name ,
246
- Namespace : pod .Namespace ,
247
- },
248
- }
249
- err = f .ClientSet .CoreV1 ().Pods (pod .Namespace ).EvictV1 (ctx , evictTarget )
250
- framework .ExpectNoError (err , "failed to evict the pod: %s/%s" , pod .Name , pod .Namespace )
191
+ gomega .Expect (pods ).To (gomega .HaveLen (numPods ), "Number of running pods doesn't match parallelism" )
192
+
193
+ ginkgo .By ("Evict all the Pods" )
194
+ workqueue .ParallelizeUntil (ctx , numPods , numPods , func (index int ) {
195
+ defer ginkgo .GinkgoRecover ()
196
+
197
+ pod := pods [index ]
198
+ ginkgo .By (fmt .Sprintf ("Evicting the running pod: %s/%s" , pod .Name , pod .Namespace ))
199
+ evictTarget := & policyv1.Eviction {
200
+ ObjectMeta : metav1.ObjectMeta {
201
+ Name : pod .Name ,
202
+ Namespace : pod .Namespace ,
203
+ },
204
+ }
205
+ err = f .ClientSet .CoreV1 ().Pods (pod .Namespace ).EvictV1 (ctx , evictTarget )
206
+ framework .ExpectNoError (err , "failed to evict the pod: %s/%s" , pod .Name , pod .Namespace )
251
207
252
- ginkgo .By (fmt .Sprintf ("Awaiting for the pod: %s/%s to be deleted" , pod .Name , pod .Namespace ))
253
- err = e2epod .WaitForPodNotFoundInNamespace (ctx , f .ClientSet , pod .Name , pod .Namespace , f .Timeouts .PodDelete )
254
- framework .ExpectNoError (err , "failed to await for the pod to be deleted: %s/%s" , pod .Name , pod .Namespace )
208
+ ginkgo .By (fmt .Sprintf ("Awaiting for the pod: %s/%s to be deleted" , pod .Name , pod .Namespace ))
209
+ err = e2epod .WaitForPodNotFoundInNamespace (ctx , f .ClientSet , pod .Name , pod .Namespace , f .Timeouts .PodDelete )
210
+ framework .ExpectNoError (err , "failed to await for all pods to be deleted: %s/%s" , pod .Name , pod .Namespace )
211
+ })
255
212
256
213
ginkgo .By ("Ensuring job reaches completions" )
257
- err = e2ejob .WaitForJobComplete (ctx , f .ClientSet , f .Namespace .Name , job .Name , ptr . To ( batchv1 . JobReasonCompletionsReached ) , completions )
214
+ err = e2ejob .WaitForJobComplete (ctx , f .ClientSet , f .Namespace .Name , job .Name , nil , completions )
258
215
framework .ExpectNoError (err , "failed to ensure job completion in namespace: %s" , f .Namespace .Name )
259
216
})
260
217
261
218
/*
262
- Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the DisruptionTarget condition
263
- Description: This test is using an indexed job. The pod corresponding to the 0th index
264
- creates a marker file on the host and runs 'forever' until evicted. We use
265
- the non-0-indexed pods to determine if the marker file is already
266
- created by the 0th indexed pod - the non-0-indexed pods fail and restart
267
- until the marker file is created (their potential failures are ignored
268
- based on the exit code). Once the marker file is created the 0th indexed
269
- pod is evicted (DisruptionTarget condition is added in the process),
270
- after restart it runs to successful completion.
271
- Steps:
272
- 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
273
- 2. Create the indexed job with pod failure policy which ignores failed pods with DisruptionTarget condition
274
- 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
275
- 4. Make sure the 0-indexed pod is running
276
- 5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
277
- 6. Await for the job to successfully complete
219
+ Testname: Ensure pod failure policy allows to ignore failure matching on the DisruptionTarget condition
220
+ Description: This test is using an indexed job. The pod corresponding to each index
221
+ creates a marker file on the host and runs 'forever' until evicted. Once
222
+ the marker file is created the pod succeeds seeing it on restart. Thus,
223
+ we trigger one failure per index due to eviction (DisruptionTarget
224
+ condition is added in the process). The Job would be marked as failed,
225
+ if not for the ignore rule matching on exit codes.
278
226
*/
279
- ginkgo .It ("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the DisruptionTarget condition" , func (ctx context.Context ) {
280
- // We set the backoffLimit to 0 so that any pod failure would trigger
281
- // job failure if not for the pod failure policy to ignore the failed
282
- // pods from counting them towards the backoffLimit.
283
- parallelism := int32 (2 )
284
- completions := int32 (4 )
285
- backoffLimit := int32 (0 )
227
+ ginkgo .It ("should allow to use a pod failure policy to ignore failure matching on DisruptionTarget condition" , func (ctx context.Context ) {
228
+ // We set the backoffLimit = numPods-1 so that we can tolerate random
229
+ // failures (like OutOfPods from kubelet). Yet, the Job would fail if the
230
+ // pod failures were not be ignored.
231
+ numPods := 3
232
+ parallelism := int32 (numPods )
233
+ completions := int32 (numPods )
234
+ backoffLimit := int32 (numPods ) - 1
286
235
287
236
ginkgo .By ("Looking for a node to schedule job pods" )
288
237
node , err := e2enode .GetRandomReadySchedulableNode (ctx , f .ClientSet )
289
238
framework .ExpectNoError (err )
290
239
291
240
ginkgo .By ("Creating a job" )
292
- job := e2ejob .NewTestJobOnNode ("notTerminateOnce " , "evicted-pod-ignore-on-disruption-condition" , v1 .RestartPolicyNever , parallelism , completions , nil , backoffLimit , node .Name )
241
+ job := e2ejob .NewTestJobOnNode ("notTerminateOncePerIndex " , "evicted-pod-ignore-on-disruption-condition" , v1 .RestartPolicyNever , parallelism , completions , nil , backoffLimit , node .Name )
293
242
job .Spec .CompletionMode = ptr .To (batchv1 .IndexedCompletion )
294
243
job .Spec .PodFailurePolicy = & batchv1.PodFailurePolicy {
295
244
Rules : []batchv1.PodFailurePolicyRule {
296
245
{
297
- // Ignore failures of the non 0-indexed pods which fail until the marker file is created
298
- Action : batchv1 .PodFailurePolicyActionIgnore ,
299
- OnExitCodes : & batchv1.PodFailurePolicyOnExitCodesRequirement {
300
- Operator : batchv1 .PodFailurePolicyOnExitCodesOpIn ,
301
- Values : []int32 {1 },
302
- },
303
- },
304
- {
305
- // Ignore the pod failure caused by the eviction
246
+ // Ignore the pod failure caused by the eviction based on the
247
+ // DisruptionTarget condition
306
248
Action : batchv1 .PodFailurePolicyActionIgnore ,
307
249
OnPodConditions : []batchv1.PodFailurePolicyOnPodConditionsPattern {
308
250
{
@@ -316,34 +258,37 @@ var _ = SIGDescribe("Job", func() {
316
258
job , err = e2ejob .CreateJob (ctx , f .ClientSet , f .Namespace .Name , job )
317
259
framework .ExpectNoError (err , "failed to create job in namespace: %s" , f .Namespace .Name )
318
260
319
- ginkgo .By ("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created" )
320
- err = e2ejob .WaitForJobPodsSucceeded (ctx , f .ClientSet , f .Namespace .Name , job .Name , completions - 1 )
321
- framework .ExpectNoError (err , "failed to await for all non 0-indexed pods to succeed for job: %s/%s" , job .Name , job .Namespace )
322
-
323
- ginkgo .By ("Awaiting for the 0-indexed pod to be running" )
324
- err = e2ejob .WaitForJobPodsRunning (ctx , f .ClientSet , f .Namespace .Name , job .Name , 1 )
325
- framework .ExpectNoError (err , "failed to await for the 0-indexed pod to be running for the job: %s/%s" , job .Name , job .Namespace )
261
+ ginkgo .By ("Waiting for all the pods to be ready" )
262
+ err = e2ejob .WaitForJobReady (ctx , f .ClientSet , f .Namespace .Name , job .Name , ptr .To (int32 (numPods )))
263
+ framework .ExpectNoError (err , "failed to await for all pods to be ready for job: %s/%s" , job .Name , job .Namespace )
326
264
265
+ ginkgo .By ("Fetch all running pods" )
327
266
pods , err := e2ejob .GetAllRunningJobPods (ctx , f .ClientSet , f .Namespace .Name , job .Name )
328
267
framework .ExpectNoError (err , "failed to get running pods for the job: %s/%s" , job .Name , job .Namespace )
329
- gomega .Expect (pods ).To (gomega .HaveLen (1 ), "Exactly one running pod is expected" )
330
- pod := pods [0 ]
331
- ginkgo .By (fmt .Sprintf ("Evicting the running pod: %s/%s" , pod .Name , pod .Namespace ))
332
- evictTarget := & policyv1.Eviction {
333
- ObjectMeta : metav1.ObjectMeta {
334
- Name : pod .Name ,
335
- Namespace : pod .Namespace ,
336
- },
337
- }
338
- err = f .ClientSet .CoreV1 ().Pods (pod .Namespace ).EvictV1 (ctx , evictTarget )
339
- framework .ExpectNoError (err , "failed to evict the pod: %s/%s" , pod .Name , pod .Namespace )
268
+ gomega .Expect (pods ).To (gomega .HaveLen (numPods ), "Number of running pods doesn't match parallelism" )
269
+
270
+ ginkgo .By ("Evict all the Pods" )
271
+ workqueue .ParallelizeUntil (ctx , numPods , numPods , func (index int ) {
272
+ defer ginkgo .GinkgoRecover ()
273
+
274
+ pod := pods [index ]
275
+ ginkgo .By (fmt .Sprintf ("Evicting the running pod: %s/%s" , pod .Name , pod .Namespace ))
276
+ evictTarget := & policyv1.Eviction {
277
+ ObjectMeta : metav1.ObjectMeta {
278
+ Name : pod .Name ,
279
+ Namespace : pod .Namespace ,
280
+ },
281
+ }
282
+ err = f .ClientSet .CoreV1 ().Pods (pod .Namespace ).EvictV1 (ctx , evictTarget )
283
+ framework .ExpectNoError (err , "failed to evict the pod: %s/%s" , pod .Name , pod .Namespace )
340
284
341
- ginkgo .By (fmt .Sprintf ("Awaiting for the pod: %s/%s to be deleted" , pod .Name , pod .Namespace ))
342
- err = e2epod .WaitForPodNotFoundInNamespace (ctx , f .ClientSet , pod .Name , pod .Namespace , f .Timeouts .PodDelete )
343
- framework .ExpectNoError (err , "failed to await for the pod to be deleted: %s/%s" , pod .Name , pod .Namespace )
285
+ ginkgo .By (fmt .Sprintf ("Awaiting for the pod: %s/%s to be deleted" , pod .Name , pod .Namespace ))
286
+ err = e2epod .WaitForPodNotFoundInNamespace (ctx , f .ClientSet , pod .Name , pod .Namespace , f .Timeouts .PodDelete )
287
+ framework .ExpectNoError (err , "failed to await for all pods to be deleted: %s/%s" , pod .Name , pod .Namespace )
288
+ })
344
289
345
290
ginkgo .By ("Ensuring job reaches completions" )
346
- err = e2ejob .WaitForJobComplete (ctx , f .ClientSet , f .Namespace .Name , job .Name , ptr . To ( batchv1 . JobReasonCompletionsReached ) , completions )
291
+ err = e2ejob .WaitForJobComplete (ctx , f .ClientSet , f .Namespace .Name , job .Name , nil , completions )
347
292
framework .ExpectNoError (err , "failed to ensure job completion in namespace: %s" , f .Namespace .Name )
348
293
})
349
294
0 commit comments