Skip to content

Commit e98c33b

Browse files
committed
switch feature flag to beta for pod replacement policy and add e2e test
update pod replacement policy feature flag comment and refactor the e2e test for pod replacement policy minor fixes for pod replacement policy and e2e test fix wrong assertions for pod replacement policy e2e test more fixes to pod replacement policy e2e test refactor PodReplacementPolicy e2e test to use finalizers fix unit tests when pod replacement policy feature flag is promoted to beta fix podgc controller unit tests when pod replacement feature is enabled fix lint issue in pod replacement policy e2e test assert no error in defer function for removing finalizer in pod replacement policy e2e test implement test using a sh trap for pod replacement policy reduce sleep after SIGTERM in pod replacement policy e2e test to 5s
1 parent 191abe3 commit e98c33b

File tree

5 files changed

+255
-136
lines changed

5 files changed

+255
-136
lines changed

pkg/controller/job/job_controller_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3452,6 +3452,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
34523452
wantStatus: batch.JobStatus{
34533453
Failed: 1,
34543454
Succeeded: 2,
3455+
Terminating: ptr.To[int32](0),
34553456
CompletedIndexes: "0,1",
34563457
FailedIndexes: ptr.To(""),
34573458
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
@@ -3483,6 +3484,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
34833484
},
34843485
wantStatus: batch.JobStatus{
34853486
Active: 2,
3487+
Terminating: ptr.To[int32](0),
34863488
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
34873489
FailedIndexes: ptr.To(""),
34883490
},
@@ -3509,6 +3511,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
35093511
wantStatus: batch.JobStatus{
35103512
Active: 2,
35113513
Failed: 1,
3514+
Terminating: ptr.To[int32](0),
35123515
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
35133516
FailedIndexes: ptr.To(""),
35143517
},
@@ -3535,6 +3538,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
35353538
Active: 1,
35363539
Failed: 1,
35373540
FailedIndexes: ptr.To("0"),
3541+
Terminating: ptr.To[int32](0),
35383542
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
35393543
},
35403544
},
@@ -3583,6 +3587,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
35833587
Active: 1,
35843588
Failed: 1,
35853589
FailedIndexes: ptr.To("0"),
3590+
Terminating: ptr.To[int32](0),
35863591
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
35873592
},
35883593
},
@@ -3632,6 +3637,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
36323637
Active: 0,
36333638
Failed: 1,
36343639
FailedIndexes: ptr.To(""),
3640+
Terminating: ptr.To[int32](0),
36353641
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
36363642
Conditions: []batch.JobCondition{
36373643
{
@@ -3695,6 +3701,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
36953701
Active: 2,
36963702
Failed: 0,
36973703
FailedIndexes: ptr.To(""),
3704+
Terminating: ptr.To[int32](0),
36983705
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
36993706
},
37003707
},
@@ -3721,6 +3728,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
37213728
Failed: 2,
37223729
Succeeded: 0,
37233730
FailedIndexes: ptr.To(""),
3731+
Terminating: ptr.To[int32](0),
37243732
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
37253733
Conditions: []batch.JobCondition{
37263734
{
@@ -3754,6 +3762,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
37543762
wantStatus: batch.JobStatus{
37553763
Failed: 1,
37563764
Succeeded: 1,
3765+
Terminating: ptr.To[int32](0),
37573766
FailedIndexes: ptr.To("0"),
37583767
CompletedIndexes: "1",
37593768
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
@@ -3792,6 +3801,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
37923801
wantStatus: batch.JobStatus{
37933802
Failed: 3,
37943803
Succeeded: 1,
3804+
Terminating: ptr.To[int32](0),
37953805
FailedIndexes: ptr.To("0,2"),
37963806
CompletedIndexes: "1",
37973807
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
@@ -3830,6 +3840,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
38303840
wantStatus: batch.JobStatus{
38313841
Active: 2,
38323842
Succeeded: 1,
3843+
Terminating: ptr.To[int32](0),
38333844
CompletedIndexes: "1",
38343845
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
38353846
},

pkg/controller/podgc/gc_controller_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ func TestGCOrphaned(t *testing.T) {
295295
},
296296
itemsInQueue: 1,
297297
deletedPodNames: sets.NewString("a"),
298+
patchedPodNames: sets.NewString("a"),
298299
},
299300
{
300301
name: "some nodes missing",
@@ -308,6 +309,7 @@ func TestGCOrphaned(t *testing.T) {
308309
},
309310
itemsInQueue: 1,
310311
deletedPodNames: sets.NewString("a", "c", "d"),
312+
patchedPodNames: sets.NewString("d"),
311313
},
312314
{
313315
name: "node added to client after quarantine",
@@ -457,6 +459,7 @@ func TestGCUnscheduledTerminating(t *testing.T) {
457459
{name: "c", phase: v1.PodRunning, deletionTimeStamp: &metav1.Time{}, nodeName: ""},
458460
},
459461
deletedPodNames: sets.NewString("a", "b", "c"),
462+
patchedPodNames: sets.NewString("c"),
460463
},
461464
{
462465
name: "Scheduled pod in any phase must not be deleted",
@@ -607,6 +610,7 @@ func TestGCTerminating(t *testing.T) {
607610
{name: "e6", phase: v1.PodUnknown, nodeName: "worker-5"},
608611
},
609612
deletedPodNames: sets.NewString("b1", "b4", "b5", "b6"),
613+
patchedPodNames: sets.NewString("b1", "b4", "b5", "b6"),
610614
},
611615
{
612616
name: "pods deleted from node tained out-of-service; PodDisruptionConditions enabled",

pkg/features/kube_features.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ const (
389389
// owner: @kannon92
390390
// kep : https://kep.k8s.io/3939
391391
// alpha: v1.28
392+
// beta: v1.29
392393
//
393394
// Allow users to specify recreating pods of a job only when
394395
// pods have fully terminated.
@@ -990,7 +991,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
990991

991992
JobPodFailurePolicy: {Default: true, PreRelease: featuregate.Beta},
992993

993-
JobPodReplacementPolicy: {Default: false, PreRelease: featuregate.Alpha},
994+
JobPodReplacementPolicy: {Default: true, PreRelease: featuregate.Beta},
994995

995996
JobReadyPods: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.31
996997

test/e2e/apps/job.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,61 @@ var _ = SIGDescribe("Job", func() {
344344
}
345345
})
346346

347+
ginkgo.It("should recreate pods only after they have failed if pod replacement policy is set to Failed", func(ctx context.Context) {
348+
ginkgo.By("Creating a job")
349+
job := e2ejob.NewTestJob("", "pod-recreate-failed", v1.RestartPolicyNever, 1, 1, nil, 1)
350+
job.Spec.PodReplacementPolicy = ptr.To(batchv1.Failed)
351+
job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", `_term(){
352+
sleep 5
353+
exit 143
354+
}
355+
trap _term SIGTERM
356+
while true; do
357+
sleep 1
358+
done`}
359+
job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
360+
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
361+
362+
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
363+
framework.ExpectNoError(err, "failed to wait for job pod to become running in namespace: %s", f.Namespace.Name)
364+
365+
ginkgo.By("Deleting job pod")
366+
pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
367+
framework.ExpectNoError(err, "failed to get pod list for job %s in namespace: %s", job.Name, f.Namespace.Name)
368+
369+
framework.ExpectNoError(e2epod.DeletePodsWithGracePeriod(ctx, f.ClientSet, pods.Items, 30), "failed to delete pods in namespace: %s", f.Namespace.Name)
370+
371+
ginkgo.By("Ensuring pod does not get recreated while it is in terminating state")
372+
err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string {
373+
if job.Status.Active == 0 && job.Status.Failed == 0 && *job.Status.Terminating == 1 {
374+
return ""
375+
} else {
376+
return fmt.Sprintf(
377+
"expected job to have 0 active pod, 0 failed pod and 1 terminating pods, but got %d active pods, %d failed pods and %d terminating pods",
378+
job.Status.Active,
379+
job.Status.Failed,
380+
*job.Status.Terminating,
381+
)
382+
}
383+
})
384+
framework.ExpectNoError(err, "failed to ensure pod is not recreated while it is in terminating state")
385+
386+
ginkgo.By("Ensuring pod gets recreated after it has failed")
387+
err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string {
388+
if job.Status.Active == 1 && job.Status.Failed == 1 && *job.Status.Terminating == 0 {
389+
return ""
390+
} else {
391+
return fmt.Sprintf(
392+
"expected job to have 1 active pods, 1 failed pods and 0 terminating pod, but got %d active pods, %d failed pods and %d terminating pods",
393+
job.Status.Active,
394+
job.Status.Failed,
395+
*job.Status.Terminating,
396+
)
397+
}
398+
})
399+
framework.ExpectNoError(err, "failed to wait for pod to get recreated")
400+
})
401+
347402
/*
348403
Release: v1.24
349404
Testname: Ensure Pods of an Indexed Job get a unique index.

0 commit comments

Comments
 (0)