switch feature flag to beta for pod replacement policy and add e2e test

dejanzele · dejanzele · commit e98c33bfaf27 · 2023-10-26T21:50:37.000+02:00
update pod replacement policy feature flag comment and refactor the e2e test for pod replacement policy

minor fixes for pod replacement policy and e2e test

fix wrong assertions for pod replacement policy e2e test

more fixes to pod replacement policy e2e test

refactor PodReplacementPolicy e2e test to use finalizers

fix unit tests when pod replacement policy feature flag is promoted to beta

fix podgc controller unit tests when pod replacement feature is enabled

fix lint issue in pod replacement policy e2e test

assert no error in defer function for removing finalizer in pod replacement policy e2e test

implement test using a sh trap for pod replacement policy

reduce sleep after SIGTERM in pod replacement policy e2e test to 5s
diff --git a/pkg/controller/job/job_controller_test.go b/pkg/controller/job/job_controller_test.go
@@ -3452,6 +3452,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			wantStatus: batch.JobStatus{
 				Failed:                  1,
 				Succeeded:               2,
+				Terminating:             ptr.To[int32](0),
 				CompletedIndexes:        "0,1",
 				FailedIndexes:           ptr.To(""),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
@@ -3483,6 +3484,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			},
 			wantStatus: batch.JobStatus{
 				Active:                  2,
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 				FailedIndexes:           ptr.To(""),
 			},
@@ -3509,6 +3511,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			wantStatus: batch.JobStatus{
 				Active:                  2,
 				Failed:                  1,
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 				FailedIndexes:           ptr.To(""),
 			},
@@ -3535,6 +3538,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 				Active:                  1,
 				Failed:                  1,
 				FailedIndexes:           ptr.To("0"),
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 			},
 		},
@@ -3583,6 +3587,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 				Active:                  1,
 				Failed:                  1,
 				FailedIndexes:           ptr.To("0"),
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 			},
 		},
@@ -3632,6 +3637,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 				Active:                  0,
 				Failed:                  1,
 				FailedIndexes:           ptr.To(""),
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 				Conditions: []batch.JobCondition{
 					{
@@ -3695,6 +3701,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 				Active:                  2,
 				Failed:                  0,
 				FailedIndexes:           ptr.To(""),
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 			},
 		},
@@ -3721,6 +3728,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 				Failed:                  2,
 				Succeeded:               0,
 				FailedIndexes:           ptr.To(""),
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 				Conditions: []batch.JobCondition{
 					{
@@ -3754,6 +3762,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			wantStatus: batch.JobStatus{
 				Failed:                  1,
 				Succeeded:               1,
+				Terminating:             ptr.To[int32](0),
 				FailedIndexes:           ptr.To("0"),
 				CompletedIndexes:        "1",
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
@@ -3792,6 +3801,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			wantStatus: batch.JobStatus{
 				Failed:                  3,
 				Succeeded:               1,
+				Terminating:             ptr.To[int32](0),
 				FailedIndexes:           ptr.To("0,2"),
 				CompletedIndexes:        "1",
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
@@ -3830,6 +3840,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			wantStatus: batch.JobStatus{
 				Active:                  2,
 				Succeeded:               1,
+				Terminating:             ptr.To[int32](0),
 				CompletedIndexes:        "1",
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 			},
diff --git a/pkg/controller/podgc/gc_controller_test.go b/pkg/controller/podgc/gc_controller_test.go
@@ -295,6 +295,7 @@ func TestGCOrphaned(t *testing.T) {
 			},
 			itemsInQueue:    1,
 			deletedPodNames: sets.NewString("a"),
+			patchedPodNames: sets.NewString("a"),
 		},
 		{
 			name:                 "some nodes missing",
@@ -308,6 +309,7 @@ func TestGCOrphaned(t *testing.T) {
 			},
 			itemsInQueue:    1,
 			deletedPodNames: sets.NewString("a", "c", "d"),
+			patchedPodNames: sets.NewString("d"),
 		},
 		{
 			name:             "node added to client after quarantine",
@@ -457,6 +459,7 @@ func TestGCUnscheduledTerminating(t *testing.T) {
 				{name: "c", phase: v1.PodRunning, deletionTimeStamp: &metav1.Time{}, nodeName: ""},
 			},
 			deletedPodNames: sets.NewString("a", "b", "c"),
+			patchedPodNames: sets.NewString("c"),
 		},
 		{
 			name: "Scheduled pod in any phase must not be deleted",
@@ -607,6 +610,7 @@ func TestGCTerminating(t *testing.T) {
 				{name: "e6", phase: v1.PodUnknown, nodeName: "worker-5"},
 			},
 			deletedPodNames: sets.NewString("b1", "b4", "b5", "b6"),
+			patchedPodNames: sets.NewString("b1", "b4", "b5", "b6"),
 		},
 		{
 			name: "pods deleted from node tained out-of-service; PodDisruptionConditions enabled",
diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go
@@ -389,6 +389,7 @@ const (
 	// owner: @kannon92
 	// kep : https://kep.k8s.io/3939
 	// alpha: v1.28
+	// beta: v1.29
 	//
 	// Allow users to specify recreating pods of a job only when
 	// pods have fully terminated.
@@ -990,7 +991,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
 
 	JobPodFailurePolicy: {Default: true, PreRelease: featuregate.Beta},
 
-	JobPodReplacementPolicy: {Default: false, PreRelease: featuregate.Alpha},
+	JobPodReplacementPolicy: {Default: true, PreRelease: featuregate.Beta},
 
 	JobReadyPods: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.31
 
diff --git a/test/e2e/apps/job.go b/test/e2e/apps/job.go
@@ -344,6 +344,61 @@ var _ = SIGDescribe("Job", func() {
 		}
 	})
 
+	ginkgo.It("should recreate pods only after they have failed if pod replacement policy is set to Failed", func(ctx context.Context) {
+		ginkgo.By("Creating a job")
+		job := e2ejob.NewTestJob("", "pod-recreate-failed", v1.RestartPolicyNever, 1, 1, nil, 1)
+		job.Spec.PodReplacementPolicy = ptr.To(batchv1.Failed)
+		job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", `_term(){
+	sleep 5
+	exit 143
+}
+trap _term SIGTERM
+while true; do
+	sleep 1
+done`}
+		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
+		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
+
+		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
+		framework.ExpectNoError(err, "failed to wait for job pod to become running in namespace: %s", f.Namespace.Name)
+
+		ginkgo.By("Deleting job pod")
+		pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
+		framework.ExpectNoError(err, "failed to get pod list for job %s in namespace: %s", job.Name, f.Namespace.Name)
+
+		framework.ExpectNoError(e2epod.DeletePodsWithGracePeriod(ctx, f.ClientSet, pods.Items, 30), "failed to delete pods in namespace: %s", f.Namespace.Name)
+
+		ginkgo.By("Ensuring pod does not get recreated while it is in terminating state")
+		err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string {
+			if job.Status.Active == 0 && job.Status.Failed == 0 && *job.Status.Terminating == 1 {
+				return ""
+			} else {
+				return fmt.Sprintf(
+					"expected job to have 0 active pod, 0 failed pod and 1 terminating pods, but got %d active pods, %d failed pods and %d terminating pods",
+					job.Status.Active,
+					job.Status.Failed,
+					*job.Status.Terminating,
+				)
+			}
+		})
+		framework.ExpectNoError(err, "failed to ensure pod is not recreated while it is in terminating state")
+
+		ginkgo.By("Ensuring pod gets recreated after it has failed")
+		err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string {
+			if job.Status.Active == 1 && job.Status.Failed == 1 && *job.Status.Terminating == 0 {
+				return ""
+			} else {
+				return fmt.Sprintf(
+					"expected job to have 1 active pods, 1 failed pods and 0 terminating pod, but got %d active pods, %d failed pods and %d terminating pods",
+					job.Status.Active,
+					job.Status.Failed,
+					*job.Status.Terminating,
+				)
+			}
+		})
+		framework.ExpectNoError(err, "failed to wait for pod to get recreated")
+	})
+
 	/*
 		  Release: v1.24
 			Testname: Ensure Pods of an Indexed Job get a unique index.
diff --git a/test/integration/job/job_test.go b/test/integration/job/job_test.go