Merge pull request kubernetes#121491 from dejanzele/pod-replacement-policy-e2e

k8s-ci-robot · web-flow · commit 6f4ad7a413f7 · 2023-10-27T14:49:16.000+02:00
Switch feature flag to beta for pod replacement policy and add e2e test
diff --git a/pkg/controller/job/job_controller_test.go b/pkg/controller/job/job_controller_test.go
@@ -3452,6 +3452,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			wantStatus: batch.JobStatus{
 				Failed:                  1,
 				Succeeded:               2,
+				Terminating:             ptr.To[int32](0),
 				CompletedIndexes:        "0,1",
 				FailedIndexes:           ptr.To(""),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
@@ -3483,6 +3484,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			},
 			wantStatus: batch.JobStatus{
 				Active:                  2,
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 				FailedIndexes:           ptr.To(""),
 			},
@@ -3509,6 +3511,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			wantStatus: batch.JobStatus{
 				Active:                  2,
 				Failed:                  1,
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 				FailedIndexes:           ptr.To(""),
 			},
@@ -3535,6 +3538,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 				Active:                  1,
 				Failed:                  1,
 				FailedIndexes:           ptr.To("0"),
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 			},
 		},
@@ -3583,6 +3587,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 				Active:                  1,
 				Failed:                  1,
 				FailedIndexes:           ptr.To("0"),
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 			},
 		},
@@ -3632,6 +3637,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 				Active:                  0,
 				Failed:                  1,
 				FailedIndexes:           ptr.To(""),
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 				Conditions: []batch.JobCondition{
 					{
@@ -3695,6 +3701,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 				Active:                  2,
 				Failed:                  0,
 				FailedIndexes:           ptr.To(""),
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 			},
 		},
@@ -3721,6 +3728,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 				Failed:                  2,
 				Succeeded:               0,
 				FailedIndexes:           ptr.To(""),
+				Terminating:             ptr.To[int32](0),
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 				Conditions: []batch.JobCondition{
 					{
@@ -3754,6 +3762,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			wantStatus: batch.JobStatus{
 				Failed:                  1,
 				Succeeded:               1,
+				Terminating:             ptr.To[int32](0),
 				FailedIndexes:           ptr.To("0"),
 				CompletedIndexes:        "1",
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
@@ -3792,6 +3801,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			wantStatus: batch.JobStatus{
 				Failed:                  3,
 				Succeeded:               1,
+				Terminating:             ptr.To[int32](0),
 				FailedIndexes:           ptr.To("0,2"),
 				CompletedIndexes:        "1",
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
@@ -3830,6 +3840,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 			wantStatus: batch.JobStatus{
 				Active:                  2,
 				Succeeded:               1,
+				Terminating:             ptr.To[int32](0),
 				CompletedIndexes:        "1",
 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
 			},
diff --git a/pkg/controller/podgc/gc_controller_test.go b/pkg/controller/podgc/gc_controller_test.go
@@ -295,6 +295,7 @@ func TestGCOrphaned(t *testing.T) {
 			},
 			itemsInQueue:    1,
 			deletedPodNames: sets.NewString("a"),
+			patchedPodNames: sets.NewString("a"),
 		},
 		{
 			name:                 "some nodes missing",
@@ -308,6 +309,7 @@ func TestGCOrphaned(t *testing.T) {
 			},
 			itemsInQueue:    1,
 			deletedPodNames: sets.NewString("a", "c", "d"),
+			patchedPodNames: sets.NewString("d"),
 		},
 		{
 			name:             "node added to client after quarantine",
@@ -457,6 +459,7 @@ func TestGCUnscheduledTerminating(t *testing.T) {
 				{name: "c", phase: v1.PodRunning, deletionTimeStamp: &metav1.Time{}, nodeName: ""},
 			},
 			deletedPodNames: sets.NewString("a", "b", "c"),
+			patchedPodNames: sets.NewString("c"),
 		},
 		{
 			name: "Scheduled pod in any phase must not be deleted",
@@ -607,6 +610,7 @@ func TestGCTerminating(t *testing.T) {
 				{name: "e6", phase: v1.PodUnknown, nodeName: "worker-5"},
 			},
 			deletedPodNames: sets.NewString("b1", "b4", "b5", "b6"),
+			patchedPodNames: sets.NewString("b1", "b4", "b5", "b6"),
 		},
 		{
 			name: "pods deleted from node tained out-of-service; PodDisruptionConditions enabled",
diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go
@@ -390,6 +390,7 @@ const (
 	// owner: @kannon92
 	// kep : https://kep.k8s.io/3939
 	// alpha: v1.28
+	// beta: v1.29
 	//
 	// Allow users to specify recreating pods of a job only when
 	// pods have fully terminated.
@@ -991,7 +992,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
 
 	JobPodFailurePolicy: {Default: true, PreRelease: featuregate.Beta},
 
-	JobPodReplacementPolicy: {Default: false, PreRelease: featuregate.Alpha},
+	JobPodReplacementPolicy: {Default: true, PreRelease: featuregate.Beta},
 
 	JobReadyPods: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.31
 
diff --git a/test/e2e/apps/job.go b/test/e2e/apps/job.go
@@ -344,6 +344,61 @@ var _ = SIGDescribe("Job", func() {
 		}
 	})
 
+	ginkgo.It("should recreate pods only after they have failed if pod replacement policy is set to Failed", func(ctx context.Context) {
+		ginkgo.By("Creating a job")
+		job := e2ejob.NewTestJob("", "pod-recreate-failed", v1.RestartPolicyNever, 1, 1, nil, 1)
+		job.Spec.PodReplacementPolicy = ptr.To(batchv1.Failed)
+		job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", `_term(){
+	sleep 5
+	exit 143
+}
+trap _term SIGTERM
+while true; do
+	sleep 1
+done`}
+		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
+		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
+
+		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
+		framework.ExpectNoError(err, "failed to wait for job pod to become running in namespace: %s", f.Namespace.Name)
+
+		ginkgo.By("Deleting job pod")
+		pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
+		framework.ExpectNoError(err, "failed to get pod list for job %s in namespace: %s", job.Name, f.Namespace.Name)
+
+		framework.ExpectNoError(e2epod.DeletePodsWithGracePeriod(ctx, f.ClientSet, pods.Items, 30), "failed to delete pods in namespace: %s", f.Namespace.Name)
+
+		ginkgo.By("Ensuring pod does not get recreated while it is in terminating state")
+		err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string {
+			if job.Status.Active == 0 && job.Status.Failed == 0 && *job.Status.Terminating == 1 {
+				return ""
+			} else {
+				return fmt.Sprintf(
+					"expected job to have 0 active pod, 0 failed pod and 1 terminating pods, but got %d active pods, %d failed pods and %d terminating pods",
+					job.Status.Active,
+					job.Status.Failed,
+					*job.Status.Terminating,
+				)
+			}
+		})
+		framework.ExpectNoError(err, "failed to ensure pod is not recreated while it is in terminating state")
+
+		ginkgo.By("Ensuring pod gets recreated after it has failed")
+		err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string {
+			if job.Status.Active == 1 && job.Status.Failed == 1 && *job.Status.Terminating == 0 {
+				return ""
+			} else {
+				return fmt.Sprintf(
+					"expected job to have 1 active pods, 1 failed pods and 0 terminating pod, but got %d active pods, %d failed pods and %d terminating pods",
+					job.Status.Active,
+					job.Status.Failed,
+					*job.Status.Terminating,
+				)
+			}
+		})
+		framework.ExpectNoError(err, "failed to wait for pod to get recreated")
+	})
+
 	/*
 		  Release: v1.24
 			Testname: Ensure Pods of an Indexed Job get a unique index.
diff --git a/test/integration/job/job_test.go b/test/integration/job/job_test.go