Skip to content

Commit 19a25ba

Browse files
authored
Merge pull request kubernetes#119159 from alculquicondor/fix-job-uncounted
Only declare job as finished after removing all finalizers
2 parents 63e6d81 + f7a1fb7 commit 19a25ba

File tree

2 files changed

+7
-7
lines changed

2 files changed

+7
-7
lines changed

pkg/controller/job/job_controller.go

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -792,12 +792,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
792792
var manageJobErr error
793793
var finishedCondition *batch.JobCondition
794794

795-
jobHasNewFailure := failed > job.Status.Failed
796-
// new failures happen when status does not reflect the failures and active
797-
// is different than parallelism, otherwise the previous controller loop
798-
// failed updating status so even if we pick up failure it is not a new one
799-
exceedsBackoffLimit := jobHasNewFailure && (active != *job.Spec.Parallelism) &&
800-
(failed > *job.Spec.BackoffLimit)
795+
exceedsBackoffLimit := failed > *job.Spec.BackoffLimit
801796

802797
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
803798
if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.JobFailureTarget); failureTargetCondition != nil {
@@ -999,6 +994,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
999994
needsFlush = true
1000995
}
1001996
podFailureCountByPolicyAction := map[string]int{}
997+
reachedMaxUncountedPods := false
1002998
for _, pod := range pods {
1003999
if !hasJobTrackingFinalizer(pod) || expectedRmFinalizers.Has(string(pod.UID)) {
10041000
// This pod was processed in a previous sync.
@@ -1049,6 +1045,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
10491045
//
10501046
// The job will be synced again because the Job status and Pod updates
10511047
// will put the Job back to the work queue.
1048+
reachedMaxUncountedPods = true
10521049
break
10531050
}
10541051
}
@@ -1077,7 +1074,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
10771074
if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, podFailureCountByPolicyAction, needsFlush, newBackoffRecord); err != nil {
10781075
return err
10791076
}
1080-
jobFinished := jm.enactJobFinished(job, finishedCond)
1077+
jobFinished := !reachedMaxUncountedPods && jm.enactJobFinished(job, finishedCond)
10811078
if jobFinished {
10821079
needsFlush = true
10831080
}

test/integration/job/job_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,6 +1341,9 @@ func TestOrphanPodsFinalizersClearedWithGC(t *testing.T) {
13411341
}
13421342

13431343
func TestFinalizersClearedWhenBackoffLimitExceeded(t *testing.T) {
1344+
// Set a maximum number of uncounted pods below parallelism, to ensure it
1345+
// doesn't affect the termination of pods.
1346+
t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 50))
13441347
closeFn, restConfig, clientSet, ns := setup(t, "simple")
13451348
defer closeFn()
13461349
ctx, cancel := startJobControllerAndWaitForCaches(restConfig)

0 commit comments

Comments
 (0)