Skip to content

Commit 7dd2948

Browse files
committed
count terminating pods when deleting active pods for failed jobs
1 parent bc8ec4f commit 7dd2948

File tree

3 files changed

+55
-14
lines changed

3 files changed

+55
-14
lines changed

pkg/controller/job/job_controller.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -899,6 +899,9 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
899899
jobCtx.finishedCondition = nil
900900
}
901901
active -= deleted
902+
if feature.DefaultFeatureGate.Enabled(features.JobPodReplacementPolicy) {
903+
*jobCtx.terminating += deleted
904+
}
902905
manageJobErr = err
903906
} else {
904907
manageJobCalled := false
@@ -1504,6 +1507,9 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn
15041507
jm.expectations.ExpectDeletions(logger, jobKey, len(podsToDelete))
15051508
removed, err := jm.deleteJobPods(ctx, job, jobKey, podsToDelete)
15061509
active -= removed
1510+
if feature.DefaultFeatureGate.Enabled(features.JobPodReplacementPolicy) {
1511+
*jobCtx.terminating += removed
1512+
}
15071513
return active, metrics.JobSyncActionPodsDeleted, err
15081514
}
15091515

@@ -1553,6 +1559,9 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn
15531559
logger.V(4).Info("Too many pods running for job", "job", klog.KObj(job), "deleted", len(podsToDelete), "target", wantActive)
15541560
removed, err := jm.deleteJobPods(ctx, job, jobKey, podsToDelete)
15551561
active -= removed
1562+
if feature.DefaultFeatureGate.Enabled(features.JobPodReplacementPolicy) {
1563+
*jobCtx.terminating += removed
1564+
}
15561565
// While it is possible for a Job to require both pod creations and
15571566
// deletions at the same time (e.g. indexed Jobs with repeated indexes), we
15581567
// restrict ourselves to either just pod deletion or pod creation in any

pkg/controller/job/job_controller_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,10 @@ func TestControllerSyncJob(t *testing.T) {
400400
backoffLimit: 6,
401401
activePods: 2,
402402
failedPods: 0,
403-
terminatingPods: 4,
403+
terminatingPods: 5,
404404
podReplacementPolicy: podReplacementPolicy(batch.Failed),
405405
jobPodReplacementPolicy: true,
406-
expectedTerminating: ptr.To[int32](4),
406+
expectedTerminating: ptr.To[int32](6),
407407
expectedReady: ptr.To[int32](0),
408408
expectedActive: 1,
409409
expectedDeletions: 1,
@@ -3640,7 +3640,7 @@ func TestSyncJobWithJobSuccessPolicy(t *testing.T) {
36403640
wantStatus: batch.JobStatus{
36413641
Failed: 1,
36423642
Succeeded: 1,
3643-
Terminating: ptr.To[int32](0),
3643+
Terminating: ptr.To[int32](2),
36443644
CompletedIndexes: "1",
36453645
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
36463646
Conditions: []batch.JobCondition{
@@ -3697,7 +3697,7 @@ func TestSyncJobWithJobSuccessPolicy(t *testing.T) {
36973697
wantStatus: batch.JobStatus{
36983698
Failed: 1,
36993699
Succeeded: 1,
3700-
Terminating: ptr.To[int32](0),
3700+
Terminating: ptr.To[int32](1),
37013701
CompletedIndexes: "1",
37023702
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
37033703
Conditions: []batch.JobCondition{
@@ -3746,7 +3746,7 @@ func TestSyncJobWithJobSuccessPolicy(t *testing.T) {
37463746
wantStatus: batch.JobStatus{
37473747
Failed: 1,
37483748
Succeeded: 1,
3749-
Terminating: ptr.To[int32](0),
3749+
Terminating: ptr.To[int32](1),
37503750
CompletedIndexes: "1",
37513751
FailedIndexes: ptr.To(""),
37523752
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
@@ -3868,7 +3868,7 @@ func TestSyncJobWithJobSuccessPolicy(t *testing.T) {
38683868
wantStatus: batch.JobStatus{
38693869
Failed: 1,
38703870
Succeeded: 1,
3871-
Terminating: ptr.To[int32](0),
3871+
Terminating: ptr.To[int32](2),
38723872
CompletedIndexes: "1",
38733873
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
38743874
Conditions: []batch.JobCondition{
@@ -3931,7 +3931,7 @@ func TestSyncJobWithJobSuccessPolicy(t *testing.T) {
39313931
wantStatus: batch.JobStatus{
39323932
Failed: 2,
39333933
Succeeded: 1,
3934-
Terminating: ptr.To[int32](0),
3934+
Terminating: ptr.To[int32](1),
39353935
CompletedIndexes: "1",
39363936
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
39373937
Conditions: []batch.JobCondition{
@@ -4761,7 +4761,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
47614761
wantStatus: batch.JobStatus{
47624762
Failed: 3,
47634763
Succeeded: 1,
4764-
Terminating: ptr.To[int32](0),
4764+
Terminating: ptr.To[int32](1),
47654765
FailedIndexes: ptr.To("0,2"),
47664766
CompletedIndexes: "1",
47674767
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},

test/integration/job/job_test.go

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,7 @@ func TestSuccessPolicy(t *testing.T) {
514514
wantActiveIndexes sets.Set[int]
515515
wantCompletedIndexes string
516516
wantFailedIndexes *string
517+
wantTerminating *int32
517518
}
518519

519520
podTemplateSpec := v1.PodTemplateSpec{
@@ -561,6 +562,7 @@ func TestSuccessPolicy(t *testing.T) {
561562
wantFailed: 0,
562563
wantSucceeded: 1,
563564
wantCompletedIndexes: "0",
565+
wantTerminating: ptr.To(int32(0)),
564566
},
565567
},
566568
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobSuccessCriteriaMet, batchv1.JobComplete},
@@ -595,6 +597,7 @@ func TestSuccessPolicy(t *testing.T) {
595597
wantFailed: 0,
596598
wantSucceeded: 1,
597599
wantCompletedIndexes: "0",
600+
wantTerminating: ptr.To(int32(0)),
598601
},
599602
},
600603
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobComplete},
@@ -630,6 +633,7 @@ func TestSuccessPolicy(t *testing.T) {
630633
wantActiveIndexes: sets.New(0, 1),
631634
wantFailed: 0,
632635
wantSucceeded: 0,
636+
wantTerminating: ptr.To(int32(0)),
633637
},
634638
{
635639
index: 1,
@@ -640,6 +644,7 @@ func TestSuccessPolicy(t *testing.T) {
640644
wantFailed: 0,
641645
wantSucceeded: 1,
642646
wantCompletedIndexes: "1",
647+
wantTerminating: ptr.To(int32(1)),
643648
},
644649
},
645650
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobSuccessCriteriaMet, batchv1.JobComplete},
@@ -675,6 +680,7 @@ func TestSuccessPolicy(t *testing.T) {
675680
wantActiveIndexes: sets.New(0, 1),
676681
wantFailed: 0,
677682
wantSucceeded: 0,
683+
wantTerminating: ptr.To(int32(0)),
678684
},
679685
{
680686
index: 1,
@@ -685,6 +691,7 @@ func TestSuccessPolicy(t *testing.T) {
685691
wantFailed: 0,
686692
wantSucceeded: 1,
687693
wantCompletedIndexes: "1",
694+
wantTerminating: ptr.To(int32(1)),
688695
},
689696
},
690697
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobSuccessCriteriaMet, batchv1.JobComplete},
@@ -723,6 +730,7 @@ func TestSuccessPolicy(t *testing.T) {
723730
wantFailed: 1,
724731
wantFailedIndexes: ptr.To("0"),
725732
wantSucceeded: 0,
733+
wantTerminating: ptr.To(int32(0)),
726734
},
727735
{
728736
index: 1,
@@ -734,6 +742,7 @@ func TestSuccessPolicy(t *testing.T) {
734742
wantSucceeded: 1,
735743
wantFailedIndexes: ptr.To("0"),
736744
wantCompletedIndexes: "1",
745+
wantTerminating: ptr.To(int32(0)),
737746
},
738747
},
739748
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobFailed},
@@ -774,7 +783,7 @@ func TestSuccessPolicy(t *testing.T) {
774783
Succeeded: podTermination.wantSucceeded,
775784
Failed: podTermination.wantFailed,
776785
Ready: ptr.To[int32](0),
777-
Terminating: ptr.To[int32](0),
786+
Terminating: podTermination.wantTerminating,
778787
})
779788
validateIndexedJobPods(ctx, t, clientSet, jobObj, podTermination.wantActiveIndexes, podTermination.wantCompletedIndexes, podTermination.wantFailedIndexes)
780789
}
@@ -861,7 +870,7 @@ func TestSuccessPolicy_ReEnabling(t *testing.T) {
861870
Active: 0,
862871
Succeeded: 3,
863872
Ready: ptr.To[int32](0),
864-
Terminating: ptr.To[int32](0),
873+
Terminating: ptr.To[int32](2),
865874
})
866875
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New[int](), "0-2", nil)
867876

@@ -1168,6 +1177,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
11681177
wantCompletedIndexes string
11691178
wantFailedIndexes *string
11701179
wantReplacementPodFailureCount *int
1180+
wantTerminating *int32
11711181
}
11721182

11731183
podTemplateSpec := v1.PodTemplateSpec{
@@ -1208,6 +1218,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
12081218
wantActiveIndexes: sets.New(0, 1),
12091219
wantFailedIndexes: ptr.To(""),
12101220
wantReplacementPodFailureCount: ptr.To(1),
1221+
wantTerminating: ptr.To(int32(0)),
12111222
},
12121223
},
12131224
wantJobConditionType: batchv1.JobComplete,
@@ -1238,6 +1249,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
12381249
wantActiveIndexes: sets.New(0, 1),
12391250
wantFailedIndexes: ptr.To(""),
12401251
wantReplacementPodFailureCount: ptr.To(1),
1252+
wantTerminating: ptr.To(int32(0)),
12411253
},
12421254
{
12431255
status: v1.PodStatus{
@@ -1248,6 +1260,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
12481260
wantActiveIndexes: sets.New(0, 1),
12491261
wantFailedIndexes: ptr.To(""),
12501262
wantReplacementPodFailureCount: ptr.To(2),
1263+
wantTerminating: ptr.To(int32(0)),
12511264
},
12521265
{
12531266
status: v1.PodStatus{
@@ -1257,6 +1270,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
12571270
wantFailed: 3,
12581271
wantActiveIndexes: sets.New(1),
12591272
wantFailedIndexes: ptr.To("0"),
1273+
wantTerminating: ptr.To(int32(0)),
12601274
},
12611275
},
12621276
wantJobConditionType: batchv1.JobFailed,
@@ -1292,6 +1306,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
12921306
wantFailed: 1,
12931307
wantActiveIndexes: sets.New(0, 1, 2),
12941308
wantFailedIndexes: ptr.To(""),
1309+
wantTerminating: ptr.To(int32(0)),
12951310
},
12961311
{
12971312
index: 1,
@@ -1302,6 +1317,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
13021317
wantFailed: 2,
13031318
wantActiveIndexes: sets.New(0, 1, 2),
13041319
wantFailedIndexes: ptr.To(""),
1320+
wantTerminating: ptr.To(int32(0)),
13051321
},
13061322
{
13071323
index: 2,
@@ -1310,6 +1326,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
13101326
},
13111327
wantFailed: 5,
13121328
wantFailedIndexes: ptr.To(""),
1329+
wantTerminating: ptr.To(int32(2)),
13131330
},
13141331
},
13151332
wantJobConditionType: batchv1.JobFailed,
@@ -1344,6 +1361,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
13441361
wantFailed: 1,
13451362
wantActiveIndexes: sets.New(1),
13461363
wantFailedIndexes: ptr.To("0"),
1364+
wantTerminating: ptr.To(int32(0)),
13471365
},
13481366
{
13491367
index: 1,
@@ -1354,6 +1372,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
13541372
wantSucceeded: 1,
13551373
wantFailedIndexes: ptr.To("0"),
13561374
wantCompletedIndexes: "1",
1375+
wantTerminating: ptr.To(int32(0)),
13571376
},
13581377
},
13591378
wantJobConditionType: batchv1.JobFailed,
@@ -1389,6 +1408,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
13891408
wantFailed: 1,
13901409
wantActiveIndexes: sets.New(1, 2),
13911410
wantFailedIndexes: ptr.To("0"),
1411+
wantTerminating: ptr.To(int32(0)),
13921412
},
13931413
{
13941414
index: 1,
@@ -1398,6 +1418,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
13981418
wantActive: 0,
13991419
wantFailed: 3,
14001420
wantFailedIndexes: ptr.To("0,1"),
1421+
wantTerminating: ptr.To(int32(1)),
14011422
},
14021423
},
14031424
wantJobConditionType: batchv1.JobFailed,
@@ -1457,6 +1478,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
14571478
wantFailed: 1,
14581479
wantActiveIndexes: sets.New(1),
14591480
wantFailedIndexes: ptr.To("0"),
1481+
wantTerminating: ptr.To(int32(0)),
14601482
},
14611483
{
14621484
index: 1,
@@ -1471,6 +1493,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
14711493
},
14721494
wantFailed: 2,
14731495
wantFailedIndexes: ptr.To("0,1"),
1496+
wantTerminating: ptr.To(int32(0)),
14741497
},
14751498
},
14761499
wantJobConditionType: batchv1.JobFailed,
@@ -1517,7 +1540,7 @@ func TestBackoffLimitPerIndex(t *testing.T) {
15171540
Succeeded: podTermination.wantSucceeded,
15181541
Failed: podTermination.wantFailed,
15191542
Ready: ptr.To[int32](0),
1520-
Terminating: ptr.To[int32](0),
1543+
Terminating: podTermination.wantTerminating,
15211544
})
15221545
validateIndexedJobPods(ctx, t, clientSet, jobObj, podTermination.wantActiveIndexes, podTermination.wantCompletedIndexes, podTermination.wantFailedIndexes)
15231546
if podTermination.wantReplacementPodFailureCount != nil {
@@ -2730,6 +2753,7 @@ func TestElasticIndexedJob(t *testing.T) {
27302753
wantFailed int
27312754
wantRemainingIndexes sets.Set[int]
27322755
wantActivePods int
2756+
wantTerminating *int32
27332757
}
27342758
cases := map[string]struct {
27352759
featureGate bool
@@ -2739,7 +2763,8 @@ func TestElasticIndexedJob(t *testing.T) {
27392763
"feature flag off, mutation not allowed": {
27402764
jobUpdates: []jobUpdate{
27412765
{
2742-
completions: ptr.To[int32](4),
2766+
completions: ptr.To[int32](4),
2767+
wantTerminating: ptr.To[int32](0),
27432768
},
27442769
},
27452770
wantErr: apierrors.NewInvalid(
@@ -2756,6 +2781,7 @@ func TestElasticIndexedJob(t *testing.T) {
27562781
completions: ptr.To[int32](4),
27572782
succeedIndexes: []int{0, 1, 2, 3},
27582783
wantSucceededIndexes: "0-3",
2784+
wantTerminating: ptr.To[int32](0),
27592785
},
27602786
},
27612787
},
@@ -2770,6 +2796,7 @@ func TestElasticIndexedJob(t *testing.T) {
27702796
wantFailed: 1,
27712797
wantRemainingIndexes: sets.New(0, 2),
27722798
wantActivePods: 2,
2799+
wantTerminating: ptr.To[int32](0),
27732800
},
27742801
// Scale down completions 3->1, verify prev failure out of range still counts
27752802
// but succeeded out of range does not.
@@ -2778,6 +2805,7 @@ func TestElasticIndexedJob(t *testing.T) {
27782805
succeedIndexes: []int{0},
27792806
wantSucceededIndexes: "0",
27802807
wantFailed: 1,
2808+
wantTerminating: ptr.To[int32](0),
27812809
},
27822810
},
27832811
},
@@ -2790,26 +2818,30 @@ func TestElasticIndexedJob(t *testing.T) {
27902818
wantSucceededIndexes: "2",
27912819
wantRemainingIndexes: sets.New(0, 1),
27922820
wantActivePods: 2,
2821+
wantTerminating: ptr.To[int32](0),
27932822
},
27942823
// Scale completions down 3->2 to exclude previously succeeded index.
27952824
{
27962825
completions: ptr.To[int32](2),
27972826
wantRemainingIndexes: sets.New(0, 1),
27982827
wantActivePods: 2,
2828+
wantTerminating: ptr.To[int32](0),
27992829
},
28002830
// Scale completions back up to include previously succeeded index that was temporarily out of range.
28012831
{
28022832
completions: ptr.To[int32](3),
28032833
succeedIndexes: []int{0, 1, 2},
28042834
wantSucceededIndexes: "0-2",
2835+
wantTerminating: ptr.To[int32](0),
28052836
},
28062837
},
28072838
},
28082839
"scale down to 0, verify that the job succeeds": {
28092840
featureGate: true,
28102841
jobUpdates: []jobUpdate{
28112842
{
2812-
completions: ptr.To[int32](0),
2843+
completions: ptr.To[int32](0),
2844+
wantTerminating: ptr.To[int32](3),
28132845
},
28142846
},
28152847
},
@@ -2887,7 +2919,7 @@ func TestElasticIndexedJob(t *testing.T) {
28872919
Succeeded: len(update.succeedIndexes),
28882920
Failed: update.wantFailed,
28892921
Ready: ptr.To[int32](0),
2890-
Terminating: ptr.To[int32](0),
2922+
Terminating: update.wantTerminating,
28912923
})
28922924
validateIndexedJobPods(ctx, t, clientSet, jobObj, update.wantRemainingIndexes, update.wantSucceededIndexes, nil)
28932925
}

0 commit comments

Comments
 (0)