@@ -131,6 +131,20 @@ type Controller struct {
131
131
podBackoffStore * backoffStore
132
132
}
133
133
134
+ type syncJobContext struct {
135
+ job * batch.Job
136
+ pods []* v1.Pod
137
+ finishedCondition * batch.JobCondition
138
+ activePods []* v1.Pod
139
+ succeeded int32
140
+ prevSucceededIndexes orderedIntervals
141
+ succeededIndexes orderedIntervals
142
+ newBackoffRecord backoffRecord
143
+ expectedRmFinalizers sets.Set [string ]
144
+ uncounted * uncountedTerminatedPods
145
+ podFailureCountByPolicyAction map [string ]int
146
+ }
147
+
134
148
// NewController creates a new Job controller that keeps the relevant pods
135
149
// in sync with their corresponding Job objects.
136
150
func NewController (ctx context.Context , podInformer coreinformers.PodInformer , jobInformer batchinformers.JobInformer , kubeClient clientset.Interface ) * Controller {
@@ -742,6 +756,9 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
742
756
return nil
743
757
}
744
758
759
+ syncJobContext := & syncJobContext {}
760
+ syncJobContext .job = & job
761
+
745
762
completionMode := getCompletionMode (& job )
746
763
action := metrics .JobSyncActionReconciling
747
764
@@ -759,7 +776,8 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
759
776
job .Status .UncountedTerminatedPods = & batch.UncountedTerminatedPods {}
760
777
}
761
778
uncounted := newUncountedTerminatedPods (* job .Status .UncountedTerminatedPods )
762
- expectedRmFinalizers := jm .finalizerExpectations .getExpectedUIDs (key )
779
+ syncJobContext .uncounted = uncounted
780
+ syncJobContext .expectedRmFinalizers = jm .finalizerExpectations .getExpectedUIDs (key )
763
781
764
782
// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
765
783
// and update the expectations after we've retrieved active pods from the store. If a new pod enters
@@ -771,10 +789,12 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
771
789
return err
772
790
}
773
791
792
+ syncJobContext .pods = pods
774
793
activePods := controller .FilterActivePods (pods )
794
+ syncJobContext .activePods = activePods
775
795
active := int32 (len (activePods ))
776
- newSucceededPods , newFailedPods := getNewFinishedPods (& job , pods , uncounted , expectedRmFinalizers )
777
- succeeded : = job .Status .Succeeded + int32 (len (newSucceededPods )) + int32 (len (uncounted .succeeded ))
796
+ newSucceededPods , newFailedPods := getNewFinishedPods (syncJobContext )
797
+ syncJobContext . succeeded = job .Status .Succeeded + int32 (len (newSucceededPods )) + int32 (len (uncounted .succeeded ))
778
798
failed := job .Status .Failed + int32 (len (newFailedPods )) + int32 (len (uncounted .failed ))
779
799
var ready * int32
780
800
if feature .DefaultFeatureGate .Enabled (features .JobReadyPods ) {
@@ -787,28 +807,27 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
787
807
job .Status .StartTime = & now
788
808
}
789
809
790
- newBackoffRecord : = jm .podBackoffStore .newBackoffRecord (key , newSucceededPods , newFailedPods )
810
+ syncJobContext . newBackoffRecord = jm .podBackoffStore .newBackoffRecord (key , newSucceededPods , newFailedPods )
791
811
792
812
var manageJobErr error
793
- var finishedCondition * batch.JobCondition
794
813
795
814
exceedsBackoffLimit := failed > * job .Spec .BackoffLimit
796
815
797
816
if feature .DefaultFeatureGate .Enabled (features .JobPodFailurePolicy ) {
798
817
if failureTargetCondition := findConditionByType (job .Status .Conditions , batch .JobFailureTarget ); failureTargetCondition != nil {
799
- finishedCondition = newFailedConditionForFailureTarget (failureTargetCondition , jm .clock .Now ())
818
+ syncJobContext . finishedCondition = newFailedConditionForFailureTarget (failureTargetCondition , jm .clock .Now ())
800
819
} else if failJobMessage := getFailJobMessage (& job , pods ); failJobMessage != nil {
801
820
// Prepare the interim FailureTarget condition to record the failure message before the finalizers (allowing removal of the pods) are removed.
802
- finishedCondition = newCondition (batch .JobFailureTarget , v1 .ConditionTrue , jobConditionReasonPodFailurePolicy , * failJobMessage , jm .clock .Now ())
821
+ syncJobContext . finishedCondition = newCondition (batch .JobFailureTarget , v1 .ConditionTrue , jobConditionReasonPodFailurePolicy , * failJobMessage , jm .clock .Now ())
803
822
}
804
823
}
805
- if finishedCondition == nil {
824
+ if syncJobContext . finishedCondition == nil {
806
825
if exceedsBackoffLimit || pastBackoffLimitOnFailure (& job , pods ) {
807
826
// check if the number of pod restart exceeds backoff (for restart OnFailure only)
808
827
// OR if the number of failed jobs increased since the last syncJob
809
- finishedCondition = newCondition (batch .JobFailed , v1 .ConditionTrue , "BackoffLimitExceeded" , "Job has reached the specified backoff limit" , jm .clock .Now ())
828
+ syncJobContext . finishedCondition = newCondition (batch .JobFailed , v1 .ConditionTrue , "BackoffLimitExceeded" , "Job has reached the specified backoff limit" , jm .clock .Now ())
810
829
} else if jm .pastActiveDeadline (& job ) {
811
- finishedCondition = newCondition (batch .JobFailed , v1 .ConditionTrue , "DeadlineExceeded" , "Job was active longer than specified deadline" , jm .clock .Now ())
830
+ syncJobContext . finishedCondition = newCondition (batch .JobFailed , v1 .ConditionTrue , "DeadlineExceeded" , "Job was active longer than specified deadline" , jm .clock .Now ())
812
831
} else if job .Spec .ActiveDeadlineSeconds != nil && ! jobSuspended (& job ) {
813
832
syncDuration := time .Duration (* job .Spec .ActiveDeadlineSeconds )* time .Second - jm .clock .Since (job .Status .StartTime .Time )
814
833
logger .V (2 ).Info ("Job has activeDeadlineSeconds configuration. Will sync this job again" , "key" , key , "nextSyncIn" , syncDuration )
@@ -819,23 +838,25 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
819
838
var prevSucceededIndexes , succeededIndexes orderedIntervals
820
839
if isIndexedJob (& job ) {
821
840
prevSucceededIndexes , succeededIndexes = calculateSucceededIndexes (logger , & job , pods )
822
- succeeded = int32 (succeededIndexes .total ())
841
+ syncJobContext .prevSucceededIndexes = prevSucceededIndexes
842
+ syncJobContext .succeededIndexes = succeededIndexes
843
+ syncJobContext .succeeded = int32 (succeededIndexes .total ())
823
844
}
824
845
suspendCondChanged := false
825
846
// Remove active pods if Job failed.
826
- if finishedCondition != nil {
847
+ if syncJobContext . finishedCondition != nil {
827
848
deleted , err := jm .deleteActivePods (ctx , & job , activePods )
828
849
if deleted != active || ! satisfiedExpectations {
829
850
// Can't declare the Job as finished yet, as there might be remaining
830
851
// pod finalizers or pods that are not in the informer's cache yet.
831
- finishedCondition = nil
852
+ syncJobContext . finishedCondition = nil
832
853
}
833
854
active -= deleted
834
855
manageJobErr = err
835
856
} else {
836
857
manageJobCalled := false
837
858
if satisfiedExpectations && job .DeletionTimestamp == nil {
838
- active , action , manageJobErr = jm .manageJob (ctx , & job , activePods , succeeded , succeededIndexes , newBackoffRecord )
859
+ active , action , manageJobErr = jm .manageJob (ctx , syncJobContext )
839
860
manageJobCalled = true
840
861
}
841
862
complete := false
@@ -846,16 +867,16 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
846
867
// not expected to fail, but if they do, the failure is ignored. Once any
847
868
// pod succeeds, the controller waits for remaining pods to finish, and
848
869
// then the job is complete.
849
- complete = succeeded > 0 && active == 0
870
+ complete = syncJobContext . succeeded > 0 && active == 0
850
871
} else {
851
872
// Job specifies a number of completions. This type of job signals
852
873
// success by having that number of successes. Since we do not
853
874
// start more pods than there are remaining completions, there should
854
875
// not be any remaining active pods once this count is reached.
855
- complete = succeeded >= * job .Spec .Completions && active == 0
876
+ complete = syncJobContext . succeeded >= * job .Spec .Completions && active == 0
856
877
}
857
878
if complete {
858
- finishedCondition = newCondition (batch .JobComplete , v1 .ConditionTrue , "" , "" , jm .clock .Now ())
879
+ syncJobContext . finishedCondition = newCondition (batch .JobComplete , v1 .ConditionTrue , "" , "" , jm .clock .Now ())
859
880
} else if manageJobCalled {
860
881
// Update the conditions / emit events only if manageJob was called in
861
882
// this syncJob. Otherwise wait for the right syncJob call to make
@@ -891,7 +912,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
891
912
needsStatusUpdate := suspendCondChanged || active != job .Status .Active || ! pointer .Int32Equal (ready , job .Status .Ready )
892
913
job .Status .Active = active
893
914
job .Status .Ready = ready
894
- err = jm .trackJobStatusAndRemoveFinalizers (ctx , & job , pods , prevSucceededIndexes , * uncounted , expectedRmFinalizers , finishedCondition , needsStatusUpdate , newBackoffRecord )
915
+ err = jm .trackJobStatusAndRemoveFinalizers (ctx , syncJobContext , needsStatusUpdate )
895
916
if err != nil {
896
917
return fmt .Errorf ("tracking status: %w" , err )
897
918
}
@@ -970,8 +991,15 @@ func (jm *Controller) deleteJobPods(ctx context.Context, job *batch.Job, jobKey
970
991
//
971
992
// It does this up to a limited number of Pods so that the size of .status
972
993
// doesn't grow too much and this sync doesn't starve other Jobs.
973
- func (jm * Controller ) trackJobStatusAndRemoveFinalizers (ctx context.Context , job * batch. Job , pods [] * v1. Pod , succeededIndexes orderedIntervals , uncounted uncountedTerminatedPods , expectedRmFinalizers sets. Set [ string ], finishedCond * batch. JobCondition , needsFlush bool , newBackoffRecord backoffRecord ) error {
994
+ func (jm * Controller ) trackJobStatusAndRemoveFinalizers (ctx context.Context , syncJobContext * syncJobContext , needsFlush bool ) error {
974
995
logger := klog .FromContext (ctx )
996
+ job := syncJobContext .job
997
+ pods := syncJobContext .pods
998
+ finishedCond := syncJobContext .finishedCondition
999
+ expectedRmFinalizers := syncJobContext .expectedRmFinalizers
1000
+ succeededIndexes := syncJobContext .succeededIndexes
1001
+ uncounted := syncJobContext .uncounted
1002
+
975
1003
isIndexed := isIndexedJob (job )
976
1004
var podsToRemoveFinalizer []* v1.Pod
977
1005
uncountedStatus := job .Status .UncountedTerminatedPods
@@ -1070,8 +1098,9 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
1070
1098
finishedCond = newFailedConditionForFailureTarget (finishedCond , jm .clock .Now ())
1071
1099
}
1072
1100
}
1101
+ syncJobContext .podFailureCountByPolicyAction = podFailureCountByPolicyAction
1073
1102
var err error
1074
- if job , needsFlush , err = jm .flushUncountedAndRemoveFinalizers (ctx , job , podsToRemoveFinalizer , uidsWithFinalizer , & oldCounters , podFailureCountByPolicyAction , needsFlush , newBackoffRecord ); err != nil {
1103
+ if job , needsFlush , err = jm .flushUncountedAndRemoveFinalizers (ctx , syncJobContext , podsToRemoveFinalizer , uidsWithFinalizer , & oldCounters , needsFlush ); err != nil {
1075
1104
return err
1076
1105
}
1077
1106
jobFinished := ! reachedMaxUncountedPods && jm .enactJobFinished (job , finishedCond )
@@ -1101,8 +1130,11 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
1101
1130
//
1102
1131
// Returns whether there are pending changes in the Job status that need to be
1103
1132
// flushed in subsequent calls.
1104
- func (jm * Controller ) flushUncountedAndRemoveFinalizers (ctx context.Context , job * batch. Job , podsToRemoveFinalizer []* v1.Pod , uidsWithFinalizer sets.Set [string ], oldCounters * batch.JobStatus , podFailureCountByPolicyAction map [ string ] int , needsFlush bool , newBackoffRecord backoffRecord ) (* batch.Job , bool , error ) {
1133
+ func (jm * Controller ) flushUncountedAndRemoveFinalizers (ctx context.Context , syncJobContext * syncJobContext , podsToRemoveFinalizer []* v1.Pod , uidsWithFinalizer sets.Set [string ], oldCounters * batch.JobStatus , needsFlush bool ) (* batch.Job , bool , error ) {
1105
1134
logger := klog .FromContext (ctx )
1135
+ job := syncJobContext .job
1136
+ newBackoffRecord := syncJobContext .newBackoffRecord
1137
+ podFailureCountByPolicyAction := syncJobContext .podFailureCountByPolicyAction
1106
1138
var err error
1107
1139
if needsFlush {
1108
1140
if job , err = jm .updateStatusHandler (ctx , job ); err != nil {
@@ -1337,11 +1369,12 @@ func getFailJobMessage(job *batch.Job, pods []*v1.Pod) *string {
1337
1369
1338
1370
// getNewFinishedPods returns the list of newly succeeded and failed pods that are not accounted
1339
1371
// in the job status. The list of failed pods can be affected by the podFailurePolicy.
1340
- func getNewFinishedPods (job * batch.Job , pods []* v1.Pod , uncounted * uncountedTerminatedPods , expectedRmFinalizers sets.Set [string ]) (succeededPods , failedPods []* v1.Pod ) {
1341
- succeededPods = getValidPodsWithFilter (job , pods , uncounted .Succeeded (), expectedRmFinalizers , func (p * v1.Pod ) bool {
1372
+ func getNewFinishedPods (syncJobContext * syncJobContext ) (succeededPods , failedPods []* v1.Pod ) {
1373
+ job := syncJobContext .job
1374
+ succeededPods = getValidPodsWithFilter (syncJobContext , syncJobContext .uncounted .Succeeded (), func (p * v1.Pod ) bool {
1342
1375
return p .Status .Phase == v1 .PodSucceeded
1343
1376
})
1344
- failedPods = getValidPodsWithFilter (job , pods , uncounted .Failed (), expectedRmFinalizers , func (p * v1.Pod ) bool {
1377
+ failedPods = getValidPodsWithFilter (syncJobContext , syncJobContext . uncounted .Failed (), func (p * v1.Pod ) bool {
1345
1378
if feature .DefaultFeatureGate .Enabled (features .JobPodFailurePolicy ) && job .Spec .PodFailurePolicy != nil {
1346
1379
if ! isPodFailed (p , job ) {
1347
1380
return false
@@ -1365,8 +1398,14 @@ func jobSuspended(job *batch.Job) bool {
1365
1398
// pods according to what is specified in the job.Spec.
1366
1399
// Respects back-off; does not create new pods if the back-off time has not passed
1367
1400
// Does NOT modify <activePods>.
1368
- func (jm * Controller ) manageJob (ctx context.Context , job * batch. Job , activePods [] * v1. Pod , succeeded int32 , succeededIndexes [] interval , newBackoffRecord backoffRecord ) (int32 , string , error ) {
1401
+ func (jm * Controller ) manageJob (ctx context.Context , syncJobContext * syncJobContext ) (int32 , string , error ) {
1369
1402
logger := klog .FromContext (ctx )
1403
+ job := syncJobContext .job
1404
+ activePods := syncJobContext .activePods
1405
+ succeeded := syncJobContext .succeeded
1406
+ succeededIndexes := syncJobContext .succeededIndexes
1407
+ newBackoffRecord := syncJobContext .newBackoffRecord
1408
+
1370
1409
active := int32 (len (activePods ))
1371
1410
parallelism := * job .Spec .Parallelism
1372
1411
jobKey , err := controller .KeyFunc (job )
@@ -1561,7 +1600,10 @@ func (jm *Controller) patchJob(ctx context.Context, job *batch.Job, data []byte)
1561
1600
// getValidPodsWithFilter returns the valid pods that pass the filter.
1562
1601
// Pods are valid if they have a finalizer or in uncounted set
1563
1602
// and, for Indexed Jobs, a valid completion index.
1564
- func getValidPodsWithFilter (job * batch.Job , pods []* v1.Pod , uncounted sets.Set [string ], expectedRmFinalizers sets.Set [string ], filter func (* v1.Pod ) bool ) []* v1.Pod {
1603
+ func getValidPodsWithFilter (synJobContext * syncJobContext , uncounted sets.Set [string ], filter func (* v1.Pod ) bool ) []* v1.Pod {
1604
+ job := synJobContext .job
1605
+ pods := synJobContext .pods
1606
+ expectedRmFinalizers := synJobContext .expectedRmFinalizers
1565
1607
var result []* v1.Pod
1566
1608
for _ , p := range pods {
1567
1609
uid := string (p .UID )
0 commit comments