@@ -54,9 +54,7 @@ import (
5454
5555 v1 "k8s.io/api/core/v1"
5656
57- "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources"
5857 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources/genericresource"
59- respod "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources/pod"
6058 "k8s.io/apimachinery/pkg/labels"
6159
6260 arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"
@@ -79,9 +77,9 @@ type XController struct {
7977
8078 appwrapperInformer arbinformers.AppWrapperInformer
8179 // resources registered for the AppWrapper
82- qjobRegisteredResources queuejobresources.RegisteredResources
80+ // qjobRegisteredResources queuejobresources.RegisteredResources
8381 // controllers for these resources
84- qjobResControls map [arbv1.ResourceType ]queuejobresources.Interface
82+ // qjobResControls map[arbv1.ResourceType]queuejobresources.Interface
8583
8684 // Captures all available resources in the cluster
8785 genericresources * genericresource.GenericResources
@@ -140,9 +138,9 @@ type JobAndClusterAgent struct {
140138}
141139
142140// RegisterAllQueueJobResourceTypes - registers all resources
143- func RegisterAllQueueJobResourceTypes (regs * queuejobresources.RegisteredResources ) {
144- respod .Register (regs )
145- }
141+ // func RegisterAllQueueJobResourceTypes(regs *queuejobresources.RegisteredResources) {
142+ // respod.Register(regs)
143+ // }
146144
147145func GetQueueJobKey (obj interface {}) (string , error ) {
148146 qj , ok := obj .(* arbv1.AppWrapper )
@@ -153,6 +151,47 @@ func GetQueueJobKey(obj interface{}) (string, error) {
153151 return fmt .Sprintf ("%s/%s" , qj .Namespace , qj .Name ), nil
154152}
155153
154+ //UpdateQueueJobStatus was part of pod informer, this is now a method of queuejob_controller file.
155+ //This change is done in an effort to simplify the controller and enable to move to controller runtime.
156+ func (qjm * XController ) UpdateQueueJobStatus (queuejob * arbv1.AppWrapper ) error {
157+
158+ labelSelector := fmt .Sprintf ("%s=%s" , "appwrapper.mcad.ibm.com" , queuejob .Name )
159+ pods , errt := qjm .clients .CoreV1 ().Pods ("" ).List (context .TODO (), metav1.ListOptions {LabelSelector : labelSelector })
160+ if errt != nil {
161+ return errt
162+ }
163+
164+ running := int32 (FilterPods (pods .Items , v1 .PodRunning ))
165+ podPhases := []v1.PodPhase {v1 .PodRunning , v1 .PodSucceeded }
166+ totalResourcesConsumedForPodPhases := clusterstateapi .EmptyResource ()
167+ for _ , phase := range podPhases {
168+ totalResourcesConsumedForPodPhases .Add (GetPodResourcesByPhase (phase , pods .Items ))
169+ }
170+ pending := int32 (FilterPods (pods .Items , v1 .PodPending ))
171+ succeeded := int32 (FilterPods (pods .Items , v1 .PodSucceeded ))
172+ failed := int32 (FilterPods (pods .Items , v1 .PodFailed ))
173+ podsConditionMap := PendingPodsFailedSchd (pods .Items )
174+ klog .V (10 ).Infof ("[UpdateQueueJobStatus] There are %d pods of AppWrapper %s: pending %d, running %d, succeeded %d, failed %d, pendingpodsfailedschd %d, total resource consumed %v" ,
175+ len (pods .Items ), queuejob .Name , pending , running , succeeded , failed , len (podsConditionMap ), totalResourcesConsumedForPodPhases )
176+
177+ queuejob .Status .Pending = pending
178+ queuejob .Status .Running = running
179+ queuejob .Status .Succeeded = succeeded
180+ queuejob .Status .Failed = failed
181+ // Total resources by all running pods
182+ queuejob .Status .TotalGPU = int32 (totalResourcesConsumedForPodPhases .GPU )
183+ queuejob .Status .TotalCPU = int32 (totalResourcesConsumedForPodPhases .MilliCPU )
184+ queuejob .Status .TotalMemory = int32 (totalResourcesConsumedForPodPhases .Memory )
185+
186+ queuejob .Status .PendingPodConditions = nil
187+ for podName , cond := range podsConditionMap {
188+ podCond := GeneratePodFailedCondition (podName , cond )
189+ queuejob .Status .PendingPodConditions = append (queuejob .Status .PendingPodConditions , podCond )
190+ }
191+
192+ return nil
193+ }
194+
156195//allocatableCapacity calculates the capacity available on each node by substracting resources
157196//consumed by existing pods.
158197//For a large cluster with thousands of nodes and hundreds of thousands of pods this
@@ -217,20 +256,20 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) *
217256
218257 cc .genericresources = genericresource .NewAppWrapperGenericResource (config )
219258
220- cc .qjobResControls = map [arbv1.ResourceType ]queuejobresources.Interface {}
221- RegisterAllQueueJobResourceTypes (& cc .qjobRegisteredResources )
259+ // cc.qjobResControls = map[arbv1.ResourceType]queuejobresources.Interface{}
260+ // RegisterAllQueueJobResourceTypes(&cc.qjobRegisteredResources)
222261
223262 // initialize pod sub-resource control
224- resControlPod , found , err := cc .qjobRegisteredResources .InitQueueJobResource (arbv1 .ResourceTypePod , config )
225- if err != nil {
226- klog .Errorf ("fail to create queuejob resource control" )
227- return nil
228- }
229- if ! found {
230- klog .Errorf ("queuejob resource type Pod not found" )
231- return nil
232- }
233- cc .qjobResControls [arbv1 .ResourceTypePod ] = resControlPod
263+ // resControlPod, found, err := cc.qjobRegisteredResources.InitQueueJobResource(arbv1.ResourceTypePod, config)
264+ // if err != nil {
265+ // klog.Errorf("fail to create queuejob resource control")
266+ // return nil
267+ // }
268+ // if !found {
269+ // klog.Errorf("queuejob resource type Pod not found")
270+ // return nil
271+ // }
272+ // cc.qjobResControls[arbv1.ResourceTypePod] = resControlPod
234273
235274 appWrapperClient , err := clientset .NewForConfig (cc .config )
236275 if err != nil {
@@ -816,7 +855,7 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust
816855
817856 }
818857
819- err := qjm .qjobResControls [ arbv1 . ResourceTypePod ]. UpdateQueueJobStatus (value )
858+ err := qjm .UpdateQueueJobStatus (value )
820859 if err != nil {
821860 klog .Warningf ("[getAggAvaiResPri] Error updating pod status counts for AppWrapper job: %s, err=%+v" , value .Name , err )
822861 }
@@ -843,7 +882,7 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust
843882 klog .V (10 ).Infof ("[getAggAvaiResPri] Subtract all resources %+v in genericItem=%T for job %s which can-run is set to: %v but state is still pending." , qjv , genericItem , value .Name , value .Status .CanRun )
844883 }
845884
846- err := qjm .qjobResControls [ arbv1 . ResourceTypePod ]. UpdateQueueJobStatus (value )
885+ err := qjm .UpdateQueueJobStatus (value )
847886 if err != nil {
848887 klog .Warningf ("[getAggAvaiResPri] Error updating pod status counts for AppWrapper job: %s, err=%+v" , value .Name , err )
849888 }
@@ -1458,7 +1497,7 @@ func (qjm *XController) backoff(ctx context.Context, q *arbv1.AppWrapper, reason
14581497func (cc * XController ) Run (stopCh <- chan struct {}) {
14591498 go cc .appwrapperInformer .Informer ().Run (stopCh )
14601499
1461- go cc .qjobResControls [arbv1 .ResourceTypePod ].Run (stopCh )
1500+ // go cc.qjobResControls[arbv1.ResourceTypePod].Run(stopCh)
14621501
14631502 cache .WaitForCacheSync (stopCh , cc .appWrapperSynced )
14641503
@@ -1508,7 +1547,7 @@ func (qjm *XController) UpdateQueueJobs() {
15081547 }
15091548 }
15101549 if (newjob .Status .State == arbv1 .AppWrapperStateActive || newjob .Status .State == arbv1 .AppWrapperStateRunningHoldCompletion ) && containsCompletionStatus {
1511- err := qjm .qjobResControls [ arbv1 . ResourceTypePod ]. UpdateQueueJobStatus (newjob )
1550+ err := qjm .UpdateQueueJobStatus (newjob )
15121551 if err != nil {
15131552 klog .Errorf ("[UpdateQueueJobs] Error updating pod status counts for AppWrapper job: %s, err=%+v" , newjob .Name , err )
15141553 continue
@@ -1911,7 +1950,7 @@ func (cc *XController) syncQueueJob(ctx context.Context, qj *arbv1.AppWrapper) e
19111950 awNew := qj .DeepCopy ()
19121951 // we call sync to update pods running, pending,...
19131952 if qj .Status .State == arbv1 .AppWrapperStateActive {
1914- err := cc .qjobResControls [ arbv1 . ResourceTypePod ]. UpdateQueueJobStatus (awNew )
1953+ err := cc .UpdateQueueJobStatus (awNew )
19151954 if err != nil {
19161955 klog .Errorf ("[syncQueueJob] Error updating pod status counts for AppWrapper job: %s, err=%+v" , qj .Name , err )
19171956 return err
0 commit comments