@@ -545,12 +545,13 @@ func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.
545545}
546546
547547func (s * GpuAllocator ) ListNonUsingNodes () sets.Set [string ] {
548+ <- s .initializedCh
548549 set := sets .New [string ]()
549- for nodeName , gpuNames := range s .nodeWorkerStore {
550+ for nodeName , podNames := range s .nodeWorkerStore {
550551 // If using by TF, the node can not be used by original scheduler
551552 // If using by other scheduler, won't record as TF worker, thus the map is empty
552553 // Return non using nodes can ensure original scheduler not conflict with TF
553- if len (gpuNames ) == 0 {
554+ if len (podNames ) == 0 {
554555 set .Insert (nodeName )
555556 }
556557 }
@@ -564,6 +565,20 @@ func (s *GpuAllocator) DeallocByPodIdentifier(ctx context.Context, podIdentifier
564565 }
565566}
566567
568+ func (s * GpuAllocator ) GetAllocationReqByNodeName (nodeName string ) []* tfv1.AllocRequest {
569+ allocRequests := make ([]* tfv1.AllocRequest , 0 , 8 )
570+ for workerName := range s .nodeWorkerStore [nodeName ] {
571+ podUID := s .podNamespaceNsToPodUID [workerName .String ()]
572+ if podUID == "" {
573+ continue
574+ }
575+ if request , exists := s .uniqueAllocation [podUID ]; exists {
576+ allocRequests = append (allocRequests , request )
577+ }
578+ }
579+ return allocRequests
580+ }
581+
567582func (s * GpuAllocator ) checkGPUCapacityAndQuota (gpu * tfv1.GPU , oldRes , newRes tfv1.Resource ) (tfv1.Resource , error ) {
568583 if gpu .Status .Available == nil {
569584 return tfv1.Resource {}, fmt .Errorf ("GPU available is nil, skip check" )
@@ -870,29 +885,7 @@ func (s *GpuAllocator) handleGPUCreate(ctx context.Context, gpu *tfv1.GPU) {
870885 }
871886 s .gpuStore [key ] = gpuInMem
872887
873- if gpuInMem .Status .NodeSelector != nil {
874- gpuNodeName := gpuInMem .Status .NodeSelector [constants .KubernetesHostNameLabel ]
875- if gpuNodeName != "" {
876- if _ , exists := s .nodeGpuStore [gpuNodeName ]; ! exists {
877- s .nodeGpuStore [gpuNodeName ] = make (map [string ]* tfv1.GPU , 4 )
878- }
879- s.nodeGpuStore [gpuNodeName ][gpuInMem.Name ] = gpuInMem
880- }
881- }
882-
883- if gpuInMem .Labels != nil {
884- pool := gpuInMem .Labels [constants .GpuPoolKey ]
885- if pool != "" {
886- if _ , exists := s .poolGpuStore [pool ]; ! exists {
887- s .poolGpuStore [pool ] = make (map [string ]* tfv1.GPU , 128 )
888- }
889- s.poolGpuStore [pool ][gpuInMem.Name ] = gpuInMem
890- }
891- }
892-
893- if gpu .Status .GPUModel != "" {
894- GPUCapacityMap [gpu .Status .GPUModel ] = * gpu .Status .Capacity
895- }
888+ s .addOrUpdateGPUMaps (gpuInMem )
896889 log .Info ("Added GPU to store" , "name" , key .Name , "phase" , gpu .Status .Phase )
897890}
898891
@@ -942,10 +935,36 @@ func (s *GpuAllocator) handleGPUUpdate(ctx context.Context, gpu *tfv1.GPU) {
942935 log .V (6 ).Info ("Updated GPU in store (new entry)" , "name" , key .Name , "phase" , gpu .Status .Phase )
943936 }
944937
945- if gpu .Status .GPUModel != "" {
946- if _ , exists := GPUCapacityMap [gpu .Status .GPUModel ]; ! exists {
947- GPUCapacityMap [gpu .Status .GPUModel ] = * gpu .Status .Capacity
938+ s .addOrUpdateGPUMaps (gpu )
939+ }
940+
941+ func (s * GpuAllocator ) addOrUpdateGPUMaps (gpuInMem * tfv1.GPU ) {
942+ if gpuInMem .Status .NodeSelector != nil {
943+ gpuNodeName := gpuInMem .Status .NodeSelector [constants .KubernetesHostNameLabel ]
944+ if gpuNodeName != "" {
945+ if _ , exists := s .nodeGpuStore [gpuNodeName ]; ! exists {
946+ s .nodeGpuStore [gpuNodeName ] = make (map [string ]* tfv1.GPU , 4 )
947+ }
948+ s.nodeGpuStore [gpuNodeName ][gpuInMem.Name ] = gpuInMem
949+ if _ , exists := s .nodeWorkerStore [gpuNodeName ]; ! exists {
950+ s .nodeWorkerStore [gpuNodeName ] = make (map [types.NamespacedName ]struct {}, 4 )
951+ }
948952 }
953+
954+ }
955+
956+ if gpuInMem .Labels != nil {
957+ pool := gpuInMem .Labels [constants .GpuPoolKey ]
958+ if pool != "" {
959+ if _ , exists := s .poolGpuStore [pool ]; ! exists {
960+ s .poolGpuStore [pool ] = make (map [string ]* tfv1.GPU , 128 )
961+ }
962+ s.poolGpuStore [pool ][gpuInMem.Name ] = gpuInMem
963+ }
964+ }
965+
966+ if gpuInMem .Status .GPUModel != "" {
967+ GPUCapacityMap [gpuInMem .Status .GPUModel ] = * gpuInMem .Status .Capacity
949968 }
950969}
951970
@@ -1166,7 +1185,7 @@ func (s *GpuAllocator) reconcileAllocationState() {
11661185 // No workers, but node contains GPU, need include into nodeWorkerStore with empty map
11671186 gpuNodeName := gpu .Status .NodeSelector [constants .KubernetesHostNameLabel ]
11681187 if _ , exists := s .nodeWorkerStore [gpuNodeName ]; ! exists {
1169- s .nodeWorkerStore [gpuNodeName ] = map [types.NamespacedName ]struct {}{}
1188+ s .nodeWorkerStore [gpuNodeName ] = make ( map [types.NamespacedName ]struct {}, 4 )
11701189 }
11711190 }
11721191
0 commit comments