@@ -150,7 +150,7 @@ func (s *GpuAllocator) Alloc(ctx context.Context, req AllocRequest) ([]*tfv1.GPU
150150}
151151
152152// Dealloc a request from gpu to release available resources on it.
153- func (s * GpuAllocator ) Dealloc (ctx context.Context , workloadNameNamespace tfv1.NameNamespace , request tfv1.Resource , gpus []types.NamespacedName ) error {
153+ func (s * GpuAllocator ) Dealloc (ctx context.Context , workloadNameNamespace tfv1.NameNamespace , request tfv1.Resource , gpus []types.NamespacedName ) {
154154 log := log .FromContext (ctx )
155155 s .storeMutex .Lock ()
156156 defer s .storeMutex .Unlock ()
@@ -175,7 +175,6 @@ func (s *GpuAllocator) Dealloc(ctx context.Context, workloadNameNamespace tfv1.N
175175 s .markGPUDirty (gpu )
176176 }
177177
178- return nil
179178}
180179
181180func NewGpuAllocator (ctx context.Context , client client.Client , syncInterval time.Duration ) * GpuAllocator {
@@ -452,7 +451,7 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) {
452451 if node .Annotations == nil {
453452 // Create annotations if they don't exist
454453 patch = []byte (`[{
455- "op": "add",
454+ "op": "add",
456455 "path": "/metadata/annotations",
457456 "value": {
458457 "` + constants .GPULastReportTimeAnnotationKey + `": "` + timeValue + `"
@@ -464,7 +463,7 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) {
464463 "op": "add",
465464 "path": "/metadata/annotations/` + encodedKey + `",
466465 "value": "` + timeValue + `"
467- }]` )
466+ }]` )
468467 }
469468
470469 err := retry .RetryOnConflict (retry .DefaultBackoff , func () error {
@@ -501,6 +500,10 @@ func (s *GpuAllocator) markGPUDirty(key types.NamespacedName) {
501500 s .dirtyQueue [key ] = struct {}{}
502501}
503502
503+ func (s * GpuAllocator ) markGPUDirtyLoced (key types.NamespacedName ) {
504+ s .dirtyQueue [key ] = struct {}{}
505+ }
506+
504507// When it's leader, should reconcile state based on existing workers
505508// this function is run inside storeMutex lock
506509func (s * GpuAllocator ) reconcileAllocationState (ctx context.Context ) {
@@ -517,6 +520,9 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
517520 vramCapacityMap := make (map [types.NamespacedName ]resource.Quantity )
518521 gpuMap := make (map [types.NamespacedName ]* tfv1.GPU )
519522
523+ defer s .storeMutex .Unlock ()
524+ s .storeMutex .Lock ()
525+
520526 for gpuKey , gpu := range s .gpuStore {
521527 if gpu .Status .Capacity != nil {
522528 tflopsCapacityMap [gpuKey ] = gpu .Status .Capacity .Tflops
@@ -527,6 +533,9 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
527533 }
528534
529535 for _ , worker := range workers .Items {
536+ if ! worker .DeletionTimestamp .IsZero () {
537+ continue
538+ }
530539 tflopsRequest , _ := resource .ParseQuantity (worker .Annotations [constants .TFLOPSRequestAnnotation ])
531540 vramRequest , _ := resource .ParseQuantity (worker .Annotations [constants .VRAMRequestAnnotation ])
532541 gpuIds := worker .Annotations [constants .GpuKey ]
@@ -559,7 +568,7 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
559568 if ! sameTflops || ! sameVRAM {
560569 gpu .Status .Available .Tflops = tflopsCapacityMap [gpuKey ]
561570 gpu .Status .Available .Vram = vramCapacityMap [gpuKey ]
562- s .markGPUDirty (gpuKey )
571+ s .markGPUDirtyLoced (gpuKey )
563572 log .FromContext (ctx ).Info ("Correcting gpu available resources" , "gpu" , gpuKey .Name , "tflops" , gpu .Status .Available .Tflops .String (), "vram" , gpu .Status .Available .Vram .String ())
564573 }
565574 }
0 commit comments