Skip to content

Commit 4fc9dc9

Browse files
authored
fix: add resource validation in Bind to prevent GPU over-allocation (#365)
- Add double-check for TFLOPs and VRAM availability before allocation
1 parent 5867f3c commit 4fc9dc9

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

internal/gpuallocator/gpuallocator.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,19 @@ func (s *GpuAllocator) Bind(
285285
gpuNodeName = gpu.Status.NodeSelector[constants.KubernetesHostNameLabel]
286286
}
287287

288+
// Double-check resource availability to prevent over-allocation
289+
if gpu.Status.Available == nil {
290+
return nil, fmt.Errorf("GPU %s has nil available resources", selectedGPU)
291+
}
292+
if gpu.Status.Available.Tflops.Cmp(req.Request.Tflops) < 0 {
293+
return nil, fmt.Errorf("GPU %s insufficient TFLOPs: available %s, requested %s",
294+
selectedGPU, gpu.Status.Available.Tflops.String(), req.Request.Tflops.String())
295+
}
296+
if gpu.Status.Available.Vram.Cmp(req.Request.Vram) < 0 {
297+
return nil, fmt.Errorf("GPU %s insufficient VRAM: available %s, requested %s",
298+
selectedGPU, gpu.Status.Available.Vram.String(), req.Request.Vram.String())
299+
}
300+
288301
// reduce available resource on the GPU status
289302
gpu.Status.Available.Tflops.Sub(req.Request.Tflops)
290303
gpu.Status.Available.Vram.Sub(req.Request.Vram)

0 commit comments

Comments
 (0)