@@ -37,6 +37,7 @@ import (
3737 scheduler "github.com/NexusGPU/tensor-fusion/internal/scheduler"
3838 "github.com/NexusGPU/tensor-fusion/internal/utils"
3939 "github.com/NexusGPU/tensor-fusion/internal/worker"
40+ "github.com/lithammer/shortuuid/v4"
4041 "github.com/prometheus/client_golang/prometheus"
4142 "github.com/samber/lo"
4243)
@@ -116,7 +117,7 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
116117 }
117118
118119 if hasDeletion {
119- return ctrl.Result {Requeue : true , RequeueAfter : constants .PendingRequeueDuration }, nil
120+ return ctrl.Result {RequeueAfter : constants .PendingRequeueDuration }, nil
120121 }
121122
122123 // Fetch the GPUPool
@@ -285,15 +286,6 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
285286 return true , nil
286287 }
287288
288- if pod .Annotations == nil {
289- pod .Annotations = make (map [string ]string )
290- }
291-
292- if pod .Annotations [constants .GpuReleasedAnnotation ] == constants .TrueStringValue {
293- log .Info ("GPU has been released for this pod" , "pod" , pod .Name )
294- return true , nil
295- }
296-
297289 // Get the GPU
298290 gpu := & tfv1.GPU {}
299291 if err := r .Get (ctx , client.ObjectKey {Name : gpuName }, gpu ); err != nil {
@@ -307,7 +299,10 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
307299 return false , err
308300 }
309301
310- pod .Annotations [constants .GpuReleasedAnnotation ] = constants .TrueStringValue
302+ if pod .Annotations == nil {
303+ pod .Annotations = make (map [string ]string )
304+ }
305+ pod .Annotations [constants .GpuReleasedAnnotation ] = shortuuid .New ()
311306
312307 // Update the annotation of the Pod to mark that GPU cleanup has been successfully processed.
313308 // This is a key part of ensuring idempotency for the handlePodGPUCleanup function.
0 commit comments