@@ -30,8 +30,6 @@ import (
3030 "sigs.k8s.io/controller-runtime/pkg/client"
3131 "sigs.k8s.io/controller-runtime/pkg/log"
3232
33- "slices"
34-
3533 tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
3634 "github.com/NexusGPU/tensor-fusion/internal/config"
3735 "github.com/NexusGPU/tensor-fusion/internal/constants"
@@ -100,7 +98,6 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
10098 if shouldReturn {
10199 return ctrl.Result {}, nil
102100 }
103-
104101 // Handle pods with finalizers that need GPU resource cleanup
105102 hasDeletion := false
106103 // Process pods with our finalizer
@@ -279,19 +276,24 @@ func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, w
279276func (r * TensorFusionWorkloadReconciler ) handlePodGPUCleanup (ctx context.Context , pod * corev1.Pod , workload * tfv1.TensorFusionWorkload ) (bool , error ) {
280277 log := log .FromContext (ctx )
281278
282- // Check if this is our finalizer
283- if ! containsFinalizer (pod , constants .Finalizer ) {
284- // Not our finalizer, skip processing
285- return true , nil
286- }
287279 log .Info ("Processing pod with GPU resource cleanup finalizer" , "pod" , pod .Name )
280+
288281 // Get GPU name from pod label
289282 gpuName , ok := pod .Labels [constants .GpuKey ]
290283 if ! ok {
291284 log .Info ("Pod has finalizer but no GPU label" , "pod" , pod .Name )
292285 return true , nil
293286 }
294287
288+ if pod .Annotations == nil {
289+ pod .Annotations = make (map [string ]string )
290+ }
291+
292+ if pod .Annotations [constants .GpuReleasedAnnotation ] == constants .TrueStringValue {
293+ log .Info ("GPU has been released for this pod" , "pod" , pod .Name )
294+ return true , nil
295+ }
296+
295297 // Get the GPU
296298 gpu := & tfv1.GPU {}
297299 if err := r .Get (ctx , client.ObjectKey {Name : gpuName }, gpu ); err != nil {
@@ -305,6 +307,18 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
305307 return false , err
306308 }
307309
310+ pod .Annotations [constants .GpuReleasedAnnotation ] = constants .TrueStringValue
311+
312+ // Update the annotation of the Pod to mark that GPU cleanup has been successfully processed.
313+ // This is a key part of ensuring idempotency for the handlePodGPUCleanup function.
314+ // If this function is called again for the same Pod instance (e.g., due to the client cache
315+ // not yet reflecting the finalizer's removal), Then this r.Update pod will fail.
316+ // Will not cause duplicate releases
317+ if err := r .Update (ctx , pod ); err != nil {
318+ log .Error (err , "Failed to mark that GPU cleanup of pod" , "gpu" , gpuName , "pod" , pod .Name )
319+ return false , err
320+ }
321+
308322 // Release GPU resources
309323 if err := r .Scheduler .Release (ctx , workload .Spec .Resources .Requests , gpu ); err != nil {
310324 log .Error (err , "Failed to release GPU resources, will retry" , "gpu" , gpuName , "pod" , pod .Name )
@@ -315,11 +329,6 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
315329 return true , nil
316330}
317331
318- // Helper function to check if a pod has a specific finalizer
319- func containsFinalizer (pod * corev1.Pod , finalizer string ) bool {
320- return slices .Contains (pod .Finalizers , finalizer )
321- }
322-
323332// deletePod deletes a pod
324333func (r * TensorFusionWorkloadReconciler ) deletePod (ctx context.Context , pod * corev1.Pod ) error {
325334 log := log .FromContext (ctx )
0 commit comments