@@ -314,6 +314,18 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
314314
315315 log .Info ("Processing pod with GPU resource cleanup finalizer" , "pod" , pod .Name )
316316
317+ pod .Annotations [constants .GpuReleasedAnnotation ] = shortuuid .New ()
318+
319+ // Update the annotation of the Pod to mark that GPU cleanup has been successfully processed.
320+ // This is a key part of ensuring idempotency for the handlePodGPUCleanup function.
321+ // If this function is called again for the same Pod instance (e.g., due to the client cache
322+ // not yet reflecting the finalizer's removal), Then this r.Update pod will fail.
323+ // Will not cause duplicate releases
324+ if err := r .Update (ctx , pod ); err != nil {
325+ log .Error (err , "Failed to mark that GPU cleanup of pod" )
326+ return false , err
327+ }
328+
317329 // read the GPU names from the pod annotations
318330 gpuNamesStr , ok := pod .Annotations [constants .GpuKey ]
319331 if ! ok {
@@ -335,17 +347,6 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
335347 if pod .Annotations == nil {
336348 pod .Annotations = make (map [string ]string )
337349 }
338- pod .Annotations [constants .GpuReleasedAnnotation ] = shortuuid .New ()
339-
340- // Update the annotation of the Pod to mark that GPU cleanup has been successfully processed.
341- // This is a key part of ensuring idempotency for the handlePodGPUCleanup function.
342- // If this function is called again for the same Pod instance (e.g., due to the client cache
343- // not yet reflecting the finalizer's removal), Then this r.Update pod will fail.
344- // Will not cause duplicate releases
345- if err := r .Update (ctx , pod ); err != nil {
346- log .Error (err , "Failed to mark that GPU cleanup of pod" )
347- return false , err
348- }
349350
350351 return true , nil
351352}
0 commit comments