Skip to content

Commit 488c7e9

Browse files
authored
fix: prevent release gpu failure (#179)
1 parent 6d484d8 commit 488c7e9

File tree

1 file changed

+6
-11
lines changed

1 file changed

+6
-11
lines changed

internal/controller/tensorfusionworkload_controller.go

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
scheduler "github.com/NexusGPU/tensor-fusion/internal/scheduler"
3838
"github.com/NexusGPU/tensor-fusion/internal/utils"
3939
"github.com/NexusGPU/tensor-fusion/internal/worker"
40+
"github.com/lithammer/shortuuid/v4"
4041
"github.com/prometheus/client_golang/prometheus"
4142
"github.com/samber/lo"
4243
)
@@ -116,7 +117,7 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
116117
}
117118

118119
if hasDeletion {
119-
return ctrl.Result{Requeue: true, RequeueAfter: constants.PendingRequeueDuration}, nil
120+
return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil
120121
}
121122

122123
// Fetch the GPUPool
@@ -285,15 +286,6 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
285286
return true, nil
286287
}
287288

288-
if pod.Annotations == nil {
289-
pod.Annotations = make(map[string]string)
290-
}
291-
292-
if pod.Annotations[constants.GpuReleasedAnnotation] == constants.TrueStringValue {
293-
log.Info("GPU has been released for this pod", "pod", pod.Name)
294-
return true, nil
295-
}
296-
297289
// Get the GPU
298290
gpu := &tfv1.GPU{}
299291
if err := r.Get(ctx, client.ObjectKey{Name: gpuName}, gpu); err != nil {
@@ -307,7 +299,10 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
307299
return false, err
308300
}
309301

310-
pod.Annotations[constants.GpuReleasedAnnotation] = constants.TrueStringValue
302+
if pod.Annotations == nil {
303+
pod.Annotations = make(map[string]string)
304+
}
305+
pod.Annotations[constants.GpuReleasedAnnotation] = shortuuid.New()
311306

312307
// Update the annotation of the Pod to mark that GPU cleanup has been successfully processed.
313308
// This is a key part of ensuring idempotency for the handlePodGPUCleanup function.

0 commit comments

Comments
 (0)