fix: ensuring idempotency for the handlePodGPUCleanup function (#178)

0x5457 · web-flow · commit 6d484d81aa2e · 2025-05-09T21:40:22.000+08:00
* fix: ensuring idempotency for the handlePodGPUCleanup function

* fix lint
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
@@ -17,7 +17,7 @@ const (
 	LabelKeyClusterOwner    = Domain + "/cluster"
 	LabelKeyNodeClass       = Domain + "/node-class"
 	LabelKeyPodTemplateHash = Domain + "/pod-template-hash"
-	LabelValueTrue          = "true"
+	TrueStringValue         = "true"
 
 	GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-"
 	GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s"
@@ -41,6 +41,7 @@ const (
 	InjectContainerAnnotation = Domain + "/inject-container"
 	ReplicasAnnotation        = Domain + "/replicas"
 	GenWorkloadAnnotation     = Domain + "/generate-workload"
+	GpuReleasedAnnotation     = Domain + "/gpu-released"
 
 	TensorFusionPodCounterKeyAnnotation   = Domain + "/pod-counter-key"
 	TensorFusionPodCountAnnotation        = Domain + "/tf-pod-count"
diff --git a/internal/controller/gpupool_compaction_controller.go b/internal/controller/gpupool_compaction_controller.go
@@ -54,13 +54,13 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
 	// Strategy #1, terminate empty node
 	allNodes := &tfv1.GPUNodeList{}
 	if err := r.List(ctx, allNodes, client.MatchingLabels(map[string]string{
-		fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, pool.Name): constants.LabelValueTrue,
+		fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, pool.Name): constants.TrueStringValue,
 	})); err != nil {
 		return fmt.Errorf("failed to list nodes : %w", err)
 	}
 	for _, gpuNode := range allNodes.Items {
 		// Skip a node that is labeled as NoDisrupt
-		if gpuNode.Labels[constants.SchedulingDoNotDisruptLabel] == constants.LabelValueTrue {
+		if gpuNode.Labels[constants.SchedulingDoNotDisruptLabel] == constants.TrueStringValue {
 			continue
 		}
 
@@ -116,7 +116,7 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
 					ObjectMeta: metav1.ObjectMeta{
 						Name: gpuNode.Status.KubernetesNodeName,
 						Labels: map[string]string{
-							constants.NodeDeletionMark: constants.LabelValueTrue,
+							constants.NodeDeletionMark: constants.TrueStringValue,
 						},
 					},
 				}, client.Merge)
diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go
@@ -68,7 +68,7 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
 	}
 
 	// Remove deletion mark if updated
-	if node.GetLabels()[constants.NodeDeletionMark] == "true" {
+	if node.GetLabels()[constants.NodeDeletionMark] == constants.TrueStringValue {
 		log.Info("Node should be removed due to GPUNode compaction, but it's not managed by TensorFusion, skip.", "name", node.Name)
 	}
 
diff --git a/internal/controller/pod_controller.go b/internal/controller/pod_controller.go
@@ -62,7 +62,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 	// generate tensor fusion connections and apply to cluster
 	tfConnection := generateTensorFusionConnection(pod)
 	if tfConnection == nil {
-		// not a tf pod skipped
+		// not a tf client pod skipped
 		return ctrl.Result{}, nil
 	}
 
diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go
@@ -30,8 +30,6 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 
-	"slices"
-
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/config"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
@@ -100,7 +98,6 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
 	if shouldReturn {
 		return ctrl.Result{}, nil
 	}
-
 	// Handle pods with finalizers that need GPU resource cleanup
 	hasDeletion := false
 	// Process pods with our finalizer
@@ -279,19 +276,24 @@ func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, w
 func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context, pod *corev1.Pod, workload *tfv1.TensorFusionWorkload) (bool, error) {
 	log := log.FromContext(ctx)
 
-	// Check if this is our finalizer
-	if !containsFinalizer(pod, constants.Finalizer) {
-		// Not our finalizer, skip processing
-		return true, nil
-	}
 	log.Info("Processing pod with GPU resource cleanup finalizer", "pod", pod.Name)
+
 	// Get GPU name from pod label
 	gpuName, ok := pod.Labels[constants.GpuKey]
 	if !ok {
 		log.Info("Pod has finalizer but no GPU label", "pod", pod.Name)
 		return true, nil
 	}
 
+	if pod.Annotations == nil {
+		pod.Annotations = make(map[string]string)
+	}
+
+	if pod.Annotations[constants.GpuReleasedAnnotation] == constants.TrueStringValue {
+		log.Info("GPU has been released for this pod", "pod", pod.Name)
+		return true, nil
+	}
+
 	// Get the GPU
 	gpu := &tfv1.GPU{}
 	if err := r.Get(ctx, client.ObjectKey{Name: gpuName}, gpu); err != nil {
@@ -305,6 +307,18 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
 		return false, err
 	}
 
+	pod.Annotations[constants.GpuReleasedAnnotation] = constants.TrueStringValue
+
+	// Update the annotation of the Pod to mark that GPU cleanup has been successfully processed.
+	// This is a key part of ensuring idempotency for the handlePodGPUCleanup function.
+	// If this function is called again for the same Pod instance (e.g., due to the client cache
+	// not yet reflecting the finalizer's removal), Then this r.Update pod will fail.
+	// Will not cause duplicate releases
+	if err := r.Update(ctx, pod); err != nil {
+		log.Error(err, "Failed to mark that GPU cleanup of pod", "gpu", gpuName, "pod", pod.Name)
+		return false, err
+	}
+
 	// Release GPU resources
 	if err := r.Scheduler.Release(ctx, workload.Spec.Resources.Requests, gpu); err != nil {
 		log.Error(err, "Failed to release GPU resources, will retry", "gpu", gpuName, "pod", pod.Name)
@@ -315,11 +329,6 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
 	return true, nil
 }
 
-// Helper function to check if a pod has a specific finalizer
-func containsFinalizer(pod *corev1.Pod, finalizer string) bool {
-	return slices.Contains(pod.Finalizers, finalizer)
-}
-
 // deletePod deletes a pod
 func (r *TensorFusionWorkloadReconciler) deletePod(ctx context.Context, pod *corev1.Pod) error {
 	log := log.FromContext(ctx)

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.`
`68`	`68`	`}`
`69`	`69`
`70`	`70`	`// Remove deletion mark if updated`
`71`		`- if node.GetLabels()[constants.NodeDeletionMark] == "true" {`
	`71`	`+ if node.GetLabels()[constants.NodeDeletionMark] == constants.TrueStringValue {`
`72`	`72`	`log.Info("Node should be removed due to GPUNode compaction, but it's not managed by TensorFusion, skip.", "name", node.Name)`
`73`	`73`	`}`
`74`	`74`
Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R`
`62`	`62`	`// generate tensor fusion connections and apply to cluster`
`63`	`63`	`tfConnection := generateTensorFusionConnection(pod)`
`64`	`64`	`if tfConnection == nil {`
`65`		`- // not a tf pod skipped`
	`65`	`+ // not a tf client pod skipped`
`66`	`66`	`return ctrl.Result{}, nil`
`67`	`67`	`}`
`68`	`68`