fix: scheduler missing reserve plugin bug, add logs and fix workload vGPU worker scaling down/up issue (#259)

Code2Life · web-flow · commit c97c91412348 · 2025-07-02T21:46:00.000+08:00
* fix: add logs and fix workload vGPU worker scaling down/up issue

* fix: scheduler missing reserve plugin bug, add event to pod

* fix: remove not working event record
diff --git a/api/v1/tensorfusionworkload_types.go b/api/v1/tensorfusionworkload_types.go
@@ -57,11 +57,11 @@ type TensorFusionWorkloadStatus struct {
 	// +optional
 	Conditions []metav1.Condition `json:"conditions,omitempty"`
 
-	// replicas is the number of Pods created by the Workload controller.
-	Replicas int32 `json:"replicas"`
+	// workerCount is the number of vGPU workers
+	WorkerCount int32 `json:"workerCount"`
 
-	// readyReplicas is the number of pods created for this Workload with a Ready Condition.
-	ReadyReplicas int32 `json:"readyReplicas,omitempty"`
+	// readyWorkers is the number of vGPU workers ready
+	ReadyWorkers int32 `json:"readyWorkers,omitempty"`
 
 	// Hash of the pod template used to create worker pods
 	PodTemplateHash string `json:"podTemplateHash,omitempty"`
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,10 +15,10 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.4.3
+version: 1.4.4
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "1.36.1"
+appVersion: "1.36.3"
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml
@@ -480,18 +480,16 @@ spec:
               podTemplateHash:
                 description: Hash of the pod template used to create worker pods
                 type: string
-              readyReplicas:
-                description: readyReplicas is the number of pods created for this
-                  Workload with a Ready Condition.
+              readyWorkers:
+                description: readyWorkers is the number of vGPU workers ready
                 format: int32
                 type: integer
-              replicas:
-                description: replicas is the number of Pods created by the Workload
-                  controller.
+              workerCount:
+                description: workerCount is the number of vGPU workers
                 format: int32
                 type: integer
             required:
-            - replicas
+            - workerCount
             type: object
         type: object
     served: true
diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml
@@ -180,6 +180,9 @@ schedulerConfig:
         enabled:
         - name: GPUResourcesFit
           weight: 5
+      reserve:
+        enabled:
+        - name: GPUResourcesFit
       preBind:
         enabled:
         - name: GPUResourcesFit
diff --git a/cmd/sched/setup.go b/cmd/sched/setup.go
@@ -214,7 +214,10 @@ func preHandleConfig(cfgPath string) (string, error) {
 		)
 	}
 
-	// TODO set other fields if needed
+	// Replace to KUBECONFIG path if env var exists
+	if os.Getenv("KUBECONFIG") != "" {
+		cfgRaw[clientConnectionCfgKey].(map[string]interface{})[kubeConfigCfgKey] = os.Getenv("KUBECONFIG")
+	}
 
 	cfgBytes, err = yaml.Marshal(cfgRaw)
 	if err != nil {
diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml
@@ -480,18 +480,16 @@ spec:
               podTemplateHash:
                 description: Hash of the pod template used to create worker pods
                 type: string
-              readyReplicas:
-                description: readyReplicas is the number of pods created for this
-                  Workload with a Ready Condition.
+              readyWorkers:
+                description: readyWorkers is the number of vGPU workers ready
                 format: int32
                 type: integer
-              replicas:
-                description: replicas is the number of Pods created by the Workload
-                  controller.
+              workerCount:
+                description: workerCount is the number of vGPU workers
                 format: int32
                 type: integer
             required:
-            - replicas
+            - workerCount
             type: object
         type: object
     served: true
diff --git a/config/samples/scheduler-config.yaml b/config/samples/scheduler-config.yaml
@@ -17,6 +17,9 @@ profiles:
       enabled:
       - name: GPUResourcesFit
         weight: 5
+    reserve:
+      enabled:
+      - name: GPUResourcesFit
     preBind:
       enabled:
       - name: GPUResourcesFit
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
@@ -179,7 +179,7 @@ const NvidiaVisibleAllDeviceEnv = "NVIDIA_VISIBLE_DEVICES"
 const NvidiaVisibleAllDeviceValue = "all"
 
 const (
-	LowFrequencyObjFailureInitialDelay        = 100 * time.Millisecond
+	LowFrequencyObjFailureInitialDelay        = 300 * time.Millisecond
 	LowFrequencyObjFailureMaxDelay            = 1000 * time.Second
 	LowFrequencyObjFailureMaxRPS              = 1
 	LowFrequencyObjFailureMaxBurst            = 1
diff --git a/internal/controller/tensorfusionconnection_controller_test.go b/internal/controller/tensorfusionconnection_controller_test.go
@@ -185,8 +185,8 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 					},
 				},
 				Status: tfv1.TensorFusionWorkloadStatus{
-					Replicas:      0,
-					ReadyReplicas: 0,
+					WorkerCount:  0,
+					ReadyWorkers: 0,
 				},
 			}
 			Expect(k8sClient.Create(ctx, workload)).To(Succeed())
diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go
@@ -40,6 +40,7 @@ import (
 	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
 	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	"github.com/NexusGPU/tensor-fusion/internal/worker"
+	"github.com/samber/lo"
 )
 
 // TensorFusionWorkloadReconciler reconciles a TensorFusionWorkload object
@@ -78,6 +79,10 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
 		client.MatchingLabels{constants.WorkloadKey: workload.Name}); err != nil {
 		return ctrl.Result{}, fmt.Errorf("list pods: %w", err)
 	}
+	// only calculate state based on not deleted pods, otherwise will cause wrong total replica count
+	podList.Items = lo.Filter(podList.Items, func(pod corev1.Pod, _ int) bool {
+		return pod.DeletionTimestamp.IsZero()
+	})
 
 	// handle finalizer
 	shouldReturn, err := utils.HandleFinalizer(ctx, workload, r.Client, func(ctx context.Context, workload *tfv1.TensorFusionWorkload) (bool, error) {
@@ -130,15 +135,16 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
 		if err := r.Status().Update(ctx, workload); err != nil {
 			return ctrl.Result{}, fmt.Errorf("update status: %w", err)
 		}
+		return ctrl.Result{}, nil
 	}
 
 	// When it is not dynamic replica, workload maintains worker replicas by itself,
 	// In this mode, allow any Pod select connection to connect to any worker,
 	// to achieve a sub-pool for lower costs when CPU side scaling frequency is high
 	if !workload.Spec.IsDynamicReplica() {
-		result, err := r.reconcileScaling(ctx, workload, podList, workerGenerator, podTemplateHash)
-		if err != nil || !result.IsZero() {
-			return result, err
+		err := r.reconcileScaling(ctx, workload, podList, workerGenerator, podTemplateHash)
+		if err != nil {
+			return ctrl.Result{}, err
 		}
 	}
 
@@ -156,10 +162,15 @@ func (r *TensorFusionWorkloadReconciler) reconcileScaling(
 	podList *corev1.PodList,
 	workerGenerator *worker.WorkerGenerator,
 	podTemplateHash string,
-) (ctrl.Result, error) {
+) error {
 	log := log.FromContext(ctx)
 	// Check if there are any Pods using the old podTemplateHash and delete them if any
 	if len(podList.Items) > 0 {
+		// make oldest pod first, to delete from oldest to latest outdated pod
+		sort.Slice(podList.Items, func(i, j int) bool {
+			return podList.Items[i].CreationTimestamp.Before(&podList.Items[j].CreationTimestamp)
+		})
+
 		var outdatedPods []corev1.Pod
 		for i := range podList.Items {
 			pod := &podList.Items[i]
@@ -171,10 +182,10 @@ func (r *TensorFusionWorkloadReconciler) reconcileScaling(
 		if len(outdatedPods) > 0 {
 			log.Info("Found outdated pods with different template hash", "count", len(outdatedPods))
 			if err := r.scaleDownWorkers(ctx, workload, outdatedPods); err != nil {
-				return ctrl.Result{}, err
+				return err
 			}
-			// After deletion, requeue, and the next reconcile will create a new pod
-			return ctrl.Result{Requeue: true}, nil
+			// After deletion, requeue will be triggered by deleted Pod
+			return nil
 		}
 	}
 
@@ -189,10 +200,10 @@ func (r *TensorFusionWorkloadReconciler) reconcileScaling(
 	log.Info("Current replicas", "count", currentReplicas, "desired", desiredReplicas)
 
 	// Update workload status
-	if workload.Status.Replicas != currentReplicas {
-		workload.Status.Replicas = currentReplicas
+	if workload.Status.WorkerCount != currentReplicas {
+		workload.Status.WorkerCount = currentReplicas
 		if err := r.Status().Update(ctx, workload); err != nil {
-			return ctrl.Result{}, fmt.Errorf("update status: %w", err)
+			return fmt.Errorf("update status: %w", err)
 		}
 	}
 
@@ -203,7 +214,7 @@ func (r *TensorFusionWorkloadReconciler) reconcileScaling(
 		// Calculate how many pods need to be added
 		podsToAdd := int(desiredReplicas - currentReplicas)
 		if err := r.scaleUpWorkers(ctx, workerGenerator, workload, podsToAdd, podTemplateHash); err != nil {
-			return ctrl.Result{}, fmt.Errorf("scale up workers: %w", err)
+			return fmt.Errorf("scale up workers: %w", err)
 		}
 	} else if currentReplicas > desiredReplicas {
 		log.Info("Scaling down workers", "from", currentReplicas, "to", desiredReplicas)
@@ -216,11 +227,11 @@ func (r *TensorFusionWorkloadReconciler) reconcileScaling(
 		// Calculate how many pods need to be removed
 		podsToRemove := int(currentReplicas - desiredReplicas)
 		if err := r.scaleDownWorkers(ctx, workload, podList.Items[:podsToRemove]); err != nil {
-			return ctrl.Result{}, err
+			return err
 		}
 	}
 
-	return ctrl.Result{}, nil
+	return nil
 }
 
 func handleMetricsRecorder(podList *corev1.PodList, workload *tfv1.TensorFusionWorkload) {
@@ -260,7 +271,6 @@ func (r *TensorFusionWorkloadReconciler) tryStartWorker(
 // scaleDownWorkers handles the scaling down of worker pods
 func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, workload *tfv1.TensorFusionWorkload, pods []corev1.Pod) error {
 	log := log.FromContext(ctx)
-
 	for i := range pods {
 		podToDelete := &pods[i]
 		log.Info("Scaling down worker pod", "name", podToDelete.Name, "workload", workload.Name)
@@ -375,15 +385,20 @@ func (r *TensorFusionWorkloadReconciler) updateStatus(
 	conditions = append(conditions, readyCondition)
 
 	// Check if we need to update status
-	statusChanged := workload.Status.ReadyReplicas != readyReplicas ||
+	totalReplicasChangedInDynamicReplicaMode :=
+		workload.Status.WorkerCount != int32(len(pods)) && workload.Spec.IsDynamicReplica()
+	if totalReplicasChangedInDynamicReplicaMode {
+		workload.Status.WorkerCount = int32(len(pods))
+	}
+	statusChanged := totalReplicasChangedInDynamicReplicaMode || workload.Status.ReadyWorkers != readyReplicas ||
 		workload.Status.Phase != phase ||
 		!utils.EqualConditionsDisregardTransitionTime(workload.Status.Conditions, conditions)
 
 	if statusChanged {
 		log.Info("Updating workload status", "phase", phase, "readyReplicas", readyReplicas)
 		workload.Status.Phase = phase
 		workload.Status.Conditions = conditions
-		workload.Status.ReadyReplicas = readyReplicas
+		workload.Status.ReadyWorkers = readyReplicas
 		if err := r.Status().Update(ctx, workload); err != nil {
 			return fmt.Errorf("update workload status: %w", err)
 		}
diff --git a/internal/controller/tensorfusionworkload_controller_test.go b/internal/controller/tensorfusionworkload_controller_test.go
@@ -476,13 +476,13 @@ func checkWorkloadStatus(in *tfv1.TensorFusionWorkload) {
 		g.Expect(err).To(Succeed())
 
 		// Check basic status
-		g.Expect(workload.Status.Replicas).Should(Equal(*workload.Spec.Replicas))
+		g.Expect(workload.Status.WorkerCount).Should(Equal(*workload.Spec.Replicas))
 
 		if *workload.Spec.Replicas == 0 {
 			return
 		}
 		// Check phase and conditions
-		if workload.Status.Replicas == 0 {
+		if workload.Status.WorkerCount == 0 {
 			g.Expect(workload.Status.Phase).Should(Equal(tfv1.TensorFusionWorkloadPhasePending))
 		} else {
 			readyCondition, found := lo.Find(workload.Status.Conditions, func(c metav1.Condition) bool {
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
@@ -5,6 +5,7 @@ import (
 	"context"
 	"fmt"
 	"math"
+	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -90,7 +91,7 @@ func NewGpuAllocator(ctx context.Context, client client.Client, syncInterval tim
 
 	// Create base filter store with common filters
 	baseRegistry := filter.NewFilterRegistry().With(
-		filter.NewPhaseFilter(tfv1.TensorFusionGPUPhaseRunning),
+		filter.NewPhaseFilter(tfv1.TensorFusionGPUPhaseRunning, tfv1.TensorFusionGPUPhasePending),
 	)
 
 	// Create quota store
@@ -485,22 +486,53 @@ func (s *GpuAllocator) GetQuotaStore() *quota.QuotaStore {
 	return s.quotaStore
 }
 
+type scoredGPU struct {
+	nodeName string
+	gpuName  string
+	score    int
+}
+
 // First level is k8s node name, second level is GPU name, value is score
 func (s *GpuAllocator) Score(ctx context.Context, cfg *config.GPUFitConfig, req tfv1.AllocRequest, validNodeGPUs map[string][]tfv1.GPU) map[string]map[string]int {
 	result := make(map[string]map[string]int, len(validNodeGPUs))
 	strategy := NewStrategy(s.getPlacementMode(ctx, req.PoolName), cfg)
+
+	allScores := make([]scoredGPU, 0)
+
 	for nodeName, gpus := range validNodeGPUs {
 		for _, gpu := range gpus {
 			res := strategy.Score(gpu)
 			if _, exists := result[nodeName]; !exists {
 				result[nodeName] = make(map[string]int, len(gpus))
 			}
 			result[nodeName][gpu.Name] = res
+			allScores = append(allScores, scoredGPU{
+				nodeName: nodeName,
+				gpuName:  gpu.Name,
+				score:    res,
+			})
 		}
 	}
+
+	log.FromContext(ctx).Info("GPU scheduler score stage completed", "pod", req.PodMeta.Name, "top score gpus", strings.Join(topScoreItems(allScores), ", "))
 	return result
 }
 
+func topScoreItems(allScores []scoredGPU) []string {
+	sort.Slice(allScores, func(i, j int) bool {
+		return allScores[i].score > allScores[j].score
+	})
+	// Get top N (10 at most) scored GPUs
+	topN := min(len(allScores), 10)
+
+	// Format top scores for logging
+	topScores := make([]string, topN)
+	for i := range topN {
+		topScores[i] = fmt.Sprintf("%s/%s:%d", allScores[i].nodeName, allScores[i].gpuName, allScores[i].score)
+	}
+	return topScores
+}
+
 // startSyncLoop starts a goroutine that periodically syncs the in-memory store with Kubernetes
 func (s *GpuAllocator) startSyncLoop(ctx context.Context) {
 	log := log.FromContext(ctx)
diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go
@@ -98,6 +98,7 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod
 
 	filteredGPUs, err := s.allocator.CheckQuotaAndFilter(ctx, &allocRequest)
 	if err != nil {
+		s.logger.Error(err, "failed to check quota and filter", "pod", pod.Name)
 		return nil, framework.NewStatus(framework.Unschedulable, err.Error())
 	}
 
@@ -113,6 +114,7 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod
 			nodeNames.Insert(k)
 		}
 	}
+	s.logger.Info("filtered valid node GPUs", "validNodeGPU count", len(validNodeGPUs), "nodeNames count", nodeNames.Len(), "pod", pod.Name)
 
 	// assign score based on different strategies
 	score := s.allocator.Score(ctx, s.cfg, allocRequest, validNodeGPUs)

Original file line number	Diff line number	Diff line change
`@@ -214,7 +214,10 @@ func preHandleConfig(cfgPath string) (string, error) {`
`214`	`214`	`)`
`215`	`215`	`}`
`216`	`216`
`217`		`- // TODO set other fields if needed`
	`217`	`+ // Replace to KUBECONFIG path if env var exists`
	`218`	`+ if os.Getenv("KUBECONFIG") != "" {`
	`219`	`+ cfgRaw[clientConnectionCfgKey].(map[string]interface{})[kubeConfigCfgKey] = os.Getenv("KUBECONFIG")`
	`220`	`+ }`
`218`	`221`
`219`	`222`	`cfgBytes, err = yaml.Marshal(cfgRaw)`
`220`	`223`	`if err != nil {`