fix: optimize scheduler event (#299)

Code2Life · web-flow · commit 82fc9c85e159 · 2025-07-30T15:34:17.000+08:00
* fix: improve GPU resource cleanup and node compaction logic

* fix: add fallback loop to clean GPU resources

* fix: tmp test

* fix: add pod event for gpu scheduler

* fix: optimize scheduler event

* fix: ut typo
diff --git a/internal/controller/gpupool_compaction_controller.go b/internal/controller/gpupool_compaction_controller.go
@@ -29,6 +29,8 @@ type GPUPoolCompactionReconciler struct {
 	Recorder record.EventRecorder
 
 	Allocator *gpuallocator.GpuAllocator
+
+	markDeletionNodes map[string]struct{}
 }
 
 var defaultCompactionDuration = 1 * time.Minute
@@ -71,9 +73,16 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
 
 	for _, gpu := range gpuStore {
 		if !gpu.DeletionTimestamp.IsZero() || gpu.Labels[constants.GpuPoolKey] != pool.Name ||
-			gpu.Status.UsedBy != tfv1.UsedByTensorFusion {
+			gpu.Status.UsedBy != tfv1.UsedByTensorFusion || len(gpu.Status.NodeSelector) == 0 {
 			continue
 		}
+
+		k8sNodeName := gpu.Status.NodeSelector[constants.KubernetesHostNameLabel]
+		if _, ok := r.markDeletionNodes[k8sNodeName]; ok {
+			log.V(4).Info("skip node already marked for deletion when calculation capacity", "node", k8sNodeName)
+			continue
+		}
+
 		availableTFlops, _ := gpu.Status.Available.Tflops.AsInt64()
 		poolAvailableTFlops += availableTFlops
 		availableVRAM, _ := gpu.Status.Available.Vram.AsInt64()
@@ -153,6 +162,7 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
 				poolAvailableVRAM -= nodeCapVRAM
 				poolTotalTFlops -= nodeCapTFlops
 				poolTotalVRAM -= nodeCapVRAM
+				r.markDeletionNodes[k8sNodeName] = struct{}{}
 
 				log.Info("Empty node can be compacted - provision mode", "node", gpuNode.Name,
 					"availableTFlopsAfterCompact", poolAvailableTFlops,
@@ -185,6 +195,7 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
 				poolAvailableVRAM -= nodeCapVRAM
 				poolTotalTFlops -= nodeCapTFlops
 				poolTotalVRAM -= nodeCapVRAM
+				r.markDeletionNodes[k8sNodeName] = struct{}{}
 
 				log.Info("Empty node can be compacted - auto-select mode", "node", gpuNode.Name,
 					"availableTFlopsAfterCompact", poolAvailableTFlops,
@@ -227,7 +238,7 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
 func (r *GPUPoolCompactionReconciler) getCompactionDuration(ctx context.Context, config *tfv1.NodeManagerConfig) time.Duration {
 	log := log.FromContext(ctx)
 	if config == nil || config.NodeCompaction == nil || config.NodeCompaction.Period == "" {
-		log.Info("empty node compaction config, use default value", "duration", defaultCompactionDuration)
+		log.V(4).Info("empty node compaction config, use default value", "duration", defaultCompactionDuration)
 		return defaultCompactionDuration
 	}
 	duration, err := time.ParseDuration(config.NodeCompaction.Period)
@@ -307,6 +318,7 @@ func (r *GPUPoolCompactionReconciler) Reconcile(ctx context.Context, req ctrl.Re
 
 // SetupWithManager sets up the controller with the Manager.
 func (r *GPUPoolCompactionReconciler) SetupWithManager(mgr ctrl.Manager) error {
+	r.markDeletionNodes = make(map[string]struct{})
 	return ctrl.NewControllerManagedBy(mgr).
 		Named("gpupool-compaction").
 		WatchesMetadata(&tfv1.GPUPool{}, &handler.EnqueueRequestForObject{}).
@@ -315,5 +327,5 @@ func (r *GPUPoolCompactionReconciler) SetupWithManager(mgr ctrl.Manager) error {
 
 func SetTestModeCompactionPeriod() {
 	defaultCompactionDuration = 700 * time.Millisecond
-	newNodeProtectionDuration = 1200 * time.Millisecond
+	newNodeProtectionDuration = 1000 * time.Millisecond
 }
diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go
@@ -755,7 +755,7 @@ func (b *TensorFusionEnvBuilder) Build() *TensorFusionEnv {
 					gpuNode.Status.TotalVRAM = resource.MustParse(fmt.Sprintf("%dGi", 2000*gpuCount))
 					gpuNode.Status.AvailableTFlops = gpuNode.Status.TotalTFlops
 					gpuNode.Status.AvailableVRAM = gpuNode.Status.TotalVRAM
-					Expect(k8sClient.Status().Update(ctx, gpuNode)).To(Succeed())
+					g.Expect(k8sClient.Status().Update(ctx, gpuNode)).To(Succeed())
 				}).Should(Succeed())
 			}
 		}
diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"sort"
+	"strconv"
 	"strings"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
@@ -50,7 +51,7 @@ type GPUSchedulingStateData struct {
 	ValidNodeGPUScore map[string]map[string]int
 
 	// In Reserve stage, bind GPUs to pod, update allocator cache
-	// In PreBind stage, fetch final GPUs call Pod patch API to update annotation
+	// In PostBind stage, fetch final GPUs call Pod patch API to update annotation
 	FinalGPUs []string
 }
 
@@ -93,6 +94,9 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod
 	// Handle progressive migration case
 	if utils.IsProgressiveMigration() && utils.HasGPUResourceRequest(pod) {
 		nodeNames := s.allocator.ListNonUsingNodes()
+		s.fh.EventRecorder().Eventf(pod, pod, v1.EventTypeNormal, "ScheduleWithNativeGPU",
+			"Scheduling non-TF workload for progressive migration",
+			"use native GPU resources, available native GPU nodes: "+strconv.Itoa(len(nodeNames)))
 		return &framework.PreFilterResult{
 			NodeNames: nodeNames,
 		}, framework.NewStatus(framework.Success, "progressive migration for native resources claim")
@@ -123,6 +127,8 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod
 	}
 
 	if err != nil {
+		s.fh.EventRecorder().Eventf(pod, pod, v1.EventTypeWarning, "GPUQuotaOrCapacityNotEnough",
+			"check quota and filter", "TensorFusion schedule failed, no enough resource or quotas: "+err.Error())
 		s.logger.Error(err, "failed to check quota and filter", "pod", pod.Name)
 		return nil, framework.NewStatus(framework.Unschedulable, err.Error())
 	}
@@ -139,6 +145,9 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod
 	// assign score based on different strategies
 	score := s.allocator.Score(ctx, s.cfg, allocRequest, validNodes)
 
+	s.fh.EventRecorder().Eventf(pod, pod, v1.EventTypeNormal, "PreScheduleDone", "pre filter for TensorFusion workload",
+		"TensorFusion pre schedule done, valid GPU node count: "+strconv.Itoa(nodeNames.Len()))
+
 	if s.logger.V(6).Enabled() {
 		jsonStr, _ := json.Marshal(validNodes)
 		scoreJsonStr, _ := json.Marshal(score)
@@ -298,5 +307,10 @@ func (s *GPUFit) PostBind(ctx context.Context, state *framework.CycleState, pod
 	err = s.client.Patch(s.ctx, pod, client.RawPatch(types.JSONPatchType, patch))
 	if err != nil {
 		s.logger.Error(err, "failed to patch gpu device ids", "pod", pod.Name)
+		s.fh.EventRecorder().Eventf(pod, pod, v1.EventTypeWarning, "GPUDeviceAllocatedFailed",
+			"Attach GPU device ID info failed", "Can not add GPU device IDs: "+gpuIDs)
+	} else {
+		s.fh.EventRecorder().Eventf(pod, pod, v1.EventTypeNormal, "GPUDeviceAllocated",
+			"Attach GPU device ID info", "Attach TensorFusion GPU device IDs to Pod: "+gpuIDs)
 	}
 }
diff --git a/internal/scheduler/gpuresources/gpuresources_test.go b/internal/scheduler/gpuresources/gpuresources_test.go
@@ -15,6 +15,7 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/client-go/tools/events"
 	"k8s.io/kubernetes/pkg/scheduler/framework"
 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder"
 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort"
@@ -236,6 +237,7 @@ func (s *GPUResourcesSuite) SetupTest() {
 		s.ctx, registeredPlugins, "",
 		frameworkruntime.WithPodNominator(testutil.NewPodNominator(nil)),
 		frameworkruntime.WithSnapshotSharedLister(testutil.NewFakeSharedLister(pods, nodes)),
+		frameworkruntime.WithEventRecorder(&events.FakeRecorder{}),
 	)
 	s.NoError(err)
 	s.fwk = fwk
@@ -494,8 +496,8 @@ func (s *GPUResourcesSuite) TestReserveAndUnreserve() {
 	s.Len(gpu.Status.RunningApps, 1)
 }
 
-func (s *GPUResourcesSuite) TestPreBind() {
-	log.FromContext(s.ctx).Info("Running TestPreBind")
+func (s *GPUResourcesSuite) TestPostBind() {
+	log.FromContext(s.ctx).Info("Running TestPostBind")
 	state := framework.NewCycleState()
 	pod := s.makePod("p1",
 		map[string]string{

Original file line number	Diff line number	Diff line change
`@@ -755,7 +755,7 @@ func (b TensorFusionEnvBuilder) Build() TensorFusionEnv {`
`755`	`755`	`gpuNode.Status.TotalVRAM = resource.MustParse(fmt.Sprintf("%dGi", 2000*gpuCount))`
`756`	`756`	`gpuNode.Status.AvailableTFlops = gpuNode.Status.TotalTFlops`
`757`	`757`	`gpuNode.Status.AvailableVRAM = gpuNode.Status.TotalVRAM`
`758`		`- Expect(k8sClient.Status().Update(ctx, gpuNode)).To(Succeed())`
	`758`	`+ g.Expect(k8sClient.Status().Update(ctx, gpuNode)).To(Succeed())`
`759`	`759`	`}).Should(Succeed())`
`760`	`760`	`}`
`761`	`761`	`}`