fix: extract GPU map update logic into separate method and fix webhook domain name, virtual cap calculation (#357)

Code2Life · web-flow · commit 0d7702431b3d · 2025-09-11T17:27:34.000+08:00
* fix: virtual tflops/vram not calculated bug

* fix: extract GPU map update logic into separate method and fix webhook domain name

* fix: nvidia device plugin compatible mode state consistent issue

* fix: nvidia device plugin compatible mode issue
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -61,7 +61,8 @@
                 "KUBECONFIG": "~/.kube/config-local-studio",
                 "ENABLE_WEBHOOKS": "false",
                 "ENABLE_SCHEDULER": "true",
-                "ENABLE_CR_CONTROLLER": "true"
+                "ENABLE_CR_CONTROLLER": "true",
+                "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION": "true"
             },
             "args": [
                 "--metrics-path", "${workspaceFolder}/logs/metrics.log",
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.5.7
+version: 1.5.8
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
diff --git a/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml b/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml
@@ -11,7 +11,7 @@ webhooks:
       namespace: {{ include "tensor-fusion.namespace" . }}
       path: /mutate-v1-pod
   failurePolicy: {{ .Values.controller.admissionWebhooks.failurePolicy }}
-  name: mpod-v1.kb.io
+  name: mpod.tensor-fusion.ai
   rules:
   - apiGroups:
     - ""
diff --git a/cmd/main.go b/cmd/main.go
@@ -364,9 +364,10 @@ func startCustomResourceController(
 	}
 
 	if err = (&controller.GPUNodeReconciler{
-		Client:   mgr.GetClient(),
-		Scheme:   mgr.GetScheme(),
-		Recorder: mgr.GetEventRecorderFor("GPUNode"),
+		Client:    mgr.GetClient(),
+		Scheme:    mgr.GetScheme(),
+		Recorder:  mgr.GetEventRecorderFor("GPUNode"),
+		Allocator: allocator,
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "GPUNode")
 		os.Exit(1)
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
@@ -47,8 +47,9 @@ import (
 // GPUNodeReconciler reconciles a GPUNode object
 type GPUNodeReconciler struct {
 	client.Client
-	Scheme   *runtime.Scheme
-	Recorder record.EventRecorder
+	Scheme    *runtime.Scheme
+	Recorder  record.EventRecorder
+	Allocator *gpuallocator.GpuAllocator
 }
 
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete
@@ -158,7 +159,9 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 	return ctrl.Result{}, err
 }
 
-func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Context, hypervisorName string, node *tfv1.GPUNode, poolObj *tfv1.GPUPool) error {
+func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(
+	ctx context.Context, hypervisorName string, node *tfv1.GPUNode, poolObj *tfv1.GPUPool,
+) error {
 	pod := &corev1.Pod{}
 	fetchErr := r.Get(ctx, client.ObjectKey{Name: hypervisorName, Namespace: utils.CurrentNamespace()}, pod)
 	if fetchErr != nil {
@@ -183,7 +186,7 @@ func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Cont
 
 		return nil
 	} else {
-		gpuModels, err := gpuallocator.RefreshGPUNodeCapacity(ctx, r.Client, node, poolObj)
+		gpuModels, err := gpuallocator.RefreshGPUNodeCapacity(ctx, r.Client, node, poolObj, r.Allocator)
 		if err != nil {
 			return err
 		}
diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go
@@ -180,9 +180,10 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())
 
 	err = (&GPUNodeReconciler{
-		Client:   mgr.GetClient(),
-		Scheme:   mgr.GetScheme(),
-		Recorder: mgr.GetEventRecorderFor("GPUNode"),
+		Client:    mgr.GetClient(),
+		Scheme:    mgr.GetScheme(),
+		Recorder:  mgr.GetEventRecorderFor("GPUNode"),
+		Allocator: allocator,
 	}).SetupWithManager(mgr)
 	Expect(err).ToNot(HaveOccurred())
 
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
@@ -545,12 +545,13 @@ func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.
 }
 
 func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string] {
+	<-s.initializedCh
 	set := sets.New[string]()
-	for nodeName, gpuNames := range s.nodeWorkerStore {
+	for nodeName, podNames := range s.nodeWorkerStore {
 		// If using by TF, the node can not be used by original scheduler
 		// If using by other scheduler, won't record as TF worker, thus the map is empty
 		// Return non using nodes can ensure original scheduler not conflict with TF
-		if len(gpuNames) == 0 {
+		if len(podNames) == 0 {
 			set.Insert(nodeName)
 		}
 	}
@@ -564,6 +565,20 @@ func (s *GpuAllocator) DeallocByPodIdentifier(ctx context.Context, podIdentifier
 	}
 }
 
+func (s *GpuAllocator) GetAllocationReqByNodeName(nodeName string) []*tfv1.AllocRequest {
+	allocRequests := make([]*tfv1.AllocRequest, 0, 8)
+	for workerName := range s.nodeWorkerStore[nodeName] {
+		podUID := s.podNamespaceNsToPodUID[workerName.String()]
+		if podUID == "" {
+			continue
+		}
+		if request, exists := s.uniqueAllocation[podUID]; exists {
+			allocRequests = append(allocRequests, request)
+		}
+	}
+	return allocRequests
+}
+
 func (s *GpuAllocator) checkGPUCapacityAndQuota(gpu *tfv1.GPU, oldRes, newRes tfv1.Resource) (tfv1.Resource, error) {
 	if gpu.Status.Available == nil {
 		return tfv1.Resource{}, fmt.Errorf("GPU available is nil, skip check")
@@ -870,29 +885,7 @@ func (s *GpuAllocator) handleGPUCreate(ctx context.Context, gpu *tfv1.GPU) {
 	}
 	s.gpuStore[key] = gpuInMem
 
-	if gpuInMem.Status.NodeSelector != nil {
-		gpuNodeName := gpuInMem.Status.NodeSelector[constants.KubernetesHostNameLabel]
-		if gpuNodeName != "" {
-			if _, exists := s.nodeGpuStore[gpuNodeName]; !exists {
-				s.nodeGpuStore[gpuNodeName] = make(map[string]*tfv1.GPU, 4)
-			}
-			s.nodeGpuStore[gpuNodeName][gpuInMem.Name] = gpuInMem
-		}
-	}
-
-	if gpuInMem.Labels != nil {
-		pool := gpuInMem.Labels[constants.GpuPoolKey]
-		if pool != "" {
-			if _, exists := s.poolGpuStore[pool]; !exists {
-				s.poolGpuStore[pool] = make(map[string]*tfv1.GPU, 128)
-			}
-			s.poolGpuStore[pool][gpuInMem.Name] = gpuInMem
-		}
-	}
-
-	if gpu.Status.GPUModel != "" {
-		GPUCapacityMap[gpu.Status.GPUModel] = *gpu.Status.Capacity
-	}
+	s.addOrUpdateGPUMaps(gpuInMem)
 	log.Info("Added GPU to store", "name", key.Name, "phase", gpu.Status.Phase)
 }
 
@@ -942,10 +935,36 @@ func (s *GpuAllocator) handleGPUUpdate(ctx context.Context, gpu *tfv1.GPU) {
 		log.V(6).Info("Updated GPU in store (new entry)", "name", key.Name, "phase", gpu.Status.Phase)
 	}
 
-	if gpu.Status.GPUModel != "" {
-		if _, exists := GPUCapacityMap[gpu.Status.GPUModel]; !exists {
-			GPUCapacityMap[gpu.Status.GPUModel] = *gpu.Status.Capacity
+	s.addOrUpdateGPUMaps(gpu)
+}
+
+func (s *GpuAllocator) addOrUpdateGPUMaps(gpuInMem *tfv1.GPU) {
+	if gpuInMem.Status.NodeSelector != nil {
+		gpuNodeName := gpuInMem.Status.NodeSelector[constants.KubernetesHostNameLabel]
+		if gpuNodeName != "" {
+			if _, exists := s.nodeGpuStore[gpuNodeName]; !exists {
+				s.nodeGpuStore[gpuNodeName] = make(map[string]*tfv1.GPU, 4)
+			}
+			s.nodeGpuStore[gpuNodeName][gpuInMem.Name] = gpuInMem
+			if _, exists := s.nodeWorkerStore[gpuNodeName]; !exists {
+				s.nodeWorkerStore[gpuNodeName] = make(map[types.NamespacedName]struct{}, 4)
+			}
 		}
+
+	}
+
+	if gpuInMem.Labels != nil {
+		pool := gpuInMem.Labels[constants.GpuPoolKey]
+		if pool != "" {
+			if _, exists := s.poolGpuStore[pool]; !exists {
+				s.poolGpuStore[pool] = make(map[string]*tfv1.GPU, 128)
+			}
+			s.poolGpuStore[pool][gpuInMem.Name] = gpuInMem
+		}
+	}
+
+	if gpuInMem.Status.GPUModel != "" {
+		GPUCapacityMap[gpuInMem.Status.GPUModel] = *gpuInMem.Status.Capacity
 	}
 }
 
@@ -1166,7 +1185,7 @@ func (s *GpuAllocator) reconcileAllocationState() {
 		// No workers, but node contains GPU, need include into nodeWorkerStore with empty map
 		gpuNodeName := gpu.Status.NodeSelector[constants.KubernetesHostNameLabel]
 		if _, exists := s.nodeWorkerStore[gpuNodeName]; !exists {
-			s.nodeWorkerStore[gpuNodeName] = map[types.NamespacedName]struct{}{}
+			s.nodeWorkerStore[gpuNodeName] = make(map[types.NamespacedName]struct{}, 4)
 		}
 	}
 
diff --git a/internal/gpuallocator/gpuallocator_test.go b/internal/gpuallocator/gpuallocator_test.go
@@ -97,7 +97,7 @@ var _ = Describe("GPU Allocator", func() {
 			if err := k8sClient.Get(ctx, types.NamespacedName{Name: "test-pool"}, pool); err != nil {
 				Expect(err).NotTo(HaveOccurred())
 			}
-			_, _ = RefreshGPUNodeCapacity(ctx, k8sClient, gpuNode, pool)
+			_, _ = RefreshGPUNodeCapacity(ctx, k8sClient, gpuNode, pool, allocator)
 
 			// Verify resources were reduced on the allocated GPU
 			gpu := getGPU(gpus[0].Name)
@@ -107,8 +107,14 @@ var _ = Describe("GPU Allocator", func() {
 			node := getGPUNode(gpu)
 			diffTflops := node.Status.TotalTFlops.Value() - node.Status.AvailableTFlops.Value()
 			diffVRAM := node.Status.TotalVRAM.Value() - node.Status.AvailableVRAM.Value()
+
+			diffVirtualTflops := node.Status.VirtualTFlops.Value() - node.Status.VirtualAvailableTFlops.Value()
+			diffVirtualVRAM := node.Status.VirtualVRAM.Value() - node.Status.VirtualAvailableVRAM.Value()
 			Expect(diffTflops).To(BeEquivalentTo(50))
 			Expect(diffVRAM).To(BeEquivalentTo(8 * 1024 * 1024 * 1024))
+
+			Expect(diffVirtualTflops).To(BeEquivalentTo(50))
+			Expect(diffVirtualVRAM).To(BeEquivalentTo(8 * 1024 * 1024 * 1024))
 		})
 
 		It("should allocate multiple GPUs from the same node", func() {
diff --git a/internal/gpuallocator/node_capacity.go b/internal/gpuallocator/node_capacity.go
@@ -11,7 +11,11 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
-func RefreshGPUNodeCapacity(ctx context.Context, k8sClient client.Client, node *tfv1.GPUNode, pool *tfv1.GPUPool) ([]string, error) {
+func RefreshGPUNodeCapacity(
+	ctx context.Context, k8sClient client.Client,
+	node *tfv1.GPUNode, pool *tfv1.GPUPool,
+	allocator *GpuAllocator,
+) ([]string, error) {
 	gpuList := &tfv1.GPUList{}
 	if err := k8sClient.List(ctx, gpuList, client.MatchingLabels{constants.LabelKeyOwner: node.Name}); err != nil {
 		return nil, fmt.Errorf("failed to list GPUs: %w", err)
@@ -54,6 +58,17 @@ func RefreshGPUNodeCapacity(ctx context.Context, k8sClient client.Client, node *
 	node.Status.VirtualTFlops = virtualTFlops
 	node.Status.VirtualVRAM = virtualVRAM
 
+	vramAvailable := virtualVRAM.DeepCopy()
+	tflopsAvailable := virtualTFlops.DeepCopy()
+
+	allocRequests := allocator.GetAllocationReqByNodeName(node.Name)
+	for _, allocRequest := range allocRequests {
+		vramAvailable.Sub(allocRequest.Limit.Vram)
+		tflopsAvailable.Sub(allocRequest.Limit.Tflops)
+	}
+	node.Status.VirtualAvailableVRAM = &vramAvailable
+	node.Status.VirtualAvailableTFlops = &tflopsAvailable
+
 	node.Status.Phase = tfv1.TensorFusionGPUNodePhaseRunning
 
 	if !equality.Semantic.DeepEqual(node.Status, statusCopy) {
diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go
@@ -158,11 +158,17 @@ func (s *GPUFit) PreFilter(ctx context.Context, state fwk.CycleState, pod *v1.Po
 			continue
 		}
 
+		preAllocSize := total - matched
+		if preAllocSize <= 0 {
+			s.logger.Error(nil, "Filtering GPU error, unexpected less than 0", "pod",
+				pod.Name, "node", k, "totalGPU count", total, "matchedGPU count", matched)
+			preAllocSize = 2
+		}
 		// range if it's not in validNodesValidGPUs, add to validNodeNonMatchingGPUs
-		validNodeNonMatchingGPUs[k] = make([]*tfv1.GPU, 0, total-matched)
+		validNodeNonMatchingGPUs[k] = make([]*tfv1.GPU, 0, preAllocSize)
 		for gpuName, gpu := range allGPUs {
 			seen := false
-			// just loop because the number always <= 8
+			// just loop because the number always <= 8/16
 			for _, matchedGPU := range matchedGPUs {
 				if gpuName == matchedGPU.Name {
 					seen = true