NexusGPU
diff --git a/‎.vscode/launch.json‎
Lines changed: 1 addition & 0 deletions b/‎.vscode/launch.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/v1/gpu_types.go‎
Lines changed: 1 addition & 1 deletion b/‎api/v1/gpu_types.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 2 additions & 2 deletions b/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml‎
Lines changed: 1 addition & 1 deletion b/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎charts/tensor-fusion/templates/controller-deployment.yaml‎
Lines changed: 5 additions & 0 deletions b/‎charts/tensor-fusion/templates/controller-deployment.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/templates/rbac.yaml‎
Lines changed: 19 additions & 0 deletions b/‎charts/tensor-fusion/templates/rbac.yaml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/values.yaml‎
Lines changed: 1 addition & 1 deletion b/‎charts/tensor-fusion/values.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmd/main.go‎
Lines changed: 0 additions & 1 deletion b/‎cmd/main.go‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎cmd/nodediscovery/main.go‎
Lines changed: 41 additions & 25 deletions b/‎cmd/nodediscovery/main.go‎
Lines changed: 41 additions & 25 deletions
diff --git a/‎cmd/sched/setup.go‎
Lines changed: 6 additions & 9 deletions b/‎cmd/sched/setup.go‎
Lines changed: 6 additions & 9 deletions
@@ -67,6 +67,7 @@
                 "--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
                 "--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
                 "--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
+                "-v", "4"
             ],
             "program": "${workspaceFolder}/cmd/main.go",
         },
 
@@ -82,7 +82,7 @@ const (
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
 // +kubebuilder:resource:scope=Cluster
-// +kubebuilder:printcolumn:name="GPU Model",type="string",JSONPath=".spec.gpuModel"
+// +kubebuilder:printcolumn:name="GPU Model",type="string",JSONPath=".status.gpuModel"
 // +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
 // +kubebuilder:printcolumn:name="Total TFlops",type="string",JSONPath=".status.capacity.tflops"
 // +kubebuilder:printcolumn:name="Total VRAM",type="string",JSONPath=".status.capacity.vram"
 
@@ -15,10 +15,10 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.4.8
+version: 1.5.1
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "1.42.1"
+appVersion: "1.43.5"
@@ -15,7 +15,7 @@ spec:
   scope: Cluster
   versions:
   - additionalPrinterColumns:
-    - jsonPath: .spec.gpuModel
+    - jsonPath: .status.gpuModel
       name: GPU Model
       type: string
     - jsonPath: .status.phase
 
@@ -43,6 +43,11 @@ spec:
             {{- toYaml .Values.controller.readinessProbe | nindent 12 }}
           resources:
             {{- toYaml .Values.controller.resources | nindent 12 }}
+          ports:
+            - name: http
+              containerPort: 8080
+            - name: metrics
+              containerPort: 8081
           env:
             - name: OPERATOR_NAMESPACE
               valueFrom:
 
@@ -163,6 +163,25 @@ rules:
   - get
   - patch
   - update
+- apiGroups:
+  - karpenter.sh
+  resources:
+  - nodeclaims
+  verbs:
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - karpenter.*
+  resources:
+  - *
+  verbs:
+  - get
+  - list
+  - watch
 
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 
@@ -185,7 +185,7 @@ schedulerConfig:
       reserve:
         enabled:
         - name: GPUResourcesFit
-      preBind:
+      postBind:
         enabled:
         - name: GPUResourcesFit
     pluginConfig:
 
@@ -97,7 +97,6 @@ var schedulerConfigPath string
 func init() {
 	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
 	utilruntime.Must(tfv1.AddToScheme(scheme))
-	utilruntime.Must(tfv1.AddToScheme(scheme))
 	// +kubebuilder:scaffold:scheme
 }
 
 
@@ -35,6 +35,8 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 )
 
+const TMP_PATH = "/tmp"
+
 var Scheme = runtime.NewScheme()
 
 func init() {
@@ -167,27 +169,22 @@ func main() {
 		availableVRAM.Add(gpu.Status.Available.Vram)
 	}
 
-	ns := nodeStatus()
-	ns.TotalTFlops = totalTFlops
-	ns.TotalVRAM = totalVRAM
-	ns.AvailableTFlops = availableTFlops
-	ns.AvailableVRAM = availableVRAM
-	ns.TotalGPUs = int32(count)
-	ns.ManagedGPUs = int32(count)
-	ns.ManagedGPUDeviceIDs = allDeviceIDs
-	ns.NodeInfo.RAMSize = *resource.NewQuantity(getTotalHostRAM(), resource.DecimalSI)
-	ns.NodeInfo.DataDiskSize = *resource.NewQuantity(getDiskInfo(constants.TFDataPath), resource.DecimalSI)
-	gpunode.Status = *ns
-
+	// Use proper patch-based update with retry on conflict
 	err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+		// Get the latest version of the resource
 		currentGPUNode := &tfv1.GPUNode{}
 		if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpunode), currentGPUNode); err != nil {
 			return err
 		}
 
-		currentGPUNode.Status = *ns
+		// Create a patch from the original to the desired state
+		patch := client.MergeFrom(currentGPUNode.DeepCopy())
+
+		// Update status fields conditionally
+		updateGPUNodeStatus(&currentGPUNode.Status, totalTFlops, totalVRAM, int32(count), allDeviceIDs)
 
-		return k8sClient.Status().Update(ctx, currentGPUNode)
+		// Apply the patch using the status subresource
+		return k8sClient.Status().Patch(ctx, currentGPUNode, patch)
 	})
 	if err != nil {
 		ctrl.Log.Error(err, "failed to update status of GPUNode after retries")
@@ -268,6 +265,9 @@ func createOrUpdateTensorFusionGPU(
 		if gpu.Status.UsedBy == "" {
 			gpu.Status.UsedBy = tfv1.UsedByTensorFusion
 		}
+		if gpu.Status.Phase == "" {
+			gpu.Status.Phase = tfv1.TensorFusionGPUPhasePending
+		}
 		return k8sClient.Status().Patch(ctx, gpu, client.Merge)
 	})
 	if err != nil {
@@ -278,12 +278,6 @@ func createOrUpdateTensorFusionGPU(
 	return gpu
 }
 
-func nodeStatus() *tfv1.GPUNodeStatus {
-	return &tfv1.GPUNodeStatus{
-		Phase: tfv1.TensorFusionGPUNodePhaseRunning,
-	}
-}
-
 func kubeClient() (client.Client, error) {
 	kubeConfigEnvVar := os.Getenv("KUBECONFIG")
 	var config *rest.Config
@@ -316,7 +310,7 @@ func kubeClient() (client.Client, error) {
 func getTotalHostRAM() int64 {
 	v, err := mem.VirtualMemory()
 	if err != nil {
-		fmt.Printf("error getting memory info: %v\n", err)
+		fmt.Printf("[warning] getting memory info failed: %v\n", err)
 		return 0
 	}
 	return int64(v.Total)
@@ -325,7 +319,7 @@ func getTotalHostRAM() int64 {
 func getDiskInfo(path string) (total int64) {
 	absPath, err := filepath.Abs(path)
 	if err != nil {
-		fmt.Printf("error getting disk path: %v\n", err)
+		fmt.Printf("[warning] getting disk path failed: %v\n", err)
 		return 0
 	}
 
@@ -335,20 +329,42 @@ func getDiskInfo(path string) (total int64) {
 		if errors.Is(err, syscall.ENOENT) {
 			err = os.MkdirAll(absPath, 0o755)
 			if err != nil {
-				fmt.Printf("error creating folder: %s, err: %v\n", absPath, err)
+				fmt.Printf("[warning] creating folder to discover disk space failed: %s, err: %v\n", absPath, err)
 				return 0
 			}
 			err = syscall.Statfs(absPath, &stat)
 			if err != nil {
-				fmt.Printf("error getting disk stats after creation: %v\n", err)
+				fmt.Printf("[warning] getting disk stats after creation failed: %v\n", err)
 				return 0
 			}
 		} else {
-			fmt.Printf("error getting disk stats: %v\n", err)
+			fmt.Printf("[warning] getting disk stats failed: %v\n", err)
 			return 0
 		}
 	}
 
 	total = int64(stat.Blocks * uint64(stat.Bsize))
 	return total
 }
+
+// updateGPUNodeStatus conditionally updates GPUNode status fields
+// Only updates phase if it's empty, and available resources if they are empty
+func updateGPUNodeStatus(
+	status *tfv1.GPUNodeStatus,
+	totalTFlops, totalVRAM resource.Quantity,
+	totalGPUs int32, deviceIDs []string) {
+	// Always update these fields as they represent current state
+	status.TotalTFlops = totalTFlops
+	status.TotalVRAM = totalVRAM
+	status.TotalGPUs = totalGPUs
+	status.ManagedGPUs = totalGPUs
+	status.ManagedGPUDeviceIDs = deviceIDs
+	status.NodeInfo = tfv1.GPUNodeInfo{
+		RAMSize:      *resource.NewQuantity(getTotalHostRAM(), resource.DecimalSI),
+		DataDiskSize: *resource.NewQuantity(getDiskInfo(TMP_PATH), resource.DecimalSI),
+	}
+	// Only update phase if it's empty (unset)
+	if status.Phase == "" {
+		status.Phase = tfv1.TensorFusionGPUNodePhasePending
+	}
+}
@@ -95,8 +95,6 @@ func SetupScheduler(
 	recorderFactory := getRecorderFactory(&cc)
 	completedProfiles := make([]kubeschedulerconfig.KubeSchedulerProfile, 0)
 
-	// TODO share the same informer with controller, do not use 'cc' to avoid duplicated watch
-
 	sched, err := scheduler.New(ctx,
 		cc.Client,
 		cc.InformerFactory,
@@ -145,9 +143,7 @@ func RunScheduler(ctx context.Context,
 		cz.Set(cc.ComponentConfig)
 	}
 
-	// Start events processing pipeline.
 	cc.EventBroadcaster.StartRecordingToSink(ctx.Done())
-	defer cc.EventBroadcaster.Shutdown()
 
 	startInformersAndWaitForSync := func(ctx context.Context) {
 		// Start all informers.
@@ -176,6 +172,7 @@ func RunScheduler(ctx context.Context,
 		<-mgr.Elected()
 		logger.Info("Starting scheduling cycle")
 		sched.Run(ctx)
+		cc.EventBroadcaster.Shutdown()
 	}()
 	return nil
 }
@@ -199,24 +196,24 @@ func preHandleConfig(cfgPath string) (string, error) {
 	if err != nil {
 		return "", err
 	}
-	var cfgRaw map[string]interface{}
+	var cfgRaw map[string]any
 	err = yaml.Unmarshal(cfgBytes, &cfgRaw)
 	if err != nil {
 		return "", err
 	}
 
 	// Replace $HOME with actual home directory
-	if cfgRaw[clientConnectionCfgKey].(map[string]interface{})[kubeConfigCfgKey] != "" {
-		cfgRaw[clientConnectionCfgKey].(map[string]interface{})[kubeConfigCfgKey] = strings.ReplaceAll(
-			cfgRaw[clientConnectionCfgKey].(map[string]interface{})[kubeConfigCfgKey].(string),
+	if cfgRaw[clientConnectionCfgKey].(map[string]any)[kubeConfigCfgKey] != "" {
+		cfgRaw[clientConnectionCfgKey].(map[string]any)[kubeConfigCfgKey] = strings.ReplaceAll(
+			cfgRaw[clientConnectionCfgKey].(map[string]any)[kubeConfigCfgKey].(string),
 			"$HOME",
 			os.Getenv("HOME"),
 		)
 	}
 
 	// Replace to KUBECONFIG path if env var exists
 	if os.Getenv("KUBECONFIG") != "" {
-		cfgRaw[clientConnectionCfgKey].(map[string]interface{})[kubeConfigCfgKey] = os.Getenv("KUBECONFIG")
+		cfgRaw[clientConnectionCfgKey].(map[string]any)[kubeConfigCfgKey] = os.Getenv("KUBECONFIG")
 	}
 
 	cfgBytes, err = yaml.Marshal(cfgRaw)
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,6 @@ var schedulerConfigPath string`
`97`	`97`	`func init() {`
`98`	`98`	`utilruntime.Must(clientgoscheme.AddToScheme(scheme))`
`99`	`99`	`utilruntime.Must(tfv1.AddToScheme(scheme))`
`100`		`- utilruntime.Must(tfv1.AddToScheme(scheme))`
`101`	`100`	`// +kubebuilder:scaffold:scheme`
`102`	`101`	`}`
`103`	`102`