NexusGPU
diff --git a/‎.github/workflows/lint.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/lint.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.golangci.yml‎
Lines changed: 30 additions & 26 deletions b/‎.golangci.yml‎
Lines changed: 30 additions & 26 deletions
diff --git a/‎.vscode/launch.json‎
Lines changed: 1 addition & 0 deletions b/‎.vscode/launch.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 1 addition & 1 deletion b/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎charts/tensor-fusion/templates/gpu-public-gpu-info.yaml‎
Lines changed: 20 additions & 8 deletions b/‎charts/tensor-fusion/templates/gpu-public-gpu-info.yaml‎
Lines changed: 20 additions & 8 deletions
diff --git a/‎cmd/main.go‎
Lines changed: 40 additions & 1 deletion b/‎cmd/main.go‎
Lines changed: 40 additions & 1 deletion
diff --git a/‎cmd/nodediscovery/main.go‎
Lines changed: 58 additions & 20 deletions b/‎cmd/nodediscovery/main.go‎
Lines changed: 58 additions & 20 deletions
diff --git a/‎config/samples/gpu-info-config.yaml‎
Lines changed: 11 additions & 0 deletions b/‎config/samples/gpu-info-config.yaml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎internal/cloudprovider/alibaba/ecs.go‎
Lines changed: 1 addition & 1 deletion b/‎internal/cloudprovider/alibaba/ecs.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎internal/config/gpu_info.go‎
Lines changed: 2 additions & 2 deletions b/‎internal/config/gpu_info.go‎
Lines changed: 2 additions & 2 deletions
@@ -34,6 +34,6 @@ jobs:
           go-version: '~1.24'
 
       - name: Run linter
-        uses: golangci/golangci-lint-action@v6
+        uses: golangci/golangci-lint-action@v7
         with:
-          version: v1.64.8
+          version: v2.1.5
@@ -1,33 +1,15 @@
+version: "2"
 run:
-  timeout: 5m
   allow-parallel-runners: true
-
-issues:
-  # don't skip warning about doc comments
-  # don't exclude the default set of lint
-  exclude-use-default: false
-  # restore some of the defaults
-  # (fill in the rest as needed)
-  exclude-rules:
-    - path: "api/*"
-      linters:
-        - lll
-    - path: "internal/*"
-      linters:
-        - dupl
-        - lll
 linters:
-  disable-all: true
+  default: none
   enable:
+    - copyloopvar
     - dupl
     - errcheck
-    - copyloopvar
     - ginkgolinter
     - goconst
     - gocyclo
-    - gofmt
-    - goimports
-    - gosimple
     - govet
     - ineffassign
     - lll
@@ -36,12 +18,34 @@ linters:
     - prealloc
     - revive
     - staticcheck
-    - typecheck
     - unconvert
     - unparam
     - unused
-
-linters-settings:
-  revive:
+  settings:
+    revive:
+      rules:
+        - name: comment-spacings
+  exclusions:
+    generated: lax
     rules:
-      - name: comment-spacings
+      - linters:
+          - lll
+        path: api/*
+      - linters:
+          - dupl
+          - lll
+        path: internal/*
+    paths:
+      - third_party$
+      - builtin$
+      - examples$
+formatters:
+  enable:
+    - gofmt
+    - goimports
+  exclusions:
+    generated: lax
+    paths:
+      - third_party$
+      - builtin$
+      - examples$
@@ -13,6 +13,7 @@
                 "ENABLE_WEBHOOKS": "false"
             },
             "program": "${workspaceFolder}/cmd/main.go",
+            "args": ["--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml"]
         },
         {
             "name": "Debug Discovery",
 
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.2.18
+version: 1.2.19
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 
@@ -32,42 +32,54 @@ data:
       fp16TFlops: 65
 
     # Ampere Architecture Series
-    - model: A100_SXM
+    - model: A100_SXM_80G
       fullModelName: "NVIDIA A100-SXM4-80GB"
       vendor: NVIDIA
       costPerHour: 1.89
       fp16TFlops: 312
     
-    - model: A100_PCIe
+    - model: A100_PCIe_80G
       fullModelName: "NVIDIA A100 80GB PCIe"
       vendor: NVIDIA
       costPerHour: 1.64
       fp16TFlops: 312
 
-    - model: A100_40G_SXM
-      fullModelName: "NVIDIA A100-SXM4-40G"
+    - model: A100_SXM_40G
+      fullModelName: "NVIDIA A100-SXM4-40GB"
       vendor: NVIDIA
       costPerHour: 1.4
       fp16TFlops: 312
     
-    - model: A100_40G_PCIe
+    - model: A100_PCIe_40G
       fullModelName: "NVIDIA A100 40GB PCIe"
       vendor: NVIDIA
       costPerHour: 1.2
       fp16TFlops: 312
 
-    - model: A800_SXM
-      fullModelName: "NVIDIA A800-SXM4-40G"
+    - model: A800_SXM_40G
+      fullModelName: "NVIDIA A800-SXM4-40GB"
+      vendor: NVIDIA
+      costPerHour: 1.89
+      fp16TFlops: 312
+
+    - model: A800_SXM_80G
+      fullModelName: "NVIDIA A800-SXM4-80GB"
       vendor: NVIDIA
       costPerHour: 1.89
       fp16TFlops: 312
     
-    - model: A800_PCIe
+    - model: A800_PCIe_80G
       fullModelName: "NVIDIA A800 80GB PCIe"
       vendor: NVIDIA
       costPerHour: 1.64
       fp16TFlops: 312
 
+    - model: A800_PCIe_40G
+      fullModelName: "NVIDIA A800 40GB PCIe"
+      vendor: NVIDIA
+      costPerHour: 1.64
+      fp16TFlops: 312  
+
     - model: A10
       fullModelName: "NVIDIA A10"
       vendor: NVIDIA
 
@@ -23,6 +23,7 @@ import (
 	"fmt"
 	"os"
 	"strings"
+	"time"
 
 	// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
 	// to ensure that exec-entrypoint and run can make use of them.
@@ -116,6 +117,9 @@ func main() {
 		gpuInfos = make([]config.GpuInfo, 0)
 	}
 
+	// Watch configMap change with interval, check lastModifiedTime to reload gpuInfoConfig
+	watchGPUInfoChanges(gpuInfoConfig, &gpuInfos)
+
 	// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
 	// More info:
 	// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/metrics/server
@@ -269,7 +273,7 @@ func main() {
 		Scheme:    mgr.GetScheme(),
 		Scheduler: scheduler,
 		Recorder:  mgr.GetEventRecorderFor("tensorfusionworkload"),
-		GpuInfos:  gpuInfos,
+		GpuInfos:  &gpuInfos,
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "TensorFusionWorkload")
 		os.Exit(1)
@@ -311,6 +315,41 @@ func main() {
 	}
 }
 
+func watchGPUInfoChanges(gpuInfoConfig string, gpuInfos *[]config.GpuInfo) {
+	var lastModTime time.Time
+	if fileInfo, err := os.Stat(gpuInfoConfig); err == nil {
+		lastModTime = fileInfo.ModTime()
+	}
+
+	go func() {
+		ticker := time.NewTicker(15 * time.Second)
+		defer ticker.Stop()
+
+		for range ticker.C {
+			// Check if file has been modified
+			fileInfo, err := os.Stat(gpuInfoConfig)
+			if err != nil {
+				ctrl.Log.Error(err, "unable to stat gpuInfo file", "gpuInfoConfig", gpuInfoConfig)
+				continue
+			}
+
+			currentModTime := fileInfo.ModTime()
+			if currentModTime.After(lastModTime) {
+				ctrl.Log.Info("gpuInfo file modified, reloading.")
+				updatedGpuInfos, err := config.LoadGpuInfoFromFile(gpuInfoConfig)
+				if err != nil {
+					ctrl.Log.Error(err, "unable to reload gpuInfo file", "gpuInfoConfig", gpuInfoConfig)
+					continue
+				}
+
+				*gpuInfos = updatedGpuInfos
+				lastModTime = currentModTime
+				ctrl.Log.Info("gpuInfo reloaded successfully.", "gpuInfoConfig", gpuInfoConfig)
+			}
+		}
+	}()
+}
+
 // only for local development, won't set KUBECONFIG env var in none local environments
 func normalizeKubeConfigEnv() {
 	cfgPath := os.Getenv("KUBECONFIG")
 
@@ -24,6 +24,7 @@ import (
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 	"k8s.io/client-go/rest"
 	"k8s.io/client-go/tools/clientcmd"
+	"k8s.io/client-go/util/retry"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
@@ -140,28 +141,23 @@ func main() {
 		})
 		tflops := info.Fp16TFlops
 		if !ok {
-			tflops = resource.Quantity{}
-			ctrl.Log.Info("unable to find GPU info from config", "deviceName", deviceName, "uuid", uuid)
+			ctrl.Log.Info(
+				"[Error] Unknown GPU model, please update `gpu-public-gpu-info` configMap "+
+					" to match your GPU model name in `nvidia-smi`, this may cause you workload stuck, "+
+					"refer this doc to resolve it in detail: "+
+					"https://tensor-fusion.ai/guide/troubleshooting/handbook"+
+					"#pod-stuck-in-starting-status-after-enabling-tensorfusion",
+				"deviceName", deviceName, "uuid", uuid)
+			os.Exit(1)
 		} else {
-			ctrl.Log.Info("found GPU info from config", "deviceName", deviceName, "baseline FP16 TFlops", tflops, "uuid", uuid)
+			ctrl.Log.Info("found GPU info from config", "deviceName", deviceName, "FP16 TFlops", tflops, "uuid", uuid)
 		}
 		gpu := &tfv1.GPU{
 			ObjectMeta: metav1.ObjectMeta{
 				Name: uuid,
-				Labels: map[string]string{
-					constants.LabelKeyOwner: gpunode.Name,
-				},
-				Annotations: map[string]string{
-					constants.GPULastReportTimeAnnotationKey: time.Now().Format(time.RFC3339),
-				},
 			},
 		}
 
-		if err := controllerutil.SetControllerReference(gpunode, gpu, Scheme); err != nil {
-			ctrl.Log.Error(err, "failed to set controller reference")
-			os.Exit(1)
-		}
-
 		gpuStatus := tfv1.GPUStatus{
 			Phase: tfv1.TensorFusionGPUPhaseRunning,
 			Capacity: &tfv1.Resource{
@@ -174,11 +170,30 @@ func main() {
 				"kubernetes.io/hostname": k8sNodeName,
 			},
 		}
-		_, err = controllerutil.CreateOrUpdate(ctx, k8sClient, gpu, func() error { return nil })
+
+		err = retry.OnError(retry.DefaultBackoff, func(err error) bool {
+			return true // Retry on all errors for now
+		}, func() error {
+			_, err := controllerutil.CreateOrUpdate(ctx, k8sClient, gpu, func() error {
+				// Set metadata fields
+				gpu.Labels = map[string]string{
+					constants.LabelKeyOwner: gpunode.Name,
+				}
+				gpu.Annotations = map[string]string{
+					constants.GPULastReportTimeAnnotationKey: time.Now().Format(time.RFC3339),
+				}
+
+				// Set controller reference
+				return controllerutil.SetControllerReference(gpunode, gpu, Scheme)
+			})
+			return err
+		})
+
 		if err != nil {
-			ctrl.Log.Error(err, "failed to create GPU", "gpu", gpu)
+			ctrl.Log.Error(err, "failed to create or update GPU after retries", "gpu", gpu)
 			os.Exit(1)
 		}
+
 		available := gpuStatus.Available
 		gpu.Status = gpuStatus
 		if available == nil {
@@ -187,8 +202,19 @@ func main() {
 			gpu.Status.Available = available
 		}
 
-		if err := k8sClient.Status().Patch(ctx, gpu, client.Merge); err != nil {
-			ctrl.Log.Error(err, "failed to update status of GPU", "gpu", gpu)
+		err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+			currentGPU := &tfv1.GPU{}
+			if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpu), currentGPU); err != nil {
+				return err
+			}
+
+			currentGPU.Status = gpu.Status
+
+			return k8sClient.Status().Update(ctx, currentGPU)
+		})
+
+		if err != nil {
+			ctrl.Log.Error(err, "failed to update status of GPU after retries", "gpu", gpu)
 			os.Exit(1)
 		}
 
@@ -209,8 +235,20 @@ func main() {
 	ns.NodeInfo.RAMSize = *resource.NewQuantity(getTotalHostRAM(), resource.DecimalSI)
 	ns.NodeInfo.DataDiskSize = *resource.NewQuantity(getDiskInfo(constants.TFDataPath), resource.DecimalSI)
 	gpunode.Status = *ns
-	if err := k8sClient.Status().Patch(ctx, gpunode, client.Merge); err != nil {
-		ctrl.Log.Error(err, "failed to update status of GPUNode")
+
+	err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+		currentGPUNode := &tfv1.GPUNode{}
+		if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpunode), currentGPUNode); err != nil {
+			return err
+		}
+
+		currentGPUNode.Status = *ns
+
+		return k8sClient.Status().Update(ctx, currentGPUNode)
+	})
+
+	if err != nil {
+		ctrl.Log.Error(err, "failed to update status of GPUNode after retries")
 		os.Exit(1)
 	}
 }
 
@@ -0,0 +1,11 @@
+- model: T4
+  fullModelName: "Tesla T4"
+  vendor: NVIDIA
+  costPerHour: 0.53
+  fp16TFlops: 65
+
+- model: A100_SXM
+  fullModelName: "NVIDIA A100-SXM4-80GB"
+  vendor: NVIDIA
+  costPerHour: 1.89
+  fp16TFlops: 312
@@ -72,7 +72,7 @@ func (p AlibabaGPUNodeProvider) TestConnection() error {
 	request := ecs.CreateDescribeRegionsRequest()
 	_, err := p.client.DescribeRegions(request)
 	if err != nil {
-		return fmt.Errorf("Can not connect to Aliyun ECS API: %v", err)
+		return fmt.Errorf("can not connect to Aliyun ECS API: %v", err)
 	}
 	fmt.Printf("Successfully connected to Aliyun ECS. Available regions got")
 	return nil
 
@@ -28,8 +28,8 @@ func LoadGpuInfoFromFile(filename string) ([]GpuInfo, error) {
 	return infos, nil
 }
 
-func MockGpuInfo() []GpuInfo {
-	return []GpuInfo{
+func MockGpuInfo() *[]GpuInfo {
+	return &[]GpuInfo{
 		{
 			Model:         "mock",
 			Vendor:        "mock",
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@`
`13`	`13`	`"ENABLE_WEBHOOKS": "false"`
`14`	`14`	`},`
`15`	`15`	`"program": "${workspaceFolder}/cmd/main.go",`
	`16`	`+ "args": ["--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml"]`
`16`	`17`	`},`
`17`	`18`	`{`
`18`	`19`	`"name": "Debug Discovery",`
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ func (p AlibabaGPUNodeProvider) TestConnection() error {`
`72`	`72`	`request := ecs.CreateDescribeRegionsRequest()`
`73`	`73`	`_, err := p.client.DescribeRegions(request)`
`74`	`74`	`if err != nil {`
`75`		`- return fmt.Errorf("Can not connect to Aliyun ECS API: %v", err)`
	`75`	`+ return fmt.Errorf("can not connect to Aliyun ECS API: %v", err)`
`76`	`76`	`}`
`77`	`77`	`fmt.Printf("Successfully connected to Aliyun ECS. Available regions got")`
`78`	`78`	`return nil`
Original file line number	Diff line number	Diff line change
`@@ -28,8 +28,8 @@ func LoadGpuInfoFromFile(filename string) ([]GpuInfo, error) {`
`28`	`28`	`return infos, nil`
`29`	`29`	`}`
`30`	`30`
`31`		`-func MockGpuInfo() []GpuInfo {`
`32`		`- return []GpuInfo{`
	`31`	`+func MockGpuInfo() *[]GpuInfo {`
	`32`	`+ return &[]GpuInfo{`
`33`	`33`	`{`
`34`	`34`	`Model: "mock",`
`35`	`35`	`Vendor: "mock",`