Fix cluster info

Dev Agent · QinYuuuu · commit 67475e89e4d5 · 2025-12-09T15:37:40.000+08:00
diff --git a/builder/deploy/cluster/cluster_manager.go b/builder/deploy/cluster/cluster_manager.go
@@ -12,6 +12,7 @@ import (
 	"time"
 
 	argo "github.com/argoproj/argo-workflows/v3/pkg/client/clientset/versioned"
+	units "github.com/dustin/go-humanize"
 	authorizationv1 "k8s.io/api/authorization/v1"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
@@ -338,12 +339,20 @@ func (cluster *Cluster) GetResourcesInCluster(config *config.Config) (map[string
 		allocatableCPU := node.Status.Allocatable.Cpu().DeepCopy()
 		totalXPU := resource.Quantity{}
 		allocatableXPU := resource.Quantity{}
-		xpuCapacityLabel, xpuTypeLabel := getXPULabel(node.Labels, config)
+		xpuCapacityLabel, xpuTypeLabel, xpuMemLabel := getXPULabel(node.Labels, config)
 		if xpuCapacityLabel != "" {
 			totalXPU = node.Status.Capacity[v1.ResourceName(xpuCapacityLabel)]
 			allocatableXPU = node.Status.Allocatable[v1.ResourceName(xpuCapacityLabel)]
 		}
 
+		bigXPUMem := ""
+		if xpuMemLabel != "" {
+			ulimit, err := units.ParseBigBytes(node.Labels[xpuMemLabel])
+			if err == nil {
+				bigXPUMem = units.BigIBytes(ulimit)
+			}
+		}
+
 		gpuModelVendor, gpuModel := getGpuTypeAndVendor(node.Labels[xpuTypeLabel], xpuCapacityLabel)
 		nodeResourcesMap[node.Name] = types.NodeResourceInfo{
 			NodeName:     node.Name,
@@ -357,6 +366,7 @@ func (cluster *Cluster) GetResourcesInCluster(config *config.Config) (map[string
 			AvailableXPU: parseQuantityToInt64(allocatableXPU),
 
 			XPUCapacityLabel: xpuCapacityLabel,
+			XPUMem:           bigXPUMem,
 		}
 	}
 
@@ -417,66 +427,70 @@ func getGpuTypeAndVendor(vendorType string, label string) (string, string) {
 }
 
 // the first label is the xpu capacity label, the second is the gpu model label
-func getXPULabel(labels map[string]string, config *config.Config) (string, string) {
+func getXPULabel(labels map[string]string, config *config.Config) (string, string, string) {
 	if _, found := labels["aliyun.accelerator/nvidia_name"]; found {
 		//for default cluster
-		return "nvidia.com/gpu", "aliyun.accelerator/nvidia_name"
+		return "nvidia.com/gpu", "aliyun.accelerator/nvidia_name", "aliyun.accelerator/nvidia_mem"
 	}
 	if _, found := labels["machine.cluster.vke.volcengine.com/gpu-name"]; found {
 		//for volcano cluster
-		return "nvidia.com/gpu", "machine.cluster.vke.volcengine.com/gpu-name"
+		return "nvidia.com/gpu", "machine.cluster.vke.volcengine.com/gpu-name", "machine.cluster.vke.volcengine.com/gpu-mem"
 	}
 	if _, found := labels["eks.tke.cloud.tencent.com/gpu-type"]; found {
 		//for tencent cluster
-		return "nvidia.com/gpu", "eks.tke.cloud.tencent.com/gpu-type"
+		return "nvidia.com/gpu", "eks.tke.cloud.tencent.com/gpu-type", "eks.tke.cloud.tencent.com/gpu-mem"
 	}
 	if _, found := labels["nvidia.com/nvidia_name"]; found {
 		//for k3s cluster
-		return "nvidia.com/gpu", "nvidia.com/nvidia_name"
+		return "nvidia.com/gpu", "nvidia.com/nvidia_name", "nvidia.com/nvidia_mem"
+	}
+	if _, found := labels["nvidia.com/gpu.product"]; found {
+		//for nvidia gpu product label
+		return "nvidia.com/gpu", "nvidia.com/gpu.product", "nvidia.com/gpu.mem"
 	}
 	if _, found := labels["nvidia.com/gpu.product"]; found {
 		//for nvidia gpu product label
 		return "nvidia.com/gpu", "nvidia.com/gpu.product"
 	}
 	if _, found := labels["kubemore_xpu_type"]; found {
 		//for huawei gpu
-		return "huawei.com/Ascend910", "kubemore_xpu_type"
+		return "huawei.com/Ascend910", "kubemore_xpu_type", "kubemore_xpu_mem"
 	}
 	if _, found := labels["huawei.accelerator"]; found {
 		//for huawei gpu
-		return "huawei.com/Ascend910", "huawei.accelerator"
+		return "huawei.com/Ascend910", "huawei.accelerator", "huawei.accelerator.mem"
 	}
 	if _, found := labels["accelerator/huawei-npu"]; found {
 		//for huawei gpu
-		return "huawei.com/Ascend910", "accelerator/huawei-npu"
+		return "huawei.com/Ascend910", "accelerator/huawei-npu", "accelerator/huawei-npu.mem"
 	}
 	if _, found := labels["hygon.com/dcu.name"]; found {
 		//for hy dcu
-		return "hygon.com/dcu", "hygon.com/dcu.name"
+		return "hygon.com/dcu", "hygon.com/dcu.name", "hygon.com/dcu.mem"
 	}
 	if _, found := labels["enflame.com/gcu"]; found {
 		//for enflame gcu
-		return "enflame.com/gcu", "enflame.com/gcu.model"
+		return "enflame.com/gcu", "enflame.com/gcu.model", "enflame.com/gcu.mem"
 	}
 	if _, found := labels["enflame.com/gcu.count"]; found {
 		//for enflame gcu
-		return "enflame.com/gcu.count", "enflame.com/gcu.model"
+		return "enflame.com/gcu.count", "enflame.com/gcu.model", "enflame.com/gcu.mem"
 	}
 	//check custom gpu model label
 	if config.Space.GPUModelLabel != "" {
 		var gpuLabels []types.GPUModel
 		err := json.Unmarshal([]byte(config.Space.GPUModelLabel), &gpuLabels)
 		if err != nil {
 			slog.Error("failed to parse GPUModelLabel", "error", err)
-			return "", ""
+			return "", "", ""
 		}
 		for _, gpuModel := range gpuLabels {
 			if _, found := labels[gpuModel.TypeLabel]; found {
-				return gpuModel.CapacityLabel, gpuModel.TypeLabel
+				return gpuModel.CapacityLabel, gpuModel.TypeLabel, gpuModel.MemLabel
 			}
 		}
 	}
-	return "", ""
+	return "", "", ""
 }
 
 // convert memory in bytes to GB
diff --git a/builder/deploy/cluster/cluster_manager_test.go b/builder/deploy/cluster/cluster_manager_test.go
@@ -231,7 +231,7 @@ func TestGetXPULabel(t *testing.T) {
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			gotCapacity, gotTypeLabel := getXPULabel(tt.labels, tt.config)
+			gotCapacity, gotTypeLabel, _ := getXPULabel(tt.labels, tt.config)
 			assert.Equal(t, tt.wantCapacity, gotCapacity)
 			assert.Equal(t, tt.wantTypeLabel, gotTypeLabel)
 		})
diff --git a/builder/deploy/cluster/resource_namespace.go b/builder/deploy/cluster/resource_namespace.go
@@ -39,7 +39,7 @@ func (cluster *Cluster) GetResourceInNamespace(namespace string, quotaName strin
 		available.Sub(usedAmount)
 		return available
 	}
-	xpuCapacityLabel, xpuTypeLabel := getXPULabel(quota.Labels, config)
+	xpuCapacityLabel, xpuTypeLabel, _ := getXPULabel(quota.Labels, config)
 	gpuModelVendor, gpuModel := getGpuTypeAndVendor(quota.Labels[xpuTypeLabel], xpuCapacityLabel)
 	var totalXPU int64 = 0
 	var availableXPU int64 = 0
diff --git a/builder/deploy/cluster/resource_namespace_test.go b/builder/deploy/cluster/resource_namespace_test.go
@@ -236,3 +236,61 @@ func TestGetNameSpaceResourcesQuota(t *testing.T) {
 		})
 	}
 }
+
+func TestGetXPUMem(t *testing.T) {
+	node1 := &v1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "node1",
+			Labels: map[string]string{
+				"nvidia.com/gpu":                 "true",
+				"aliyun.accelerator/nvidia_name": "T4",
+				"aliyun.accelerator/nvidia_mem":  "16GiB",
+			},
+		},
+		Status: v1.NodeStatus{
+			Conditions: []v1.NodeCondition{
+				{Type: v1.NodeReady, Status: v1.ConditionTrue},
+			},
+			Capacity: v1.ResourceList{
+				v1.ResourceCPU:    resource.MustParse("4"),
+				v1.ResourceMemory: resource.MustParse("16Gi"),
+				"nvidia.com/gpu":  resource.MustParse("2"),
+			},
+			Allocatable: v1.ResourceList{
+				v1.ResourceCPU:    resource.MustParse("3"),
+				v1.ResourceMemory: resource.MustParse("14Gi"),
+				"nvidia.com/gpu":  resource.MustParse("2"),
+			},
+		},
+	}
+
+	node2 := &v1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "node2",
+			Labels: map[string]string{
+				"nvidia.com/gpu":                 "true",
+				"aliyun.accelerator/nvidia_name": "NVIDIA-A10",
+				"aliyun.accelerator/nvidia_mem":  "23028MiB",
+			},
+		},
+		Status: v1.NodeStatus{
+			Conditions: []v1.NodeCondition{
+				{Type: v1.NodeReady, Status: v1.ConditionTrue},
+			},
+		},
+	}
+
+	clientset := fake.NewSimpleClientset(node1, node2)
+	cluster := &Cluster{
+		Client: clientset,
+	}
+
+	config := &config.Config{}
+
+	resources, err := cluster.GetResourcesInCluster(config)
+	assert.NoError(t, err)
+	assert.Len(t, resources, 2)
+
+	assert.Equal(t, resources["node1"].XPUMem, "16 GiB")
+	assert.Equal(t, resources["node2"].XPUMem, "22 GiB")
+}
diff --git a/builder/deploy/deployer.go b/builder/deploy/deployer.go
@@ -641,15 +641,39 @@ func (d *deployer) GetClusterUsageById(ctx context.Context, clusterId string) (*
 		Provider:  resp.Provider,
 		Status:    types.ClusterStatusRunning,
 	}
+	var vendorSet = make(map[string]struct{}, 0)
+	var modelsSet = make(map[string]struct{}, 0)
 	for _, node := range resp.Nodes {
 		res.TotalCPU += node.TotalCPU
 		res.AvailableCPU += node.AvailableCPU
 		res.TotalMem += float64(node.TotalMem)
 		res.AvailableMem += float64(node.AvailableMem)
 		res.TotalGPU += node.TotalXPU
 		res.AvailableGPU += node.AvailableXPU
+		if node.GPUVendor != "" {
+			vendorSet[node.GPUVendor] = struct{}{}
+			modelsSet[fmt.Sprintf("%s(%s)", node.XPUModel, node.XPUMem)] = struct{}{}
+		}
+	}
 
+	var vendor string
+	for k := range vendorSet {
+		vendor += k + ", "
 	}
+	if vendor != "" {
+		vendor = vendor[:len(vendor)-2]
+	}
+
+	var models string
+	for k := range modelsSet {
+		models += k + ", "
+	}
+	if models != "" {
+		models = models[:len(models)-2]
+	}
+
+	res.XPUVendors = vendor
+	res.XPUModels = models
 	res.AvailableCPU = math.Floor(res.AvailableCPU)
 	res.TotalMem = math.Floor(res.TotalMem)
 	res.AvailableMem = math.Floor(res.AvailableMem)
diff --git a/common/types/cluster.go b/common/types/cluster.go
@@ -64,9 +64,10 @@ type ClusterRes struct {
 
 	ResourceStatus ResourceStatus `json:"resource_status"`
 
-	LastUpdateTime int64 `json:"last_update_time"`
+	LastUpdateTime int64  `json:"last_update_time"`
+	XPUVendors     string `json:"xpu_vendors"` // NVIDIA, AMD
+	XPUModels      string `json:"xpu_models"`  // A10(32 GB),H100(80 GB)
 }
-
 type DeployRes struct {
 	ClusterID       string    `json:"cluster_id"`
 	ClusterRegion   string    `json:"cluster_region"`
@@ -91,6 +92,7 @@ type NodeResourceInfo struct {
 	AvailableMem     float32 `json:"available_mem"` //in GB
 	XPUCapacityLabel string  `json:"xpu_capacity_label"`
 	ReservedXPU      int64   `json:"reserved_xpu"`
+	XPUMem           string  `json:"xpu_mem"`
 }
 
 type UpdateClusterResponse struct {
@@ -100,6 +102,7 @@ type UpdateClusterResponse struct {
 type GPUModel struct {
 	TypeLabel     string `json:"type_label"`
 	CapacityLabel string `json:"capacity_label"`
+	MemLabel      string `json:"mem_label"`
 }
 
 type ClusterStatus string

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ func (cluster *Cluster) GetResourceInNamespace(namespace string, quotaName strin`
`39`	`39`	`available.Sub(usedAmount)`
`40`	`40`	`return available`
`41`	`41`	`}`
`42`		`- xpuCapacityLabel, xpuTypeLabel := getXPULabel(quota.Labels, config)`
	`42`	`+ xpuCapacityLabel, xpuTypeLabel, _ := getXPULabel(quota.Labels, config)`
`43`	`43`	`gpuModelVendor, gpuModel := getGpuTypeAndVendor(quota.Labels[xpuTypeLabel], xpuCapacityLabel)`
`44`	`44`	`var totalXPU int64 = 0`
`45`	`45`	`var availableXPU int64 = 0`