diff --git a/builder/deploy/cluster/cluster_manager.go b/builder/deploy/cluster/cluster_manager.go index 755b2dc1c..37884d509 100644 --- a/builder/deploy/cluster/cluster_manager.go +++ b/builder/deploy/cluster/cluster_manager.go @@ -12,6 +12,7 @@ import ( "time" argo "github.com/argoproj/argo-workflows/v3/pkg/client/clientset/versioned" + units "github.com/dustin/go-humanize" authorizationv1 "k8s.io/api/authorization/v1" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -338,12 +339,20 @@ func (cluster *Cluster) GetResourcesInCluster(config *config.Config) (map[string allocatableCPU := node.Status.Allocatable.Cpu().DeepCopy() totalXPU := resource.Quantity{} allocatableXPU := resource.Quantity{} - xpuCapacityLabel, xpuTypeLabel := getXPULabel(node.Labels, config) + xpuCapacityLabel, xpuTypeLabel, xpuMemLabel := getXPULabel(node.Labels, config) if xpuCapacityLabel != "" { totalXPU = node.Status.Capacity[v1.ResourceName(xpuCapacityLabel)] allocatableXPU = node.Status.Allocatable[v1.ResourceName(xpuCapacityLabel)] } + bigXPUMem := "" + if xpuMemLabel != "" { + ulimit, err := units.ParseBigBytes(node.Labels[xpuMemLabel]) + if err == nil { + bigXPUMem = units.BigIBytes(ulimit) + } + } + gpuModelVendor, gpuModel := getGpuTypeAndVendor(node.Labels[xpuTypeLabel], xpuCapacityLabel) nodeResourcesMap[node.Name] = types.NodeResourceInfo{ NodeName: node.Name, @@ -357,6 +366,7 @@ func (cluster *Cluster) GetResourcesInCluster(config *config.Config) (map[string AvailableXPU: parseQuantityToInt64(allocatableXPU), XPUCapacityLabel: xpuCapacityLabel, + XPUMem: bigXPUMem, } } @@ -417,50 +427,50 @@ func getGpuTypeAndVendor(vendorType string, label string) (string, string) { } // the first label is the xpu capacity label, the second is the gpu model label -func getXPULabel(labels map[string]string, config *config.Config) (string, string) { +func getXPULabel(labels map[string]string, config *config.Config) (string, string, string) { if _, found := labels["aliyun.accelerator/nvidia_name"]; found { //for default cluster - return "nvidia.com/gpu", "aliyun.accelerator/nvidia_name" + return "nvidia.com/gpu", "aliyun.accelerator/nvidia_name", "aliyun.accelerator/nvidia_mem" } if _, found := labels["machine.cluster.vke.volcengine.com/gpu-name"]; found { //for volcano cluster - return "nvidia.com/gpu", "machine.cluster.vke.volcengine.com/gpu-name" + return "nvidia.com/gpu", "machine.cluster.vke.volcengine.com/gpu-name", "machine.cluster.vke.volcengine.com/gpu-mem" } if _, found := labels["eks.tke.cloud.tencent.com/gpu-type"]; found { //for tencent cluster - return "nvidia.com/gpu", "eks.tke.cloud.tencent.com/gpu-type" + return "nvidia.com/gpu", "eks.tke.cloud.tencent.com/gpu-type", "eks.tke.cloud.tencent.com/gpu-mem" } if _, found := labels["nvidia.com/nvidia_name"]; found { //for k3s cluster - return "nvidia.com/gpu", "nvidia.com/nvidia_name" + return "nvidia.com/gpu", "nvidia.com/nvidia_name", "nvidia.com/nvidia_mem" } if _, found := labels["nvidia.com/gpu.product"]; found { //for nvidia gpu product label - return "nvidia.com/gpu", "nvidia.com/gpu.product" + return "nvidia.com/gpu", "nvidia.com/gpu.product", "nvidia.com/gpu.mem" } if _, found := labels["kubemore_xpu_type"]; found { //for huawei gpu - return "huawei.com/Ascend910", "kubemore_xpu_type" + return "huawei.com/Ascend910", "kubemore_xpu_type", "kubemore_xpu_mem" } if _, found := labels["huawei.accelerator"]; found { //for huawei gpu - return "huawei.com/Ascend910", "huawei.accelerator" + return "huawei.com/Ascend910", "huawei.accelerator", "huawei.accelerator.mem" } if _, found := labels["accelerator/huawei-npu"]; found { //for huawei gpu - return "huawei.com/Ascend910", "accelerator/huawei-npu" + return "huawei.com/Ascend910", "accelerator/huawei-npu", "accelerator/huawei-npu.mem" } if _, found := labels["hygon.com/dcu.name"]; found { //for hy dcu - return "hygon.com/dcu", "hygon.com/dcu.name" + return "hygon.com/dcu", "hygon.com/dcu.name", "hygon.com/dcu.mem" } if _, found := labels["enflame.com/gcu"]; found { //for enflame gcu - return "enflame.com/gcu", "enflame.com/gcu.model" + return "enflame.com/gcu", "enflame.com/gcu.model", "enflame.com/gcu.mem" } if _, found := labels["enflame.com/gcu.count"]; found { //for enflame gcu - return "enflame.com/gcu.count", "enflame.com/gcu.model" + return "enflame.com/gcu.count", "enflame.com/gcu.model", "enflame.com/gcu.mem" } //check custom gpu model label if config.Space.GPUModelLabel != "" { @@ -468,15 +478,15 @@ func getXPULabel(labels map[string]string, config *config.Config) (string, strin err := json.Unmarshal([]byte(config.Space.GPUModelLabel), &gpuLabels) if err != nil { slog.Error("failed to parse GPUModelLabel", "error", err) - return "", "" + return "", "", "" } for _, gpuModel := range gpuLabels { if _, found := labels[gpuModel.TypeLabel]; found { - return gpuModel.CapacityLabel, gpuModel.TypeLabel + return gpuModel.CapacityLabel, gpuModel.TypeLabel, gpuModel.MemLabel } } } - return "", "" + return "", "", "" } // convert memory in bytes to GB diff --git a/builder/deploy/cluster/cluster_manager_test.go b/builder/deploy/cluster/cluster_manager_test.go index 17cf772e8..3251a09ed 100644 --- a/builder/deploy/cluster/cluster_manager_test.go +++ b/builder/deploy/cluster/cluster_manager_test.go @@ -231,7 +231,7 @@ func TestGetXPULabel(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - gotCapacity, gotTypeLabel := getXPULabel(tt.labels, tt.config) + gotCapacity, gotTypeLabel, _ := getXPULabel(tt.labels, tt.config) assert.Equal(t, tt.wantCapacity, gotCapacity) assert.Equal(t, tt.wantTypeLabel, gotTypeLabel) }) diff --git a/builder/deploy/cluster/resource_namespace.go b/builder/deploy/cluster/resource_namespace.go index 7d26ec06a..6f6105d24 100644 --- a/builder/deploy/cluster/resource_namespace.go +++ b/builder/deploy/cluster/resource_namespace.go @@ -39,7 +39,7 @@ func (cluster *Cluster) GetResourceInNamespace(namespace string, quotaName strin available.Sub(usedAmount) return available } - xpuCapacityLabel, xpuTypeLabel := getXPULabel(quota.Labels, config) + xpuCapacityLabel, xpuTypeLabel, _ := getXPULabel(quota.Labels, config) gpuModelVendor, gpuModel := getGpuTypeAndVendor(quota.Labels[xpuTypeLabel], xpuCapacityLabel) var totalXPU int64 = 0 var availableXPU int64 = 0 diff --git a/builder/deploy/cluster/resource_namespace_test.go b/builder/deploy/cluster/resource_namespace_test.go index ff8ae2fad..018381d89 100644 --- a/builder/deploy/cluster/resource_namespace_test.go +++ b/builder/deploy/cluster/resource_namespace_test.go @@ -236,3 +236,61 @@ func TestGetNameSpaceResourcesQuota(t *testing.T) { }) } } + +func TestGetXPUMem(t *testing.T) { + node1 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Labels: map[string]string{ + "nvidia.com/gpu": "true", + "aliyun.accelerator/nvidia_name": "T4", + "aliyun.accelerator/nvidia_mem": "16GiB", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + {Type: v1.NodeReady, Status: v1.ConditionTrue}, + }, + Capacity: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("16Gi"), + "nvidia.com/gpu": resource.MustParse("2"), + }, + Allocatable: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("3"), + v1.ResourceMemory: resource.MustParse("14Gi"), + "nvidia.com/gpu": resource.MustParse("2"), + }, + }, + } + + node2 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node2", + Labels: map[string]string{ + "nvidia.com/gpu": "true", + "aliyun.accelerator/nvidia_name": "NVIDIA-A10", + "aliyun.accelerator/nvidia_mem": "23028MiB", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + {Type: v1.NodeReady, Status: v1.ConditionTrue}, + }, + }, + } + + clientset := fake.NewSimpleClientset(node1, node2) + cluster := &Cluster{ + Client: clientset, + } + + config := &config.Config{} + + resources, err := cluster.GetResourcesInCluster(config) + assert.NoError(t, err) + assert.Len(t, resources, 2) + + assert.Equal(t, resources["node1"].XPUMem, "16 GiB") + assert.Equal(t, resources["node2"].XPUMem, "22 GiB") +} diff --git a/builder/deploy/deployer.go b/builder/deploy/deployer.go index afdd74984..96a0d034b 100644 --- a/builder/deploy/deployer.go +++ b/builder/deploy/deployer.go @@ -641,6 +641,8 @@ func (d *deployer) GetClusterUsageById(ctx context.Context, clusterId string) (* Provider: resp.Provider, Status: types.ClusterStatusRunning, } + var vendorSet = make(map[string]struct{}, 0) + var modelsSet = make(map[string]struct{}, 0) for _, node := range resp.Nodes { res.TotalCPU += node.TotalCPU res.AvailableCPU += node.AvailableCPU @@ -648,8 +650,30 @@ func (d *deployer) GetClusterUsageById(ctx context.Context, clusterId string) (* res.AvailableMem += float64(node.AvailableMem) res.TotalGPU += node.TotalXPU res.AvailableGPU += node.AvailableXPU + if node.GPUVendor != "" { + vendorSet[node.GPUVendor] = struct{}{} + modelsSet[fmt.Sprintf("%s(%s)", node.XPUModel, node.XPUMem)] = struct{}{} + } + } + var vendor string + for k := range vendorSet { + vendor += k + ", " } + if vendor != "" { + vendor = vendor[:len(vendor)-2] + } + + var models string + for k := range modelsSet { + models += k + ", " + } + if models != "" { + models = models[:len(models)-2] + } + + res.XPUVendors = vendor + res.XPUModels = models res.AvailableCPU = math.Floor(res.AvailableCPU) res.TotalMem = math.Floor(res.TotalMem) res.AvailableMem = math.Floor(res.AvailableMem) diff --git a/common/types/cluster.go b/common/types/cluster.go index 6de71dbdf..7b57a47cf 100644 --- a/common/types/cluster.go +++ b/common/types/cluster.go @@ -64,9 +64,10 @@ type ClusterRes struct { ResourceStatus ResourceStatus `json:"resource_status"` - LastUpdateTime int64 `json:"last_update_time"` + LastUpdateTime int64 `json:"last_update_time"` + XPUVendors string `json:"xpu_vendors"` // NVIDIA, AMD + XPUModels string `json:"xpu_models"` // A10(32 GB),H100(80 GB) } - type DeployRes struct { ClusterID string `json:"cluster_id"` ClusterRegion string `json:"cluster_region"` @@ -91,6 +92,7 @@ type NodeResourceInfo struct { AvailableMem float32 `json:"available_mem"` //in GB XPUCapacityLabel string `json:"xpu_capacity_label"` ReservedXPU int64 `json:"reserved_xpu"` + XPUMem string `json:"xpu_mem"` } type UpdateClusterResponse struct { @@ -100,6 +102,7 @@ type UpdateClusterResponse struct { type GPUModel struct { TypeLabel string `json:"type_label"` CapacityLabel string `json:"capacity_label"` + MemLabel string `json:"mem_label"` } type ClusterStatus string