Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 26 additions & 16 deletions builder/deploy/cluster/cluster_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"time"

argo "github.com/argoproj/argo-workflows/v3/pkg/client/clientset/versioned"
units "github.com/dustin/go-humanize"
authorizationv1 "k8s.io/api/authorization/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
Expand Down Expand Up @@ -338,12 +339,20 @@ func (cluster *Cluster) GetResourcesInCluster(config *config.Config) (map[string
allocatableCPU := node.Status.Allocatable.Cpu().DeepCopy()
totalXPU := resource.Quantity{}
allocatableXPU := resource.Quantity{}
xpuCapacityLabel, xpuTypeLabel := getXPULabel(node.Labels, config)
xpuCapacityLabel, xpuTypeLabel, xpuMemLabel := getXPULabel(node.Labels, config)
if xpuCapacityLabel != "" {
totalXPU = node.Status.Capacity[v1.ResourceName(xpuCapacityLabel)]
allocatableXPU = node.Status.Allocatable[v1.ResourceName(xpuCapacityLabel)]
}

bigXPUMem := ""
if xpuMemLabel != "" {
ulimit, err := units.ParseBigBytes(node.Labels[xpuMemLabel])
if err == nil {
bigXPUMem = units.BigIBytes(ulimit)
}
}

gpuModelVendor, gpuModel := getGpuTypeAndVendor(node.Labels[xpuTypeLabel], xpuCapacityLabel)
nodeResourcesMap[node.Name] = types.NodeResourceInfo{
NodeName: node.Name,
Expand All @@ -357,6 +366,7 @@ func (cluster *Cluster) GetResourcesInCluster(config *config.Config) (map[string
AvailableXPU: parseQuantityToInt64(allocatableXPU),

XPUCapacityLabel: xpuCapacityLabel,
XPUMem: bigXPUMem,
}
}

Expand Down Expand Up @@ -417,66 +427,66 @@ func getGpuTypeAndVendor(vendorType string, label string) (string, string) {
}

// the first label is the xpu capacity label, the second is the gpu model label
func getXPULabel(labels map[string]string, config *config.Config) (string, string) {
func getXPULabel(labels map[string]string, config *config.Config) (string, string, string) {
if _, found := labels["aliyun.accelerator/nvidia_name"]; found {
//for default cluster
return "nvidia.com/gpu", "aliyun.accelerator/nvidia_name"
return "nvidia.com/gpu", "aliyun.accelerator/nvidia_name", "aliyun.accelerator/nvidia_mem"
}
if _, found := labels["machine.cluster.vke.volcengine.com/gpu-name"]; found {
//for volcano cluster
return "nvidia.com/gpu", "machine.cluster.vke.volcengine.com/gpu-name"
return "nvidia.com/gpu", "machine.cluster.vke.volcengine.com/gpu-name", "machine.cluster.vke.volcengine.com/gpu-mem"
}
if _, found := labels["eks.tke.cloud.tencent.com/gpu-type"]; found {
//for tencent cluster
return "nvidia.com/gpu", "eks.tke.cloud.tencent.com/gpu-type"
return "nvidia.com/gpu", "eks.tke.cloud.tencent.com/gpu-type", "eks.tke.cloud.tencent.com/gpu-mem"
}
if _, found := labels["nvidia.com/nvidia_name"]; found {
//for k3s cluster
return "nvidia.com/gpu", "nvidia.com/nvidia_name"
return "nvidia.com/gpu", "nvidia.com/nvidia_name", "nvidia.com/nvidia_mem"
}
if _, found := labels["nvidia.com/gpu.product"]; found {
//for nvidia gpu product label
return "nvidia.com/gpu", "nvidia.com/gpu.product"
return "nvidia.com/gpu", "nvidia.com/gpu.product", "nvidia.com/gpu.mem"
}
if _, found := labels["kubemore_xpu_type"]; found {
//for huawei gpu
return "huawei.com/Ascend910", "kubemore_xpu_type"
return "huawei.com/Ascend910", "kubemore_xpu_type", "kubemore_xpu_mem"
}
if _, found := labels["huawei.accelerator"]; found {
//for huawei gpu
return "huawei.com/Ascend910", "huawei.accelerator"
return "huawei.com/Ascend910", "huawei.accelerator", "huawei.accelerator.mem"
}
if _, found := labels["accelerator/huawei-npu"]; found {
//for huawei gpu
return "huawei.com/Ascend910", "accelerator/huawei-npu"
return "huawei.com/Ascend910", "accelerator/huawei-npu", "accelerator/huawei-npu.mem"
}
if _, found := labels["hygon.com/dcu.name"]; found {
//for hy dcu
return "hygon.com/dcu", "hygon.com/dcu.name"
return "hygon.com/dcu", "hygon.com/dcu.name", "hygon.com/dcu.mem"
}
if _, found := labels["enflame.com/gcu"]; found {
//for enflame gcu
return "enflame.com/gcu", "enflame.com/gcu.model"
return "enflame.com/gcu", "enflame.com/gcu.model", "enflame.com/gcu.mem"
}
if _, found := labels["enflame.com/gcu.count"]; found {
//for enflame gcu
return "enflame.com/gcu.count", "enflame.com/gcu.model"
return "enflame.com/gcu.count", "enflame.com/gcu.model", "enflame.com/gcu.mem"
}
//check custom gpu model label
if config.Space.GPUModelLabel != "" {
var gpuLabels []types.GPUModel
err := json.Unmarshal([]byte(config.Space.GPUModelLabel), &gpuLabels)
if err != nil {
slog.Error("failed to parse GPUModelLabel", "error", err)
return "", ""
return "", "", ""
}
for _, gpuModel := range gpuLabels {
if _, found := labels[gpuModel.TypeLabel]; found {
return gpuModel.CapacityLabel, gpuModel.TypeLabel
return gpuModel.CapacityLabel, gpuModel.TypeLabel, gpuModel.MemLabel
}
}
}
return "", ""
return "", "", ""
}

// convert memory in bytes to GB
Expand Down
2 changes: 1 addition & 1 deletion builder/deploy/cluster/cluster_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ func TestGetXPULabel(t *testing.T) {

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotCapacity, gotTypeLabel := getXPULabel(tt.labels, tt.config)
gotCapacity, gotTypeLabel, _ := getXPULabel(tt.labels, tt.config)
assert.Equal(t, tt.wantCapacity, gotCapacity)
assert.Equal(t, tt.wantTypeLabel, gotTypeLabel)
})
Expand Down
2 changes: 1 addition & 1 deletion builder/deploy/cluster/resource_namespace.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func (cluster *Cluster) GetResourceInNamespace(namespace string, quotaName strin
available.Sub(usedAmount)
return available
}
xpuCapacityLabel, xpuTypeLabel := getXPULabel(quota.Labels, config)
xpuCapacityLabel, xpuTypeLabel, _ := getXPULabel(quota.Labels, config)
gpuModelVendor, gpuModel := getGpuTypeAndVendor(quota.Labels[xpuTypeLabel], xpuCapacityLabel)
var totalXPU int64 = 0
var availableXPU int64 = 0
Expand Down
58 changes: 58 additions & 0 deletions builder/deploy/cluster/resource_namespace_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,3 +236,61 @@ func TestGetNameSpaceResourcesQuota(t *testing.T) {
})
}
}

func TestGetXPUMem(t *testing.T) {
node1 := &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node1",
Labels: map[string]string{
"nvidia.com/gpu": "true",
"aliyun.accelerator/nvidia_name": "T4",
"aliyun.accelerator/nvidia_mem": "16GiB",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{Type: v1.NodeReady, Status: v1.ConditionTrue},
},
Capacity: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("4"),
v1.ResourceMemory: resource.MustParse("16Gi"),
"nvidia.com/gpu": resource.MustParse("2"),
},
Allocatable: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("3"),
v1.ResourceMemory: resource.MustParse("14Gi"),
"nvidia.com/gpu": resource.MustParse("2"),
},
},
}

node2 := &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node2",
Labels: map[string]string{
"nvidia.com/gpu": "true",
"aliyun.accelerator/nvidia_name": "NVIDIA-A10",
"aliyun.accelerator/nvidia_mem": "23028MiB",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{Type: v1.NodeReady, Status: v1.ConditionTrue},
},
},
}

clientset := fake.NewSimpleClientset(node1, node2)
cluster := &Cluster{
Client: clientset,
}

config := &config.Config{}

resources, err := cluster.GetResourcesInCluster(config)
assert.NoError(t, err)
assert.Len(t, resources, 2)

assert.Equal(t, resources["node1"].XPUMem, "16 GiB")
assert.Equal(t, resources["node2"].XPUMem, "22 GiB")
}
24 changes: 24 additions & 0 deletions builder/deploy/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -641,15 +641,39 @@ func (d *deployer) GetClusterUsageById(ctx context.Context, clusterId string) (*
Provider: resp.Provider,
Status: types.ClusterStatusRunning,
}
var vendorSet = make(map[string]struct{}, 0)
var modelsSet = make(map[string]struct{}, 0)
for _, node := range resp.Nodes {
res.TotalCPU += node.TotalCPU
res.AvailableCPU += node.AvailableCPU
res.TotalMem += float64(node.TotalMem)
res.AvailableMem += float64(node.AvailableMem)
res.TotalGPU += node.TotalXPU
res.AvailableGPU += node.AvailableXPU
if node.GPUVendor != "" {
vendorSet[node.GPUVendor] = struct{}{}
modelsSet[fmt.Sprintf("%s(%s)", node.XPUModel, node.XPUMem)] = struct{}{}
}
}

var vendor string
for k := range vendorSet {
vendor += k + ", "
}
if vendor != "" {
vendor = vendor[:len(vendor)-2]
}

var models string
for k := range modelsSet {
models += k + ", "
}
if models != "" {
models = models[:len(models)-2]
}

res.XPUVendors = vendor
res.XPUModels = models
res.AvailableCPU = math.Floor(res.AvailableCPU)
res.TotalMem = math.Floor(res.TotalMem)
res.AvailableMem = math.Floor(res.AvailableMem)
Expand Down
7 changes: 5 additions & 2 deletions common/types/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,10 @@ type ClusterRes struct {

ResourceStatus ResourceStatus `json:"resource_status"`

LastUpdateTime int64 `json:"last_update_time"`
LastUpdateTime int64 `json:"last_update_time"`
XPUVendors string `json:"xpu_vendors"` // NVIDIA, AMD
XPUModels string `json:"xpu_models"` // A10(32 GB),H100(80 GB)
}

type DeployRes struct {
ClusterID string `json:"cluster_id"`
ClusterRegion string `json:"cluster_region"`
Expand All @@ -91,6 +92,7 @@ type NodeResourceInfo struct {
AvailableMem float32 `json:"available_mem"` //in GB
XPUCapacityLabel string `json:"xpu_capacity_label"`
ReservedXPU int64 `json:"reserved_xpu"`
XPUMem string `json:"xpu_mem"`
}

type UpdateClusterResponse struct {
Expand All @@ -100,6 +102,7 @@ type UpdateClusterResponse struct {
type GPUModel struct {
TypeLabel string `json:"type_label"`
CapacityLabel string `json:"capacity_label"`
MemLabel string `json:"mem_label"`
}

type ClusterStatus string
Expand Down
Loading