Skip to content

Commit 67475e8

Browse files
Dev AgentQinYuuuu
authored andcommitted
Fix cluster info
1 parent faf9510 commit 67475e8

File tree

6 files changed

+118
-19
lines changed

6 files changed

+118
-19
lines changed

builder/deploy/cluster/cluster_manager.go

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"time"
1313

1414
argo "github.com/argoproj/argo-workflows/v3/pkg/client/clientset/versioned"
15+
units "github.com/dustin/go-humanize"
1516
authorizationv1 "k8s.io/api/authorization/v1"
1617
v1 "k8s.io/api/core/v1"
1718
"k8s.io/apimachinery/pkg/api/resource"
@@ -338,12 +339,20 @@ func (cluster *Cluster) GetResourcesInCluster(config *config.Config) (map[string
338339
allocatableCPU := node.Status.Allocatable.Cpu().DeepCopy()
339340
totalXPU := resource.Quantity{}
340341
allocatableXPU := resource.Quantity{}
341-
xpuCapacityLabel, xpuTypeLabel := getXPULabel(node.Labels, config)
342+
xpuCapacityLabel, xpuTypeLabel, xpuMemLabel := getXPULabel(node.Labels, config)
342343
if xpuCapacityLabel != "" {
343344
totalXPU = node.Status.Capacity[v1.ResourceName(xpuCapacityLabel)]
344345
allocatableXPU = node.Status.Allocatable[v1.ResourceName(xpuCapacityLabel)]
345346
}
346347

348+
bigXPUMem := ""
349+
if xpuMemLabel != "" {
350+
ulimit, err := units.ParseBigBytes(node.Labels[xpuMemLabel])
351+
if err == nil {
352+
bigXPUMem = units.BigIBytes(ulimit)
353+
}
354+
}
355+
347356
gpuModelVendor, gpuModel := getGpuTypeAndVendor(node.Labels[xpuTypeLabel], xpuCapacityLabel)
348357
nodeResourcesMap[node.Name] = types.NodeResourceInfo{
349358
NodeName: node.Name,
@@ -357,6 +366,7 @@ func (cluster *Cluster) GetResourcesInCluster(config *config.Config) (map[string
357366
AvailableXPU: parseQuantityToInt64(allocatableXPU),
358367

359368
XPUCapacityLabel: xpuCapacityLabel,
369+
XPUMem: bigXPUMem,
360370
}
361371
}
362372

@@ -417,66 +427,70 @@ func getGpuTypeAndVendor(vendorType string, label string) (string, string) {
417427
}
418428

419429
// the first label is the xpu capacity label, the second is the gpu model label
420-
func getXPULabel(labels map[string]string, config *config.Config) (string, string) {
430+
func getXPULabel(labels map[string]string, config *config.Config) (string, string, string) {
421431
if _, found := labels["aliyun.accelerator/nvidia_name"]; found {
422432
//for default cluster
423-
return "nvidia.com/gpu", "aliyun.accelerator/nvidia_name"
433+
return "nvidia.com/gpu", "aliyun.accelerator/nvidia_name", "aliyun.accelerator/nvidia_mem"
424434
}
425435
if _, found := labels["machine.cluster.vke.volcengine.com/gpu-name"]; found {
426436
//for volcano cluster
427-
return "nvidia.com/gpu", "machine.cluster.vke.volcengine.com/gpu-name"
437+
return "nvidia.com/gpu", "machine.cluster.vke.volcengine.com/gpu-name", "machine.cluster.vke.volcengine.com/gpu-mem"
428438
}
429439
if _, found := labels["eks.tke.cloud.tencent.com/gpu-type"]; found {
430440
//for tencent cluster
431-
return "nvidia.com/gpu", "eks.tke.cloud.tencent.com/gpu-type"
441+
return "nvidia.com/gpu", "eks.tke.cloud.tencent.com/gpu-type", "eks.tke.cloud.tencent.com/gpu-mem"
432442
}
433443
if _, found := labels["nvidia.com/nvidia_name"]; found {
434444
//for k3s cluster
435-
return "nvidia.com/gpu", "nvidia.com/nvidia_name"
445+
return "nvidia.com/gpu", "nvidia.com/nvidia_name", "nvidia.com/nvidia_mem"
446+
}
447+
if _, found := labels["nvidia.com/gpu.product"]; found {
448+
//for nvidia gpu product label
449+
return "nvidia.com/gpu", "nvidia.com/gpu.product", "nvidia.com/gpu.mem"
436450
}
437451
if _, found := labels["nvidia.com/gpu.product"]; found {
438452
//for nvidia gpu product label
439453
return "nvidia.com/gpu", "nvidia.com/gpu.product"
440454
}
441455
if _, found := labels["kubemore_xpu_type"]; found {
442456
//for huawei gpu
443-
return "huawei.com/Ascend910", "kubemore_xpu_type"
457+
return "huawei.com/Ascend910", "kubemore_xpu_type", "kubemore_xpu_mem"
444458
}
445459
if _, found := labels["huawei.accelerator"]; found {
446460
//for huawei gpu
447-
return "huawei.com/Ascend910", "huawei.accelerator"
461+
return "huawei.com/Ascend910", "huawei.accelerator", "huawei.accelerator.mem"
448462
}
449463
if _, found := labels["accelerator/huawei-npu"]; found {
450464
//for huawei gpu
451-
return "huawei.com/Ascend910", "accelerator/huawei-npu"
465+
return "huawei.com/Ascend910", "accelerator/huawei-npu", "accelerator/huawei-npu.mem"
452466
}
453467
if _, found := labels["hygon.com/dcu.name"]; found {
454468
//for hy dcu
455-
return "hygon.com/dcu", "hygon.com/dcu.name"
469+
return "hygon.com/dcu", "hygon.com/dcu.name", "hygon.com/dcu.mem"
456470
}
457471
if _, found := labels["enflame.com/gcu"]; found {
458472
//for enflame gcu
459-
return "enflame.com/gcu", "enflame.com/gcu.model"
473+
return "enflame.com/gcu", "enflame.com/gcu.model", "enflame.com/gcu.mem"
460474
}
461475
if _, found := labels["enflame.com/gcu.count"]; found {
462476
//for enflame gcu
463-
return "enflame.com/gcu.count", "enflame.com/gcu.model"
477+
return "enflame.com/gcu.count", "enflame.com/gcu.model", "enflame.com/gcu.mem"
464478
}
465479
//check custom gpu model label
466480
if config.Space.GPUModelLabel != "" {
467481
var gpuLabels []types.GPUModel
468482
err := json.Unmarshal([]byte(config.Space.GPUModelLabel), &gpuLabels)
469483
if err != nil {
470484
slog.Error("failed to parse GPUModelLabel", "error", err)
471-
return "", ""
485+
return "", "", ""
472486
}
473487
for _, gpuModel := range gpuLabels {
474488
if _, found := labels[gpuModel.TypeLabel]; found {
475-
return gpuModel.CapacityLabel, gpuModel.TypeLabel
489+
return gpuModel.CapacityLabel, gpuModel.TypeLabel, gpuModel.MemLabel
476490
}
477491
}
478492
}
479-
return "", ""
493+
return "", "", ""
480494
}
481495

482496
// convert memory in bytes to GB

builder/deploy/cluster/cluster_manager_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ func TestGetXPULabel(t *testing.T) {
231231

232232
for _, tt := range tests {
233233
t.Run(tt.name, func(t *testing.T) {
234-
gotCapacity, gotTypeLabel := getXPULabel(tt.labels, tt.config)
234+
gotCapacity, gotTypeLabel, _ := getXPULabel(tt.labels, tt.config)
235235
assert.Equal(t, tt.wantCapacity, gotCapacity)
236236
assert.Equal(t, tt.wantTypeLabel, gotTypeLabel)
237237
})

builder/deploy/cluster/resource_namespace.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ func (cluster *Cluster) GetResourceInNamespace(namespace string, quotaName strin
3939
available.Sub(usedAmount)
4040
return available
4141
}
42-
xpuCapacityLabel, xpuTypeLabel := getXPULabel(quota.Labels, config)
42+
xpuCapacityLabel, xpuTypeLabel, _ := getXPULabel(quota.Labels, config)
4343
gpuModelVendor, gpuModel := getGpuTypeAndVendor(quota.Labels[xpuTypeLabel], xpuCapacityLabel)
4444
var totalXPU int64 = 0
4545
var availableXPU int64 = 0

builder/deploy/cluster/resource_namespace_test.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,3 +236,61 @@ func TestGetNameSpaceResourcesQuota(t *testing.T) {
236236
})
237237
}
238238
}
239+
240+
func TestGetXPUMem(t *testing.T) {
241+
node1 := &v1.Node{
242+
ObjectMeta: metav1.ObjectMeta{
243+
Name: "node1",
244+
Labels: map[string]string{
245+
"nvidia.com/gpu": "true",
246+
"aliyun.accelerator/nvidia_name": "T4",
247+
"aliyun.accelerator/nvidia_mem": "16GiB",
248+
},
249+
},
250+
Status: v1.NodeStatus{
251+
Conditions: []v1.NodeCondition{
252+
{Type: v1.NodeReady, Status: v1.ConditionTrue},
253+
},
254+
Capacity: v1.ResourceList{
255+
v1.ResourceCPU: resource.MustParse("4"),
256+
v1.ResourceMemory: resource.MustParse("16Gi"),
257+
"nvidia.com/gpu": resource.MustParse("2"),
258+
},
259+
Allocatable: v1.ResourceList{
260+
v1.ResourceCPU: resource.MustParse("3"),
261+
v1.ResourceMemory: resource.MustParse("14Gi"),
262+
"nvidia.com/gpu": resource.MustParse("2"),
263+
},
264+
},
265+
}
266+
267+
node2 := &v1.Node{
268+
ObjectMeta: metav1.ObjectMeta{
269+
Name: "node2",
270+
Labels: map[string]string{
271+
"nvidia.com/gpu": "true",
272+
"aliyun.accelerator/nvidia_name": "NVIDIA-A10",
273+
"aliyun.accelerator/nvidia_mem": "23028MiB",
274+
},
275+
},
276+
Status: v1.NodeStatus{
277+
Conditions: []v1.NodeCondition{
278+
{Type: v1.NodeReady, Status: v1.ConditionTrue},
279+
},
280+
},
281+
}
282+
283+
clientset := fake.NewSimpleClientset(node1, node2)
284+
cluster := &Cluster{
285+
Client: clientset,
286+
}
287+
288+
config := &config.Config{}
289+
290+
resources, err := cluster.GetResourcesInCluster(config)
291+
assert.NoError(t, err)
292+
assert.Len(t, resources, 2)
293+
294+
assert.Equal(t, resources["node1"].XPUMem, "16 GiB")
295+
assert.Equal(t, resources["node2"].XPUMem, "22 GiB")
296+
}

builder/deploy/deployer.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,15 +641,39 @@ func (d *deployer) GetClusterUsageById(ctx context.Context, clusterId string) (*
641641
Provider: resp.Provider,
642642
Status: types.ClusterStatusRunning,
643643
}
644+
var vendorSet = make(map[string]struct{}, 0)
645+
var modelsSet = make(map[string]struct{}, 0)
644646
for _, node := range resp.Nodes {
645647
res.TotalCPU += node.TotalCPU
646648
res.AvailableCPU += node.AvailableCPU
647649
res.TotalMem += float64(node.TotalMem)
648650
res.AvailableMem += float64(node.AvailableMem)
649651
res.TotalGPU += node.TotalXPU
650652
res.AvailableGPU += node.AvailableXPU
653+
if node.GPUVendor != "" {
654+
vendorSet[node.GPUVendor] = struct{}{}
655+
modelsSet[fmt.Sprintf("%s(%s)", node.XPUModel, node.XPUMem)] = struct{}{}
656+
}
657+
}
651658

659+
var vendor string
660+
for k := range vendorSet {
661+
vendor += k + ", "
652662
}
663+
if vendor != "" {
664+
vendor = vendor[:len(vendor)-2]
665+
}
666+
667+
var models string
668+
for k := range modelsSet {
669+
models += k + ", "
670+
}
671+
if models != "" {
672+
models = models[:len(models)-2]
673+
}
674+
675+
res.XPUVendors = vendor
676+
res.XPUModels = models
653677
res.AvailableCPU = math.Floor(res.AvailableCPU)
654678
res.TotalMem = math.Floor(res.TotalMem)
655679
res.AvailableMem = math.Floor(res.AvailableMem)

common/types/cluster.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,10 @@ type ClusterRes struct {
6464

6565
ResourceStatus ResourceStatus `json:"resource_status"`
6666

67-
LastUpdateTime int64 `json:"last_update_time"`
67+
LastUpdateTime int64 `json:"last_update_time"`
68+
XPUVendors string `json:"xpu_vendors"` // NVIDIA, AMD
69+
XPUModels string `json:"xpu_models"` // A10(32 GB),H100(80 GB)
6870
}
69-
7071
type DeployRes struct {
7172
ClusterID string `json:"cluster_id"`
7273
ClusterRegion string `json:"cluster_region"`
@@ -91,6 +92,7 @@ type NodeResourceInfo struct {
9192
AvailableMem float32 `json:"available_mem"` //in GB
9293
XPUCapacityLabel string `json:"xpu_capacity_label"`
9394
ReservedXPU int64 `json:"reserved_xpu"`
95+
XPUMem string `json:"xpu_mem"`
9496
}
9597

9698
type UpdateClusterResponse struct {
@@ -100,6 +102,7 @@ type UpdateClusterResponse struct {
100102
type GPUModel struct {
101103
TypeLabel string `json:"type_label"`
102104
CapacityLabel string `json:"capacity_label"`
105+
MemLabel string `json:"mem_label"`
103106
}
104107

105108
type ClusterStatus string

0 commit comments

Comments
 (0)