@@ -12,6 +12,7 @@ import (
1212 "time"
1313
1414 argo "github.com/argoproj/argo-workflows/v3/pkg/client/clientset/versioned"
15+ units "github.com/dustin/go-humanize"
1516 authorizationv1 "k8s.io/api/authorization/v1"
1617 v1 "k8s.io/api/core/v1"
1718 "k8s.io/apimachinery/pkg/api/resource"
@@ -338,12 +339,20 @@ func (cluster *Cluster) GetResourcesInCluster(config *config.Config) (map[string
338339 allocatableCPU := node .Status .Allocatable .Cpu ().DeepCopy ()
339340 totalXPU := resource.Quantity {}
340341 allocatableXPU := resource.Quantity {}
341- xpuCapacityLabel , xpuTypeLabel := getXPULabel (node .Labels , config )
342+ xpuCapacityLabel , xpuTypeLabel , xpuMemLabel := getXPULabel (node .Labels , config )
342343 if xpuCapacityLabel != "" {
343344 totalXPU = node .Status .Capacity [v1 .ResourceName (xpuCapacityLabel )]
344345 allocatableXPU = node .Status .Allocatable [v1 .ResourceName (xpuCapacityLabel )]
345346 }
346347
348+ bigXPUMem := ""
349+ if xpuMemLabel != "" {
350+ ulimit , err := units .ParseBigBytes (node .Labels [xpuMemLabel ])
351+ if err == nil {
352+ bigXPUMem = units .BigIBytes (ulimit )
353+ }
354+ }
355+
347356 gpuModelVendor , gpuModel := getGpuTypeAndVendor (node .Labels [xpuTypeLabel ], xpuCapacityLabel )
348357 nodeResourcesMap [node .Name ] = types.NodeResourceInfo {
349358 NodeName : node .Name ,
@@ -357,6 +366,7 @@ func (cluster *Cluster) GetResourcesInCluster(config *config.Config) (map[string
357366 AvailableXPU : parseQuantityToInt64 (allocatableXPU ),
358367
359368 XPUCapacityLabel : xpuCapacityLabel ,
369+ XPUMem : bigXPUMem ,
360370 }
361371 }
362372
@@ -417,66 +427,70 @@ func getGpuTypeAndVendor(vendorType string, label string) (string, string) {
417427}
418428
419429// the first label is the xpu capacity label, the second is the gpu model label
420- func getXPULabel (labels map [string ]string , config * config.Config ) (string , string ) {
430+ func getXPULabel (labels map [string ]string , config * config.Config ) (string , string , string ) {
421431 if _ , found := labels ["aliyun.accelerator/nvidia_name" ]; found {
422432 //for default cluster
423- return "nvidia.com/gpu" , "aliyun.accelerator/nvidia_name"
433+ return "nvidia.com/gpu" , "aliyun.accelerator/nvidia_name" , "aliyun.accelerator/nvidia_mem"
424434 }
425435 if _ , found := labels ["machine.cluster.vke.volcengine.com/gpu-name" ]; found {
426436 //for volcano cluster
427- return "nvidia.com/gpu" , "machine.cluster.vke.volcengine.com/gpu-name"
437+ return "nvidia.com/gpu" , "machine.cluster.vke.volcengine.com/gpu-name" , "machine.cluster.vke.volcengine.com/gpu-mem"
428438 }
429439 if _ , found := labels ["eks.tke.cloud.tencent.com/gpu-type" ]; found {
430440 //for tencent cluster
431- return "nvidia.com/gpu" , "eks.tke.cloud.tencent.com/gpu-type"
441+ return "nvidia.com/gpu" , "eks.tke.cloud.tencent.com/gpu-type" , "eks.tke.cloud.tencent.com/gpu-mem"
432442 }
433443 if _ , found := labels ["nvidia.com/nvidia_name" ]; found {
434444 //for k3s cluster
435- return "nvidia.com/gpu" , "nvidia.com/nvidia_name"
445+ return "nvidia.com/gpu" , "nvidia.com/nvidia_name" , "nvidia.com/nvidia_mem"
446+ }
447+ if _ , found := labels ["nvidia.com/gpu.product" ]; found {
448+ //for nvidia gpu product label
449+ return "nvidia.com/gpu" , "nvidia.com/gpu.product" , "nvidia.com/gpu.mem"
436450 }
437451 if _ , found := labels ["nvidia.com/gpu.product" ]; found {
438452 //for nvidia gpu product label
439453 return "nvidia.com/gpu" , "nvidia.com/gpu.product"
440454 }
441455 if _ , found := labels ["kubemore_xpu_type" ]; found {
442456 //for huawei gpu
443- return "huawei.com/Ascend910" , "kubemore_xpu_type"
457+ return "huawei.com/Ascend910" , "kubemore_xpu_type" , "kubemore_xpu_mem"
444458 }
445459 if _ , found := labels ["huawei.accelerator" ]; found {
446460 //for huawei gpu
447- return "huawei.com/Ascend910" , "huawei.accelerator"
461+ return "huawei.com/Ascend910" , "huawei.accelerator" , "huawei.accelerator.mem"
448462 }
449463 if _ , found := labels ["accelerator/huawei-npu" ]; found {
450464 //for huawei gpu
451- return "huawei.com/Ascend910" , "accelerator/huawei-npu"
465+ return "huawei.com/Ascend910" , "accelerator/huawei-npu" , "accelerator/huawei-npu.mem"
452466 }
453467 if _ , found := labels ["hygon.com/dcu.name" ]; found {
454468 //for hy dcu
455- return "hygon.com/dcu" , "hygon.com/dcu.name"
469+ return "hygon.com/dcu" , "hygon.com/dcu.name" , "hygon.com/dcu.mem"
456470 }
457471 if _ , found := labels ["enflame.com/gcu" ]; found {
458472 //for enflame gcu
459- return "enflame.com/gcu" , "enflame.com/gcu.model"
473+ return "enflame.com/gcu" , "enflame.com/gcu.model" , "enflame.com/gcu.mem"
460474 }
461475 if _ , found := labels ["enflame.com/gcu.count" ]; found {
462476 //for enflame gcu
463- return "enflame.com/gcu.count" , "enflame.com/gcu.model"
477+ return "enflame.com/gcu.count" , "enflame.com/gcu.model" , "enflame.com/gcu.mem"
464478 }
465479 //check custom gpu model label
466480 if config .Space .GPUModelLabel != "" {
467481 var gpuLabels []types.GPUModel
468482 err := json .Unmarshal ([]byte (config .Space .GPUModelLabel ), & gpuLabels )
469483 if err != nil {
470484 slog .Error ("failed to parse GPUModelLabel" , "error" , err )
471- return "" , ""
485+ return "" , "" , ""
472486 }
473487 for _ , gpuModel := range gpuLabels {
474488 if _ , found := labels [gpuModel .TypeLabel ]; found {
475- return gpuModel .CapacityLabel , gpuModel .TypeLabel
489+ return gpuModel .CapacityLabel , gpuModel .TypeLabel , gpuModel . MemLabel
476490 }
477491 }
478492 }
479- return "" , ""
493+ return "" , "" , ""
480494}
481495
482496// convert memory in bytes to GB
0 commit comments