Skip to content

Commit 92c9bab

Browse files
Kyrie336Lei Guo
andauthored
Support Metax sGPU topology aware (#1193)
Signed-off-by: Lei Guo <[email protected]> Co-authored-by: Lei Guo <[email protected]>
1 parent 006ca44 commit 92c9bab

File tree

10 files changed

+1112
-66
lines changed

10 files changed

+1112
-66
lines changed

charts/hami/templates/scheduler/device-configmap.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ data:
9797
resourceVCountName: {{ .Values.metaxResourceName }}
9898
resourceVMemoryName: {{ .Values.metaxResourceMem }}
9999
resourceVCoreName: {{ .Values.metaxResourceCore }}
100+
sgpuTopologyAware: {{ .Values.metaxsGPUTopologyAware }}
100101
enflame:
101102
resourceCountName: "enflame.com/vgcu"
102103
resourcePercentageName: "enflame.com/vgcu-percentage"

charts/hami/values.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,11 @@ iluvatarResourceName: "iluvatar.ai/vgpu"
4343
iluvatarResourceMem: "iluvatar.ai/vcuda-memory"
4444
iluvatarResourceCore: "iluvatar.ai/vcuda-core"
4545

46-
#Metax SGPU Parameters
46+
#Metax sGPU Parameters
4747
metaxResourceName: "metax-tech.com/sgpu"
4848
metaxResourceCore: "metax-tech.com/vcore"
4949
metaxResourceMem: "metax-tech.com/vmemory"
50+
metaxsGPUTopologyAware: "false"
5051

5152
#Kunlun XPU Parameters
5253
kunlunResourceName: "kunlunxin.com/xpu"

examples/metax/sgpu/allocate_specific_gpu.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ kind: Pod
33
metadata:
44
name: gpu-pod
55
annotations:
6-
metax-tech.com/use-gpuuuid: "36beae85-c835-6b14-6ab2-02671837a59c" # allocate specific gpu
6+
metax-tech.com/use-gpuuuid: "GPU-36beae85-c835-6b14-6ab2-02671837a59c" # allocate specific gpu
77
spec:
88
containers:
99
- name: ubuntu-container
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: gpu-pod
5+
annotations:
6+
metax-tech.com/sgpu-topology-aware: "true" # enable topology aware scheduling
7+
spec:
8+
containers:
9+
- name: ubuntu-container
10+
image: ubuntu:22.04
11+
imagePullPolicy: IfNotPresent
12+
command: ["sleep","infinity"]
13+
resources:
14+
limits:
15+
metax-tech.com/sgpu: 4 # requesting 4 exclusive GPU

pkg/device/metax/config.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ type MetaxConfig struct {
2626
ResourceVCountName string `yaml:"resourceVCountName"`
2727
ResourceVMemoryName string `yaml:"resourceVMemoryName"`
2828
ResourceVCoreName string `yaml:"resourceVCoreName"`
29+
TopologyAware bool `yaml:"sgpuTopologyAware"`
2930
}
3031

3132
func ParseConfig(fs *flag.FlagSet) {
@@ -36,4 +37,5 @@ func ParseConfig(fs *flag.FlagSet) {
3637
fs.StringVar(&MetaxResourceNameVCount, "metax-vcount", "metax-tech.com/sgpu", "metax vcount name")
3738
fs.StringVar(&MetaxResourceNameVCore, "metax-vcore", "metax-tech.com/vcore", "metax vcore name")
3839
fs.StringVar(&MetaxResourceNameVMemory, "metax-vmemory", "metax-tech.com/vmemory", "metax vmemory name")
40+
fs.BoolVar(&MetaxTopologyAware, "sgpu-topology-aware", false, "sGPU topology aware enable")
3941
}

pkg/device/metax/protocol.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ const (
3030
MetaxUseUUID = "metax-tech.com/use-gpuuuid"
3131
MetaxNoUseUUID = "metax-tech.com/nouse-gpuuuid"
3232

33-
MetaxSGPUQosPolicy = "metax-tech.com/sgpu-qos-policy"
33+
MetaxSGPUQosPolicy = "metax-tech.com/sgpu-qos-policy"
34+
MetaxSGPUTopologyAware = "metax-tech.com/sgpu-topology-aware"
3435
)
3536

3637
const (
@@ -52,6 +53,7 @@ type MetaxSDeviceInfo struct {
5253
Numa int32 `json:"numa,omitempty"`
5354
Healthy bool `json:"healthy,omitempty"`
5455
QosPolicy string `json:"qosPolicy,omitempty"`
56+
LinkZone int32 `json:"linkZone,omitempty"`
5557
}
5658
type NodeMetaxSDeviceInfo []*MetaxSDeviceInfo
5759

@@ -67,8 +69,8 @@ func (ni NodeMetaxSDeviceInfo) String() string {
6769
str := "\n"
6870

6971
for _, i := range ni {
70-
str += fmt.Sprintf("MetaxSDeviceInfo[%s]: TotalDevCount=%d, TotalCompute=%d, TotalVRam=%d, Numa=%d, Healthy=%t, QosPolicy=%s\n",
71-
i.UUID, i.TotalDevCount, i.TotalCompute, i.TotalVRam, i.Numa, i.Healthy, i.QosPolicy)
72+
str += fmt.Sprintf("MetaxSDeviceInfo[%s]: TotalDevCount=%d, TotalCompute=%d, TotalVRam=%d, Numa=%d, Healthy=%t, QosPolicy=%s, LinkZone=%d\n",
73+
i.UUID, i.TotalDevCount, i.TotalCompute, i.TotalVRam, i.Numa, i.Healthy, i.QosPolicy, i.LinkZone)
7274
}
7375

7476
return str
@@ -108,6 +110,7 @@ func convertMetaxSDeviceToHAMIDevice(metaxSDevices []*MetaxSDeviceInfo) []*util.
108110
CustomInfo: map[string]any{
109111
"QosPolicy": sdevice.QosPolicy,
110112
"Model": sdevice.Model,
113+
"LinkZone": sdevice.LinkZone,
111114
},
112115
}
113116
}

pkg/device/metax/protocol_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ func TestConvertMetaxSDeviceToHAMIDevice(t *testing.T) {
4646
Numa: 0,
4747
Healthy: true,
4848
QosPolicy: BestEffort,
49+
LinkZone: 1,
4950
},
5051
{
5152
UUID: "GPU-a16ac188-0592-5c8f-2b6e-8bd8e7a604a8",
@@ -60,6 +61,7 @@ func TestConvertMetaxSDeviceToHAMIDevice(t *testing.T) {
6061
Numa: -1,
6162
Healthy: false,
6263
QosPolicy: BurstShare,
64+
LinkZone: 2,
6365
},
6466
},
6567
expected: []*util.DeviceInfo{
@@ -78,6 +80,7 @@ func TestConvertMetaxSDeviceToHAMIDevice(t *testing.T) {
7880
CustomInfo: map[string]any{
7981
"QosPolicy": BestEffort,
8082
"Model": "native",
83+
"LinkZone": int32(1),
8184
},
8285
},
8386
{
@@ -95,6 +98,7 @@ func TestConvertMetaxSDeviceToHAMIDevice(t *testing.T) {
9598
CustomInfo: map[string]any{
9699
"QosPolicy": BurstShare,
97100
"Model": "sgpu",
101+
"LinkZone": int32(2),
98102
},
99103
},
100104
},

pkg/device/metax/score.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
Copyright 2025 The HAMi Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metax
18+
19+
import "fmt"
20+
21+
const DirectLinkScore = 10
22+
23+
type LinkDevice struct {
24+
uuid string
25+
linkZone int32
26+
}
27+
28+
func (from *LinkDevice) score(to *LinkDevice) int {
29+
if from.uuid == to.uuid {
30+
return 0
31+
}
32+
33+
if from.linkZone == 0 || to.linkZone == 0 {
34+
return 0
35+
}
36+
37+
if from.linkZone == to.linkZone {
38+
return DirectLinkScore
39+
} else {
40+
return 0
41+
}
42+
}
43+
44+
type LinkDevices []*LinkDevice
45+
46+
func (devs LinkDevices) Score() int {
47+
score := 0
48+
49+
for i := 0; i < len(devs); i++ {
50+
for j := i + 1; j < len(devs); j++ {
51+
score += devs[i].score(devs[j])
52+
}
53+
}
54+
55+
return score
56+
}
57+
58+
func (devs LinkDevices) String() string {
59+
str := "["
60+
for _, dev := range devs {
61+
str += fmt.Sprintf("%v", *dev)
62+
}
63+
str += "]"
64+
65+
return str
66+
}

0 commit comments

Comments
 (0)