Skip to content

Commit a90081d

Browse files
authored
Support topology-awareness for Kunlunxin device (#1121)
Support topology-awareness for Kunlunxin device Signed-off-by: limengxuan <[email protected]>
1 parent 3802f2f commit a90081d

File tree

12 files changed

+691
-12
lines changed

12 files changed

+691
-12
lines changed

charts/hami/templates/scheduler/configmap.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,14 @@ data:
4949
},
5050
{{- end }}
5151
{{- end }}
52+
{{- if .Values.devices.kunlun.enabled }}
53+
{{- range .Values.devices.kunlun.customresources }}
54+
{
55+
"name": "{{ . }}",
56+
"ignoredByScheduler": true
57+
},
58+
{{- end }}
59+
{{- end }}
5260
{
5361
"name": "{{ .Values.resourceName }}",
5462
"ignoredByScheduler": true

charts/hami/templates/scheduler/configmapnew.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,4 +76,10 @@ data:
7676
ignoredByScheduler: true
7777
{{- end }}
7878
{{- end }}
79+
{{- if .Values.devices.kunlun.enabled }}
80+
{{- range .Values.devices.kunlun.customresources }}
81+
- name: {{ . }}
82+
ignoredByScheduler: true
83+
{{- end }}
84+
{{- end }}
7985
{{- end }}

charts/hami/values.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,13 +204,17 @@ devicePlugin:
204204
# memory: 100Mi
205205

206206
devices:
207+
kunlun:
208+
enabled: true
209+
customresources:
210+
- kunlunxin.com/xpu
207211
enflame:
208-
enabled: false
212+
enabled: true
209213
customresources:
210214
- enflame.com/vgcu
211215
- enflame.com/vgcu-percentage
212216
mthreads:
213-
enabled: false
217+
enabled: true
214218
customresources:
215219
- mthreads.com/vgpu
216220
nvidia:

pkg/device/cambricon/device.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,10 @@ func (dev *CambriconDevices) CheckHealth(devType string, n *corev1.Node) (bool,
184184
func (dev *CambriconDevices) GetNodeDevices(n corev1.Node) ([]*util.DeviceInfo, error) {
185185
nodedevices := []*util.DeviceInfo{}
186186
i := 0
187-
cards, _ := n.Status.Capacity.Name(corev1.ResourceName(MLUResourceCores), resource.DecimalSI).AsInt64()
187+
cards, ok := n.Status.Capacity.Name(corev1.ResourceName(MLUResourceCores), resource.DecimalSI).AsInt64()
188+
if !ok || cards == 0 {
189+
return []*util.DeviceInfo{}, fmt.Errorf("device not found %s", MLUResourceCores)
190+
}
188191
memoryTotal, _ := n.Status.Capacity.Name(corev1.ResourceName(MLUResourceMemory), resource.DecimalSI).AsInt64()
189192
for int64(i)*100 < cards {
190193
nodedevices = append(nodedevices, &util.DeviceInfo{

pkg/device/enflame/device.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ func (dev *EnflameDevices) GetNodeDevices(n corev1.Node) ([]*util.DeviceInfo, er
118118
i := 0
119119
cards, ok := n.Status.Capacity.Name(corev1.ResourceName(CountNoSharedName), resource.DecimalSI).AsInt64()
120120
if !ok || cards == 0 {
121-
return nodedevices, nil
121+
return []*util.DeviceInfo{}, fmt.Errorf("device not found %s", CountNoSharedName)
122122
}
123123
shared, _ := n.Status.Capacity.Name(corev1.ResourceName(SharedResourceName), resource.DecimalSI).AsInt64()
124124
dev.factor = int(shared / cards)

pkg/device/iluvatar/device.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,10 @@ func (dev *IluvatarDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod
8989
func (dev *IluvatarDevices) GetNodeDevices(n corev1.Node) ([]*util.DeviceInfo, error) {
9090
nodedevices := []*util.DeviceInfo{}
9191
i := 0
92-
cards, _ := n.Status.Capacity.Name(corev1.ResourceName(IluvatarResourceCores), resource.DecimalSI).AsInt64()
92+
cards, ok := n.Status.Capacity.Name(corev1.ResourceName(IluvatarResourceCores), resource.DecimalSI).AsInt64()
93+
if !ok || cards == 0 {
94+
return []*util.DeviceInfo{}, fmt.Errorf("device not found %s", IluvatarResourceCores)
95+
}
9396
memoryTotal, _ := n.Status.Capacity.Name(corev1.ResourceName(IluvatarResourceMemory), resource.DecimalSI).AsInt64()
9497
for int64(i)*100 < cards {
9598
nodedevices = append(nodedevices, &util.DeviceInfo{

0 commit comments

Comments
 (0)