Skip to content

Commit ced2900

Browse files
peizhaoyouwawa0210
authored andcommitted
feat: support huawei ascend310p
Signed-off-by: peizhaoyou <[email protected]>
1 parent 1d9faf4 commit ced2900

File tree

7 files changed

+271
-0
lines changed

7 files changed

+271
-0
lines changed

.github/workflows/dev-image-build.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ jobs:
1515
build:
1616
name: build-dev-image
1717
runs-on: ubuntu-latest
18+
env:
19+
IMAGE: ${{ secrets.IMAGE || 'projecthami/hami' }}
1820
steps:
1921
- name: Checkout
2022
uses: actions/checkout@v4

charts/hami/templates/scheduler/configmap.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,14 @@ data:
7171
{
7272
"name": "{{ .Values.ascendResourceName }}",
7373
"ignoredByScheduler": true
74+
},
75+
{
76+
"name": "{{ .Values.ascend310PResourceName }}",
77+
"ignoredByScheduler": true
78+
},
79+
{
80+
"name": "{{ .Values.ascend310PResourceName }}",
81+
"ignoredByScheduler": true
7482
}
7583
],
7684
"ignoreable": false

charts/hami/templates/scheduler/configmapnew.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,4 +53,8 @@ data:
5353
ignoredByScheduler: true
5454
- name: {{ .Values.ascendResourceName }}
5555
ignoredByScheduler: true
56+
- name: {{ .Values.ascend310PResourceMem }}
57+
ignoredByScheduler: true
58+
- name: {{ .Values.ascend310PResourceName }}
59+
ignoredByScheduler: true
5660
{{- end }}

charts/hami/templates/scheduler/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ spec:
8989
- --cambricon-mlu-cores={{ .Values.mluResourceCores }}
9090
- --ascend-name={{ .Values.ascendResourceName }}
9191
- --ascend-memory={{ .Values.ascendResourceMem }}
92+
- --ascend310p-name={{ .Values.ascend310PResourceName }}
93+
- --ascend310p-memory={{ .Values.ascend310PResourceMem }}
9294
- --overwrite-env={{ .Values.scheduler.overwriteEnv }}
9395
- --node-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.nodeSchedulerPolicy }}
9496
- --gpu-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.gpuSchedulerPolicy }}

charts/hami/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ iluvatarResourceCore: "iluvatar.ai/vcuda-core"
3131
ascendResourceName: "huawei.com/Ascend910"
3232
ascendResourceMem: "huawei.com/Ascend910-memory"
3333

34+
#Ascend 310P Parameters
35+
ascend310PResourceName: "huawei.com/Ascend310P"
36+
ascend310PResourceMem: "huawei.com/Ascend310P-memory"
37+
38+
3439
schedulerName: "hami-scheduler"
3540

3641
podSecurityPolicy:

pkg/device/ascend/ascend310p.go

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
/*
2+
Copyright 2024 The HAMi Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package ascend
18+
19+
import (
20+
"errors"
21+
"flag"
22+
"fmt"
23+
"strconv"
24+
"strings"
25+
"time"
26+
27+
"github.com/Project-HAMi/HAMi/pkg/api"
28+
"github.com/Project-HAMi/HAMi/pkg/util"
29+
30+
corev1 "k8s.io/api/core/v1"
31+
"k8s.io/apimachinery/pkg/api/resource"
32+
"k8s.io/klog/v2"
33+
)
34+
35+
type Ascend310P struct {
36+
}
37+
38+
const (
39+
Ascend310PName = "Ascend310P"
40+
Ascend310PSelection = "huawei.com/predicate-ascend310p-idx-"
41+
Ascend310PUseUUID = "huawei.com/use-ascend310p-uuid"
42+
Ascend310PNoUseUUID = "huawei.com/no-use-ascend310p-uuid"
43+
Ascend310PMaxMemory = 21 * 1024 // Just for the sake of being able to split, if it exceeds 12G, the whole card will be used.
44+
Ascend310PMemoryCapacity = 24 * 1024
45+
)
46+
47+
var (
48+
Ascend310PResourceCount string
49+
Ascend310PResourceMemory string
50+
Ascend310PResourceCores string
51+
)
52+
53+
type virTemplate struct {
54+
name string
55+
aiCore int
56+
aiCPU int
57+
memory int64
58+
}
59+
60+
var virAscend310PTemplates = []virTemplate{
61+
{"vir01", 1, 1, 3 * 1024},
62+
{"vir02", 2, 2, 6 * 1024},
63+
{"vir04", 4, 4, 12 * 1024},
64+
}
65+
66+
func trimAscend310PMemory(m int64) (int64, string) {
67+
for i := 0; i < len(virAscend310PTemplates); i++ {
68+
if m <= virAscend310PTemplates[i].memory {
69+
return virAscend310PTemplates[i].memory, virAscend310PTemplates[i].name
70+
}
71+
}
72+
if m <= Ascend310PMemoryCapacity {
73+
// use the whole card
74+
return Ascend310PMaxMemory, ""
75+
}
76+
return 0, ""
77+
}
78+
79+
func InitAscend310P() *Ascend310P {
80+
util.InRequestDevices[Ascend310PName] = "hami.io/ascend310p-devices-to-allocate"
81+
util.SupportDevices[Ascend310PName] = "hami.io/ascend310p-devices-allocated"
82+
return &Ascend310P{}
83+
}
84+
85+
func (dev *Ascend310P) ParseConfig(fs *flag.FlagSet) {
86+
fs.StringVar(&Ascend310PResourceCount, "ascend310p-name", "huawei.com/Ascend310P", "Ascend310P resource count")
87+
fs.StringVar(&Ascend310PResourceMemory, "ascend310p-memory", "huawei.com/Ascend310P-memory", "Ascend310P memory resource")
88+
}
89+
90+
func (dev *Ascend310P) MutateAdmission(ctr *corev1.Container) (bool, error) {
91+
count, ok := ctr.Resources.Limits[corev1.ResourceName(Ascend310PResourceCount)]
92+
if !ok {
93+
return false, nil
94+
}
95+
trimMem := int64(Ascend310PMaxMemory)
96+
memory, ok := ctr.Resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)]
97+
if ok {
98+
trimMem, _ = trimAscend310PMemory(memory.Value())
99+
if trimMem <= 0 {
100+
return false, fmt.Errorf("ascend310p memory %d is invalid", memory.Value())
101+
}
102+
}
103+
if count.Value() > 1 {
104+
if trimMem != int64(Ascend310PMaxMemory) {
105+
return true, errors.New("vNPU nor supported for multiple devices")
106+
}
107+
}
108+
ctr.Resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)] = resource.MustParse(fmt.Sprint(trimMem))
109+
ctr.Resources.Requests[corev1.ResourceName(Ascend310PResourceMemory)] = resource.MustParse(fmt.Sprint(trimMem))
110+
return true, nil
111+
}
112+
113+
func (dev *Ascend310P) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) {
114+
nodedevices := []*api.DeviceInfo{}
115+
i := 0
116+
cards, _ := n.Status.Capacity.Name(corev1.ResourceName(Ascend310PResourceCount), resource.DecimalSI).AsInt64()
117+
for int64(i)*10 < cards {
118+
nodedevices = append(nodedevices, &api.DeviceInfo{
119+
Index: i,
120+
Id: n.Name + "-Ascend310P-" + fmt.Sprint(i),
121+
Count: 100,
122+
Devmem: Ascend310PMaxMemory,
123+
Devcore: 100,
124+
Type: Ascend310PName,
125+
Numa: 0,
126+
Health: true,
127+
})
128+
i++
129+
}
130+
return nodedevices, nil
131+
}
132+
133+
func (dev *Ascend310P) PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string {
134+
devlist, ok := pd[Ascend310PName]
135+
if ok && len(devlist) > 0 {
136+
(*annoinput)[util.InRequestDevices[Ascend310PName]] = util.EncodePodSingleDevice(devlist)
137+
(*annoinput)[util.SupportDevices[Ascend310PName]] = util.EncodePodSingleDevice(devlist)
138+
(*annoinput)["predicate-time"] = strconv.FormatInt(time.Now().Unix(), 10)
139+
allocateStr := "huawei.com/Ascend310P"
140+
for _, dp := range devlist {
141+
value := ""
142+
for _, val := range dp {
143+
value = value + "Ascend310P-"
144+
_, temp := trimAscend310PMemory(int64(val.Usedmem))
145+
value = value + temp + "-"
146+
value = value + fmt.Sprint(val.Idx) + ","
147+
}
148+
if len(value) > 0 {
149+
(*annoinput)[allocateStr] = strings.TrimRight(value, ",")
150+
}
151+
}
152+
}
153+
return *annoinput
154+
}
155+
156+
func (dev *Ascend310P) LockNode(n *corev1.Node, p *corev1.Pod) error {
157+
return nil
158+
}
159+
160+
func (dev *Ascend310P) ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error {
161+
return nil
162+
}
163+
164+
func (dev *Ascend310P) NodeCleanUp(nn string) error {
165+
return nil
166+
}
167+
168+
func (dev *Ascend310P) CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) {
169+
if strings.Compare(n.Type, Ascend310PName) == 0 {
170+
return true, true, false
171+
}
172+
return false, false, false
173+
}
174+
175+
func (dev *Ascend310P) CheckUUID(annos map[string]string, d util.DeviceUsage) bool {
176+
userUUID, ok := annos[Ascend310PUseUUID]
177+
if ok {
178+
klog.V(5).Infof("check uuid for Iluvatar user uuid [%s], device id is %s", userUUID, d.ID)
179+
// use , symbol to connect multiple uuid
180+
userUUIDs := strings.Split(userUUID, ",")
181+
for _, uuid := range userUUIDs {
182+
if d.ID == uuid {
183+
return true
184+
}
185+
}
186+
return false
187+
}
188+
189+
noUserUUID, ok := annos[Ascend310PNoUseUUID]
190+
if ok {
191+
klog.V(5).Infof("check uuid for Iluvatar not user uuid [%s], device id is %s", noUserUUID, d.ID)
192+
// use , symbol to connect multiple uuid
193+
noUserUUIDs := strings.Split(noUserUUID, ",")
194+
for _, uuid := range noUserUUIDs {
195+
if d.ID == uuid {
196+
return false
197+
}
198+
}
199+
return true
200+
}
201+
return true
202+
}
203+
204+
func (dev *Ascend310P) CheckHealth(devType string, n *corev1.Node) (bool, bool) {
205+
return true, true
206+
}
207+
208+
func (dev *Ascend310P) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest {
209+
klog.Info("Counting Ascend310P devices")
210+
ascendResourceCount := corev1.ResourceName(Ascend310PResourceCount)
211+
ascendResourceMem := corev1.ResourceName(Ascend310PResourceMemory)
212+
v, ok := ctr.Resources.Limits[ascendResourceCount]
213+
if !ok {
214+
v, ok = ctr.Resources.Requests[ascendResourceCount]
215+
}
216+
if ok {
217+
if n, ok := v.AsInt64(); ok {
218+
klog.Info("Found Ascend310P devices")
219+
memnum := 0
220+
mem, ok := ctr.Resources.Limits[ascendResourceMem]
221+
if !ok {
222+
mem, ok = ctr.Resources.Requests[ascendResourceMem]
223+
}
224+
if ok {
225+
memnums, ok := mem.AsInt64()
226+
if ok {
227+
m, _ := trimAscend310PMemory(memnums)
228+
memnum = int(m)
229+
}
230+
}
231+
corenum := int32(0)
232+
233+
mempnum := 0
234+
if memnum == 0 {
235+
mempnum = 100
236+
}
237+
238+
return util.ContainerDeviceRequest{
239+
Nums: int32(n),
240+
Type: Ascend310PName,
241+
Memreq: int32(memnum),
242+
MemPercentagereq: int32(mempnum),
243+
Coresreq: corenum,
244+
}
245+
}
246+
}
247+
return util.ContainerDeviceRequest{}
248+
}

pkg/device/devices.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,14 @@ func init() {
7272
devices[hygon.HygonDCUDevice] = hygon.InitDCUDevice()
7373
devices[iluvatar.IluvatarGPUDevice] = iluvatar.InitIluvatarDevice()
7474
devices[ascend.AscendDevice] = ascend.InitDevice()
75+
devices[ascend.Ascend310PName] = ascend.InitAscend310P()
7576
DevicesToHandle = []string{}
7677
DevicesToHandle = append(DevicesToHandle, nvidia.NvidiaGPUCommonWord)
7778
DevicesToHandle = append(DevicesToHandle, cambricon.CambriconMLUCommonWord)
7879
DevicesToHandle = append(DevicesToHandle, hygon.HygonDCUCommonWord)
7980
DevicesToHandle = append(DevicesToHandle, iluvatar.IluvatarGPUCommonWord)
8081
DevicesToHandle = append(DevicesToHandle, ascend.AscendDevice)
82+
DevicesToHandle = append(DevicesToHandle, ascend.Ascend310PName)
8183
}
8284

8385
func PodAllocationTrySuccess(nodeName string, devName string, lockName string, pod *corev1.Pod) {

0 commit comments

Comments
 (0)