|
| 1 | +/* |
| 2 | +Copyright 2024 The HAMi Authors. |
| 3 | +
|
| 4 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +you may not use this file except in compliance with the License. |
| 6 | +You may obtain a copy of the License at |
| 7 | +
|
| 8 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +
|
| 10 | +Unless required by applicable law or agreed to in writing, software |
| 11 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +See the License for the specific language governing permissions and |
| 14 | +limitations under the License. |
| 15 | +*/ |
| 16 | + |
| 17 | +package ascend |
| 18 | + |
| 19 | +import ( |
| 20 | + "errors" |
| 21 | + "flag" |
| 22 | + "fmt" |
| 23 | + "strconv" |
| 24 | + "strings" |
| 25 | + "time" |
| 26 | + |
| 27 | + "github.com/Project-HAMi/HAMi/pkg/api" |
| 28 | + "github.com/Project-HAMi/HAMi/pkg/util" |
| 29 | + |
| 30 | + corev1 "k8s.io/api/core/v1" |
| 31 | + "k8s.io/apimachinery/pkg/api/resource" |
| 32 | + "k8s.io/klog/v2" |
| 33 | +) |
| 34 | + |
| 35 | +type Ascend310P struct { |
| 36 | +} |
| 37 | + |
| 38 | +const ( |
| 39 | + Ascend310PName = "Ascend310P" |
| 40 | + Ascend310PSelection = "huawei.com/predicate-ascend310p-idx-" |
| 41 | + Ascend310PUseUUID = "huawei.com/use-ascend310p-uuid" |
| 42 | + Ascend310PNoUseUUID = "huawei.com/no-use-ascend310p-uuid" |
| 43 | + Ascend310PMaxMemory = 21 * 1024 // Just for the sake of being able to split, if it exceeds 12G, the whole card will be used. |
| 44 | + Ascend310PMemoryCapacity = 24 * 1024 |
| 45 | +) |
| 46 | + |
| 47 | +var ( |
| 48 | + Ascend310PResourceCount string |
| 49 | + Ascend310PResourceMemory string |
| 50 | + Ascend310PResourceCores string |
| 51 | +) |
| 52 | + |
| 53 | +type virTemplate struct { |
| 54 | + name string |
| 55 | + aiCore int |
| 56 | + aiCPU int |
| 57 | + memory int64 |
| 58 | +} |
| 59 | + |
| 60 | +var virAscend310PTemplates = []virTemplate{ |
| 61 | + {"vir01", 1, 1, 3 * 1024}, |
| 62 | + {"vir02", 2, 2, 6 * 1024}, |
| 63 | + {"vir04", 4, 4, 12 * 1024}, |
| 64 | +} |
| 65 | + |
| 66 | +func trimAscend310PMemory(m int64) (int64, string) { |
| 67 | + for i := 0; i < len(virAscend310PTemplates); i++ { |
| 68 | + if m <= virAscend310PTemplates[i].memory { |
| 69 | + return virAscend310PTemplates[i].memory, virAscend310PTemplates[i].name |
| 70 | + } |
| 71 | + } |
| 72 | + if m <= Ascend310PMemoryCapacity { |
| 73 | + // use the whole card |
| 74 | + return Ascend310PMaxMemory, "" |
| 75 | + } |
| 76 | + return 0, "" |
| 77 | +} |
| 78 | + |
| 79 | +func InitAscend310P() *Ascend310P { |
| 80 | + util.InRequestDevices[Ascend310PName] = "hami.io/ascend310p-devices-to-allocate" |
| 81 | + util.SupportDevices[Ascend310PName] = "hami.io/ascend310p-devices-allocated" |
| 82 | + return &Ascend310P{} |
| 83 | +} |
| 84 | + |
| 85 | +func (dev *Ascend310P) ParseConfig(fs *flag.FlagSet) { |
| 86 | + fs.StringVar(&Ascend310PResourceCount, "ascend310p-name", "huawei.com/Ascend310P", "Ascend310P resource count") |
| 87 | + fs.StringVar(&Ascend310PResourceMemory, "ascend310p-memory", "huawei.com/Ascend310P-memory", "Ascend310P memory resource") |
| 88 | +} |
| 89 | + |
| 90 | +func (dev *Ascend310P) MutateAdmission(ctr *corev1.Container) (bool, error) { |
| 91 | + count, ok := ctr.Resources.Limits[corev1.ResourceName(Ascend310PResourceCount)] |
| 92 | + if !ok { |
| 93 | + return false, nil |
| 94 | + } |
| 95 | + trimMem := int64(Ascend310PMaxMemory) |
| 96 | + memory, ok := ctr.Resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)] |
| 97 | + if ok { |
| 98 | + trimMem, _ = trimAscend310PMemory(memory.Value()) |
| 99 | + if trimMem <= 0 { |
| 100 | + return false, fmt.Errorf("ascend310p memory %d is invalid", memory.Value()) |
| 101 | + } |
| 102 | + } |
| 103 | + if count.Value() > 1 { |
| 104 | + if trimMem != int64(Ascend310PMaxMemory) { |
| 105 | + return true, errors.New("vNPU nor supported for multiple devices") |
| 106 | + } |
| 107 | + } |
| 108 | + ctr.Resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)] = resource.MustParse(fmt.Sprint(trimMem)) |
| 109 | + ctr.Resources.Requests[corev1.ResourceName(Ascend310PResourceMemory)] = resource.MustParse(fmt.Sprint(trimMem)) |
| 110 | + return true, nil |
| 111 | +} |
| 112 | + |
| 113 | +func (dev *Ascend310P) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) { |
| 114 | + nodedevices := []*api.DeviceInfo{} |
| 115 | + i := 0 |
| 116 | + cards, _ := n.Status.Capacity.Name(corev1.ResourceName(Ascend310PResourceCount), resource.DecimalSI).AsInt64() |
| 117 | + for int64(i)*10 < cards { |
| 118 | + nodedevices = append(nodedevices, &api.DeviceInfo{ |
| 119 | + Index: i, |
| 120 | + Id: n.Name + "-Ascend310P-" + fmt.Sprint(i), |
| 121 | + Count: 100, |
| 122 | + Devmem: Ascend310PMaxMemory, |
| 123 | + Devcore: 100, |
| 124 | + Type: Ascend310PName, |
| 125 | + Numa: 0, |
| 126 | + Health: true, |
| 127 | + }) |
| 128 | + i++ |
| 129 | + } |
| 130 | + return nodedevices, nil |
| 131 | +} |
| 132 | + |
| 133 | +func (dev *Ascend310P) PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string { |
| 134 | + devlist, ok := pd[Ascend310PName] |
| 135 | + if ok && len(devlist) > 0 { |
| 136 | + (*annoinput)[util.InRequestDevices[Ascend310PName]] = util.EncodePodSingleDevice(devlist) |
| 137 | + (*annoinput)[util.SupportDevices[Ascend310PName]] = util.EncodePodSingleDevice(devlist) |
| 138 | + (*annoinput)["predicate-time"] = strconv.FormatInt(time.Now().Unix(), 10) |
| 139 | + allocateStr := "huawei.com/Ascend310P" |
| 140 | + for _, dp := range devlist { |
| 141 | + value := "" |
| 142 | + for _, val := range dp { |
| 143 | + value = value + "Ascend310P-" |
| 144 | + _, temp := trimAscend310PMemory(int64(val.Usedmem)) |
| 145 | + value = value + temp + "-" |
| 146 | + value = value + fmt.Sprint(val.Idx) + "," |
| 147 | + } |
| 148 | + if len(value) > 0 { |
| 149 | + (*annoinput)[allocateStr] = strings.TrimRight(value, ",") |
| 150 | + } |
| 151 | + } |
| 152 | + } |
| 153 | + return *annoinput |
| 154 | +} |
| 155 | + |
| 156 | +func (dev *Ascend310P) LockNode(n *corev1.Node, p *corev1.Pod) error { |
| 157 | + return nil |
| 158 | +} |
| 159 | + |
| 160 | +func (dev *Ascend310P) ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error { |
| 161 | + return nil |
| 162 | +} |
| 163 | + |
| 164 | +func (dev *Ascend310P) NodeCleanUp(nn string) error { |
| 165 | + return nil |
| 166 | +} |
| 167 | + |
| 168 | +func (dev *Ascend310P) CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) { |
| 169 | + if strings.Compare(n.Type, Ascend310PName) == 0 { |
| 170 | + return true, true, false |
| 171 | + } |
| 172 | + return false, false, false |
| 173 | +} |
| 174 | + |
| 175 | +func (dev *Ascend310P) CheckUUID(annos map[string]string, d util.DeviceUsage) bool { |
| 176 | + userUUID, ok := annos[Ascend310PUseUUID] |
| 177 | + if ok { |
| 178 | + klog.V(5).Infof("check uuid for Iluvatar user uuid [%s], device id is %s", userUUID, d.ID) |
| 179 | + // use , symbol to connect multiple uuid |
| 180 | + userUUIDs := strings.Split(userUUID, ",") |
| 181 | + for _, uuid := range userUUIDs { |
| 182 | + if d.ID == uuid { |
| 183 | + return true |
| 184 | + } |
| 185 | + } |
| 186 | + return false |
| 187 | + } |
| 188 | + |
| 189 | + noUserUUID, ok := annos[Ascend310PNoUseUUID] |
| 190 | + if ok { |
| 191 | + klog.V(5).Infof("check uuid for Iluvatar not user uuid [%s], device id is %s", noUserUUID, d.ID) |
| 192 | + // use , symbol to connect multiple uuid |
| 193 | + noUserUUIDs := strings.Split(noUserUUID, ",") |
| 194 | + for _, uuid := range noUserUUIDs { |
| 195 | + if d.ID == uuid { |
| 196 | + return false |
| 197 | + } |
| 198 | + } |
| 199 | + return true |
| 200 | + } |
| 201 | + return true |
| 202 | +} |
| 203 | + |
| 204 | +func (dev *Ascend310P) CheckHealth(devType string, n *corev1.Node) (bool, bool) { |
| 205 | + return true, true |
| 206 | +} |
| 207 | + |
| 208 | +func (dev *Ascend310P) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest { |
| 209 | + klog.Info("Counting Ascend310P devices") |
| 210 | + ascendResourceCount := corev1.ResourceName(Ascend310PResourceCount) |
| 211 | + ascendResourceMem := corev1.ResourceName(Ascend310PResourceMemory) |
| 212 | + v, ok := ctr.Resources.Limits[ascendResourceCount] |
| 213 | + if !ok { |
| 214 | + v, ok = ctr.Resources.Requests[ascendResourceCount] |
| 215 | + } |
| 216 | + if ok { |
| 217 | + if n, ok := v.AsInt64(); ok { |
| 218 | + klog.Info("Found Ascend310P devices") |
| 219 | + memnum := 0 |
| 220 | + mem, ok := ctr.Resources.Limits[ascendResourceMem] |
| 221 | + if !ok { |
| 222 | + mem, ok = ctr.Resources.Requests[ascendResourceMem] |
| 223 | + } |
| 224 | + if ok { |
| 225 | + memnums, ok := mem.AsInt64() |
| 226 | + if ok { |
| 227 | + m, _ := trimAscend310PMemory(memnums) |
| 228 | + memnum = int(m) |
| 229 | + } |
| 230 | + } |
| 231 | + corenum := int32(0) |
| 232 | + |
| 233 | + mempnum := 0 |
| 234 | + if memnum == 0 { |
| 235 | + mempnum = 100 |
| 236 | + } |
| 237 | + |
| 238 | + return util.ContainerDeviceRequest{ |
| 239 | + Nums: int32(n), |
| 240 | + Type: Ascend310PName, |
| 241 | + Memreq: int32(memnum), |
| 242 | + MemPercentagereq: int32(mempnum), |
| 243 | + Coresreq: corenum, |
| 244 | + } |
| 245 | + } |
| 246 | + } |
| 247 | + return util.ContainerDeviceRequest{} |
| 248 | +} |
0 commit comments