Skip to content

Commit b7fd975

Browse files
authored
kubectl-inspect-gpushare supports gpushare2.0 (#33)
1 parent 3be84e3 commit b7fd975

File tree

2 files changed

+57
-31
lines changed

2 files changed

+57
-31
lines changed

cmd/inspect/main.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import (
55
"fmt"
66
"os"
77

8-
"k8s.io/api/core/v1"
8+
v1 "k8s.io/api/core/v1"
99
)
1010

1111
const (
@@ -17,9 +17,10 @@ const (
1717
pluginComponentKey = "component"
1818
pluginComponentValue = "gpushare-device-plugin"
1919

20-
envNVGPUID = "ALIYUN_COM_GPU_MEM_IDX"
21-
envPodGPUMemory = "ALIYUN_COM_GPU_MEM_POD"
22-
envTOTALGPUMEMORY = "ALIYUN_COM_GPU_MEM_DEV"
20+
envNVGPUID = "ALIYUN_COM_GPU_MEM_IDX"
21+
envPodGPUMemory = "ALIYUN_COM_GPU_MEM_POD"
22+
envTOTALGPUMEMORY = "ALIYUN_COM_GPU_MEM_DEV"
23+
gpushareAllocationFlag = "scheduler.framework.gpushare.allocation"
2324
)
2425

2526
func init() {

cmd/inspect/nodeinfo.go

Lines changed: 52 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
package main
22

33
import (
4+
"encoding/json"
45
"fmt"
56
"strconv"
67

78
log "github.com/golang/glog"
89
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
910

10-
"k8s.io/api/core/v1"
11+
v1 "k8s.io/api/core/v1"
1112
)
1213

1314
type DeviceInfo struct {
@@ -139,43 +140,39 @@ func (n *NodeInfo) hasPendingGPUMemory() bool {
139140

140141
// Get used GPUs in checkpoint
141142
func (n *NodeInfo) buildDeviceInfo() error {
142-
143+
totalGPUMem := 0
144+
if n.gpuCount > 0 {
145+
totalGPUMem = n.gpuTotalMemory / n.gpuCount
146+
}
143147
GPUSearchLoop:
144148
for _, pod := range n.pods {
145149
if gpuMemoryInPod(pod) <= 0 {
146150
continue GPUSearchLoop
147151
}
148-
149-
devID, usedGPUMem := n.getDeivceInfo(pod)
150-
151-
var dev *DeviceInfo
152-
ok := false
153-
if dev, ok = n.devs[devID]; !ok {
154-
totalGPUMem := 0
155-
if n.gpuCount > 0 {
156-
totalGPUMem = n.gpuTotalMemory / n.gpuCount
157-
}
158-
159-
dev = &DeviceInfo{
160-
pods: []v1.Pod{},
161-
idx: devID,
162-
totalGPUMem: totalGPUMem,
163-
node: n.node,
152+
for devID, usedGPUMem := range n.getDeivceInfo(pod) {
153+
if n.devs[devID] == nil {
154+
n.devs[devID] = &DeviceInfo{
155+
pods: []v1.Pod{},
156+
idx: devID,
157+
totalGPUMem: totalGPUMem,
158+
node: n.node,
159+
}
164160
}
165-
n.devs[devID] = dev
161+
n.devs[devID].usedGPUMem += usedGPUMem
162+
n.devs[devID].pods = append(n.devs[devID].pods, pod)
166163
}
167-
168-
dev.usedGPUMem = dev.usedGPUMem + usedGPUMem
169-
dev.pods = append(dev.pods, pod)
170164
}
171-
172165
return nil
173166
}
174167

175-
func (n *NodeInfo) getDeivceInfo(pod v1.Pod) (devIdx int, gpuMemory int) {
168+
func (n *NodeInfo) getDeivceInfo(pod v1.Pod) map[int]int {
176169
var err error
177170
id := -1
178-
171+
allocation := map[int]int{}
172+
allocation = GetAllocation(&pod)
173+
if len(allocation) != 0 {
174+
return allocation
175+
}
179176
if len(pod.ObjectMeta.Annotations) > 0 {
180177
value, found := pod.ObjectMeta.Annotations[envNVGPUID]
181178
if found {
@@ -194,8 +191,8 @@ func (n *NodeInfo) getDeivceInfo(pod v1.Pod) (devIdx int, gpuMemory int) {
194191
pod.Namespace)
195192
}
196193
}
197-
198-
return id, gpuMemoryInPod(pod)
194+
allocation[id] = gpuMemoryInPod(pod)
195+
return allocation
199196
}
200197

201198
func hasPendingGPUMemory(nodeInfos []*NodeInfo) (found bool) {
@@ -244,3 +241,31 @@ func setUnit(gpuMemory, gpuCount int) {
244241
memoryUnit = "GiB"
245242
}
246243
}
244+
func GetAllocation(pod *v1.Pod) map[int]int {
245+
podGPUMems := map[int]int{}
246+
allocationString := ""
247+
if pod.ObjectMeta.Annotations == nil {
248+
return podGPUMems
249+
}
250+
value, ok := pod.ObjectMeta.Annotations[gpushareAllocationFlag]
251+
if !ok {
252+
return podGPUMems
253+
}
254+
allocationString = value
255+
var allocation map[int]map[string]int
256+
err := json.Unmarshal([]byte(allocationString), &allocation)
257+
if err != nil {
258+
return podGPUMems
259+
}
260+
for _, containerAllocation := range allocation {
261+
for id, gpuMem := range containerAllocation {
262+
gpuIndex, err := strconv.Atoi(id)
263+
if err != nil {
264+
log.Errorf("failed to get gpu memory from pod annotation,reason: %v", err)
265+
return map[int]int{}
266+
}
267+
podGPUMems[gpuIndex] += gpuMem
268+
}
269+
}
270+
return podGPUMems
271+
}

0 commit comments

Comments
 (0)