11package main
22
33import (
4+ "encoding/json"
45 "fmt"
56 "strconv"
67
78 log "github.com/golang/glog"
89 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
910
10- "k8s.io/api/core/v1"
11+ v1 "k8s.io/api/core/v1"
1112)
1213
1314type DeviceInfo struct {
@@ -139,43 +140,39 @@ func (n *NodeInfo) hasPendingGPUMemory() bool {
139140
140141// Get used GPUs in checkpoint
141142func (n * NodeInfo ) buildDeviceInfo () error {
142-
143+ totalGPUMem := 0
144+ if n .gpuCount > 0 {
145+ totalGPUMem = n .gpuTotalMemory / n .gpuCount
146+ }
143147GPUSearchLoop:
144148 for _ , pod := range n .pods {
145149 if gpuMemoryInPod (pod ) <= 0 {
146150 continue GPUSearchLoop
147151 }
148-
149- devID , usedGPUMem := n .getDeivceInfo (pod )
150-
151- var dev * DeviceInfo
152- ok := false
153- if dev , ok = n .devs [devID ]; ! ok {
154- totalGPUMem := 0
155- if n .gpuCount > 0 {
156- totalGPUMem = n .gpuTotalMemory / n .gpuCount
157- }
158-
159- dev = & DeviceInfo {
160- pods : []v1.Pod {},
161- idx : devID ,
162- totalGPUMem : totalGPUMem ,
163- node : n .node ,
152+ for devID , usedGPUMem := range n .getDeivceInfo (pod ) {
153+ if n .devs [devID ] == nil {
154+ n .devs [devID ] = & DeviceInfo {
155+ pods : []v1.Pod {},
156+ idx : devID ,
157+ totalGPUMem : totalGPUMem ,
158+ node : n .node ,
159+ }
164160 }
165- n .devs [devID ] = dev
161+ n .devs [devID ].usedGPUMem += usedGPUMem
162+ n .devs [devID ].pods = append (n .devs [devID ].pods , pod )
166163 }
167-
168- dev .usedGPUMem = dev .usedGPUMem + usedGPUMem
169- dev .pods = append (dev .pods , pod )
170164 }
171-
172165 return nil
173166}
174167
175- func (n * NodeInfo ) getDeivceInfo (pod v1.Pod ) ( devIdx int , gpuMemory int ) {
168+ func (n * NodeInfo ) getDeivceInfo (pod v1.Pod ) map [ int ] int {
176169 var err error
177170 id := - 1
178-
171+ allocation := map [int ]int {}
172+ allocation = GetAllocation (& pod )
173+ if len (allocation ) != 0 {
174+ return allocation
175+ }
179176 if len (pod .ObjectMeta .Annotations ) > 0 {
180177 value , found := pod .ObjectMeta .Annotations [envNVGPUID ]
181178 if found {
@@ -194,8 +191,8 @@ func (n *NodeInfo) getDeivceInfo(pod v1.Pod) (devIdx int, gpuMemory int) {
194191 pod .Namespace )
195192 }
196193 }
197-
198- return id , gpuMemoryInPod ( pod )
194+ allocation [ id ] = gpuMemoryInPod ( pod )
195+ return allocation
199196}
200197
201198func hasPendingGPUMemory (nodeInfos []* NodeInfo ) (found bool ) {
@@ -244,3 +241,31 @@ func setUnit(gpuMemory, gpuCount int) {
244241 memoryUnit = "GiB"
245242 }
246243}
244+ func GetAllocation (pod * v1.Pod ) map [int ]int {
245+ podGPUMems := map [int ]int {}
246+ allocationString := ""
247+ if pod .ObjectMeta .Annotations == nil {
248+ return podGPUMems
249+ }
250+ value , ok := pod .ObjectMeta .Annotations [gpushareAllocationFlag ]
251+ if ! ok {
252+ return podGPUMems
253+ }
254+ allocationString = value
255+ var allocation map [int ]map [string ]int
256+ err := json .Unmarshal ([]byte (allocationString ), & allocation )
257+ if err != nil {
258+ return podGPUMems
259+ }
260+ for _ , containerAllocation := range allocation {
261+ for id , gpuMem := range containerAllocation {
262+ gpuIndex , err := strconv .Atoi (id )
263+ if err != nil {
264+ log .Errorf ("failed to get gpu memory from pod annotation,reason: %v" , err )
265+ return map [int ]int {}
266+ }
267+ podGPUMems [gpuIndex ] += gpuMem
268+ }
269+ }
270+ return podGPUMems
271+ }
0 commit comments