Skip to content

Commit dcd90b4

Browse files
authored
fix bug: update pod error when allocate (#38)
Co-authored-by: tzzcfrank <[email protected]>
1 parent ec4ff6e commit dcd90b4

File tree

2 files changed

+24
-16
lines changed

2 files changed

+24
-16
lines changed

pkg/gpu/nvidia/allocate.go

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ import (
66

77
log "github.com/golang/glog"
88
"golang.org/x/net/context"
9-
"k8s.io/api/core/v1"
10-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
9+
v1 "k8s.io/api/core/v1"
10+
"k8s.io/apimachinery/pkg/types"
1111
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
1212
)
1313

@@ -26,7 +26,7 @@ func buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginap
2626
for _, req := range reqs.ContainerRequests {
2727
response := pluginapi.ContainerAllocateResponse{
2828
Envs: map[string]string{
29-
envNVGPU: fmt.Sprintf("no-gpu-has-%dMiB-to-run", podReqGPU),
29+
envNVGPU: fmt.Sprintf("no-gpu-has-%d%s-to-run", podReqGPU, metric),
3030
EnvResourceIndex: fmt.Sprintf("-1"),
3131
EnvResourceByPod: fmt.Sprintf("%d", podReqGPU),
3232
EnvResourceByContainer: fmt.Sprintf("%d", uint(len(req.DevicesIDs))),
@@ -121,26 +121,23 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
121121
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
122122
},
123123
}
124-
if m.disableCGPUIsolation {
125-
response.Envs["CGPU_DISABLE"] = "true"
126-
}
124+
if m.disableCGPUIsolation {
125+
response.Envs["CGPU_DISABLE"] = "true"
126+
}
127127
responses.ContainerResponses = append(responses.ContainerResponses, &response)
128128
}
129129

130130
// 2. Update Pod spec
131-
newPod := updatePodAnnotations(assumePod)
132-
_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
131+
patchedAnnotationBytes, err := patchPodAnnotationSpecAssigned()
132+
if err != nil {
133+
return buildErrResponse(reqs, podReqGPU), nil
134+
}
135+
_, err = clientset.CoreV1().Pods(assumePod.Namespace).Patch(assumePod.Name, types.StrategicMergePatchType, patchedAnnotationBytes)
133136
if err != nil {
134137
// the object has been modified; please apply your changes to the latest version and try again
135138
if err.Error() == OptimisticLockErrorMsg {
136139
// retry
137-
pod, err := clientset.CoreV1().Pods(assumePod.Namespace).Get(assumePod.Name, metav1.GetOptions{})
138-
if err != nil {
139-
log.Warningf("Failed due to %v", err)
140-
return buildErrResponse(reqs, podReqGPU), nil
141-
}
142-
newPod = updatePodAnnotations(pod)
143-
_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
140+
_, err = clientset.CoreV1().Pods(assumePod.Namespace).Patch(assumePod.Name, types.StrategicMergePatchType, patchedAnnotationBytes)
144141
if err != nil {
145142
log.Warningf("Failed due to %v", err)
146143
return buildErrResponse(reqs, podReqGPU), nil

pkg/gpu/nvidia/podutils.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
package nvidia
22

33
import (
4+
"encoding/json"
45
"fmt"
56
"strconv"
67
"time"
78

89
log "github.com/golang/glog"
9-
"k8s.io/api/core/v1"
10+
v1 "k8s.io/api/core/v1"
1011
)
1112

1213
// update pod env with assigned status
@@ -23,6 +24,16 @@ func updatePodAnnotations(oldPod *v1.Pod) (newPod *v1.Pod) {
2324
return newPod
2425
}
2526

27+
func patchPodAnnotationSpecAssigned() ([]byte, error) {
28+
now := time.Now()
29+
patchAnnotations := map[string]interface{}{
30+
"metadata": map[string]map[string]string{"annotations": {
31+
EnvAssignedFlag: "true",
32+
EnvResourceAssumeTime: fmt.Sprintf("%d", now.UnixNano()),
33+
}}}
34+
return json.Marshal(patchAnnotations)
35+
}
36+
2637
func getGPUIDFromPodAnnotation(pod *v1.Pod) (id int) {
2738
var err error
2839
id = -1

0 commit comments

Comments
 (0)