Skip to content
This repository was archived by the owner on Jul 30, 2021. It is now read-only.

Commit c4ec0bb

Browse files
author
Yifan Gu
committed
Remove all checkpoints when the pod checkpointer is unscheduled.
1 parent fc04e43 commit c4ec0bb

File tree

2 files changed

+95
-7
lines changed

2 files changed

+95
-7
lines changed

cmd/checkpoint/main.go

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -240,10 +240,10 @@ func process(localRunningPods, localParentPods, apiParentPods, activeCheckpoints
240240
if _, ok := apiParentPods[id]; !ok {
241241
glog.V(4).Infof("API GC: should remove inactive checkpoint %s", id)
242242

243+
removeMap[id] = struct{}{}
243244
if isPodCheckpointer(inactiveCheckpoints[id]) {
244245
podCheckpointerID = id
245-
} else {
246-
removeMap[id] = struct{}{}
246+
break
247247
}
248248

249249
delete(inactiveCheckpoints, id)
@@ -256,17 +256,29 @@ func process(localRunningPods, localParentPods, apiParentPods, activeCheckpoints
256256
if _, ok := apiParentPods[id]; !ok {
257257
glog.V(4).Infof("API GC: should remove active checkpoint %s", id)
258258

259+
removeMap[id] = struct{}{}
259260
if isPodCheckpointer(activeCheckpoints[id]) {
260261
podCheckpointerID = id
261-
} else {
262-
removeMap[id] = struct{}{}
262+
break
263263
}
264264

265265
delete(activeCheckpoints, id)
266266
}
267267
}
268268
}
269269

270+
// Remove all checkpoints if we need to remove the pod checkpointer itself.
271+
if podCheckpointerID != "" {
272+
for id := range inactiveCheckpoints {
273+
removeMap[id] = struct{}{}
274+
delete(inactiveCheckpoints, id)
275+
}
276+
for id := range activeCheckpoints {
277+
removeMap[id] = struct{}{}
278+
delete(activeCheckpoints, id)
279+
}
280+
}
281+
270282
// Can make decisions about starting/stopping checkpoints just with local state.
271283
//
272284
// If there is an inactive checkpoint, and no parent pod is running, or the checkpoint
@@ -293,8 +305,12 @@ func process(localRunningPods, localParentPods, apiParentPods, activeCheckpoints
293305

294306
// De-duped checkpoints to remove. If we decide to GC a checkpoint, we will clean up both inactive/active.
295307
for k := range removeMap {
308+
if k == podCheckpointerID {
309+
continue
310+
}
296311
remove = append(remove, k)
297312
}
313+
// Put pod checkpoint at the last of the queue.
298314
if podCheckpointerID != "" {
299315
remove = append(remove, podCheckpointerID)
300316
}
@@ -359,6 +375,11 @@ func writeCheckpointManifest(pod *v1.Pod) error {
359375
return err
360376
}
361377
path := filepath.Join(inactiveCheckpointPath, pod.Namespace+"-"+pod.Name+".json")
378+
// Make sure the inactive checkpoint path exists.
379+
if err := os.MkdirAll(filepath.Dir(path), 0600); err != nil {
380+
return err
381+
}
382+
362383
oldb, err := ioutil.ReadFile(path)
363384
if err != nil && !os.IsNotExist(err) {
364385
return err

cmd/checkpoint/main_test.go

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,39 @@ func TestProcess(t *testing.T) {
150150
expectRemove: []string{"AA", "kube-system/pod-checkpointer"},
151151
},
152152
{
153-
desc: "In child checkpoint: Inactive pod-checkpointer, local parent, local running, api parent: should start",
153+
desc: "Inactive pod-checkpointer, no local parent, no api parent: should remove all",
154+
localRunning: map[string]*v1.Pod{"kube-system/pod-checkpointer": {}, "AA": {}},
155+
localParents: map[string]*v1.Pod{"AA": {}},
156+
apiParents: map[string]*v1.Pod{"AA": {}},
157+
inactiveCheckpoints: map[string]*v1.Pod{
158+
"kube-system/pod-checkpointer": {
159+
ObjectMeta: v1.ObjectMeta{
160+
Namespace: "kube-system",
161+
Name: "pod-checkpointer",
162+
},
163+
},
164+
"AA": {},
165+
},
166+
expectRemove: []string{"AA", "kube-system/pod-checkpointer"},
167+
},
168+
{
169+
desc: "Active pod-checkpointer, no local parent, no api parent: should remove all",
170+
localRunning: map[string]*v1.Pod{"kube-system/pod-checkpointer": {}, "AA": {}},
171+
localParents: map[string]*v1.Pod{"AA": {}},
172+
apiParents: map[string]*v1.Pod{"AA": {}},
173+
activeCheckpoints: map[string]*v1.Pod{
174+
"kube-system/pod-checkpointer": {
175+
ObjectMeta: v1.ObjectMeta{
176+
Namespace: "kube-system",
177+
Name: "pod-checkpointer",
178+
},
179+
},
180+
"AA": {},
181+
},
182+
expectRemove: []string{"AA", "kube-system/pod-checkpointer"},
183+
},
184+
{
185+
desc: "Running as an on-disk checkpointer: Inactive pod-checkpointer, local parent, local running, api parent: should start",
154186
podName: "pod-checkpointer-mynode",
155187
localRunning: map[string]*v1.Pod{"kube-system/pod-checkpointer": {}},
156188
localParents: map[string]*v1.Pod{"kube-system/pod-checkpointer": {}},
@@ -166,7 +198,8 @@ func TestProcess(t *testing.T) {
166198
expectStart: []string{"kube-system/pod-checkpointer"},
167199
},
168200
{
169-
desc: "In child checkpoint: Inactive pod-checkpointer, local parent, no local running, api not reachable: should start",
201+
desc: "Running as an on-disk checkpointer: Inactive pod-checkpointer, local parent, no local running, api not reachable: should start",
202+
podName: "pod-checkpointer-mynode",
170203
localParents: map[string]*v1.Pod{"kube-system/pod-checkpointer": {}},
171204
inactiveCheckpoints: map[string]*v1.Pod{
172205
"kube-system/pod-checkpointer": {
@@ -179,7 +212,7 @@ func TestProcess(t *testing.T) {
179212
expectStart: []string{"kube-system/pod-checkpointer"},
180213
},
181214
{
182-
desc: "In child checkpoint: Inactive pod-checkpointer, no local parent, no api parent: should remove in the last",
215+
desc: "Running as an on-disk checkpointer: Inactive pod-checkpointer, no local parent, no api parent: should remove in the last",
183216
podName: "pod-checkpointer-mynode",
184217
localRunning: map[string]*v1.Pod{"kube-system/pod-checkpointer": {}, "AA": {}},
185218
localParents: map[string]*v1.Pod{"BB": {}},
@@ -195,6 +228,40 @@ func TestProcess(t *testing.T) {
195228
},
196229
expectRemove: []string{"AA", "kube-system/pod-checkpointer"},
197230
},
231+
{
232+
desc: "Running as an on-disk checkpointer: Inactive pod-checkpointer, no local parent, no api parent: should remove all",
233+
podName: "pod-checkpointer-mynode",
234+
localRunning: map[string]*v1.Pod{"kube-system/pod-checkpointer": {}, "AA": {}},
235+
localParents: map[string]*v1.Pod{"AA": {}},
236+
apiParents: map[string]*v1.Pod{"AA": {}},
237+
inactiveCheckpoints: map[string]*v1.Pod{
238+
"kube-system/pod-checkpointer": {
239+
ObjectMeta: v1.ObjectMeta{
240+
Namespace: "kube-system",
241+
Name: "pod-checkpointer",
242+
},
243+
},
244+
"AA": {},
245+
},
246+
expectRemove: []string{"AA", "kube-system/pod-checkpointer"},
247+
},
248+
{
249+
desc: "Running as an on-disk checkpointer: Active pod-checkpointer, no local parent, no api parent: should remove all",
250+
podName: "pod-checkpointer-mynode",
251+
localRunning: map[string]*v1.Pod{"kube-system/pod-checkpointer": {}, "AA": {}},
252+
localParents: map[string]*v1.Pod{"AA": {}},
253+
apiParents: map[string]*v1.Pod{"AA": {}},
254+
activeCheckpoints: map[string]*v1.Pod{
255+
"kube-system/pod-checkpointer": {
256+
ObjectMeta: v1.ObjectMeta{
257+
Namespace: "kube-system",
258+
Name: "pod-checkpointer",
259+
},
260+
},
261+
"AA": {},
262+
},
263+
expectRemove: []string{"AA", "kube-system/pod-checkpointer"},
264+
},
198265
}
199266

200267
for _, tc := range cases {

0 commit comments

Comments
 (0)