Skip to content

Commit 2f3da71

Browse files
authored
Merge pull request kubernetes#121206 from mochizuki875/fix_startup_probe_117153
Stop StartupProbe explicity when successThrethold is reached
2 parents a28f140 + 632f162 commit 2f3da71

File tree

3 files changed

+96
-2
lines changed

3 files changed

+96
-2
lines changed

pkg/kubelet/prober/common_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ func getTestRunningStatus() v1.PodStatus {
4646
return getTestRunningStatusWithStarted(true)
4747
}
4848

49+
func getTestNotRunningStatus() v1.PodStatus {
50+
return getTestRunningStatusWithStarted(false)
51+
}
52+
4953
func getTestRunningStatusWithStarted(started bool) v1.PodStatus {
5054
containerStatus := v1.ContainerStatus{
5155
Name: testContainerName,

pkg/kubelet/prober/worker.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,11 +321,14 @@ func (w *worker) doProbe(ctx context.Context) (keepGoing bool) {
321321

322322
w.resultsManager.Set(w.containerID, result, w.pod)
323323

324-
if (w.probeType == liveness || w.probeType == startup) && result == results.Failure {
324+
if (w.probeType == liveness && result == results.Failure) || w.probeType == startup {
325325
// The container fails a liveness/startup check, it will need to be restarted.
326326
// Stop probing until we see a new container ID. This is to reduce the
327327
// chance of hitting #21751, where running `docker exec` when a
328328
// container is being stopped may lead to corrupted container state.
329+
// In addition, if the threshold for each result of a startup probe is exceeded, we should stop probing
330+
// until the container is restarted.
331+
// This is to prevent extra Probe executions #117153.
329332
w.onHold = true
330333
w.resultRun = 0
331334
}

pkg/kubelet/prober/worker_test.go

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,91 @@ func TestSuccessThreshold(t *testing.T) {
268268
}
269269
}
270270

271+
func TestStartupProbeSuccessThreshold(t *testing.T) {
272+
ctx := context.Background()
273+
m := newTestManager()
274+
successThreshold := 1
275+
failureThreshold := 3
276+
w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: int32(successThreshold), FailureThreshold: int32(failureThreshold)})
277+
m.statusManager.SetPodStatus(w.pod, getTestNotRunningStatus())
278+
m.prober.exec = fakeExecProber{probe.Success, nil}
279+
280+
for i := 0; i < successThreshold+1; i++ {
281+
if i < successThreshold {
282+
// Probe should not be on hold and will continue to be excuted
283+
// until successThreshold is met
284+
if w.onHold {
285+
t.Errorf("Prober should not be on hold")
286+
}
287+
msg := fmt.Sprintf("%d success", i+1)
288+
expectContinue(t, w, w.doProbe(ctx), msg)
289+
expectResult(t, w, results.Success, msg)
290+
} else {
291+
// Probe should be on hold and will not be executed anymore
292+
// when successThreshold is met
293+
if !w.onHold {
294+
t.Errorf("Prober should be on hold because successThreshold is exceeded")
295+
}
296+
// Meeting or exceeding successThreshold should cause resultRun to reset to 0
297+
if w.resultRun != 0 {
298+
t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)
299+
}
300+
}
301+
}
302+
}
303+
304+
func TestStartupProbeFailureThreshold(t *testing.T) {
305+
ctx := context.Background()
306+
m := newTestManager()
307+
successThreshold := 1
308+
failureThreshold := 3
309+
w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: int32(successThreshold), FailureThreshold: int32(failureThreshold)})
310+
m.statusManager.SetPodStatus(w.pod, getTestNotRunningStatus())
311+
m.prober.exec = fakeExecProber{probe.Failure, nil}
312+
313+
for i := 0; i < failureThreshold+1; i++ {
314+
if i < failureThreshold {
315+
// Probe should not be on hold and will continue to be excuted
316+
// until failureThreshold is met
317+
if w.onHold {
318+
t.Errorf("Prober should not be on hold")
319+
}
320+
msg := fmt.Sprintf("%d failure", i+1)
321+
expectContinue(t, w, w.doProbe(ctx), msg)
322+
switch i {
323+
case 0, 1:
324+
// At this point, the expected result is Unknown
325+
// because w.resultsManager.Set() will be called after FailureThreshold is reached
326+
expectResult(t, w, results.Unknown, msg)
327+
// resultRun should be incremented until failureThreshold is met
328+
if w.resultRun != i+1 {
329+
t.Errorf("Prober resultRun should be %d, but %d", i+1, w.resultRun)
330+
}
331+
case 2:
332+
// The expected result is Failure
333+
// because w.resultsManager.Set() will be called due to resultRun reaching failureThreshold,
334+
// updating the cached result to Failure.
335+
// After that, resultRun will be reset to 0.
336+
expectResult(t, w, results.Failure, msg)
337+
// Meeting failureThreshold should cause resultRun to reset to 0
338+
if w.resultRun != 0 {
339+
t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)
340+
}
341+
}
342+
} else {
343+
// Probe should be on hold and will not be executed anymore
344+
// when failureThreshold is met
345+
if !w.onHold {
346+
t.Errorf("Prober should be on hold because failureThreshold is exceeded")
347+
}
348+
// Exceeding failureThreshold should cause resultRun to reset to 0
349+
if w.resultRun != 0 {
350+
t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)
351+
}
352+
}
353+
}
354+
}
355+
271356
func TestCleanUp(t *testing.T) {
272357
m := newTestManager()
273358

@@ -366,8 +451,10 @@ func TestOnHoldOnLivenessOrStartupCheckFailure(t *testing.T) {
366451
msg = "hold lifted"
367452
expectContinue(t, w, w.doProbe(ctx), msg)
368453
expectResult(t, w, results.Success, msg)
369-
if w.onHold {
454+
if probeType == liveness && w.onHold {
370455
t.Errorf("Prober should not be on hold anymore")
456+
} else if probeType == startup && !w.onHold {
457+
t.Errorf("Prober should be on hold due to %s check success", probeType)
371458
}
372459
}
373460
}

0 commit comments

Comments
 (0)