Skip to content

Commit e44961e

Browse files
committed
tests: Solve backoff tests flakiness
The container status is not constant, and can change over time in the following order: - Running: When kubelet reports the Pod as running. This state is missable if the container finishes its command faster than kubelet getting to report this state. - Terminated: After the Container finished its command, it will enter the Terminated state, in which will remain for a short period of time, before kubelet will try to restart it. - Waiting: When kubelet has to wait for the backoff period to expire before actually restarting the container. Treating and handling each of these states when calculating the backoff period between container restarts will make the tests more reliable.
1 parent 5716127 commit e44961e

File tree

1 file changed

+33
-5
lines changed

1 file changed

+33
-5
lines changed

test/e2e/common/pods.go

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ func startPodAndGetBackOffs(podClient *framework.PodClient, pod *v1.Pod, sleepAm
109109

110110
func getRestartDelay(podClient *framework.PodClient, podName string, containerName string) (time.Duration, error) {
111111
beginTime := time.Now()
112+
var previousRestartCount int32 = -1
113+
var previousFinishedAt time.Time
112114
for time.Since(beginTime) < (2 * maxBackOffTolerance) { // may just miss the 1st MaxContainerBackOff delay
113115
time.Sleep(time.Second)
114116
pod, err := podClient.Get(podName, metav1.GetOptions{})
@@ -119,11 +121,37 @@ func getRestartDelay(podClient *framework.PodClient, podName string, containerNa
119121
continue
120122
}
121123

122-
if status.State.Waiting == nil && status.State.Terminated != nil && status.LastTerminationState.Terminated != nil && status.State.Terminated.StartedAt.Time.After(beginTime) {
123-
startedAt := status.State.Terminated.StartedAt.Time
124-
finishedAt := status.LastTerminationState.Terminated.FinishedAt.Time
125-
framework.Logf("getRestartDelay: restartCount = %d, finishedAt=%s restartedAt=%s (%s)", status.RestartCount, finishedAt, startedAt, startedAt.Sub(finishedAt))
126-
return startedAt.Sub(finishedAt), nil
124+
// the only case this happens is if this is the first time the Pod is running and there is no "Last State".
125+
if status.LastTerminationState.Terminated == nil {
126+
framework.Logf("Container's last state is not \"Terminated\".")
127+
continue
128+
}
129+
130+
if previousRestartCount == -1 {
131+
if status.State.Running != nil {
132+
// container is still Running, there is no "FinishedAt" time.
133+
continue
134+
} else if status.State.Terminated != nil {
135+
previousFinishedAt = status.State.Terminated.FinishedAt.Time
136+
} else {
137+
previousFinishedAt = status.LastTerminationState.Terminated.FinishedAt.Time
138+
}
139+
previousRestartCount = status.RestartCount
140+
}
141+
142+
// when the RestartCount is changed, the Containers will be in one of the following states:
143+
//Running, Terminated, Waiting (it already is waiting for the backoff period to expire, and the last state details have been stored into status.LastTerminationState).
144+
if status.RestartCount > previousRestartCount {
145+
var startedAt time.Time
146+
if status.State.Running != nil {
147+
startedAt = status.State.Running.StartedAt.Time
148+
} else if status.State.Terminated != nil {
149+
startedAt = status.State.Terminated.StartedAt.Time
150+
} else {
151+
startedAt = status.LastTerminationState.Terminated.StartedAt.Time
152+
}
153+
framework.Logf("getRestartDelay: restartCount = %d, finishedAt=%s restartedAt=%s (%s)", status.RestartCount, previousFinishedAt, startedAt, startedAt.Sub(previousFinishedAt))
154+
return startedAt.Sub(previousFinishedAt), nil
127155
}
128156
}
129157
return 0, fmt.Errorf("timeout getting pod restart delay")

0 commit comments

Comments
 (0)