Skip to content

Commit 6d98b0a

Browse files
Test that an always-fail container can't report the pod Succeeded
The kubelet can race when a pod is deleted and report that a container succeeded when it instead failed, and thus the pod is reported as succeeded. Create an e2e test that demonstrates this failure.
1 parent b5b6754 commit 6d98b0a

File tree

2 files changed

+230
-0
lines changed

2 files changed

+230
-0
lines changed

test/e2e/node/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ go_library(
3737
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
3838
"//staging/src/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
3939
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
40+
"//staging/src/k8s.io/apimachinery/pkg/watch:go_default_library",
4041
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
4142
"//test/e2e/framework:go_default_library",
4243
"//test/e2e/framework/job:go_default_library",

test/e2e/node/pods.go

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,13 @@ package node
1919
import (
2020
"context"
2121
"encoding/json"
22+
"fmt"
23+
"math/rand"
2224
"net/http"
25+
"regexp"
2326
"strconv"
27+
"strings"
28+
"sync"
2429
"time"
2530

2631
v1 "k8s.io/api/core/v1"
@@ -29,6 +34,7 @@ import (
2934
"k8s.io/apimachinery/pkg/labels"
3035
"k8s.io/apimachinery/pkg/util/uuid"
3136
"k8s.io/apimachinery/pkg/util/wait"
37+
"k8s.io/apimachinery/pkg/watch"
3238
"k8s.io/kubernetes/test/e2e/framework"
3339
e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet"
3440
imageutils "k8s.io/kubernetes/test/utils/image"
@@ -197,4 +203,227 @@ var _ = SIGDescribe("Pods Extended", func() {
197203
framework.ExpectEqual(pod.Status.QOSClass, v1.PodQOSGuaranteed)
198204
})
199205
})
206+
207+
framework.KubeDescribe("Pod Container Status", func() {
208+
var podClient *framework.PodClient
209+
ginkgo.BeforeEach(func() {
210+
podClient = f.PodClient()
211+
})
212+
213+
ginkgo.It("should never report success for a pending container", func() {
214+
ginkgo.By("creating pods that should always exit 1 and terminating the pod after a random delay")
215+
216+
var reBug88766 = regexp.MustCompile(`ContainerCannotRun.*rootfs_linux\.go.*kubernetes\.io~secret.*no such file or directory`)
217+
218+
var (
219+
lock sync.Mutex
220+
errs []error
221+
222+
wg sync.WaitGroup
223+
)
224+
225+
const delay = 2000
226+
const workers = 3
227+
const pods = 15
228+
var min, max time.Duration
229+
for i := 0; i < workers; i++ {
230+
wg.Add(1)
231+
go func(i int) {
232+
defer wg.Done()
233+
for retries := 0; retries < pods; retries++ {
234+
name := fmt.Sprintf("pod-submit-status-%d-%d", i, retries)
235+
value := strconv.Itoa(time.Now().Nanosecond())
236+
one := int64(1)
237+
pod := &v1.Pod{
238+
ObjectMeta: metav1.ObjectMeta{
239+
Name: name,
240+
Labels: map[string]string{
241+
"name": "foo",
242+
"time": value,
243+
},
244+
},
245+
Spec: v1.PodSpec{
246+
RestartPolicy: v1.RestartPolicyNever,
247+
TerminationGracePeriodSeconds: &one,
248+
Containers: []v1.Container{
249+
{
250+
Name: "busybox",
251+
Image: imageutils.GetE2EImage(imageutils.BusyBox),
252+
Command: []string{
253+
"/bin/false",
254+
},
255+
Resources: v1.ResourceRequirements{
256+
Requests: v1.ResourceList{
257+
v1.ResourceCPU: resource.MustParse("5m"),
258+
v1.ResourceMemory: resource.MustParse("10Mi"),
259+
},
260+
},
261+
},
262+
},
263+
},
264+
}
265+
266+
// create the pod, capture the change events, then delete the pod
267+
start := time.Now()
268+
created := podClient.Create(pod)
269+
ch := make(chan []watch.Event)
270+
go func() {
271+
defer close(ch)
272+
w, err := podClient.Watch(context.TODO(), metav1.ListOptions{
273+
ResourceVersion: created.ResourceVersion,
274+
FieldSelector: fmt.Sprintf("metadata.name=%s", pod.Name),
275+
})
276+
if err != nil {
277+
framework.Logf("Unable to watch pod %s: %v", pod.Name, err)
278+
return
279+
}
280+
defer w.Stop()
281+
events := []watch.Event{
282+
{Type: watch.Added, Object: created},
283+
}
284+
for event := range w.ResultChan() {
285+
events = append(events, event)
286+
if event.Type == watch.Deleted {
287+
break
288+
}
289+
}
290+
ch <- events
291+
}()
292+
293+
t := time.Duration(rand.Intn(delay)) * time.Millisecond
294+
time.Sleep(t)
295+
err := podClient.Delete(context.TODO(), pod.Name, nil)
296+
framework.ExpectNoError(err, "failed to delete pod")
297+
298+
events, ok := <-ch
299+
if !ok {
300+
continue
301+
}
302+
if len(events) < 2 {
303+
framework.Fail("only got a single event")
304+
}
305+
306+
end := time.Now()
307+
308+
// check the returned events for consistency
309+
var duration, completeDuration time.Duration
310+
var hasContainers, hasTerminated, hasTerminalPhase, hasRunningContainers bool
311+
verifyFn := func(event watch.Event) error {
312+
var ok bool
313+
pod, ok = event.Object.(*v1.Pod)
314+
if !ok {
315+
framework.Logf("Unexpected event object: %s %#v", event.Type, event.Object)
316+
return nil
317+
}
318+
319+
if len(pod.Status.InitContainerStatuses) != 0 {
320+
return fmt.Errorf("pod %s on node %s had incorrect init containers: %#v", pod.Name, pod.Spec.NodeName, pod.Status.InitContainerStatuses)
321+
}
322+
if len(pod.Status.ContainerStatuses) == 0 {
323+
if hasContainers {
324+
return fmt.Errorf("pod %s on node %s had incorrect containers: %#v", pod.Name, pod.Spec.NodeName, pod.Status.ContainerStatuses)
325+
}
326+
return nil
327+
}
328+
hasContainers = true
329+
if len(pod.Status.ContainerStatuses) != 1 {
330+
return fmt.Errorf("pod %s on node %s had incorrect containers: %#v", pod.Name, pod.Spec.NodeName, pod.Status.ContainerStatuses)
331+
}
332+
status := pod.Status.ContainerStatuses[0]
333+
t := status.State.Terminated
334+
if hasTerminated {
335+
if status.State.Waiting != nil || status.State.Running != nil {
336+
return fmt.Errorf("pod %s on node %s was terminated and then changed state: %#v", pod.Name, pod.Spec.NodeName, status)
337+
}
338+
if t == nil {
339+
return fmt.Errorf("pod %s on node %s was terminated and then had termination cleared: %#v", pod.Name, pod.Spec.NodeName, status)
340+
}
341+
}
342+
hasRunningContainers = status.State.Waiting == nil && status.State.Terminated == nil
343+
if t != nil {
344+
if !t.FinishedAt.Time.IsZero() {
345+
duration = t.FinishedAt.Sub(t.StartedAt.Time)
346+
completeDuration = t.FinishedAt.Sub(pod.CreationTimestamp.Time)
347+
}
348+
349+
defer func() { hasTerminated = true }()
350+
switch {
351+
case t.ExitCode == 1:
352+
// expected
353+
case t.ExitCode == 128 && reBug88766.MatchString(t.Message):
354+
// pod volume teardown races with container start in CRI, which reports a failure
355+
framework.Logf("pod %s on node %s failed with the symptoms of https://github.com/kubernetes/kubernetes/issues/88766")
356+
default:
357+
return fmt.Errorf("pod %s on node %s container unexpected exit code %d: start=%s end=%s reason=%s message=%s", pod.Name, pod.Spec.NodeName, t.ExitCode, t.StartedAt, t.FinishedAt, t.Reason, t.Message)
358+
}
359+
}
360+
if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
361+
hasTerminalPhase = true
362+
} else {
363+
if hasTerminalPhase {
364+
return fmt.Errorf("pod %s on node %s was in a terminal phase and then reverted: %#v", pod.Name, pod.Spec.NodeName, pod.Status)
365+
}
366+
}
367+
return nil
368+
}
369+
370+
var eventErr error
371+
for _, event := range events[1:] {
372+
if err := verifyFn(event); err != nil {
373+
eventErr = err
374+
break
375+
}
376+
}
377+
func() {
378+
defer lock.Unlock()
379+
lock.Lock()
380+
381+
if eventErr != nil {
382+
errs = append(errs, eventErr)
383+
return
384+
}
385+
386+
if !hasTerminalPhase {
387+
var names []string
388+
for _, status := range pod.Status.ContainerStatuses {
389+
if status.State.Terminated != nil || status.State.Running != nil {
390+
names = append(names, status.Name)
391+
}
392+
}
393+
switch {
394+
case len(names) > 0:
395+
errs = append(errs, fmt.Errorf("pod %s on node %s did not reach a terminal phase before being deleted but had running containers: phase=%s, running-containers=%s", pod.Name, pod.Spec.NodeName, pod.Status.Phase, strings.Join(names, ",")))
396+
case pod.Status.Phase != v1.PodPending:
397+
errs = append(errs, fmt.Errorf("pod %s on node %s was not Pending but has no running containers: phase=%s", pod.Name, pod.Spec.NodeName, pod.Status.Phase))
398+
}
399+
}
400+
if hasRunningContainers {
401+
data, _ := json.MarshalIndent(pod.Status.ContainerStatuses, "", " ")
402+
errs = append(errs, fmt.Errorf("pod %s on node %s had running or unknown container status before being deleted:\n%s", pod.Name, pod.Spec.NodeName, string(data)))
403+
}
404+
}()
405+
406+
if duration < min {
407+
min = duration
408+
}
409+
if duration > max || max == 0 {
410+
max = duration
411+
}
412+
framework.Logf("Pod %s on node %s timings total=%s t=%s run=%s execute=%s", pod.Name, pod.Spec.NodeName, end.Sub(start), t, completeDuration, duration)
413+
}
414+
415+
}(i)
416+
}
417+
418+
wg.Wait()
419+
420+
if len(errs) > 0 {
421+
var messages []string
422+
for _, err := range errs {
423+
messages = append(messages, err.Error())
424+
}
425+
framework.Failf("%d errors:\n%v", len(errs), strings.Join(messages, "\n"))
426+
}
427+
})
428+
})
200429
})

0 commit comments

Comments
 (0)