Skip to content

Commit c84a371

Browse files
committed
Fix a flaky e2e test of Job completion
The test was flaky because it required the job succeeds 3 times with pseudorandom 50% failure chance within 15 minutes, while there is an exponential back-off delay (10s, 20s, 40s …) capped at 6 minutes before recreating failed pods. As 7 consecutive failures (1/128 chance) could take 20+ minutes, exceeding the timeout, the test failed intermittently because of "timed out waiting for the condition". This PR forces the Pods of a Job to be scheduled to a single node and uses a hostPath volume instead of an emptyDir to persist data across new Pods.
1 parent c33bbbc commit c84a371

File tree

3 files changed

+42
-20
lines changed

3 files changed

+42
-20
lines changed

test/e2e/apps/job.go

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
batchinternal "k8s.io/kubernetes/pkg/apis/batch"
2727
"k8s.io/kubernetes/test/e2e/framework"
2828
jobutil "k8s.io/kubernetes/test/e2e/framework/job"
29+
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
2930
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
3031

3132
"github.com/onsi/ginkgo"
@@ -94,12 +95,10 @@ var _ = SIGDescribe("Job", func() {
9495
framework.ConformanceIt("should run a job to completion when tasks sometimes fail and are locally restarted", func() {
9596
ginkgo.By("Creating a job")
9697
// One failure, then a success, local restarts.
97-
// We can't use the random failure approach used by the
98-
// non-local test below, because kubelet will throttle
99-
// frequently failing containers in a given pod, ramping
100-
// up to 5 minutes between restarts, making test timeouts
101-
// due to successive failures too likely with a reasonable
102-
// test timeout.
98+
// We can't use the random failure approach, because kubelet will
99+
// throttle frequently failing containers in a given pod, ramping
100+
// up to 5 minutes between restarts, making test timeout due to
101+
// successive failures too likely with a reasonable test timeout.
103102
job := jobutil.NewTestJob("failOnce", "fail-once-local", v1.RestartPolicyOnFailure, parallelism, completions, nil, backoffLimit)
104103
job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
105104
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
@@ -111,18 +110,20 @@ var _ = SIGDescribe("Job", func() {
111110

112111
// Pods sometimes fail, but eventually succeed, after pod restarts
113112
ginkgo.It("should run a job to completion when tasks sometimes fail and are not locally restarted", func() {
113+
// One failure, then a success, no local restarts.
114+
// We can't use the random failure approach, because JobController
115+
// will throttle frequently failing Pods of a given Job, ramping
116+
// up to 6 minutes between restarts, making test timeout due to
117+
// successive failures.
118+
// Instead, we force the Job's Pods to be scheduled to a single Node
119+
// and use a hostPath volume to persist data across new Pods.
120+
ginkgo.By("Looking for a node to schedule job pod")
121+
node, err := e2enode.GetRandomReadySchedulableNode(f.ClientSet)
122+
framework.ExpectNoError(err)
123+
114124
ginkgo.By("Creating a job")
115-
// 50% chance of container success, local restarts.
116-
// Can't use the failOnce approach because that relies
117-
// on an emptyDir, which is not preserved across new pods.
118-
// Worst case analysis: 15 failures, each taking 1 minute to
119-
// run due to some slowness, 1 in 2^15 chance of happening,
120-
// causing test flake. Should be very rare.
121-
// With the introduction of backoff limit and high failure rate this
122-
// is hitting its timeout, the 3 is a reasonable that should make this
123-
// test less flaky, for now.
124-
job := jobutil.NewTestJob("randomlySucceedOrFail", "rand-non-local", v1.RestartPolicyNever, parallelism, 3, nil, 999)
125-
job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
125+
job := jobutil.NewTestJobOnNode("failOnce", "fail-once-non-local", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
126+
job, err = jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
126127
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
127128

128129
ginkgo.By("Ensuring job reaches completions")

test/e2e/framework/job/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ go_library(
1717
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
1818
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
1919
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
20+
"//staging/src/k8s.io/apimachinery/pkg/util/rand:go_default_library",
2021
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
2122
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
2223
"//test/e2e/framework:go_default_library",

test/e2e/framework/job/fixtures.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
batchv1 "k8s.io/api/batch/v1"
2121
"k8s.io/api/core/v1"
2222
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23+
"k8s.io/apimachinery/pkg/util/rand"
2324
"k8s.io/kubernetes/test/e2e/framework"
2425
)
2526

@@ -30,6 +31,13 @@ import (
3031
// policy of the containers in which the Pod is running. Parallelism is the Job's parallelism, and completions is the
3132
// Job's required number of completions.
3233
func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, completions int32, activeDeadlineSeconds *int64, backoffLimit int32) *batchv1.Job {
34+
anyNode := ""
35+
return NewTestJobOnNode(behavior, name, rPol, parallelism, completions, activeDeadlineSeconds, backoffLimit, anyNode)
36+
}
37+
38+
// NewTestJobOnNode is similar to NewTestJob but supports specifying a Node on which the Job's Pods will run.
39+
// Empty nodeName means no node selection constraints.
40+
func NewTestJobOnNode(behavior, name string, rPol v1.RestartPolicy, parallelism, completions int32, activeDeadlineSeconds *int64, backoffLimit int32, nodeName string) *batchv1.Job {
3341
manualSelector := false
3442
job := &batchv1.Job{
3543
ObjectMeta: metav1.ObjectMeta{
@@ -72,6 +80,7 @@ func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, compl
7280
SecurityContext: &v1.SecurityContext{},
7381
},
7482
},
83+
NodeName: nodeName,
7584
},
7685
},
7786
},
@@ -89,10 +98,21 @@ func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, compl
8998
job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "exit $(( $RANDOM / 16384 ))"}
9099
case "failOnce":
91100
// Fail the first the container of the pod is run, and
92-
// succeed the second time. Checks for file on emptydir.
101+
// succeed the second time. Checks for file on a data volume.
93102
// If present, succeed. If not, create but fail.
94-
// Note that this cannot be used with RestartNever because
95-
// it always fails the first time for a pod.
103+
// If RestartPolicy is Never, the nodeName should be set to
104+
// ensure all job pods run on a single node and the volume
105+
// will be mounted from a hostPath instead.
106+
if len(nodeName) > 0 {
107+
randomDir := "/tmp/job-e2e/" + rand.String(10)
108+
hostPathType := v1.HostPathDirectoryOrCreate
109+
job.Spec.Template.Spec.Volumes[0].VolumeSource = v1.VolumeSource{HostPath: &v1.HostPathVolumeSource{Path: randomDir, Type: &hostPathType}}
110+
// Tests involving r/w operations on hostPath volume needs to run in
111+
// privileged mode for SELinux enabled distro, while Windows platform
112+
// neither supports nor needs privileged mode.
113+
privileged := !framework.NodeOSDistroIs("windows")
114+
job.Spec.Template.Spec.Containers[0].SecurityContext.Privileged = &privileged
115+
}
96116
job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "if [[ -r /data/foo ]] ; then exit 0 ; else touch /data/foo ; exit 1 ; fi"}
97117
}
98118
return job

0 commit comments

Comments
 (0)