Skip to content

Commit 9d1eb45

Browse files
authored
Merge pull request kubernetes#86627 from tnqn/flaky-e2e-job
Fix a flaky e2e test of Job completion
2 parents 50dcac4 + c84a371 commit 9d1eb45

File tree

3 files changed

+42
-20
lines changed

3 files changed

+42
-20
lines changed

test/e2e/apps/job.go

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
batchinternal "k8s.io/kubernetes/pkg/apis/batch"
3030
"k8s.io/kubernetes/test/e2e/framework"
3131
jobutil "k8s.io/kubernetes/test/e2e/framework/job"
32+
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
3233
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
3334

3435
"github.com/onsi/ginkgo"
@@ -97,12 +98,10 @@ var _ = SIGDescribe("Job", func() {
9798
framework.ConformanceIt("should run a job to completion when tasks sometimes fail and are locally restarted", func() {
9899
ginkgo.By("Creating a job")
99100
// One failure, then a success, local restarts.
100-
// We can't use the random failure approach used by the
101-
// non-local test below, because kubelet will throttle
102-
// frequently failing containers in a given pod, ramping
103-
// up to 5 minutes between restarts, making test timeouts
104-
// due to successive failures too likely with a reasonable
105-
// test timeout.
101+
// We can't use the random failure approach, because kubelet will
102+
// throttle frequently failing containers in a given pod, ramping
103+
// up to 5 minutes between restarts, making test timeout due to
104+
// successive failures too likely with a reasonable test timeout.
106105
job := jobutil.NewTestJob("failOnce", "fail-once-local", v1.RestartPolicyOnFailure, parallelism, completions, nil, backoffLimit)
107106
job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
108107
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
@@ -114,18 +113,20 @@ var _ = SIGDescribe("Job", func() {
114113

115114
// Pods sometimes fail, but eventually succeed, after pod restarts
116115
ginkgo.It("should run a job to completion when tasks sometimes fail and are not locally restarted", func() {
116+
// One failure, then a success, no local restarts.
117+
// We can't use the random failure approach, because JobController
118+
// will throttle frequently failing Pods of a given Job, ramping
119+
// up to 6 minutes between restarts, making test timeout due to
120+
// successive failures.
121+
// Instead, we force the Job's Pods to be scheduled to a single Node
122+
// and use a hostPath volume to persist data across new Pods.
123+
ginkgo.By("Looking for a node to schedule job pod")
124+
node, err := e2enode.GetRandomReadySchedulableNode(f.ClientSet)
125+
framework.ExpectNoError(err)
126+
117127
ginkgo.By("Creating a job")
118-
// 50% chance of container success, local restarts.
119-
// Can't use the failOnce approach because that relies
120-
// on an emptyDir, which is not preserved across new pods.
121-
// Worst case analysis: 15 failures, each taking 1 minute to
122-
// run due to some slowness, 1 in 2^15 chance of happening,
123-
// causing test flake. Should be very rare.
124-
// With the introduction of backoff limit and high failure rate this
125-
// is hitting its timeout, the 3 is a reasonable that should make this
126-
// test less flaky, for now.
127-
job := jobutil.NewTestJob("randomlySucceedOrFail", "rand-non-local", v1.RestartPolicyNever, parallelism, 3, nil, 999)
128-
job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
128+
job := jobutil.NewTestJobOnNode("failOnce", "fail-once-non-local", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
129+
job, err = jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
129130
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
130131

131132
ginkgo.By("Ensuring job reaches completions")

test/e2e/framework/job/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ go_library(
1717
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
1818
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
1919
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
20+
"//staging/src/k8s.io/apimachinery/pkg/util/rand:go_default_library",
2021
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
2122
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
2223
"//test/e2e/framework:go_default_library",

test/e2e/framework/job/fixtures.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
batchv1 "k8s.io/api/batch/v1"
2121
"k8s.io/api/core/v1"
2222
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23+
"k8s.io/apimachinery/pkg/util/rand"
2324
"k8s.io/kubernetes/test/e2e/framework"
2425
)
2526

@@ -30,6 +31,13 @@ import (
3031
// policy of the containers in which the Pod is running. Parallelism is the Job's parallelism, and completions is the
3132
// Job's required number of completions.
3233
func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, completions int32, activeDeadlineSeconds *int64, backoffLimit int32) *batchv1.Job {
34+
anyNode := ""
35+
return NewTestJobOnNode(behavior, name, rPol, parallelism, completions, activeDeadlineSeconds, backoffLimit, anyNode)
36+
}
37+
38+
// NewTestJobOnNode is similar to NewTestJob but supports specifying a Node on which the Job's Pods will run.
39+
// Empty nodeName means no node selection constraints.
40+
func NewTestJobOnNode(behavior, name string, rPol v1.RestartPolicy, parallelism, completions int32, activeDeadlineSeconds *int64, backoffLimit int32, nodeName string) *batchv1.Job {
3341
manualSelector := false
3442
job := &batchv1.Job{
3543
ObjectMeta: metav1.ObjectMeta{
@@ -72,6 +80,7 @@ func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, compl
7280
SecurityContext: &v1.SecurityContext{},
7381
},
7482
},
83+
NodeName: nodeName,
7584
},
7685
},
7786
},
@@ -89,10 +98,21 @@ func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, compl
8998
job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "exit $(( $RANDOM / 16384 ))"}
9099
case "failOnce":
91100
// Fail the first the container of the pod is run, and
92-
// succeed the second time. Checks for file on emptydir.
101+
// succeed the second time. Checks for file on a data volume.
93102
// If present, succeed. If not, create but fail.
94-
// Note that this cannot be used with RestartNever because
95-
// it always fails the first time for a pod.
103+
// If RestartPolicy is Never, the nodeName should be set to
104+
// ensure all job pods run on a single node and the volume
105+
// will be mounted from a hostPath instead.
106+
if len(nodeName) > 0 {
107+
randomDir := "/tmp/job-e2e/" + rand.String(10)
108+
hostPathType := v1.HostPathDirectoryOrCreate
109+
job.Spec.Template.Spec.Volumes[0].VolumeSource = v1.VolumeSource{HostPath: &v1.HostPathVolumeSource{Path: randomDir, Type: &hostPathType}}
110+
// Tests involving r/w operations on hostPath volume needs to run in
111+
// privileged mode for SELinux enabled distro, while Windows platform
112+
// neither supports nor needs privileged mode.
113+
privileged := !framework.NodeOSDistroIs("windows")
114+
job.Spec.Template.Spec.Containers[0].SecurityContext.Privileged = &privileged
115+
}
96116
job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "if [[ -r /data/foo ]] ; then exit 0 ; else touch /data/foo ; exit 1 ; fi"}
97117
}
98118
return job

0 commit comments

Comments
 (0)