@@ -29,6 +29,7 @@ import (
29
29
batchinternal "k8s.io/kubernetes/pkg/apis/batch"
30
30
"k8s.io/kubernetes/test/e2e/framework"
31
31
jobutil "k8s.io/kubernetes/test/e2e/framework/job"
32
+ e2enode "k8s.io/kubernetes/test/e2e/framework/node"
32
33
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
33
34
34
35
"github.com/onsi/ginkgo"
@@ -97,12 +98,10 @@ var _ = SIGDescribe("Job", func() {
97
98
framework .ConformanceIt ("should run a job to completion when tasks sometimes fail and are locally restarted" , func () {
98
99
ginkgo .By ("Creating a job" )
99
100
// One failure, then a success, local restarts.
100
- // We can't use the random failure approach used by the
101
- // non-local test below, because kubelet will throttle
102
- // frequently failing containers in a given pod, ramping
103
- // up to 5 minutes between restarts, making test timeouts
104
- // due to successive failures too likely with a reasonable
105
- // test timeout.
101
+ // We can't use the random failure approach, because kubelet will
102
+ // throttle frequently failing containers in a given pod, ramping
103
+ // up to 5 minutes between restarts, making test timeout due to
104
+ // successive failures too likely with a reasonable test timeout.
106
105
job := jobutil .NewTestJob ("failOnce" , "fail-once-local" , v1 .RestartPolicyOnFailure , parallelism , completions , nil , backoffLimit )
107
106
job , err := jobutil .CreateJob (f .ClientSet , f .Namespace .Name , job )
108
107
framework .ExpectNoError (err , "failed to create job in namespace: %s" , f .Namespace .Name )
@@ -114,18 +113,20 @@ var _ = SIGDescribe("Job", func() {
114
113
115
114
// Pods sometimes fail, but eventually succeed, after pod restarts
116
115
ginkgo .It ("should run a job to completion when tasks sometimes fail and are not locally restarted" , func () {
116
+ // One failure, then a success, no local restarts.
117
+ // We can't use the random failure approach, because JobController
118
+ // will throttle frequently failing Pods of a given Job, ramping
119
+ // up to 6 minutes between restarts, making test timeout due to
120
+ // successive failures.
121
+ // Instead, we force the Job's Pods to be scheduled to a single Node
122
+ // and use a hostPath volume to persist data across new Pods.
123
+ ginkgo .By ("Looking for a node to schedule job pod" )
124
+ node , err := e2enode .GetRandomReadySchedulableNode (f .ClientSet )
125
+ framework .ExpectNoError (err )
126
+
117
127
ginkgo .By ("Creating a job" )
118
- // 50% chance of container success, local restarts.
119
- // Can't use the failOnce approach because that relies
120
- // on an emptyDir, which is not preserved across new pods.
121
- // Worst case analysis: 15 failures, each taking 1 minute to
122
- // run due to some slowness, 1 in 2^15 chance of happening,
123
- // causing test flake. Should be very rare.
124
- // With the introduction of backoff limit and high failure rate this
125
- // is hitting its timeout, the 3 is a reasonable that should make this
126
- // test less flaky, for now.
127
- job := jobutil .NewTestJob ("randomlySucceedOrFail" , "rand-non-local" , v1 .RestartPolicyNever , parallelism , 3 , nil , 999 )
128
- job , err := jobutil .CreateJob (f .ClientSet , f .Namespace .Name , job )
128
+ job := jobutil .NewTestJobOnNode ("failOnce" , "fail-once-non-local" , v1 .RestartPolicyNever , parallelism , completions , nil , backoffLimit , node .Name )
129
+ job , err = jobutil .CreateJob (f .ClientSet , f .Namespace .Name , job )
129
130
framework .ExpectNoError (err , "failed to create job in namespace: %s" , f .Namespace .Name )
130
131
131
132
ginkgo .By ("Ensuring job reaches completions" )
0 commit comments