Skip to content

Commit 900beff

Browse files
authored
improve e2e testing of failures (#236)
1. Add test for Job with stuckInit 2. Merge two failing job scenarios into one to reduce CI time
1 parent 65cd689 commit 900beff

File tree

2 files changed

+57
-14
lines changed

2 files changed

+57
-14
lines changed

test/e2e/appwrapper_test.go

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -280,20 +280,7 @@ var _ = Describe("AppWrapper E2E Test", func() {
280280
Eventually(AppWrapperPhase(ctx, aw), 60*time.Second).Should(Equal(workloadv1beta2.AppWrapperSucceeded))
281281
})
282282

283-
It("A failed Batch Job yields a failed AppWrapper", func() {
284-
aw := toAppWrapper(failingBatchjob(500))
285-
if aw.Annotations == nil {
286-
aw.Annotations = make(map[string]string)
287-
}
288-
aw.Annotations[workloadv1beta2.FailureGracePeriodDurationAnnotation] = "0s"
289-
aw.Annotations[workloadv1beta2.RetryLimitAnnotation] = "0"
290-
Expect(getClient(ctx).Create(ctx, aw)).To(Succeed())
291-
appwrappers = append(appwrappers, aw)
292-
Expect(waitAWPodsReady(ctx, aw)).Should(Succeed())
293-
Eventually(AppWrapperPhase(ctx, aw), 90*time.Second).Should(Equal(workloadv1beta2.AppWrapperFailed))
294-
})
295-
296-
It("Failed Jobs will be retried up to retryLimit", func() {
283+
It("A failed Batch Job will be Reset up to retryLimit and then Failed", func() {
297284
aw := toAppWrapper(failingBatchjob(500))
298285
if aw.Annotations == nil {
299286
aw.Annotations = make(map[string]string)
@@ -348,6 +335,26 @@ var _ = Describe("AppWrapper E2E Test", func() {
348335
})
349336
})
350337

338+
Describe("Detection of Startup Failures", Label("slow"), Label("Kueue", "Standalone"), func() {
339+
It("Job with stuck init is detected and Failed", func() {
340+
aw := toAppWrapper(stuckInitBatchjob(100))
341+
if aw.Annotations == nil {
342+
aw.Annotations = make(map[string]string)
343+
}
344+
aw.Annotations[workloadv1beta2.FailureGracePeriodDurationAnnotation] = "10s"
345+
aw.Annotations[workloadv1beta2.WarmupGracePeriodDurationAnnotation] = "10s"
346+
aw.Annotations[workloadv1beta2.RetryLimitAnnotation] = "1"
347+
aw.Annotations[workloadv1beta2.RetryPausePeriodDurationAnnotation] = "0s"
348+
Expect(getClient(ctx).Create(ctx, aw)).To(Succeed())
349+
appwrappers = append(appwrappers, aw)
350+
Eventually(AppWrapperPhase(ctx, aw), 30*time.Second).Should(Equal(workloadv1beta2.AppWrapperRunning))
351+
Eventually(AppWrapperPhase(ctx, aw), 30*time.Second).Should(Equal(workloadv1beta2.AppWrapperResetting))
352+
Eventually(AppWrapperPhase(ctx, aw), 180*time.Second).Should(Equal(workloadv1beta2.AppWrapperFailed))
353+
aw = getAppWrapper(ctx, types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace})
354+
Expect(aw.Status.Retries).Should(Equal(int32(1)))
355+
})
356+
})
357+
351358
Describe("Load Testing", Label("slow"), Label("Kueue", "Standalone"), func() {
352359
It("Create 50 AppWrappers", func() {
353360
const (

test/e2e/fixtures_test.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,42 @@ func succeedingBatchjob(milliCPU int64) workloadv1beta2.AppWrapperComponent {
300300
}
301301
}
302302

303+
const stuckInitBatchJobYAML = `
304+
apiVersion: batch/v1
305+
kind: Job
306+
metadata:
307+
name: %v
308+
spec:
309+
template:
310+
spec:
311+
restartPolicy: Never
312+
terminationGracePeriodSeconds: 0
313+
initContainers:
314+
- name: stuck-init
315+
image: quay.io/project-codeflare/busybox:1.36
316+
command: ["sh", "-c", "sleep 100000; exit 1"]
317+
containers:
318+
- name: busybox
319+
image: quay.io/project-codeflare/busybox:1.36
320+
command: ["sh", "-c", "sleep 10; exit 1"]
321+
resources:
322+
requests:
323+
cpu: %v
324+
`
325+
326+
func stuckInitBatchjob(milliCPU int64) workloadv1beta2.AppWrapperComponent {
327+
yamlString := fmt.Sprintf(stuckInitBatchJobYAML,
328+
randName("batchjob"),
329+
resource.NewMilliQuantity(milliCPU, resource.DecimalSI))
330+
331+
jsonBytes, err := yaml.YAMLToJSON([]byte(yamlString))
332+
Expect(err).NotTo(HaveOccurred())
333+
return workloadv1beta2.AppWrapperComponent{
334+
DeclaredPodSets: []workloadv1beta2.AppWrapperPodSet{{Path: "template.spec.template"}},
335+
Template: runtime.RawExtension{Raw: jsonBytes},
336+
}
337+
}
338+
303339
// This is not a useful PyTorchJob:
304340
// 1. Using a dummy busybox image to avoid pulling a large & rate-limited image from dockerhub
305341
// 2. We avoid needing the injected sidecar (alpine:3.10 from dockerhub) by not specifying a Master

0 commit comments

Comments
 (0)