Skip to content

Commit d66928b

Browse files
authored
Add Job e2e for tracking failure count per index (kubernetes#130390)
* Add Job e2e for tracking failure count per index * Review remarks
1 parent 49f419e commit d66928b

File tree

2 files changed

+165
-0
lines changed

2 files changed

+165
-0
lines changed

test/e2e/apps/job.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,57 @@ done`}
660660
gomega.Expect(job.Status.Failed).Should(gomega.Equal(int32(1)))
661661
})
662662

663+
/*
664+
Testname: Track the failure count per index in Pod annotation when backoffLimitPerIndex is used
665+
Description: Create an indexed job and ensure that the Pods are
666+
re-created with the failure-count Pod annotation set properly to
667+
indicate the number of so-far failures per index.
668+
*/
669+
ginkgo.It("should record the failure-count in the Pod annotation when using backoffLimitPerIndex", func(ctx context.Context) {
670+
jobName := "e2e-backofflimitperindex-" + utilrand.String(5)
671+
label := map[string]string{batchv1.JobNameLabel: jobName}
672+
labelSelector := labels.SelectorFromSet(label).String()
673+
674+
parallelism := int32(2)
675+
completions := int32(2)
676+
backoffLimit := int32(6) // default value
677+
678+
job := e2ejob.NewTestJob("fail", jobName, v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
679+
job.Spec.BackoffLimit = nil
680+
job.Spec.BackoffLimitPerIndex = ptr.To[int32](1)
681+
job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
682+
683+
tracker := NewIndexedPodAnnotationTracker(jobName, f.Namespace.Name, labelSelector, batchv1.JobCompletionIndexAnnotation, batchv1.JobIndexFailureCountAnnotation)
684+
trackerCancel := tracker.Start(ctx, f.ClientSet)
685+
ginkgo.DeferCleanup(trackerCancel)
686+
687+
ginkgo.By("Creating an indexed job with backoffLimit per index and failing pods")
688+
job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
689+
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
690+
691+
ginkgo.By("Awaiting for the job to fail as there are failed indexes")
692+
err = e2ejob.WaitForJobFailed(ctx, f.ClientSet, f.Namespace.Name, job.Name)
693+
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
694+
695+
ginkgo.By("Verify the failure-count annotation on Pods")
696+
// Since the Job is already failed all the relevant Pod events are
697+
// already being distributed. Still, there might be a little bit of lag
698+
// between the events being receiced by the Job controller and the test
699+
// code so we need to wait a little bit.
700+
gomega.Eventually(ctx, tracker.cloneTrackedAnnotations).
701+
WithTimeout(15 * time.Second).
702+
WithPolling(500 * time.Millisecond).
703+
Should(gomega.Equal(map[int][]string{0: {"0", "1"}, 1: {"0", "1"}}))
704+
705+
ginkgo.By("Verifying the Job status fields")
706+
job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name)
707+
framework.ExpectNoError(err, "failed to retrieve latest job object")
708+
gomega.Expect(job.Status.FailedIndexes).Should(gomega.HaveValue(gomega.Equal("0,1")))
709+
gomega.Expect(job.Status.CompletedIndexes).Should(gomega.Equal(""))
710+
gomega.Expect(job.Status.Failed).Should(gomega.Equal(int32(4)))
711+
gomega.Expect(job.Status.Succeeded).Should(gomega.Equal(int32(0)))
712+
})
713+
663714
/*
664715
Testcase: Mark indexes as failed when the FailIndex action is matched in podFailurePolicy
665716
Description: Create an indexed job with backoffLimitPerIndex, and podFailurePolicy

test/e2e/apps/util.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package apps
18+
19+
import (
20+
"context"
21+
"maps"
22+
"strconv"
23+
"sync"
24+
25+
"github.com/onsi/ginkgo/v2"
26+
v1 "k8s.io/api/core/v1"
27+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
"k8s.io/apimachinery/pkg/runtime"
29+
"k8s.io/apimachinery/pkg/watch"
30+
clientset "k8s.io/client-go/kubernetes"
31+
"k8s.io/client-go/tools/cache"
32+
"k8s.io/klog/v2"
33+
"k8s.io/kubernetes/test/e2e/framework"
34+
)
35+
36+
type IndexedPodAnnotationTracker struct {
37+
sync.Mutex
38+
ownerName string
39+
ownerNs string
40+
labelSelector string
41+
podIndexAnnotation string
42+
podTrackedAnnotation string
43+
trackedAnnotations map[int][]string
44+
}
45+
46+
func NewIndexedPodAnnotationTracker(ownerName, ownerNs, labelSelector, podIndexAnnotation, podTrackedAnnotation string) *IndexedPodAnnotationTracker {
47+
return &IndexedPodAnnotationTracker{
48+
ownerName: ownerName,
49+
ownerNs: ownerNs,
50+
labelSelector: labelSelector,
51+
podIndexAnnotation: podIndexAnnotation,
52+
podTrackedAnnotation: podTrackedAnnotation,
53+
trackedAnnotations: make(map[int][]string),
54+
}
55+
}
56+
57+
func (t *IndexedPodAnnotationTracker) Start(ctx context.Context, c clientset.Interface) context.CancelFunc {
58+
trackerCtx, trackerCancel := context.WithCancel(ctx)
59+
_, podTracker := cache.NewInformerWithOptions(cache.InformerOptions{
60+
ListerWatcher: &cache.ListWatch{
61+
ListWithContextFunc: func(ctx context.Context, options metav1.ListOptions) (runtime.Object, error) {
62+
options.LabelSelector = t.labelSelector
63+
obj, err := c.CoreV1().Pods(t.ownerNs).List(ctx, options)
64+
return runtime.Object(obj), err
65+
},
66+
WatchFuncWithContext: func(ctx context.Context, options metav1.ListOptions) (watch.Interface, error) {
67+
options.LabelSelector = t.labelSelector
68+
return c.CoreV1().Pods(t.ownerNs).Watch(ctx, options)
69+
},
70+
},
71+
ObjectType: &v1.Pod{},
72+
Handler: cache.ResourceEventHandlerFuncs{
73+
AddFunc: func(obj interface{}) {
74+
defer ginkgo.GinkgoRecover()
75+
if pod, ok := obj.(*v1.Pod); ok {
76+
framework.Logf("Observed event for Pod %q with index=%v, annotation value=%v",
77+
klog.KObj(pod), pod.Annotations[t.podIndexAnnotation], pod.Annotations[t.podTrackedAnnotation])
78+
podIndex, err := strconv.Atoi(pod.Annotations[t.podIndexAnnotation])
79+
if err != nil {
80+
framework.Failf("failed to parse pod index for Pod %q: %v", klog.KObj(pod), err.Error())
81+
} else {
82+
t.Lock()
83+
defer t.Unlock()
84+
t.trackedAnnotations[podIndex] = append(t.trackedAnnotations[podIndex], pod.Annotations[t.podTrackedAnnotation])
85+
}
86+
}
87+
},
88+
UpdateFunc: func(old, new interface{}) {
89+
defer ginkgo.GinkgoRecover()
90+
oldPod, oldOk := old.(*v1.Pod)
91+
newPod, newOk := new.(*v1.Pod)
92+
if !oldOk || !newOk {
93+
return
94+
}
95+
if oldPod.Annotations[t.podTrackedAnnotation] != newPod.Annotations[t.podTrackedAnnotation] {
96+
framework.Failf("Unexepected mutation of the annotation %q for Pod %q, old=%q, new=%q",
97+
t.podTrackedAnnotation,
98+
klog.KObj(newPod),
99+
oldPod.Annotations[t.podTrackedAnnotation],
100+
newPod.Annotations[t.podTrackedAnnotation],
101+
)
102+
}
103+
},
104+
},
105+
})
106+
go podTracker.RunWithContext(trackerCtx)
107+
return trackerCancel
108+
}
109+
110+
func (t *IndexedPodAnnotationTracker) cloneTrackedAnnotations() map[int][]string {
111+
t.Lock()
112+
defer t.Unlock()
113+
return maps.Clone(t.trackedAnnotations)
114+
}

0 commit comments

Comments
 (0)