Skip to content

Commit 77a3de3

Browse files
Improve performance of Job controller delete event handler
1 parent af2bf2d commit 77a3de3

File tree

2 files changed

+114
-24
lines changed

2 files changed

+114
-24
lines changed

pkg/controller/job/job_controller.go

Lines changed: 78 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ type Controller struct {
113113
queue workqueue.TypedRateLimitingInterface[string]
114114

115115
// Orphan deleted pods that still have a Job tracking finalizer to be removed
116-
orphanQueue workqueue.TypedRateLimitingInterface[string]
116+
orphanQueue workqueue.TypedRateLimitingInterface[orphanPodKey]
117117

118118
broadcaster record.EventBroadcaster
119119
recorder record.EventRecorder
@@ -143,6 +143,23 @@ type syncJobCtx struct {
143143
ready int32
144144
}
145145

146+
type orphanPodKeyKind int
147+
148+
const (
149+
// "key"
150+
OrphanPodKeyKindName orphanPodKeyKind = iota
151+
// "selector"
152+
OrphanPodKeyKindSelector
153+
)
154+
155+
type orphanPodKey struct {
156+
// Either "name" or "selector"
157+
kind orphanPodKeyKind
158+
namespace string
159+
// Either "pod name" or "pod selector"
160+
value string
161+
}
162+
146163
// NewController creates a new Job controller that keeps the relevant pods
147164
// in sync with their corresponding Job objects.
148165
func NewController(ctx context.Context, podInformer coreinformers.PodInformer, jobInformer batchinformers.JobInformer, kubeClient clientset.Interface) (*Controller, error) {
@@ -162,7 +179,7 @@ func newControllerWithClock(ctx context.Context, podInformer coreinformers.PodIn
162179
expectations: controller.NewControllerExpectations(),
163180
finalizerExpectations: newUIDTrackingExpectations(),
164181
queue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.NewTypedItemExponentialFailureRateLimiter[string](DefaultJobApiBackOff, MaxJobApiBackOff), workqueue.TypedRateLimitingQueueConfig[string]{Name: "job", Clock: clock}),
165-
orphanQueue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.NewTypedItemExponentialFailureRateLimiter[string](DefaultJobApiBackOff, MaxJobApiBackOff), workqueue.TypedRateLimitingQueueConfig[string]{Name: "job_orphan_pod", Clock: clock}),
182+
orphanQueue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.NewTypedItemExponentialFailureRateLimiter[orphanPodKey](DefaultJobApiBackOff, MaxJobApiBackOff), workqueue.TypedRateLimitingQueueConfig[orphanPodKey]{Name: "job_orphan_pod", Clock: clock}),
166183
broadcaster: eventBroadcaster,
167184
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "job-controller"}),
168185
clock: clock,
@@ -513,7 +530,17 @@ func (jm *Controller) deleteJob(logger klog.Logger, obj interface{}) {
513530
return
514531
}
515532
}
516-
jm.cleanupPodFinalizers(jobObj)
533+
selector, err := metav1.LabelSelectorAsSelector(jobObj.Spec.Selector)
534+
if err != nil {
535+
utilruntime.HandleError(fmt.Errorf("job %s/%s has invalid label selector: %w", jobObj.Namespace, jobObj.Name, err))
536+
return
537+
}
538+
orphanPodKey := orphanPodKey{
539+
kind: OrphanPodKeyKindSelector,
540+
namespace: jobObj.Namespace,
541+
value: selector.String(),
542+
}
543+
jm.orphanQueue.Add(orphanPodKey)
517544
}
518545

519546
// enqueueSyncJobImmediately tells the Job controller to invoke syncJob
@@ -563,12 +590,12 @@ func (jm *Controller) enqueueSyncJobInternal(logger klog.Logger, obj interface{}
563590
}
564591

565592
func (jm *Controller) enqueueOrphanPod(obj *v1.Pod) {
566-
key, err := controller.KeyFunc(obj)
567-
if err != nil {
568-
utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err))
569-
return
593+
orphanPodKey := orphanPodKey{
594+
kind: OrphanPodKeyKindName,
595+
namespace: obj.Namespace,
596+
value: obj.Name,
570597
}
571-
jm.orphanQueue.Add(key)
598+
jm.orphanQueue.Add(orphanPodKey)
572599
}
573600

574601
// worker runs a worker thread that just dequeues items, processes them, and marks them done.
@@ -620,37 +647,70 @@ func (jm *Controller) processNextOrphanPod(ctx context.Context) bool {
620647
}
621648

622649
// syncOrphanPod removes the tracking finalizer from an orphan pod if found.
623-
func (jm *Controller) syncOrphanPod(ctx context.Context, key string) error {
650+
func (jm *Controller) syncOrphanPod(ctx context.Context, key orphanPodKey) error {
624651
startTime := jm.clock.Now()
625652
logger := klog.FromContext(ctx)
626653
defer func() {
627654
logger.V(4).Info("Finished syncing orphan pod", "pod", key, "elapsed", jm.clock.Since(startTime))
628655
}()
629656

630-
ns, name, err := cache.SplitMetaNamespaceKey(key)
657+
switch key.kind {
658+
case OrphanPodKeyKindName:
659+
pod, err := jm.podStore.Pods(key.namespace).Get(key.value)
660+
if err != nil {
661+
if apierrors.IsNotFound(err) {
662+
logger.V(4).Info("Orphan pod has been deleted", "pod", klog.KRef(key.namespace, key.value))
663+
return nil
664+
}
665+
return err
666+
}
667+
return jm.handleSingleOrphanPod(ctx, pod)
668+
case OrphanPodKeyKindSelector:
669+
logger.V(8).Info("syncing all pods matching the label selector", "namespace", key.namespace, "labelSelector", key.value)
670+
return jm.syncOrphanPodsBySelector(ctx, key.namespace, key.value)
671+
default:
672+
return fmt.Errorf("unknown key type: %d", key.kind)
673+
}
674+
}
675+
676+
// syncOrphanPodsBySelector fetches and processes all pods matching the given label selector.
677+
func (jm *Controller) syncOrphanPodsBySelector(ctx context.Context, namespace string, labelSelector string) error {
678+
logger := klog.FromContext(ctx)
679+
selector, err := labels.Parse(labelSelector)
631680
if err != nil {
632-
return err
681+
return fmt.Errorf("invalid label selector: %w", err)
633682
}
634683

635-
sharedPod, err := jm.podStore.Pods(ns).Get(name)
684+
// Fetch all pods that match the label selector.
685+
// relatively expensive operation but it is called only from the orphan reconciler
686+
pods, err := jm.podStore.Pods(namespace).List(selector)
636687
if err != nil {
637-
if apierrors.IsNotFound(err) {
638-
logger.V(4).Info("Orphan pod has been deleted", "pod", key)
639-
return nil
640-
}
641688
return err
642689
}
690+
for _, pod := range pods {
691+
if err := jm.handleSingleOrphanPod(ctx, pod); err != nil {
692+
logger.Error(err, "syncing orphan pod failed", "pod", klog.KObj(pod))
693+
}
694+
}
695+
return nil
696+
}
697+
698+
// handleSingleOrphanPod processes a single orphan pod.
699+
func (jm *Controller) handleSingleOrphanPod(ctx context.Context, sharedPod *v1.Pod) error {
700+
logger := klog.FromContext(ctx)
701+
ns := sharedPod.Namespace
702+
name := sharedPod.Name
643703
// Make sure the pod is still orphaned.
644704
if controllerRef := metav1.GetControllerOf(sharedPod); controllerRef != nil {
645705
if controllerRef.Kind != controllerKind.Kind || controllerRef.APIVersion != batch.SchemeGroupVersion.String() {
646706
// The pod is controlled by an owner that is not a batch/v1 Job. Do not remove finalizer.
647707
return nil
648708
}
649-
job := jm.resolveControllerRef(sharedPod.Namespace, controllerRef)
709+
job := jm.resolveControllerRef(ns, controllerRef)
650710
if job != nil {
651711
// Skip cleanup of finalizers for pods owned by a job managed by an external controller
652712
if controllerName := managedByExternalController(job); controllerName != nil {
653-
logger.V(2).Info("Skip cleanup of the job finalizer for a pod owned by a job that is managed by an external controller", "key", key, "podUID", sharedPod.UID, "jobUID", job.UID, "controllerName", controllerName)
713+
logger.V(2).Info("Skip cleanup of the job finalizer for a pod owned by a job that is managed by an external controller", "namespace", ns, "name", name, "podUID", sharedPod.UID, "jobUID", job.UID, "controllerName", controllerName)
654714
return nil
655715
}
656716
}

pkg/controller/job/job_controller_test.go

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6641,6 +6641,7 @@ func TestSyncOrphanPod(t *testing.T) {
66416641
job *batch.Job
66426642
inCache bool
66436643
wantFinalizerRemoved bool
6644+
podSelector *metav1.LabelSelector
66446645
}{
66456646
"controlled_by_existing_running_job": {
66466647
owner: &metav1.OwnerReference{
@@ -6743,6 +6744,14 @@ func TestSyncOrphanPod(t *testing.T) {
67436744
},
67446745
wantFinalizerRemoved: true,
67456746
},
6747+
"orphan_pods_by_label_selector": {
6748+
podSelector: &metav1.LabelSelector{
6749+
MatchLabels: map[string]string{
6750+
"app": "test",
6751+
},
6752+
},
6753+
wantFinalizerRemoved: true,
6754+
},
67466755
}
67476756
for name, tc := range cases {
67486757
t.Run(name, func(t *testing.T) {
@@ -6756,19 +6765,35 @@ func TestSyncOrphanPod(t *testing.T) {
67566765
}
67576766
})
67586767
}
6759-
67606768
podBuilder := buildPod().name(name).deletionTimestamp().trackingFinalizer()
67616769
if tc.owner != nil {
67626770
podBuilder = podBuilder.owner(*tc.owner)
67636771
}
6764-
orphanPod := podBuilder.Pod
6765-
orphanPod, err := clientset.CoreV1().Pods("default").Create(ctx, orphanPod, metav1.CreateOptions{})
6772+
orphanKey := orphanPodKey{
6773+
kind: OrphanPodKeyKindName,
6774+
namespace: podBuilder.Pod.Namespace,
6775+
value: podBuilder.Pod.Name,
6776+
}
6777+
if tc.podSelector != nil {
6778+
podBuilder = podBuilder.labels(tc.podSelector.MatchLabels)
6779+
selector, err := metav1.LabelSelectorAsSelector(tc.podSelector)
6780+
if err != nil {
6781+
t.Fatalf("Error parsing pod label selector: %v", err)
6782+
}
6783+
orphanKey = orphanPodKey{
6784+
kind: OrphanPodKeyKindSelector,
6785+
namespace: podBuilder.Pod.Namespace,
6786+
value: selector.String(),
6787+
}
6788+
}
6789+
orphanPod, err := clientset.CoreV1().Pods("default").Create(ctx, podBuilder.Pod, metav1.CreateOptions{})
67666790
if err != nil {
67676791
t.Fatalf("Creating orphan pod: %v", err)
67686792
}
6769-
err = manager.syncOrphanPod(ctx, cache.MetaObjectToName(orphanPod).String())
6793+
// Sync orphan pod by name or selector
6794+
err = manager.syncOrphanPod(ctx, orphanKey)
67706795
if err != nil {
6771-
t.Fatalf("Failed sync orphan pod: %v", err)
6796+
t.Fatalf("Failed to sync orphan pod: %v", err)
67726797
}
67736798
if tc.wantFinalizerRemoved {
67746799
if err := wait.PollUntilContextTimeout(ctx, 10*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
@@ -6786,7 +6811,7 @@ func TestSyncOrphanPod(t *testing.T) {
67866811
time.Sleep(time.Millisecond)
67876812
orphanPod, err := clientset.CoreV1().Pods(orphanPod.Namespace).Get(ctx, orphanPod.Name, metav1.GetOptions{})
67886813
if err != nil {
6789-
t.Fatalf("Failed to the latest pod: %v", err)
6814+
t.Fatalf("Failed to retrieve the latest pod: %v", err)
67906815
}
67916816
if !hasJobTrackingFinalizer(orphanPod) {
67926817
t.Errorf("Unexpected removal of the Job's finalizer")
@@ -7796,6 +7821,11 @@ func (pb podBuilder) uid(u string) podBuilder {
77967821
return pb
77977822
}
77987823

7824+
func (pb podBuilder) labels(labels map[string]string) podBuilder {
7825+
pb.Labels = labels
7826+
return pb
7827+
}
7828+
77997829
func (pb podBuilder) job(j *batch.Job) podBuilder {
78007830
pb.Labels = j.Spec.Selector.MatchLabels
78017831
pb.Namespace = j.Namespace

0 commit comments

Comments
 (0)