Skip to content

Commit 1020fed

Browse files
TomerNewmank8s-ci-robot
authored andcommitted
deleting orphan worker pods
Previously, when worker pods failed for any reason and the user deleted the associated Module CR, the failing pods would remain in the cluster instead of being removed. This commit introduces a garbage collection mechanism to ensure that orphaned worker pods are properly deleted when the corresponding Module CR is removed.
1 parent efe36a0 commit 1020fed

File tree

3 files changed

+142
-0
lines changed

3 files changed

+142
-0
lines changed

internal/controllers/mock_nmc_reconciler.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/controllers/nmc_reconciler.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,10 @@ func (r *NMCReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
159159
errs = append(errs, fmt.Errorf("failed to GC in-use labels for NMC %s: %v", req.NamespacedName, err))
160160
}
161161

162+
if err := r.helper.GarbageCollectWorkerPods(ctx, &nmcObj); err != nil {
163+
errs = append(errs, fmt.Errorf("failed to GC orphan worker pods for NMC %s: %v", req.NamespacedName, err))
164+
}
165+
162166
if loaded, unloaded, err := r.helper.UpdateNodeLabels(ctx, &nmcObj, &node); err != nil {
163167
errs = append(errs, fmt.Errorf("could not update node's labels for NMC %s: %v", req.NamespacedName, err))
164168
} else {
@@ -212,6 +216,7 @@ func GetContainerStatus(statuses []v1.ContainerStatus, name string) v1.Container
212216

213217
type nmcReconcilerHelper interface {
214218
GarbageCollectInUseLabels(ctx context.Context, nmc *kmmv1beta1.NodeModulesConfig) error
219+
GarbageCollectWorkerPods(ctx context.Context, nmc *kmmv1beta1.NodeModulesConfig) error
215220
ProcessModuleSpec(ctx context.Context, nmc *kmmv1beta1.NodeModulesConfig, spec *kmmv1beta1.NodeModuleSpec, status *kmmv1beta1.NodeModuleStatus, node *v1.Node) error
216221
ProcessUnconfiguredModuleStatus(ctx context.Context, nmc *kmmv1beta1.NodeModulesConfig, status *kmmv1beta1.NodeModuleStatus, node *v1.Node) error
217222
RemovePodFinalizers(ctx context.Context, nodeName string) error
@@ -238,6 +243,41 @@ func newNMCReconcilerHelper(client client.Client, podManager pod.WorkerPodManage
238243
}
239244
}
240245

246+
func (h *nmcReconcilerHelperImpl) GarbageCollectWorkerPods(ctx context.Context, nmcObj *kmmv1beta1.NodeModulesConfig) error {
247+
podsList, err := h.podManager.ListWorkerPodsOnNode(ctx, nmcObj.Name)
248+
if err != nil {
249+
return fmt.Errorf("could not list worker Pods: %v", err)
250+
}
251+
252+
modulePresentInNMC := sets.New[string]()
253+
for _, module := range nmcObj.Spec.Modules {
254+
modulePresentInNMC.Insert(module.Name)
255+
}
256+
for _, module := range nmcObj.Status.Modules {
257+
modulePresentInNMC.Insert(module.Name)
258+
}
259+
260+
var errs []error
261+
for _, workerPod := range podsList {
262+
podModuleName := workerPod.Labels[constants.ModuleNameLabel]
263+
if !modulePresentInNMC.Has(podModuleName) {
264+
mergeFrom := client.MergeFrom(workerPod.DeepCopy())
265+
if controllerutil.RemoveFinalizer(&workerPod, pod.NodeModulesConfigFinalizer) {
266+
if err = h.client.Patch(ctx, &workerPod, mergeFrom); err != nil {
267+
errs = append(
268+
errs, fmt.Errorf("could not patch Pod %s/%s: %v", workerPod.Namespace, workerPod.Name, err),
269+
)
270+
}
271+
}
272+
273+
if err = h.podManager.DeletePod(ctx, &workerPod); err != nil {
274+
errs = append(errs, fmt.Errorf("could not delete pod %s: %v", workerPod.Name, err))
275+
}
276+
}
277+
}
278+
return errors.Join(errs...)
279+
}
280+
241281
// GarbageCollectInUseLabels removes all module-in-use labels for which there is no corresponding entry either in
242282
// spec.modules or in status.modules.
243283
func (h *nmcReconcilerHelperImpl) GarbageCollectInUseLabels(ctx context.Context, nmcObj *kmmv1beta1.NodeModulesConfig) error {

internal/controllers/nmc_reconciler_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ var _ = Describe("NodeModulesConfigReconciler_Reconcile", func() {
303303
wh.EXPECT().ProcessModuleSpec(contextWithValueMatch, nmc, &spec1, nil, &node),
304304
wh.EXPECT().ProcessUnconfiguredModuleStatus(contextWithValueMatch, nmc, &status2, &node),
305305
wh.EXPECT().GarbageCollectInUseLabels(ctx, nmc),
306+
wh.EXPECT().GarbageCollectWorkerPods(ctx, nmc),
306307
wh.EXPECT().UpdateNodeLabels(ctx, nmc, &node).Return(loaded, unloaded, err),
307308
wh.EXPECT().RecordEvents(&node, loaded, unloaded),
308309
)
@@ -330,6 +331,7 @@ var _ = Describe("NodeModulesConfigReconciler_Reconcile", func() {
330331
fmt.Errorf("error processing Module %s: %v", namespace+"/"+mod0Name, errorMeassge),
331332
fmt.Errorf("error processing orphan status for Module %s: %v", namespace+"/"+mod2Name, errorMeassge),
332333
fmt.Errorf("failed to GC in-use labels for NMC %s: %v", types.NamespacedName{Name: nmcName}, errorMeassge),
334+
fmt.Errorf("failed to GC orphan worker pods for NMC %s: %v", types.NamespacedName{Name: nmcName}, errorMeassge),
333335
fmt.Errorf("could not update node's labels for NMC %s: %v", types.NamespacedName{Name: nmcName}, errorMeassge),
334336
}
335337

@@ -381,6 +383,7 @@ var _ = Describe("NodeModulesConfigReconciler_Reconcile", func() {
381383
wh.EXPECT().ProcessModuleSpec(contextWithValueMatch, nmc, &spec0, &status0, &node).Return(errors.New(errorMeassge)),
382384
wh.EXPECT().ProcessUnconfiguredModuleStatus(contextWithValueMatch, nmc, &status2, &node).Return(errors.New(errorMeassge)),
383385
wh.EXPECT().GarbageCollectInUseLabels(ctx, nmc).Return(errors.New(errorMeassge)),
386+
wh.EXPECT().GarbageCollectWorkerPods(ctx, nmc).Return(errors.New(errorMeassge)),
384387
wh.EXPECT().UpdateNodeLabels(ctx, nmc, &node).Return(nil, nil, errors.New(errorMeassge)),
385388
)
386389

@@ -389,6 +392,91 @@ var _ = Describe("NodeModulesConfigReconciler_Reconcile", func() {
389392
})
390393
})
391394

395+
var _ = Describe("nmcReconcilerHelperImpl_GarbageCollectWorkerPods", func() {
396+
var (
397+
ctx = context.TODO()
398+
client *testclient.MockClient
399+
pm *pod.MockWorkerPodManager
400+
nrh nmcReconcilerHelper
401+
)
402+
403+
BeforeEach(func() {
404+
ctrl := gomock.NewController(GinkgoT())
405+
client = testclient.NewMockClient(ctrl)
406+
pm = pod.NewMockWorkerPodManager(ctrl)
407+
nrh = newNMCReconcilerHelper(client, pm, nil, nil)
408+
})
409+
410+
It("should delete orphaned worker pod", func() {
411+
nmcSpec := kmmv1beta1.NodeModulesConfigSpec{
412+
Modules: []kmmv1beta1.NodeModuleSpec{
413+
{
414+
ModuleItem: kmmv1beta1.ModuleItem{Name: nameSecond},
415+
},
416+
},
417+
}
418+
nmcObj := &kmmv1beta1.NodeModulesConfig{
419+
ObjectMeta: metav1.ObjectMeta{Name: nmcName},
420+
Spec: nmcSpec,
421+
}
422+
pod := v1.Pod{
423+
ObjectMeta: metav1.ObjectMeta{
424+
Name: fmt.Sprintf("kmm-worker-%s-%s", nmcObj.Name, nameFirst),
425+
Namespace: "d",
426+
Labels: map[string]string{
427+
constants.ModuleNameLabel: nameFirst,
428+
},
429+
Finalizers: []string{pod.NodeModulesConfigFinalizer},
430+
},
431+
}
432+
433+
gomock.InOrder(
434+
pm.EXPECT().ListWorkerPodsOnNode(ctx, nmcName).Return([]v1.Pod{pod}, nil),
435+
client.EXPECT().Patch(ctx, gomock.Any(), gomock.Any()).Return(nil),
436+
pm.EXPECT().DeletePod(ctx, gomock.Any()).Return(nil),
437+
)
438+
439+
Expect(
440+
nrh.GarbageCollectWorkerPods(ctx, nmcObj),
441+
).NotTo(
442+
HaveOccurred(),
443+
)
444+
})
445+
446+
It("should do nothing because there are not orphaned worker pods", func() {
447+
nmcSpec := kmmv1beta1.NodeModulesConfigSpec{
448+
Modules: []kmmv1beta1.NodeModuleSpec{
449+
{
450+
ModuleItem: kmmv1beta1.ModuleItem{Name: nameFirst},
451+
},
452+
},
453+
}
454+
nmcObj := &kmmv1beta1.NodeModulesConfig{
455+
ObjectMeta: metav1.ObjectMeta{Name: nmcName},
456+
Spec: nmcSpec,
457+
}
458+
pod := v1.Pod{
459+
ObjectMeta: metav1.ObjectMeta{
460+
Name: fmt.Sprintf("kmm-worker-%s-%s", nmcObj.Name, nameFirst),
461+
Namespace: "d",
462+
Labels: map[string]string{
463+
constants.ModuleNameLabel: nameFirst,
464+
},
465+
},
466+
}
467+
468+
gomock.InOrder(
469+
pm.EXPECT().ListWorkerPodsOnNode(ctx, nmcName).Return([]v1.Pod{pod}, nil),
470+
)
471+
472+
Expect(
473+
nrh.GarbageCollectWorkerPods(ctx, nmcObj),
474+
).NotTo(
475+
HaveOccurred(),
476+
)
477+
})
478+
})
479+
392480
var _ = Describe("nmcReconcilerHelperImpl_GarbageCollectInUseLabels", func() {
393481
var (
394482
ctx = context.TODO()

0 commit comments

Comments
 (0)