Skip to content

Only remove finalizer if NodePool CR in workload cluster is gone #267

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed

- Only remove `finalizer` after the karpenter NodePool CR in the workload cluster has been removed.

## [0.20.0] - 2025-06-23

### Changed
Expand Down
43 changes: 29 additions & 14 deletions controllers/karpentermachinepool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,12 @@ func (r *KarpenterMachinePoolReconciler) Reconcile(ctx context.Context, req reco
karpenterMachinePool.Status.Replicas = numberOfNodeClaims
karpenterMachinePool.Status.Ready = true

logger.Info("Found NodeClaims in workload cluster, patching KarpenterMachinePool", "numberOfNodeClaims", numberOfNodeClaims)

if err := r.client.Status().Patch(ctx, karpenterMachinePool, client.MergeFrom(karpenterMachinePoolCopy), client.FieldOwner("karpentermachinepool-controller")); err != nil {
logger.Error(err, "failed to patch karpenterMachinePool.status.Replicas")
return reconcile.Result{}, err
if !karpenterMachinePoolCopy.Status.Ready || karpenterMachinePoolCopy.Status.Replicas != numberOfNodeClaims {
logger.Info("Found NodeClaims in workload cluster, patching KarpenterMachinePool.status field", "numberOfNodeClaims", numberOfNodeClaims, "currentNumberOfReplicas", karpenterMachinePoolCopy.Status.Replicas)
if err := r.client.Status().Patch(ctx, karpenterMachinePool, client.MergeFrom(karpenterMachinePoolCopy), client.FieldOwner("karpentermachinepool-controller")); err != nil {
logger.Error(err, "failed to patch karpenterMachinePool.status.Replicas")
return reconcile.Result{}, err
}
}

karpenterMachinePool.Spec.ProviderIDList = providerIDList
Expand All @@ -193,6 +194,7 @@ func (r *KarpenterMachinePoolReconciler) Reconcile(ctx context.Context, req reco
}

if machinePool.Spec.Replicas == nil || *machinePool.Spec.Replicas != numberOfNodeClaims {
logger.Info("Patching MachinePool.spec.replicas field", "numberOfNodeClaims", numberOfNodeClaims)
machinePoolCopy := machinePool.DeepCopy()
machinePool.Spec.Replicas = &numberOfNodeClaims
if err := r.client.Patch(ctx, machinePool, client.MergeFrom(machinePoolCopy), client.FieldOwner("karpenter-machinepool-controller")); err != nil {
Expand All @@ -212,25 +214,38 @@ func (r *KarpenterMachinePoolReconciler) reconcileDelete(ctx context.Context, lo
}

// Terminate EC2 instances with the karpenter.sh/nodepool tag matching the KarpenterMachinePool name
logger.Info("Terminating EC2 instances for KarpenterMachinePool", "karpenterMachinePoolName", karpenterMachinePool.Name)
instanceIDs, err := ec2Client.TerminateInstancesByTag(ctx, logger, "karpenter.sh/nodepool", karpenterMachinePool.Name)
_, err = ec2Client.TerminateInstancesByTag(ctx, logger, "karpenter.sh/nodepool", karpenterMachinePool.Name)
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to terminate EC2 instances: %w", err)
}
}

workloadClusterClient, err := r.clusterClientGetter(ctx, "", r.client, client.ObjectKeyFromObject(cluster))
if err != nil {
return reconcile.Result{}, err
}

logger.Info("Found instances", "instanceIDs", instanceIDs)
nodePool := &unstructured.Unstructured{}
nodePool.SetGroupVersionKind(schema.GroupVersionKind{
Group: "karpenter.sh",
Kind: "NodePool",
Version: "v1",
})
err = workloadClusterClient.Get(ctx, client.ObjectKey{Name: karpenterMachinePool.Name}, nodePool)
if client.IgnoreNotFound(err) != nil {
return reconcile.Result{}, err
}

// Requeue if we find instances to terminate. Once there are no instances to terminate, we proceed to remove the finalizer.
// We do this when the cluster is being deleted, to avoid removing the finalizer before karpenter launches a new instance that would be left over.
if len(instanceIDs) > 0 {
return reconcile.Result{RequeueAfter: 30 * time.Second}, nil
}
// Requeue if we find the karpenter NodePool CR, otherwise `karpenter` may launch new instances for it
if err == nil {
return reconcile.Result{RequeueAfter: 30 * time.Second}, nil
}

// Create deep copy of the reconciled object so we can change it
// Create a deep copy of the reconciled object so we can change it
karpenterMachinePoolCopy := karpenterMachinePool.DeepCopy()

controllerutil.RemoveFinalizer(karpenterMachinePool, KarpenterFinalizer)
logger.Info("Removing finalizer", "finalizer", KarpenterFinalizer)
if err := r.client.Patch(ctx, karpenterMachinePool, client.MergeFrom(karpenterMachinePoolCopy)); err != nil {
logger.Error(err, "failed to remove finalizer", "finalizer", KarpenterFinalizer)
return reconcile.Result{}, err
Expand Down
41 changes: 34 additions & 7 deletions controllers/karpentermachinepool_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,29 @@ var _ = Describe("KarpenterMachinePool reconciler", func() {
err = fakeCtrlClient.Delete(ctx, cluster)
Expect(err).NotTo(HaveOccurred())
})
// This test is a bit cumbersome because we are deleting CRs, so we can't use different `It` blocks or the CRs would be gone.
// We first mock the call to `TerminateInstancesByTag` to return some instances so that we can test
// the behavior when there are pending instances to remove.
// Then we manually/explicitly call the reconciler inside the test again, to be able to test the behavior
// when there are no instances to remove.
// This test is a bit cumbersome because we are deleting CRs, so we can't use different `It` blocks, or the CRs would be gone.
// We first create a karpenter NodePool CR in the workload cluster, so that we can test that we won't remove
// the finalizer until the NodePool is removed, to avoid removing the `KarpenterMachinePool` before karpenter keeps launching instances.
// Then we manually/explicitly call the reconciler inside the test again, to be able to test the behavior when the NodePool
// is gone (so we know karpenter won't launch new instances).
When("there are ec2 instances from karpenter", func() {
BeforeEach(func() {
ec2Client.TerminateInstancesByTagReturnsOnCall(0, []string{"i-abc123", "i-def456"}, nil)
nodePool := &unstructured.Unstructured{}
nodePool.Object = map[string]interface{}{
"metadata": map[string]interface{}{
"name": KarpenterMachinePoolName,
},
"spec": map[string]interface{}{},
}
nodePool.SetGroupVersionKind(schema.GroupVersionKind{
Group: "karpenter.sh",
Kind: "NodePool",
Version: "v1",
})
// We want the NodePool CR on the WC.
// For the tests, the WC client is the same client we use for the reconciler
err := fakeCtrlClient.Create(ctx, nodePool)
Expect(err).NotTo(HaveOccurred())
})

It("deletes KarpenterMachinePool ec2 instances and finalizer", func() {
Expand All @@ -282,7 +297,19 @@ var _ = Describe("KarpenterMachinePool reconciler", func() {
// Finalizer should be there blocking the deletion of the CR
Expect(karpenterMachinePoolList.Items).To(HaveLen(1))

ec2Client.TerminateInstancesByTagReturnsOnCall(0, nil, nil)
nodePool := &unstructured.Unstructured{}
nodePool.Object = map[string]interface{}{
"metadata": map[string]interface{}{
"name": KarpenterMachinePoolName,
},
}
nodePool.SetGroupVersionKind(schema.GroupVersionKind{
Group: "karpenter.sh",
Kind: "NodePool",
Version: "v1",
})
err = fakeCtrlClient.Delete(ctx, nodePool)
Expect(err).NotTo(HaveOccurred())

reconcileResult, reconcileErr = reconciler.Reconcile(ctx, ctrl.Request{
NamespacedName: types.NamespacedName{
Expand Down
Loading