Skip to content

Commit a70eef4

Browse files
committed
Only remove finalizer if NodePool CR in workload cluster is gone
1 parent 2dd5849 commit a70eef4

File tree

3 files changed

+66
-21
lines changed

3 files changed

+66
-21
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Fixed
11+
12+
- Only remove `finalizer` after the karpenter NodePool CR in the workload cluster has been removed.
13+
1014
## [0.20.0] - 2025-06-23
1115

1216
### Changed

controllers/karpentermachinepool_controller.go

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,12 @@ func (r *KarpenterMachinePoolReconciler) Reconcile(ctx context.Context, req reco
178178
karpenterMachinePool.Status.Replicas = numberOfNodeClaims
179179
karpenterMachinePool.Status.Ready = true
180180

181-
logger.Info("Found NodeClaims in workload cluster, patching KarpenterMachinePool", "numberOfNodeClaims", numberOfNodeClaims)
182-
183-
if err := r.client.Status().Patch(ctx, karpenterMachinePool, client.MergeFrom(karpenterMachinePoolCopy), client.FieldOwner("karpentermachinepool-controller")); err != nil {
184-
logger.Error(err, "failed to patch karpenterMachinePool.status.Replicas")
185-
return reconcile.Result{}, err
181+
if !karpenterMachinePoolCopy.Status.Ready || karpenterMachinePoolCopy.Status.Replicas != numberOfNodeClaims {
182+
logger.Info("Found NodeClaims in workload cluster, patching KarpenterMachinePool.status field", "numberOfNodeClaims", numberOfNodeClaims)
183+
if err := r.client.Status().Patch(ctx, karpenterMachinePool, client.MergeFrom(karpenterMachinePoolCopy), client.FieldOwner("karpentermachinepool-controller")); err != nil {
184+
logger.Error(err, "failed to patch karpenterMachinePool.status.Replicas")
185+
return reconcile.Result{}, err
186+
}
186187
}
187188

188189
karpenterMachinePool.Spec.ProviderIDList = providerIDList
@@ -193,6 +194,7 @@ func (r *KarpenterMachinePoolReconciler) Reconcile(ctx context.Context, req reco
193194
}
194195

195196
if machinePool.Spec.Replicas == nil || *machinePool.Spec.Replicas != numberOfNodeClaims {
197+
logger.Info("Patching MachinePool.spec.replicas field", "numberOfNodeClaims", numberOfNodeClaims)
196198
machinePoolCopy := machinePool.DeepCopy()
197199
machinePool.Spec.Replicas = &numberOfNodeClaims
198200
if err := r.client.Patch(ctx, machinePool, client.MergeFrom(machinePoolCopy), client.FieldOwner("karpenter-machinepool-controller")); err != nil {
@@ -212,22 +214,34 @@ func (r *KarpenterMachinePoolReconciler) reconcileDelete(ctx context.Context, lo
212214
}
213215

214216
// Terminate EC2 instances with the karpenter.sh/nodepool tag matching the KarpenterMachinePool name
215-
logger.Info("Terminating EC2 instances for KarpenterMachinePool", "karpenterMachinePoolName", karpenterMachinePool.Name)
216-
instanceIDs, err := ec2Client.TerminateInstancesByTag(ctx, logger, "karpenter.sh/nodepool", karpenterMachinePool.Name)
217+
_, err = ec2Client.TerminateInstancesByTag(ctx, logger, "karpenter.sh/nodepool", karpenterMachinePool.Name)
217218
if err != nil {
218219
return reconcile.Result{}, fmt.Errorf("failed to terminate EC2 instances: %w", err)
219220
}
221+
}
222+
223+
workloadClusterClient, err := r.clusterClientGetter(ctx, "", r.client, client.ObjectKeyFromObject(cluster))
224+
if err != nil {
225+
return reconcile.Result{}, err
226+
}
220227

221-
logger.Info("Found instances", "instanceIDs", instanceIDs)
228+
nodePool := &unstructured.Unstructured{}
229+
nodePool.SetGroupVersionKind(schema.GroupVersionKind{
230+
Group: "karpenter.sh",
231+
Kind: "NodePool",
232+
Version: "v1",
233+
})
234+
err = workloadClusterClient.Get(ctx, client.ObjectKey{Name: karpenterMachinePool.Name}, nodePool)
235+
if client.IgnoreNotFound(err) != nil {
236+
return reconcile.Result{}, err
237+
}
222238

223-
// Requeue if we find instances to terminate. Once there are no instances to terminate, we proceed to remove the finalizer.
224-
// We do this when the cluster is being deleted, to avoid removing the finalizer before karpenter launches a new instance that would be left over.
225-
if len(instanceIDs) > 0 {
226-
return reconcile.Result{RequeueAfter: 30 * time.Second}, nil
227-
}
239+
// Requeue if we find the karpenter NodePool CR, otherwise `karpenter` may launch new instances for it
240+
if err == nil {
241+
return reconcile.Result{RequeueAfter: 30 * time.Second}, nil
228242
}
229243

230-
// Create deep copy of the reconciled object so we can change it
244+
// Create a deep copy of the reconciled object so we can change it
231245
karpenterMachinePoolCopy := karpenterMachinePool.DeepCopy()
232246

233247
controllerutil.RemoveFinalizer(karpenterMachinePool, KarpenterFinalizer)

controllers/karpentermachinepool_controller_test.go

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -261,14 +261,29 @@ var _ = Describe("KarpenterMachinePool reconciler", func() {
261261
err = fakeCtrlClient.Delete(ctx, cluster)
262262
Expect(err).NotTo(HaveOccurred())
263263
})
264-
// This test is a bit cumbersome because we are deleting CRs, so we can't use different `It` blocks or the CRs would be gone.
265-
// We first mock the call to `TerminateInstancesByTag` to return some instances so that we can test
266-
// the behavior when there are pending instances to remove.
267-
// Then we manually/explicitly call the reconciler inside the test again, to be able to test the behavior
268-
// when there are no instances to remove.
264+
// This test is a bit cumbersome because we are deleting CRs, so we can't use different `It` blocks, or the CRs would be gone.
265+
// We first create a karpenter NodePool CR in the workload cluster, so that we can test that we won't remove
266+
// the finalizer until the NodePool is removed, to avoid removing the `KarpenterMachinePool` before karpenter keeps launching instances.
267+
// Then we manually/explicitly call the reconciler inside the test again, to be able to test the behavior when the NodePool
268+
// is gone (so we know karpenter won't launch new instances).
269269
When("there are ec2 instances from karpenter", func() {
270270
BeforeEach(func() {
271-
ec2Client.TerminateInstancesByTagReturnsOnCall(0, []string{"i-abc123", "i-def456"}, nil)
271+
nodePool := &unstructured.Unstructured{}
272+
nodePool.Object = map[string]interface{}{
273+
"metadata": map[string]interface{}{
274+
"name": KarpenterMachinePoolName,
275+
},
276+
"spec": map[string]interface{}{},
277+
}
278+
nodePool.SetGroupVersionKind(schema.GroupVersionKind{
279+
Group: "karpenter.sh",
280+
Kind: "NodePool",
281+
Version: "v1",
282+
})
283+
// We want the NodePool CR on the WC.
284+
// For the tests, the WC client is the same client we use for the reconciler
285+
err := fakeCtrlClient.Create(ctx, nodePool)
286+
Expect(err).NotTo(HaveOccurred())
272287
})
273288

274289
It("deletes KarpenterMachinePool ec2 instances and finalizer", func() {
@@ -282,7 +297,19 @@ var _ = Describe("KarpenterMachinePool reconciler", func() {
282297
// Finalizer should be there blocking the deletion of the CR
283298
Expect(karpenterMachinePoolList.Items).To(HaveLen(1))
284299

285-
ec2Client.TerminateInstancesByTagReturnsOnCall(0, nil, nil)
300+
nodePool := &unstructured.Unstructured{}
301+
nodePool.Object = map[string]interface{}{
302+
"metadata": map[string]interface{}{
303+
"name": KarpenterMachinePoolName,
304+
},
305+
}
306+
nodePool.SetGroupVersionKind(schema.GroupVersionKind{
307+
Group: "karpenter.sh",
308+
Kind: "NodePool",
309+
Version: "v1",
310+
})
311+
err = fakeCtrlClient.Delete(ctx, nodePool)
312+
Expect(err).NotTo(HaveOccurred())
286313

287314
reconcileResult, reconcileErr = reconciler.Reconcile(ctx, ctrl.Request{
288315
NamespacedName: types.NamespacedName{

0 commit comments

Comments
 (0)