diff --git a/CHANGELOG.md b/CHANGELOG.md index 67895d3b..c2ecb0b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- Only remove `finalizer` after the karpenter NodePool CR in the workload cluster has been removed. + ## [0.20.0] - 2025-06-23 ### Changed diff --git a/controllers/karpentermachinepool_controller.go b/controllers/karpentermachinepool_controller.go index b087f1a7..ea2938a6 100644 --- a/controllers/karpentermachinepool_controller.go +++ b/controllers/karpentermachinepool_controller.go @@ -178,11 +178,12 @@ func (r *KarpenterMachinePoolReconciler) Reconcile(ctx context.Context, req reco karpenterMachinePool.Status.Replicas = numberOfNodeClaims karpenterMachinePool.Status.Ready = true - logger.Info("Found NodeClaims in workload cluster, patching KarpenterMachinePool", "numberOfNodeClaims", numberOfNodeClaims) - - if err := r.client.Status().Patch(ctx, karpenterMachinePool, client.MergeFrom(karpenterMachinePoolCopy), client.FieldOwner("karpentermachinepool-controller")); err != nil { - logger.Error(err, "failed to patch karpenterMachinePool.status.Replicas") - return reconcile.Result{}, err + if !karpenterMachinePoolCopy.Status.Ready || karpenterMachinePoolCopy.Status.Replicas != numberOfNodeClaims { + logger.Info("Found NodeClaims in workload cluster, patching KarpenterMachinePool.status field", "numberOfNodeClaims", numberOfNodeClaims, "currentNumberOfReplicas", karpenterMachinePoolCopy.Status.Replicas) + if err := r.client.Status().Patch(ctx, karpenterMachinePool, client.MergeFrom(karpenterMachinePoolCopy), client.FieldOwner("karpentermachinepool-controller")); err != nil { + logger.Error(err, "failed to patch karpenterMachinePool.status.Replicas") + return reconcile.Result{}, err + } } karpenterMachinePool.Spec.ProviderIDList = providerIDList @@ -193,6 +194,7 @@ func (r *KarpenterMachinePoolReconciler) Reconcile(ctx context.Context, req reco } if machinePool.Spec.Replicas == nil || *machinePool.Spec.Replicas != numberOfNodeClaims { + logger.Info("Patching MachinePool.spec.replicas field", "numberOfNodeClaims", numberOfNodeClaims) machinePoolCopy := machinePool.DeepCopy() machinePool.Spec.Replicas = &numberOfNodeClaims if err := r.client.Patch(ctx, machinePool, client.MergeFrom(machinePoolCopy), client.FieldOwner("karpenter-machinepool-controller")); err != nil { @@ -212,25 +214,38 @@ func (r *KarpenterMachinePoolReconciler) reconcileDelete(ctx context.Context, lo } // Terminate EC2 instances with the karpenter.sh/nodepool tag matching the KarpenterMachinePool name - logger.Info("Terminating EC2 instances for KarpenterMachinePool", "karpenterMachinePoolName", karpenterMachinePool.Name) - instanceIDs, err := ec2Client.TerminateInstancesByTag(ctx, logger, "karpenter.sh/nodepool", karpenterMachinePool.Name) + _, err = ec2Client.TerminateInstancesByTag(ctx, logger, "karpenter.sh/nodepool", karpenterMachinePool.Name) if err != nil { return reconcile.Result{}, fmt.Errorf("failed to terminate EC2 instances: %w", err) } + } + + workloadClusterClient, err := r.clusterClientGetter(ctx, "", r.client, client.ObjectKeyFromObject(cluster)) + if err != nil { + return reconcile.Result{}, err + } - logger.Info("Found instances", "instanceIDs", instanceIDs) + nodePool := &unstructured.Unstructured{} + nodePool.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "karpenter.sh", + Kind: "NodePool", + Version: "v1", + }) + err = workloadClusterClient.Get(ctx, client.ObjectKey{Name: karpenterMachinePool.Name}, nodePool) + if client.IgnoreNotFound(err) != nil { + return reconcile.Result{}, err + } - // Requeue if we find instances to terminate. Once there are no instances to terminate, we proceed to remove the finalizer. - // We do this when the cluster is being deleted, to avoid removing the finalizer before karpenter launches a new instance that would be left over. - if len(instanceIDs) > 0 { - return reconcile.Result{RequeueAfter: 30 * time.Second}, nil - } + // Requeue if we find the karpenter NodePool CR, otherwise `karpenter` may launch new instances for it + if err == nil { + return reconcile.Result{RequeueAfter: 30 * time.Second}, nil } - // Create deep copy of the reconciled object so we can change it + // Create a deep copy of the reconciled object so we can change it karpenterMachinePoolCopy := karpenterMachinePool.DeepCopy() controllerutil.RemoveFinalizer(karpenterMachinePool, KarpenterFinalizer) + logger.Info("Removing finalizer", "finalizer", KarpenterFinalizer) if err := r.client.Patch(ctx, karpenterMachinePool, client.MergeFrom(karpenterMachinePoolCopy)); err != nil { logger.Error(err, "failed to remove finalizer", "finalizer", KarpenterFinalizer) return reconcile.Result{}, err diff --git a/controllers/karpentermachinepool_controller_test.go b/controllers/karpentermachinepool_controller_test.go index 4403aa5a..6788d1a4 100644 --- a/controllers/karpentermachinepool_controller_test.go +++ b/controllers/karpentermachinepool_controller_test.go @@ -261,14 +261,29 @@ var _ = Describe("KarpenterMachinePool reconciler", func() { err = fakeCtrlClient.Delete(ctx, cluster) Expect(err).NotTo(HaveOccurred()) }) - // This test is a bit cumbersome because we are deleting CRs, so we can't use different `It` blocks or the CRs would be gone. - // We first mock the call to `TerminateInstancesByTag` to return some instances so that we can test - // the behavior when there are pending instances to remove. - // Then we manually/explicitly call the reconciler inside the test again, to be able to test the behavior - // when there are no instances to remove. + // This test is a bit cumbersome because we are deleting CRs, so we can't use different `It` blocks, or the CRs would be gone. + // We first create a karpenter NodePool CR in the workload cluster, so that we can test that we won't remove + // the finalizer until the NodePool is removed, to avoid removing the `KarpenterMachinePool` before karpenter keeps launching instances. + // Then we manually/explicitly call the reconciler inside the test again, to be able to test the behavior when the NodePool + // is gone (so we know karpenter won't launch new instances). When("there are ec2 instances from karpenter", func() { BeforeEach(func() { - ec2Client.TerminateInstancesByTagReturnsOnCall(0, []string{"i-abc123", "i-def456"}, nil) + nodePool := &unstructured.Unstructured{} + nodePool.Object = map[string]interface{}{ + "metadata": map[string]interface{}{ + "name": KarpenterMachinePoolName, + }, + "spec": map[string]interface{}{}, + } + nodePool.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "karpenter.sh", + Kind: "NodePool", + Version: "v1", + }) + // We want the NodePool CR on the WC. + // For the tests, the WC client is the same client we use for the reconciler + err := fakeCtrlClient.Create(ctx, nodePool) + Expect(err).NotTo(HaveOccurred()) }) It("deletes KarpenterMachinePool ec2 instances and finalizer", func() { @@ -282,7 +297,19 @@ var _ = Describe("KarpenterMachinePool reconciler", func() { // Finalizer should be there blocking the deletion of the CR Expect(karpenterMachinePoolList.Items).To(HaveLen(1)) - ec2Client.TerminateInstancesByTagReturnsOnCall(0, nil, nil) + nodePool := &unstructured.Unstructured{} + nodePool.Object = map[string]interface{}{ + "metadata": map[string]interface{}{ + "name": KarpenterMachinePoolName, + }, + } + nodePool.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "karpenter.sh", + Kind: "NodePool", + Version: "v1", + }) + err = fakeCtrlClient.Delete(ctx, nodePool) + Expect(err).NotTo(HaveOccurred()) reconcileResult, reconcileErr = reconciler.Reconcile(ctx, ctrl.Request{ NamespacedName: types.NamespacedName{