Improve pause handling implementation on the controllers (#460)

mergify[bot] · web-flow · commit 49a734f011cc · 2025-04-30T19:00:35.000Z
### Description Cluster-API version (>=1.9.5) incorporated a [fix](kubernetes-sigs/cluster-api#11814) for a [race condition](kubernetes-sigs/cluster-api#11812) in their clusterctl move logic. Currently, during a cluster move operation, CAPT controllers interpret the temporary absence of TinkerbellMachine CRDs in the source cluster as a deletion event. This triggers power-off jobs, potentially causing catastrophic effects for users, when in reality the resources are just being moved from source to target cluster. This PR implements proper pause handling in both cluster and machine controllers to prevent unwanted reconciliation during cluster move operations. When a CAPI cluster is paused: - Controllers check for pause annotations before proceeding with reconciliation - Reconciliation is halted if pause is detected ## Why is this needed Fixes: # ## How Has This Been Tested? Tested with a custom built controller and moving the CRs back and forth using clusterctl move. ## How are existing users impacted? What migration steps/scripts do we need? ## Checklist: I have: - [ ] updated the documentation and/or roadmap (if required) - [ ] added unit or e2e tests - [ ] provided instructions on how to upgrade
diff --git a/controller/cluster/tinkerbellcluster.go b/controller/cluster/tinkerbellcluster.go
@@ -243,6 +243,9 @@ func (tcr *TinkerbellClusterReconciler) Reconcile(ctx context.Context, req ctrl.
 		return ctrl.Result{}, nil
 	}
 
+	// TODO(enhancement): Currently using simple annotation-based pause checking. Need to implement
+	// proper pause handling using paused.EnsurePausedCondition() as per:
+	// https://cluster-api.sigs.k8s.io/developer/providers/contracts/infra-cluster#infracluster-pausing
 	if annotations.IsPaused(crc.cluster, crc.tinkerbellCluster) {
 		crc.log.Info("TinkerbellCluster is marked as paused. Won't reconcile")
 
diff --git a/controller/machine/tinkerbellmachine.go b/controller/machine/tinkerbellmachine.go
@@ -25,6 +25,7 @@ import (
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
 	"sigs.k8s.io/cluster-api/util"
+	"sigs.k8s.io/cluster-api/util/annotations"
 	"sigs.k8s.io/cluster-api/util/collections"
 	"sigs.k8s.io/cluster-api/util/patch"
 	"sigs.k8s.io/cluster-api/util/predicates"
@@ -94,10 +95,6 @@ func (r *TinkerbellMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re
 
 	scope.patchHelper = patchHelper
 
-	if scope.MachineScheduledForDeletion() {
-		return ctrl.Result{}, scope.DeleteMachineWithDependencies()
-	}
-
 	// We must be bound to a CAPI Machine object before we can continue.
 	machine, err := scope.getReadyMachine()
 	if err != nil {
@@ -108,6 +105,27 @@ func (r *TinkerbellMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re
 		return ctrl.Result{}, nil
 	}
 
+	// Fetch the capi cluster owning the machine and check if the cluster is paused
+	cluster, err := util.GetClusterFromMetadata(ctx, scope.client, machine.ObjectMeta)
+	if err != nil {
+		if !apierrors.IsNotFound(err) {
+			return ctrl.Result{}, fmt.Errorf("getting cluster from metadata:: %w", err)
+		}
+	}
+
+	// TODO(enhancement): Currently using simple annotation-based pause checking. Need to implement
+	// proper pause handling using paused.EnsurePausedCondition() as per:
+	// https://cluster-api.sigs.k8s.io/developer/providers/contracts/infra-cluster#infracluster-pausing
+	if cluster != nil && annotations.IsPaused(cluster, scope.tinkerbellMachine) {
+		log.Info("TinkerbellMachine is paused, skipping reconciliation")
+
+		return ctrl.Result{}, nil
+	}
+
+	if scope.MachineScheduledForDeletion() {
+		return ctrl.Result{}, scope.DeleteMachineWithDependencies()
+	}
+
 	// We need a bootstrap cloud config secret to bootstrap the node so we can't proceed without it.
 	// Typically, this is something akin to cloud-init user-data.
 	bootstrapCloudConfig, err := scope.getReadyBootstrapCloudConfig(machine)

Original file line number	Diff line number	Diff line change
`@@ -243,6 +243,9 @@ func (tcr *TinkerbellClusterReconciler) Reconcile(ctx context.Context, req ctrl.`
`243`	`243`	`return ctrl.Result{}, nil`
`244`	`244`	`}`
`245`	`245`
	`246`	`+ // TODO(enhancement): Currently using simple annotation-based pause checking. Need to implement`
	`247`	`+ // proper pause handling using paused.EnsurePausedCondition() as per:`
	`248`	`+ // https://cluster-api.sigs.k8s.io/developer/providers/contracts/infra-cluster#infracluster-pausing`
`246`	`249`	`if annotations.IsPaused(crc.cluster, crc.tinkerbellCluster) {`
`247`	`250`	`crc.log.Info("TinkerbellCluster is marked as paused. Won't reconcile")`
`248`	`251`