Skip to content

Commit 49a734f

Browse files
authored
Improve pause handling implementation on the controllers (#460)
### Description Cluster-API version (>=1.9.5) incorporated a [fix](kubernetes-sigs/cluster-api#11814) for a [race condition](kubernetes-sigs/cluster-api#11812) in their clusterctl move logic. Currently, during a cluster move operation, CAPT controllers interpret the temporary absence of TinkerbellMachine CRDs in the source cluster as a deletion event. This triggers power-off jobs, potentially causing catastrophic effects for users, when in reality the resources are just being moved from source to target cluster. This PR implements proper pause handling in both cluster and machine controllers to prevent unwanted reconciliation during cluster move operations. When a CAPI cluster is paused: - Controllers check for pause annotations before proceeding with reconciliation - Reconciliation is halted if pause is detected ## Why is this needed Fixes: # ## How Has This Been Tested? Tested with a custom built controller and moving the CRs back and forth using clusterctl move. ## How are existing users impacted? What migration steps/scripts do we need? ## Checklist: I have: - [ ] updated the documentation and/or roadmap (if required) - [ ] added unit or e2e tests - [ ] provided instructions on how to upgrade
2 parents 0200620 + b86f532 commit 49a734f

File tree

2 files changed

+25
-4
lines changed

2 files changed

+25
-4
lines changed

controller/cluster/tinkerbellcluster.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,9 @@ func (tcr *TinkerbellClusterReconciler) Reconcile(ctx context.Context, req ctrl.
243243
return ctrl.Result{}, nil
244244
}
245245

246+
// TODO(enhancement): Currently using simple annotation-based pause checking. Need to implement
247+
// proper pause handling using paused.EnsurePausedCondition() as per:
248+
// https://cluster-api.sigs.k8s.io/developer/providers/contracts/infra-cluster#infracluster-pausing
246249
if annotations.IsPaused(crc.cluster, crc.tinkerbellCluster) {
247250
crc.log.Info("TinkerbellCluster is marked as paused. Won't reconcile")
248251

controller/machine/tinkerbellmachine.go

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
apierrors "k8s.io/apimachinery/pkg/api/errors"
2626
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
2727
"sigs.k8s.io/cluster-api/util"
28+
"sigs.k8s.io/cluster-api/util/annotations"
2829
"sigs.k8s.io/cluster-api/util/collections"
2930
"sigs.k8s.io/cluster-api/util/patch"
3031
"sigs.k8s.io/cluster-api/util/predicates"
@@ -94,10 +95,6 @@ func (r *TinkerbellMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re
9495

9596
scope.patchHelper = patchHelper
9697

97-
if scope.MachineScheduledForDeletion() {
98-
return ctrl.Result{}, scope.DeleteMachineWithDependencies()
99-
}
100-
10198
// We must be bound to a CAPI Machine object before we can continue.
10299
machine, err := scope.getReadyMachine()
103100
if err != nil {
@@ -108,6 +105,27 @@ func (r *TinkerbellMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re
108105
return ctrl.Result{}, nil
109106
}
110107

108+
// Fetch the capi cluster owning the machine and check if the cluster is paused
109+
cluster, err := util.GetClusterFromMetadata(ctx, scope.client, machine.ObjectMeta)
110+
if err != nil {
111+
if !apierrors.IsNotFound(err) {
112+
return ctrl.Result{}, fmt.Errorf("getting cluster from metadata:: %w", err)
113+
}
114+
}
115+
116+
// TODO(enhancement): Currently using simple annotation-based pause checking. Need to implement
117+
// proper pause handling using paused.EnsurePausedCondition() as per:
118+
// https://cluster-api.sigs.k8s.io/developer/providers/contracts/infra-cluster#infracluster-pausing
119+
if cluster != nil && annotations.IsPaused(cluster, scope.tinkerbellMachine) {
120+
log.Info("TinkerbellMachine is paused, skipping reconciliation")
121+
122+
return ctrl.Result{}, nil
123+
}
124+
125+
if scope.MachineScheduledForDeletion() {
126+
return ctrl.Result{}, scope.DeleteMachineWithDependencies()
127+
}
128+
111129
// We need a bootstrap cloud config secret to bootstrap the node so we can't proceed without it.
112130
// Typically, this is something akin to cloud-init user-data.
113131
bootstrapCloudConfig, err := scope.getReadyBootstrapCloudConfig(machine)

0 commit comments

Comments
 (0)