Skip to content

Commit a026502

Browse files
authored
Merge pull request #6183 from ykakarap/kcp-machinetemplate-cleanup
🐛 prevent blocking of KCP and DockerMachine controllers
2 parents f008f26 + f73a277 commit a026502

File tree

8 files changed

+29
-16
lines changed

8 files changed

+29
-16
lines changed

controlplane/kubeadm/internal/controllers/controller.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,12 +149,6 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.
149149
return ctrl.Result{}, nil
150150
}
151151

152-
// Wait for the cluster infrastructure to be ready before creating machines
153-
if !cluster.Status.InfrastructureReady {
154-
log.Info("Cluster infrastructure is not ready yet")
155-
return ctrl.Result{}, nil
156-
}
157-
158152
// Initialize the patch helper.
159153
patchHelper, err := patch.NewHelper(kcp, r.Client)
160154
if err != nil {
@@ -255,6 +249,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
255249
return ctrl.Result{}, err
256250
}
257251

252+
// Wait for the cluster infrastructure to be ready before creating machines
253+
if !cluster.Status.InfrastructureReady {
254+
log.Info("Cluster infrastructure is not ready yet")
255+
return ctrl.Result{}, nil
256+
}
257+
258258
// Generate Cluster Certificates if needed
259259
config := kcp.Spec.KubeadmConfigSpec.DeepCopy()
260260
config.JoinConfiguration = nil

controlplane/kubeadm/internal/controllers/controller_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ func TestReconcileReturnErrorWhenOwnerClusterIsMissing(t *testing.T) {
147147
g.Expect(result).To(Equal(ctrl.Result{}))
148148

149149
// calling reconcile should return error
150-
g.Expect(env.Delete(ctx, cluster)).To(Succeed())
150+
g.Expect(env.CleanupAndWait(ctx, cluster)).To(Succeed())
151151

152152
g.Eventually(func() error {
153153
_, err := r.Reconcile(ctx, ctrl.Request{NamespacedName: util.ObjectKey(kcp)})
@@ -461,6 +461,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
461461
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
462462
cluster.Spec.ControlPlaneEndpoint.Host = "bar"
463463
cluster.Spec.ControlPlaneEndpoint.Port = 6443
464+
cluster.Status.InfrastructureReady = true
464465
kcp.Spec.Version = version
465466

466467
fmc := &fakeManagementCluster{
@@ -525,6 +526,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
525526
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
526527
cluster.Spec.ControlPlaneEndpoint.Host = "bar"
527528
cluster.Spec.ControlPlaneEndpoint.Port = 6443
529+
cluster.Status.InfrastructureReady = true
528530
kcp.Spec.Version = version
529531

530532
fmc := &fakeManagementCluster{
@@ -631,6 +633,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
631633
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
632634
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com1"
633635
cluster.Spec.ControlPlaneEndpoint.Port = 6443
636+
cluster.Status.InfrastructureReady = true
634637
kcp.Spec.Version = version
635638

636639
now := metav1.Now()
@@ -697,6 +700,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
697700
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
698701
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com2"
699702
cluster.Spec.ControlPlaneEndpoint.Port = 6443
703+
cluster.Status.InfrastructureReady = true
700704
kcp.Spec.Version = "v1.17.0"
701705

702706
fmc := &fakeManagementCluster{

controlplane/kubeadm/internal/controllers/scale_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
129129
initObjs := []client.Object{fakeGenericMachineTemplateCRD, cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy()}
130130
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
131131
cluster.Spec.ControlPlaneEndpoint.Port = 6443
132+
cluster.Status.InfrastructureReady = true
132133

133134
beforeMachines := collections.New()
134135
for i := 0; i < 2; i++ {

controlplane/kubeadm/internal/controllers/upgrade_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleUp(t *testing.T) {
4444
cluster, kcp, genericMachineTemplate := createClusterWithControlPlane(metav1.NamespaceDefault)
4545
cluster.Spec.ControlPlaneEndpoint.Host = Host
4646
cluster.Spec.ControlPlaneEndpoint.Port = 6443
47+
cluster.Status.InfrastructureReady = true
4748
kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil
4849
kcp.Spec.Replicas = pointer.Int32Ptr(1)
4950
setKCPHealthy(kcp)

docs/book/src/developer/providers/machine-infrastructure.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ The following diagram shows the typical logic for a machine infrastructure provi
113113
1. If the `Cluster` to which this resource belongs cannot be found, exit the reconciliation
114114
1. Add the provider-specific finalizer, if needed
115115
1. If the associated `Cluster`'s `status.infrastructureReady` is `false`, exit the reconciliation
116+
1. **Note**: This check should not be blocking any further delete reconciliation flows.
117+
1. **Note**: This check should only be performed after appropriate owner references (if any) are updated.
116118
1. If the associated `Machine`'s `spec.bootstrap.dataSecretName` is `nil`, exit the reconciliation
117119
1. Reconcile provider-specific machine infrastructure
118120
1. If any errors are encountered:

docs/book/src/developer/providers/v1.1-to-v1.2.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,8 @@ in ClusterAPI are kept in sync with the versions used by `sigs.k8s.io/controller
8181
- `WaitForControlPlaneAndMachinesReady`
8282
- `DiscoveryAndWaitForMachineDeployments`
8383
- The `AssertControlPlaneFailureDomains` function in the E2E test framework has been modified to allow proper failure domain testing.
84+
85+
- After investigating an [issue](https://github.com/kubernetes-sigs/cluster-api/issues/6006) we discovered that improper implementation of a check on `cluster.status.infrastructureReady` can lead to problems during cluster deletion. As a consequence, we recommend that all providers ensure:
86+
- The check for `cluster.status.infrastructureReady=true` usually existing at the beginning of the reconcile loop for control-plane providers is implemented after setting external objects ref;
87+
- The check for `cluster.status.infrastructureReady=true` usually existing at the beginning of the reconcile loop for infrastructure provider does not prevent the object to be deleted
88+
rif. [PR #6183](https://github.com/kubernetes-sigs/cluster-api/pull/6183)

test/framework/controlplane_helpers.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -401,12 +401,12 @@ func ScaleAndWaitControlPlane(ctx context.Context, input ScaleAndWaitControlPlan
401401
return -1, err
402402
}
403403

404-
selectorMap, err := metav1.LabelSelectorAsMap(kcpLabelSelector)
404+
selector, err := metav1.LabelSelectorAsSelector(kcpLabelSelector)
405405
if err != nil {
406406
return -1, err
407407
}
408408
machines := &clusterv1.MachineList{}
409-
if err := input.ClusterProxy.GetClient().List(ctx, machines, client.InNamespace(input.ControlPlane.Namespace), client.MatchingLabels(selectorMap)); err != nil {
409+
if err := input.ClusterProxy.GetClient().List(ctx, machines, &client.ListOptions{LabelSelector: selector, Namespace: input.ControlPlane.Namespace}); err != nil {
410410
return -1, err
411411
}
412412
nodeRefCount := 0

test/infrastructure/docker/internal/controllers/dockermachine_controller.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,13 +135,6 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques
135135
return ctrl.Result{}, nil
136136
}
137137

138-
// Check if the infrastructure is ready, otherwise return and wait for the cluster object to be updated
139-
if !cluster.Status.InfrastructureReady {
140-
log.Info("Waiting for DockerCluster Controller to create cluster infrastructure")
141-
conditions.MarkFalse(dockerMachine, infrav1.ContainerProvisionedCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
142-
return ctrl.Result{}, nil
143-
}
144-
145138
// Create a helper for managing the docker container hosting the machine.
146139
externalMachine, err := docker.NewMachine(ctx, cluster, machine.Name, nil)
147140
if err != nil {
@@ -192,6 +185,13 @@ func patchDockerMachine(ctx context.Context, patchHelper *patch.Helper, dockerMa
192185
func (r *DockerMachineReconciler) reconcileNormal(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine, dockerMachine *infrav1.DockerMachine, externalMachine *docker.Machine, externalLoadBalancer *docker.LoadBalancer) (res ctrl.Result, retErr error) {
193186
log := ctrl.LoggerFrom(ctx)
194187

188+
// Check if the infrastructure is ready, otherwise return and wait for the cluster object to be updated
189+
if !cluster.Status.InfrastructureReady {
190+
log.Info("Waiting for DockerCluster Controller to create cluster infrastructure")
191+
conditions.MarkFalse(dockerMachine, infrav1.ContainerProvisionedCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
192+
return ctrl.Result{}, nil
193+
}
194+
195195
// if the machine is already provisioned, return
196196
if dockerMachine.Spec.ProviderID != nil {
197197
// ensure ready state is set.

0 commit comments

Comments
 (0)