Skip to content

Commit f73a277

Browse files
author
Yuvaraj Kakaraparthi
committed
prevent blocking of KCP and DockerMachine controllers
KCP and DockerMachine controllers are currently blocked if the infrastrucutre is not ready. This leads to cases where if the infrstructure is not ready we cannot delete the cluster. This PR fixes this behavior and prevents blocking the controllers for this case.
1 parent 0195739 commit f73a277

File tree

8 files changed

+29
-16
lines changed

8 files changed

+29
-16
lines changed

controlplane/kubeadm/internal/controllers/controller.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,12 +149,6 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.
149149
return ctrl.Result{}, nil
150150
}
151151

152-
// Wait for the cluster infrastructure to be ready before creating machines
153-
if !cluster.Status.InfrastructureReady {
154-
log.Info("Cluster infrastructure is not ready yet")
155-
return ctrl.Result{}, nil
156-
}
157-
158152
// Initialize the patch helper.
159153
patchHelper, err := patch.NewHelper(kcp, r.Client)
160154
if err != nil {
@@ -255,6 +249,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
255249
return ctrl.Result{}, err
256250
}
257251

252+
// Wait for the cluster infrastructure to be ready before creating machines
253+
if !cluster.Status.InfrastructureReady {
254+
log.Info("Cluster infrastructure is not ready yet")
255+
return ctrl.Result{}, nil
256+
}
257+
258258
// Generate Cluster Certificates if needed
259259
config := kcp.Spec.KubeadmConfigSpec.DeepCopy()
260260
config.JoinConfiguration = nil

controlplane/kubeadm/internal/controllers/controller_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ func TestReconcileReturnErrorWhenOwnerClusterIsMissing(t *testing.T) {
147147
g.Expect(result).To(Equal(ctrl.Result{}))
148148

149149
// calling reconcile should return error
150-
g.Expect(env.Delete(ctx, cluster)).To(Succeed())
150+
g.Expect(env.CleanupAndWait(ctx, cluster)).To(Succeed())
151151

152152
g.Eventually(func() error {
153153
_, err := r.Reconcile(ctx, ctrl.Request{NamespacedName: util.ObjectKey(kcp)})
@@ -461,6 +461,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
461461
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
462462
cluster.Spec.ControlPlaneEndpoint.Host = "bar"
463463
cluster.Spec.ControlPlaneEndpoint.Port = 6443
464+
cluster.Status.InfrastructureReady = true
464465
kcp.Spec.Version = version
465466

466467
fmc := &fakeManagementCluster{
@@ -525,6 +526,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
525526
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
526527
cluster.Spec.ControlPlaneEndpoint.Host = "bar"
527528
cluster.Spec.ControlPlaneEndpoint.Port = 6443
529+
cluster.Status.InfrastructureReady = true
528530
kcp.Spec.Version = version
529531

530532
fmc := &fakeManagementCluster{
@@ -631,6 +633,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
631633
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
632634
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com1"
633635
cluster.Spec.ControlPlaneEndpoint.Port = 6443
636+
cluster.Status.InfrastructureReady = true
634637
kcp.Spec.Version = version
635638

636639
now := metav1.Now()
@@ -697,6 +700,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
697700
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
698701
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com2"
699702
cluster.Spec.ControlPlaneEndpoint.Port = 6443
703+
cluster.Status.InfrastructureReady = true
700704
kcp.Spec.Version = "v1.17.0"
701705

702706
fmc := &fakeManagementCluster{

controlplane/kubeadm/internal/controllers/scale_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
129129
initObjs := []client.Object{fakeGenericMachineTemplateCRD, cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy()}
130130
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
131131
cluster.Spec.ControlPlaneEndpoint.Port = 6443
132+
cluster.Status.InfrastructureReady = true
132133

133134
beforeMachines := collections.New()
134135
for i := 0; i < 2; i++ {

controlplane/kubeadm/internal/controllers/upgrade_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleUp(t *testing.T) {
4444
cluster, kcp, genericMachineTemplate := createClusterWithControlPlane(metav1.NamespaceDefault)
4545
cluster.Spec.ControlPlaneEndpoint.Host = Host
4646
cluster.Spec.ControlPlaneEndpoint.Port = 6443
47+
cluster.Status.InfrastructureReady = true
4748
kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil
4849
kcp.Spec.Replicas = pointer.Int32Ptr(1)
4950
setKCPHealthy(kcp)

docs/book/src/developer/providers/machine-infrastructure.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ The following diagram shows the typical logic for a machine infrastructure provi
113113
1. If the `Cluster` to which this resource belongs cannot be found, exit the reconciliation
114114
1. Add the provider-specific finalizer, if needed
115115
1. If the associated `Cluster`'s `status.infrastructureReady` is `false`, exit the reconciliation
116+
1. **Note**: This check should not be blocking any further delete reconciliation flows.
117+
1. **Note**: This check should only be performed after appropriate owner references (if any) are updated.
116118
1. If the associated `Machine`'s `spec.bootstrap.dataSecretName` is `nil`, exit the reconciliation
117119
1. Reconcile provider-specific machine infrastructure
118120
1. If any errors are encountered:

docs/book/src/developer/providers/v1.1-to-v1.2.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,8 @@ in ClusterAPI are kept in sync with the versions used by `sigs.k8s.io/controller
8181
- `WaitForControlPlaneAndMachinesReady`
8282
- `DiscoveryAndWaitForMachineDeployments`
8383
- The `AssertControlPlaneFailureDomains` function in the E2E test framework has been modified to allow proper failure domain testing.
84+
85+
- After investigating an [issue](https://github.com/kubernetes-sigs/cluster-api/issues/6006) we discovered that improper implementation of a check on `cluster.status.infrastructureReady` can lead to problems during cluster deletion. As a consequence, we recommend that all providers ensure:
86+
- The check for `cluster.status.infrastructureReady=true` usually existing at the beginning of the reconcile loop for control-plane providers is implemented after setting external objects ref;
87+
- The check for `cluster.status.infrastructureReady=true` usually existing at the beginning of the reconcile loop for infrastructure provider does not prevent the object to be deleted
88+
rif. [PR #6183](https://github.com/kubernetes-sigs/cluster-api/pull/6183)

test/framework/controlplane_helpers.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -401,12 +401,12 @@ func ScaleAndWaitControlPlane(ctx context.Context, input ScaleAndWaitControlPlan
401401
return -1, err
402402
}
403403

404-
selectorMap, err := metav1.LabelSelectorAsMap(kcpLabelSelector)
404+
selector, err := metav1.LabelSelectorAsSelector(kcpLabelSelector)
405405
if err != nil {
406406
return -1, err
407407
}
408408
machines := &clusterv1.MachineList{}
409-
if err := input.ClusterProxy.GetClient().List(ctx, machines, client.InNamespace(input.ControlPlane.Namespace), client.MatchingLabels(selectorMap)); err != nil {
409+
if err := input.ClusterProxy.GetClient().List(ctx, machines, &client.ListOptions{LabelSelector: selector, Namespace: input.ControlPlane.Namespace}); err != nil {
410410
return -1, err
411411
}
412412
nodeRefCount := 0

test/infrastructure/docker/internal/controllers/dockermachine_controller.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,13 +135,6 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques
135135
return ctrl.Result{}, nil
136136
}
137137

138-
// Check if the infrastructure is ready, otherwise return and wait for the cluster object to be updated
139-
if !cluster.Status.InfrastructureReady {
140-
log.Info("Waiting for DockerCluster Controller to create cluster infrastructure")
141-
conditions.MarkFalse(dockerMachine, infrav1.ContainerProvisionedCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
142-
return ctrl.Result{}, nil
143-
}
144-
145138
// Create a helper for managing the docker container hosting the machine.
146139
externalMachine, err := docker.NewMachine(ctx, cluster, machine.Name, nil)
147140
if err != nil {
@@ -192,6 +185,13 @@ func patchDockerMachine(ctx context.Context, patchHelper *patch.Helper, dockerMa
192185
func (r *DockerMachineReconciler) reconcileNormal(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine, dockerMachine *infrav1.DockerMachine, externalMachine *docker.Machine, externalLoadBalancer *docker.LoadBalancer) (res ctrl.Result, retErr error) {
193186
log := ctrl.LoggerFrom(ctx)
194187

188+
// Check if the infrastructure is ready, otherwise return and wait for the cluster object to be updated
189+
if !cluster.Status.InfrastructureReady {
190+
log.Info("Waiting for DockerCluster Controller to create cluster infrastructure")
191+
conditions.MarkFalse(dockerMachine, infrav1.ContainerProvisionedCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
192+
return ctrl.Result{}, nil
193+
}
194+
195195
// if the machine is already provisioned, return
196196
if dockerMachine.Spec.ProviderID != nil {
197197
// ensure ready state is set.

0 commit comments

Comments
 (0)