Skip to content

Commit 85e51f1

Browse files
committed
Fix Deletion Deadlock
There's a race where the infrastructure can be deleted before the machines, but the deletion of machines is dependent on the infrastruture, and we get stuck deleting forever (unless you manually delete the machines from Nova and remove the finalizer). Simple fix is to defer deletion of infrastructure until the machines have been purged.
1 parent 094b5c3 commit 85e51f1

File tree

1 file changed

+18
-2
lines changed

1 file changed

+18
-2
lines changed

controllers/openstackcluster_controller.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222
"reflect"
23+
"time"
2324

2425
"github.com/pkg/errors"
2526
corev1 "k8s.io/api/core/v1"
@@ -30,6 +31,7 @@ import (
3031
capierrors "sigs.k8s.io/cluster-api/errors"
3132
"sigs.k8s.io/cluster-api/util"
3233
"sigs.k8s.io/cluster-api/util/annotations"
34+
"sigs.k8s.io/cluster-api/util/collections"
3335
"sigs.k8s.io/cluster-api/util/patch"
3436
"sigs.k8s.io/cluster-api/util/predicates"
3537
ctrl "sigs.k8s.io/controller-runtime"
@@ -119,16 +121,30 @@ func (r *OpenStackClusterReconciler) Reconcile(ctx context.Context, req ctrl.Req
119121

120122
// Handle deleted clusters
121123
if !openStackCluster.DeletionTimestamp.IsZero() {
122-
return reconcileDelete(scope, cluster, openStackCluster)
124+
return r.reconcileDelete(ctx, scope, cluster, openStackCluster)
123125
}
124126

125127
// Handle non-deleted clusters
126128
return reconcileNormal(scope, cluster, openStackCluster)
127129
}
128130

129-
func reconcileDelete(scope scope.Scope, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster) (ctrl.Result, error) {
131+
func (r *OpenStackClusterReconciler) reconcileDelete(ctx context.Context, scope scope.Scope, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster) (ctrl.Result, error) {
130132
scope.Logger().Info("Reconciling Cluster delete")
131133

134+
// Wait for machines to be deleted before removing the finalizer as they
135+
// depend on this resource to deprovision. Additionally it appears that
136+
// allowing the Kubernetes API to vanish too quickly will upset the capi
137+
// kubeadm control plane controller.
138+
machines, err := collections.GetFilteredMachinesForCluster(ctx, r.Client, cluster)
139+
if err != nil {
140+
return ctrl.Result{}, err
141+
}
142+
143+
if len(machines) != 0 {
144+
scope.Logger().Info("Waiting for machines to be deleted", "remaining", len(machines))
145+
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
146+
}
147+
132148
if err := deleteBastion(scope, cluster, openStackCluster); err != nil {
133149
return reconcile.Result{}, err
134150
}

0 commit comments

Comments
 (0)