Skip to content

Commit 9ebb706

Browse files
committed
Don't try to resolve machine on delete if cluster not ready
This fixes a bug where if we created a machine for a cluster which never became ready, we would never be able to 'resolve' the machine and therefore never delete it. We address this situation in several layers: Firstly, we move the point in the machine controller at which we add the finalizer in the first place. We don't add the finalizer until we're writing resolved, so this situation can never occur for newly created machines. This makes sense because we don't create resources until we've observed that both the finalizer has been added and resolved is up to date, so we don't need the finalizer to protect resources which can't have been created yet. Secondly, we shortcut the delete flow if the cluster is not ready. This is safe for the same reason as above, but is only relevant to machines created before v0.10. Lastly we surface and restrict the circumstances in which 'Resolved' is required on delete anyway. On closer inspection, this is only required in the very specific circumstance that the machine has volumes defined, and we are deleting it without the machine having been created. To make this more obvious we split volume deletion out of DeleteInstance and only resolve the machine spec in the event that it's required. 2 other factors make this change larger than it might otherwise be. We hit a cyclomatic complexity limit in reconcileDelete(), requiring a refactor. We remove the DeleteInstance tests which, after separating out DeleteVolumes, are quite trivial, and replace them with much more comprehensive set of tests for reconcileDelete.
1 parent 6c83bc4 commit 9ebb706

File tree

7 files changed

+576
-227
lines changed

7 files changed

+576
-227
lines changed

controllers/openstackcluster_controller.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,18 @@ func deleteBastion(scope *scope.WithLogger, cluster *clusterv1.Cluster, openStac
290290
}
291291
}
292292

293-
if instanceStatus != nil {
293+
// If no instance was created we currently need to check for orphaned
294+
// volumes. This requires resolving the instance spec.
295+
// TODO: write volumes to status resources on creation so this is no longer required.
296+
if instanceStatus == nil {
297+
instanceSpec, err := bastionToInstanceSpec(openStackCluster, cluster)
298+
if err != nil {
299+
return err
300+
}
301+
if err := computeService.DeleteVolumes(instanceSpec); err != nil {
302+
return fmt.Errorf("delete volumes: %w", err)
303+
}
304+
} else {
294305
instanceNS, err := instanceStatus.NetworkStatus()
295306
if err != nil {
296307
return err
@@ -307,11 +318,7 @@ func deleteBastion(scope *scope.WithLogger, cluster *clusterv1.Cluster, openStac
307318
}
308319
}
309320

310-
instanceSpec, err := bastionToInstanceSpec(openStackCluster, cluster)
311-
if err != nil {
312-
return err
313-
}
314-
if err = computeService.DeleteInstance(openStackCluster, instanceStatus, instanceSpec); err != nil {
321+
if err = computeService.DeleteInstance(openStackCluster, instanceStatus); err != nil {
315322
handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to delete bastion: %w", err))
316323
return fmt.Errorf("failed to delete bastion: %w", err)
317324
}

controllers/openstackcluster_controller_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@ var _ = Describe("OpenStackCluster controller", func() {
208208
testCluster.Status = infrav1.OpenStackClusterStatus{
209209
Bastion: &infrav1.BastionStatus{
210210
ID: "bastion-uuid",
211+
Resolved: &infrav1.ResolvedMachineSpec{
212+
ImageID: "imageID",
213+
},
211214
},
212215
}
213216
err = k8sClient.Status().Update(ctx, testCluster)
@@ -221,8 +224,8 @@ var _ = Describe("OpenStackCluster controller", func() {
221224
computeClientRecorder.GetServer("bastion-uuid").Return(nil, gophercloud.ErrResourceNotFound{})
222225

223226
err = deleteBastion(scope, capiCluster, testCluster)
224-
Expect(testCluster.Status.Bastion).To(BeNil())
225227
Expect(err).To(BeNil())
228+
Expect(testCluster.Status.Bastion).To(BeNil())
226229
})
227230
It("should adopt an existing bastion even if its uuid is not stored in status", func() {
228231
testCluster.SetName("adopt-existing-bastion")

controllers/openstackmachine_controller.go

Lines changed: 141 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -169,26 +169,21 @@ func resolveMachineResources(scope *scope.WithLogger, clusterResourceName string
169169
openStackMachine.Status.Resolved = resolved
170170
}
171171
// Resolve and store resources
172-
changed, err := compute.ResolveMachineSpec(scope,
172+
return compute.ResolveMachineSpec(scope,
173173
&openStackMachine.Spec, resolved,
174174
clusterResourceName, openStackMachine.Name,
175175
openStackCluster, getManagedSecurityGroup(openStackCluster, machine))
176-
if err != nil {
177-
return false, err
178-
}
179-
if changed {
180-
// If the resolved machine spec changed we need to start the reconcile again to prevent inconsistency between reconciles.
181-
return true, nil
182-
}
176+
}
183177

178+
func adoptMachineResources(scope *scope.WithLogger, openStackMachine *infrav1.OpenStackMachine) error {
184179
resources := openStackMachine.Status.Resources
185180
if resources == nil {
186181
resources = &infrav1.MachineResources{}
187182
openStackMachine.Status.Resources = resources
188183
}
189184

190185
// Adopt any existing resources
191-
return false, compute.AdoptMachineResources(scope, resolved, resources)
186+
return compute.AdoptMachineResources(scope, openStackMachine.Status.Resolved, resources)
192187
}
193188

194189
func patchMachine(ctx context.Context, patchHelper *patch.Helper, openStackMachine *infrav1.OpenStackMachine, machine *clusterv1.Machine, options ...patch.Option) error {
@@ -256,66 +251,72 @@ func (r *OpenStackMachineReconciler) reconcileDelete(scope *scope.WithLogger, cl
256251
return ctrl.Result{}, err
257252
}
258253

259-
// We may have resources to adopt if the cluster is ready
260-
if openStackCluster.Status.Ready && openStackCluster.Status.Network != nil {
261-
if _, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine); err != nil {
262-
return ctrl.Result{}, err
263-
}
254+
// Nothing to do if the cluster is not ready because no machine resources were created.
255+
if !openStackCluster.Status.Ready || openStackCluster.Status.Network == nil {
256+
// The finalizer should not have been added yet in this case,
257+
// but the following handles the upgrade case.
258+
controllerutil.RemoveFinalizer(openStackMachine, infrav1.MachineFinalizer)
259+
return ctrl.Result{}, nil
264260
}
265261

266-
if openStackCluster.Spec.APIServerLoadBalancer.IsEnabled() {
267-
loadBalancerService, err := loadbalancer.NewService(scope)
268-
if err != nil {
269-
return ctrl.Result{}, err
270-
}
262+
// For machines created after v0.10, or any machine which has been
263+
// reconciled at least once by v0.10 or later, status.Resolved always
264+
// exists before any resources are created. We can therefore assume
265+
// that if it does not exist, no resources were created.
266+
//
267+
// There is an upgrade edge case where a machine may have been marked
268+
// deleted before upgrade but we are completing it after upgrade. For
269+
// this use case only we make a best effort to resolve resources before
270+
// continuing, but if we get an error we log it and continue anyway.
271+
// This has the potential to leak resources, but only in this specific
272+
// edge case. The alternative is to continue retrying until it succeeds,
273+
// but that risks never deleting a machine which cannot be resolved due
274+
// to a spec error.
275+
//
276+
// This code can and should be deleted in a future release when we are
277+
// sure that all machines have been reconciled at least by a v0.10 or
278+
// later controller.
279+
if _, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine); err != nil {
280+
// Return the error, but allow the resource to be removed anyway.
281+
controllerutil.RemoveFinalizer(openStackMachine, infrav1.MachineFinalizer)
282+
return ctrl.Result{}, err
283+
}
271284

272-
err = loadBalancerService.DeleteLoadBalancerMember(openStackCluster, machine, openStackMachine, clusterResourceName)
273-
if err != nil {
274-
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.LoadBalancerMemberErrorReason, clusterv1.ConditionSeverityWarning, "Machine could not be removed from load balancer: %v", err)
275-
return ctrl.Result{}, err
276-
}
285+
// Check for any orphaned resources
286+
// N.B. Unlike resolveMachineResources, we must always look for orphaned resources in the delete path.
287+
if err := adoptMachineResources(scope, openStackMachine); err != nil {
288+
return ctrl.Result{}, fmt.Errorf("adopting machine resources: %w", err)
277289
}
278290

279-
var instanceStatus *compute.InstanceStatus
280-
if openStackMachine.Status.InstanceID != nil {
281-
instanceStatus, err = computeService.GetInstanceStatus(*openStackMachine.Status.InstanceID)
282-
if err != nil {
283-
return ctrl.Result{}, err
284-
}
285-
} else if instanceStatus, err = computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name); err != nil {
291+
instanceStatus, err := getInstanceStatus(openStackMachine, computeService)
292+
if err != nil {
286293
return ctrl.Result{}, err
287294
}
288-
if !openStackCluster.Spec.APIServerLoadBalancer.IsEnabled() && util.IsControlPlaneMachine(machine) && openStackCluster.Spec.APIServerFloatingIP == nil {
289-
if instanceStatus != nil {
290-
instanceNS, err := instanceStatus.NetworkStatus()
291-
if err != nil {
292-
openStackMachine.SetFailure(
293-
capierrors.UpdateMachineError,
294-
fmt.Errorf("get network status for OpenStack instance %s with ID %s: %v", instanceStatus.Name(), instanceStatus.ID(), err),
295-
)
296-
return ctrl.Result{}, nil
297-
}
298295

299-
addresses := instanceNS.Addresses()
300-
for _, address := range addresses {
301-
if address.Type == corev1.NodeExternalIP {
302-
if err = networkingService.DeleteFloatingIP(openStackMachine, address.Address); err != nil {
303-
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.FloatingIPErrorReason, clusterv1.ConditionSeverityError, "Deleting floating IP failed: %v", err)
304-
return ctrl.Result{}, fmt.Errorf("delete floating IP %q: %w", address.Address, err)
305-
}
306-
}
307-
}
296+
if util.IsControlPlaneMachine(machine) {
297+
if err := removeAPIServerEndpoint(scope, openStackCluster, openStackMachine, instanceStatus, clusterResourceName); err != nil {
298+
return ctrl.Result{}, err
308299
}
309300
}
310301

311-
instanceSpec, err := machineToInstanceSpec(openStackCluster, machine, openStackMachine, "")
312-
if err != nil {
313-
return ctrl.Result{}, err
302+
// If no instance was created we currently need to check for orphaned
303+
// volumes. This requires resolving the instance spec.
304+
// TODO: write volumes to status resources on creation so this is no longer required.
305+
if instanceStatus == nil && openStackMachine.Status.Resolved != nil {
306+
instanceSpec, err := machineToInstanceSpec(openStackCluster, machine, openStackMachine, "")
307+
if err != nil {
308+
return ctrl.Result{}, err
309+
}
310+
if err := computeService.DeleteVolumes(instanceSpec); err != nil {
311+
return ctrl.Result{}, fmt.Errorf("delete volumes: %w", err)
312+
}
314313
}
315314

316-
if err := computeService.DeleteInstance(openStackMachine, instanceStatus, instanceSpec); err != nil {
317-
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeleteFailedReason, clusterv1.ConditionSeverityError, "Deleting instance failed: %v", err)
318-
return ctrl.Result{}, fmt.Errorf("delete instance: %w", err)
315+
if instanceStatus != nil {
316+
if err := computeService.DeleteInstance(openStackMachine, instanceStatus); err != nil {
317+
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeleteFailedReason, clusterv1.ConditionSeverityError, "Deleting instance failed: %v", err)
318+
return ctrl.Result{}, fmt.Errorf("delete instance: %w", err)
319+
}
319320
}
320321

321322
trunkSupported, err := networkingService.IsTrunkExtSupported()
@@ -341,6 +342,62 @@ func (r *OpenStackMachineReconciler) reconcileDelete(scope *scope.WithLogger, cl
341342
return ctrl.Result{}, nil
342343
}
343344

345+
func getInstanceStatus(openStackMachine *infrav1.OpenStackMachine, computeService *compute.Service) (*compute.InstanceStatus, error) {
346+
if openStackMachine.Status.InstanceID != nil {
347+
return computeService.GetInstanceStatus(*openStackMachine.Status.InstanceID)
348+
}
349+
return computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name)
350+
}
351+
352+
func removeAPIServerEndpoint(scope *scope.WithLogger, openStackCluster *infrav1.OpenStackCluster, openStackMachine *infrav1.OpenStackMachine, instanceStatus *compute.InstanceStatus, clusterResourceName string) error {
353+
if openStackCluster.Spec.APIServerLoadBalancer.IsEnabled() {
354+
loadBalancerService, err := loadbalancer.NewService(scope)
355+
if err != nil {
356+
return err
357+
}
358+
359+
err = loadBalancerService.DeleteLoadBalancerMember(openStackCluster, openStackMachine, clusterResourceName)
360+
if err != nil {
361+
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.LoadBalancerMemberErrorReason, clusterv1.ConditionSeverityWarning, "Machine could not be removed from load balancer: %v", err)
362+
return err
363+
}
364+
return nil
365+
}
366+
367+
// XXX(mdbooth): This looks wrong to me. Surely we should only ever
368+
// disassociate the floating IP here. I would expect the API server
369+
// floating IP to be created and deleted with the cluster. And if the
370+
// delete behaviour is correct, we leak it if the instance was
371+
// previously deleted.
372+
if openStackCluster.Spec.APIServerFloatingIP == nil && instanceStatus != nil {
373+
instanceNS, err := instanceStatus.NetworkStatus()
374+
if err != nil {
375+
openStackMachine.SetFailure(
376+
capierrors.UpdateMachineError,
377+
fmt.Errorf("get network status for OpenStack instance %s with ID %s: %v", instanceStatus.Name(), instanceStatus.ID(), err),
378+
)
379+
return nil
380+
}
381+
382+
networkingService, err := networking.NewService(scope)
383+
if err != nil {
384+
return err
385+
}
386+
387+
addresses := instanceNS.Addresses()
388+
for _, address := range addresses {
389+
if address.Type == corev1.NodeExternalIP {
390+
if err = networkingService.DeleteFloatingIP(openStackMachine, address.Address); err != nil {
391+
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.FloatingIPErrorReason, clusterv1.ConditionSeverityError, "Deleting floating IP failed: %v", err)
392+
return fmt.Errorf("delete floating IP %q: %w", address.Address, err)
393+
}
394+
}
395+
}
396+
}
397+
398+
return nil
399+
}
400+
344401
// GetPortIDs returns a list of port IDs from a list of PortStatus.
345402
func GetPortIDs(ports []infrav1.PortStatus) []string {
346403
portIDs := make([]string, len(ports))
@@ -489,34 +546,50 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
489546
return ctrl.Result{}, nil
490547
}
491548

492-
// If the OpenStackMachine doesn't have our finalizer, add it.
493-
if controllerutil.AddFinalizer(openStackMachine, infrav1.MachineFinalizer) {
494-
// Register the finalizer immediately to avoid orphaning OpenStack resources on delete
495-
return ctrl.Result{}, nil
496-
}
497-
498549
if !openStackCluster.Status.Ready {
499550
scope.Logger().Info("Cluster infrastructure is not ready yet, re-queuing machine")
500551
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
501552
return ctrl.Result{RequeueAfter: waitForClusterInfrastructureReadyDuration}, nil
502553
}
503554

504-
if changed, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine); changed || err != nil {
505-
scope.Logger().V(6).Info("Machine resources updated, requeuing")
506-
return ctrl.Result{}, err
507-
}
508-
509555
// Make sure bootstrap data is available and populated.
510556
if machine.Spec.Bootstrap.DataSecretName == nil {
511557
scope.Logger().Info("Bootstrap data secret reference is not yet available")
512558
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.WaitingForBootstrapDataReason, clusterv1.ConditionSeverityInfo, "")
513559
return ctrl.Result{}, nil
514560
}
515-
userData, err := r.getBootstrapData(ctx, machine, openStackMachine)
561+
562+
changed, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine)
516563
if err != nil {
517564
return ctrl.Result{}, err
518565
}
566+
567+
// Also add the finalizer when writing resolved resources so we can start creating resources on the next reconcile.
568+
if controllerutil.AddFinalizer(openStackMachine, infrav1.MachineFinalizer) {
569+
changed = true
570+
}
571+
572+
// We requeue if we either added the finalizer or resolved machine
573+
// resources. This means that we never create any resources unless we
574+
// have observed that the finalizer and resolved machine resources were
575+
// successfully written in a previous transaction. This in turn means
576+
// that in the delete path we can be sure that if there are no resolved
577+
// resources then no resources were created.
578+
if changed {
579+
scope.Logger().V(6).Info("Machine resources updated, requeuing")
580+
return ctrl.Result{}, nil
581+
}
582+
583+
// Check for orphaned resources previously created but not written to the status
584+
if err := adoptMachineResources(scope, openStackMachine); err != nil {
585+
return ctrl.Result{}, fmt.Errorf("adopting machine resources: %w", err)
586+
}
587+
519588
scope.Logger().Info("Reconciling Machine")
589+
userData, err := r.getBootstrapData(ctx, machine, openStackMachine)
590+
if err != nil {
591+
return ctrl.Result{}, err
592+
}
520593

521594
computeService, err := compute.NewService(scope)
522595
if err != nil {

0 commit comments

Comments
 (0)