Skip to content

Commit da795db

Browse files
authored
Merge pull request kubernetes-sigs#10177 from g-gaston/race-condition-machine-controller-release-1.6
[release-1.6] 🐛 Watch external objects for machine before deleting
2 parents fc43a06 + a64cd41 commit da795db

File tree

3 files changed

+241
-35
lines changed

3 files changed

+241
-35
lines changed

internal/controllers/machine/machine_controller.go

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
333333
if err != nil {
334334
switch err {
335335
case errNoControlPlaneNodes, errLastControlPlaneNode, errNilNodeRef, errClusterIsBeingDeleted, errControlPlaneIsBeingDeleted:
336-
var nodeName = ""
336+
nodeName := ""
337337
if m.Status.NodeRef != nil {
338338
nodeName = m.Status.NodeRef.Name
339339
}
@@ -427,7 +427,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
427427
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
428428
}
429429

430-
infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, m)
430+
infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, cluster, m)
431431
if err != nil {
432432
return ctrl.Result{}, err
433433
}
@@ -436,7 +436,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
436436
return ctrl.Result{}, nil
437437
}
438438

439-
bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, m)
439+
bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, cluster, m)
440440
if err != nil {
441441
return ctrl.Result{}, err
442442
}
@@ -723,8 +723,8 @@ func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster,
723723
return nil
724724
}
725725

726-
func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.Machine) (bool, error) {
727-
obj, err := r.reconcileDeleteExternal(ctx, m, m.Spec.Bootstrap.ConfigRef)
726+
func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
727+
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, m.Spec.Bootstrap.ConfigRef)
728728
if err != nil {
729729
return false, err
730730
}
@@ -743,8 +743,8 @@ func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.
743743
return false, nil
744744
}
745745

746-
func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clusterv1.Machine) (bool, error) {
747-
obj, err := r.reconcileDeleteExternal(ctx, m, &m.Spec.InfrastructureRef)
746+
func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
747+
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, &m.Spec.InfrastructureRef)
748748
if err != nil {
749749
return false, err
750750
}
@@ -764,7 +764,7 @@ func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clust
764764
}
765765

766766
// reconcileDeleteExternal tries to delete external references.
767-
func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) {
767+
func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) {
768768
if ref == nil {
769769
return nil, nil
770770
}
@@ -777,6 +777,14 @@ func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.M
777777
}
778778

779779
if obj != nil {
780+
// reconcileExternal ensures that we set the object's OwnerReferences correctly and watch the object.
781+
// The machine delete logic depends on reconciling the machine when the external objects are deleted.
782+
// This avoids a race condition where the machine is deleted before the external objects are ever reconciled
783+
// by this controller.
784+
if _, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref); err != nil {
785+
return nil, err
786+
}
787+
780788
// Issue a delete request.
781789
if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
782790
return obj, errors.Wrapf(err,

internal/controllers/machine/machine_controller_phases.go

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,7 @@ import (
4343
"sigs.k8s.io/cluster-api/util/patch"
4444
)
4545

46-
var (
47-
externalReadyWait = 30 * time.Second
48-
)
46+
var externalReadyWait = 30 * time.Second
4947

5048
func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) {
5149
originalPhase := m.Status.Phase
@@ -89,12 +87,44 @@ func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) {
8987

9088
// reconcileExternal handles generic unstructured objects referenced by a Machine.
9189
func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) {
92-
log := ctrl.LoggerFrom(ctx)
93-
9490
if err := utilconversion.UpdateReferenceAPIContract(ctx, r.Client, ref); err != nil {
9591
return external.ReconcileOutput{}, err
9692
}
9793

94+
result, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref)
95+
if err != nil {
96+
return external.ReconcileOutput{}, err
97+
}
98+
if result.RequeueAfter > 0 || result.Paused {
99+
return result, nil
100+
}
101+
102+
obj := result.Result
103+
104+
// Set failure reason and message, if any.
105+
failureReason, failureMessage, err := external.FailuresFrom(obj)
106+
if err != nil {
107+
return external.ReconcileOutput{}, err
108+
}
109+
if failureReason != "" {
110+
machineStatusError := capierrors.MachineStatusError(failureReason)
111+
m.Status.FailureReason = &machineStatusError
112+
}
113+
if failureMessage != "" {
114+
m.Status.FailureMessage = pointer.String(
115+
fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s",
116+
obj.GroupVersionKind(), obj.GetName(), failureMessage),
117+
)
118+
}
119+
120+
return external.ReconcileOutput{Result: obj}, nil
121+
}
122+
123+
// ensureExternalOwnershipAndWatch ensures that only the Machine owns the external object,
124+
// adds a watch to the external object if one does not already exist and adds the necessary labels.
125+
func (r *Reconciler) ensureExternalOwnershipAndWatch(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) {
126+
log := ctrl.LoggerFrom(ctx)
127+
98128
obj, err := external.Get(ctx, r.UnstructuredCachingClient, ref, m.Namespace)
99129
if err != nil {
100130
if apierrors.IsNotFound(errors.Cause(err)) {
@@ -146,22 +176,6 @@ func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.C
146176
return external.ReconcileOutput{}, err
147177
}
148178

149-
// Set failure reason and message, if any.
150-
failureReason, failureMessage, err := external.FailuresFrom(obj)
151-
if err != nil {
152-
return external.ReconcileOutput{}, err
153-
}
154-
if failureReason != "" {
155-
machineStatusError := capierrors.MachineStatusError(failureReason)
156-
m.Status.FailureReason = &machineStatusError
157-
}
158-
if failureMessage != "" {
159-
m.Status.FailureMessage = pointer.String(
160-
fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s",
161-
obj.GroupVersionKind(), obj.GetName(), failureMessage),
162-
)
163-
}
164-
165179
return external.ReconcileOutput{Result: obj}, nil
166180
}
167181

0 commit comments

Comments
 (0)