Skip to content

Commit 8ca8170

Browse files
authored
Merge pull request #8464 from ykakarap/pr-mhc-upgrade-deadlock-fix
🐛 avoid errors when MHC and upgrade occur together in classy clusters
2 parents a2c1f5d + 20032af commit 8ca8170

File tree

9 files changed

+357
-194
lines changed

9 files changed

+357
-194
lines changed

internal/controllers/topology/cluster/conditions.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste
133133
case s.UpgradeTracker.ControlPlane.IsScaling:
134134
msgBuilder.WriteString(" Control plane is reconciling desired replicas")
135135

136+
case len(s.UpgradeTracker.MachineDeployments.UpgradingNames()) > 0:
137+
fmt.Fprintf(msgBuilder, " MachineDeployment(s) %s are upgrading",
138+
computeMachineDeploymentNameList(s.UpgradeTracker.MachineDeployments.UpgradingNames()),
139+
)
140+
136141
case s.Current.MachineDeployments.IsAnyRollingOut():
137142
fmt.Fprintf(msgBuilder, " MachineDeployment(s) %s are rolling out",
138143
computeMachineDeploymentNameList(s.UpgradeTracker.MachineDeployments.RolloutNames()),

internal/controllers/topology/cluster/conditions_test.go

Lines changed: 42 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ import (
2323
"github.com/pkg/errors"
2424
corev1 "k8s.io/api/core/v1"
2525
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/apimachinery/pkg/runtime"
27+
"sigs.k8s.io/controller-runtime/pkg/client"
28+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
2629

2730
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
2831
runtimehooksv1 "sigs.k8s.io/cluster-api/exp/runtime/hooks/api/v1alpha1"
@@ -32,11 +35,16 @@ import (
3235
)
3336

3437
func TestReconcileTopologyReconciledCondition(t *testing.T) {
38+
g := NewWithT(t)
39+
scheme := runtime.NewScheme()
40+
g.Expect(clusterv1.AddToScheme(scheme)).To(Succeed())
41+
3542
tests := []struct {
3643
name string
3744
reconcileErr error
3845
s *scope.Scope
3946
cluster *clusterv1.Cluster
47+
machines []*clusterv1.Machine
4048
wantConditionStatus corev1.ConditionStatus
4149
wantConditionReason string
4250
wantConditionMessage string
@@ -382,7 +390,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
382390
wantConditionStatus: corev1.ConditionTrue,
383391
},
384392
{
385-
name: "should set the condition to false is some machine deployments have not picked the new version because other machine deployments are rolling out (not all replicas ready)",
393+
name: "should set the condition to false is some machine deployments have not picked the new version because other machine deployments are upgrading",
386394
reconcileErr: nil,
387395
cluster: &clusterv1.Cluster{},
388396
s: &scope.Scope{
@@ -404,6 +412,11 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
404412
Object: builder.MachineDeployment("ns1", "md0-abc123").
405413
WithReplicas(2).
406414
WithVersion("v1.22.0").
415+
WithSelector(metav1.LabelSelector{
416+
MatchLabels: map[string]string{
417+
clusterv1.ClusterTopologyMachineDeploymentNameLabel: "md0",
418+
},
419+
}).
407420
WithStatus(clusterv1.MachineDeploymentStatus{
408421
// MD is not ready because we don't have 2 updated, ready and available replicas.
409422
Replicas: int32(2),
@@ -418,67 +431,11 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
418431
Object: builder.MachineDeployment("ns1", "md1-abc123").
419432
WithReplicas(2).
420433
WithVersion("v1.21.2").
421-
WithStatus(clusterv1.MachineDeploymentStatus{
422-
Replicas: int32(2),
423-
UpdatedReplicas: int32(2),
424-
ReadyReplicas: int32(2),
425-
AvailableReplicas: int32(2),
426-
UnavailableReplicas: int32(0),
434+
WithSelector(metav1.LabelSelector{
435+
MatchLabels: map[string]string{
436+
clusterv1.ClusterTopologyMachineDeploymentNameLabel: "md1",
437+
},
427438
}).
428-
Build(),
429-
},
430-
},
431-
},
432-
UpgradeTracker: func() *scope.UpgradeTracker {
433-
ut := scope.NewUpgradeTracker()
434-
ut.ControlPlane.PendingUpgrade = false
435-
ut.MachineDeployments.MarkRollingOut("md0-abc123")
436-
ut.MachineDeployments.MarkPendingUpgrade("md1-abc123")
437-
return ut
438-
}(),
439-
HookResponseTracker: scope.NewHookResponseTracker(),
440-
},
441-
wantConditionStatus: corev1.ConditionFalse,
442-
wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason,
443-
wantConditionMessage: "MachineDeployment(s) md1-abc123 upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are rolling out",
444-
},
445-
{
446-
name: "should set the condition to false if some machine deployments have not picked the new version because other machine deployments are rolling out (unavailable replica)",
447-
reconcileErr: nil,
448-
cluster: &clusterv1.Cluster{},
449-
s: &scope.Scope{
450-
Blueprint: &scope.ClusterBlueprint{
451-
Topology: &clusterv1.Topology{
452-
Version: "v1.22.0",
453-
},
454-
},
455-
Current: &scope.ClusterState{
456-
Cluster: &clusterv1.Cluster{},
457-
ControlPlane: &scope.ControlPlaneState{
458-
Object: builder.ControlPlane("ns1", "controlplane1").
459-
WithVersion("v1.22.0").
460-
WithReplicas(3).
461-
Build(),
462-
},
463-
MachineDeployments: scope.MachineDeploymentsStateMap{
464-
"md0": &scope.MachineDeploymentState{
465-
Object: builder.MachineDeployment("ns1", "md0-abc123").
466-
WithReplicas(2).
467-
WithVersion("v1.22.0").
468-
WithStatus(clusterv1.MachineDeploymentStatus{
469-
// MD is not ready because we still have an unavailable replica.
470-
Replicas: int32(2),
471-
UpdatedReplicas: int32(2),
472-
ReadyReplicas: int32(2),
473-
AvailableReplicas: int32(2),
474-
UnavailableReplicas: int32(1),
475-
}).
476-
Build(),
477-
},
478-
"md1": &scope.MachineDeploymentState{
479-
Object: builder.MachineDeployment("ns1", "md1-abc123").
480-
WithReplicas(2).
481-
WithVersion("v1.21.2").
482439
WithStatus(clusterv1.MachineDeploymentStatus{
483440
Replicas: int32(2),
484441
UpdatedReplicas: int32(2),
@@ -493,15 +450,25 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
493450
UpgradeTracker: func() *scope.UpgradeTracker {
494451
ut := scope.NewUpgradeTracker()
495452
ut.ControlPlane.PendingUpgrade = false
496-
ut.MachineDeployments.MarkRollingOut("md0-abc123")
453+
ut.MachineDeployments.MarkUpgradingAndRollingOut("md0-abc123")
497454
ut.MachineDeployments.MarkPendingUpgrade("md1-abc123")
498455
return ut
499456
}(),
500457
HookResponseTracker: scope.NewHookResponseTracker(),
501458
},
459+
machines: []*clusterv1.Machine{
460+
builder.Machine("ns1", "md0-machine0").
461+
WithLabels(map[string]string{clusterv1.ClusterTopologyMachineDeploymentNameLabel: "md0"}).
462+
WithVersion("v1.21.2"). // Machine's version does not match MachineDeployment's version
463+
Build(),
464+
builder.Machine("ns1", "md1-machine0").
465+
WithLabels(map[string]string{clusterv1.ClusterTopologyMachineDeploymentNameLabel: "md1"}).
466+
WithVersion("v1.21.2").
467+
Build(),
468+
},
502469
wantConditionStatus: corev1.ConditionFalse,
503470
wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason,
504-
wantConditionMessage: "MachineDeployment(s) md1-abc123 upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are rolling out",
471+
wantConditionMessage: "MachineDeployment(s) md1-abc123 upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are upgrading",
505472
},
506473
{
507474
name: "should set the condition to false if some machine deployments have not picked the new version because their upgrade has been deferred",
@@ -624,7 +591,18 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
624591
t.Run(tt.name, func(t *testing.T) {
625592
g := NewWithT(t)
626593

627-
r := &Reconciler{}
594+
objs := []client.Object{}
595+
if tt.s != nil && tt.s.Current != nil {
596+
for _, mds := range tt.s.Current.MachineDeployments {
597+
objs = append(objs, mds.Object)
598+
}
599+
}
600+
for _, m := range tt.machines {
601+
objs = append(objs, m)
602+
}
603+
fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build()
604+
605+
r := &Reconciler{Client: fakeClient}
628606
err := r.reconcileTopologyReconciledCondition(tt.s, tt.cluster, tt.reconcileErr)
629607
if tt.wantErr {
630608
g.Expect(err).To(HaveOccurred())

internal/controllers/topology/cluster/desired_state.go

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ func (r *Reconciler) computeDesiredState(ctx context.Context, s *scope.Scope) (*
8989
// If required, compute the desired state of the MachineDeployments from the list of MachineDeploymentTopologies
9090
// defined in the cluster.
9191
if s.Blueprint.HasMachineDeployments() {
92-
desiredState.MachineDeployments, err = computeMachineDeployments(ctx, s, desiredState.ControlPlane)
92+
desiredState.MachineDeployments, err = r.computeMachineDeployments(ctx, s, desiredState.ControlPlane)
9393
if err != nil {
9494
return nil, errors.Wrapf(err, "failed to compute MachineDeployments")
9595
}
@@ -523,12 +523,22 @@ func calculateRefDesiredAPIVersion(currentRef *corev1.ObjectReference, desiredRe
523523
}
524524

525525
// computeMachineDeployments computes the desired state of the list of MachineDeployments.
526-
func computeMachineDeployments(ctx context.Context, s *scope.Scope, desiredControlPlaneState *scope.ControlPlaneState) (scope.MachineDeploymentsStateMap, error) {
527-
// Mark all the machine deployments that are currently rolling out.
528-
// This captured information will be used for
529-
// - Building the TopologyReconciled condition.
530-
// - Making upgrade decisions on machine deployments.
526+
func (r *Reconciler) computeMachineDeployments(ctx context.Context, s *scope.Scope, desiredControlPlaneState *scope.ControlPlaneState) (scope.MachineDeploymentsStateMap, error) {
527+
// Mark all the MachineDeployments that are currently upgrading.
528+
// This captured information is used for:
529+
// - Building the TopologyReconciled condition.
530+
// - Making upgrade decisions on machine deployments.
531+
upgradingMDs, err := s.Current.MachineDeployments.Upgrading(ctx, r.Client)
532+
if err != nil {
533+
return nil, err
534+
}
535+
s.UpgradeTracker.MachineDeployments.MarkUpgradingAndRollingOut(upgradingMDs...)
536+
537+
// Mark all MachineDeployments that are currently rolling out.
538+
// This captured information is used for:
539+
// - Building the TopologyReconciled condition (when control plane upgrade is pending)
531540
s.UpgradeTracker.MachineDeployments.MarkRollingOut(s.Current.MachineDeployments.RollingOut()...)
541+
532542
machineDeploymentsStateMap := make(scope.MachineDeploymentsStateMap)
533543
for _, mdTopology := range s.Blueprint.Topology.Workers.MachineDeployments {
534544
desiredMachineDeployment, err := computeMachineDeployment(ctx, s, desiredControlPlaneState, mdTopology)
@@ -844,7 +854,7 @@ func computeMachineDeploymentVersion(s *scope.Scope, machineDeploymentTopology c
844854

845855
// Control plane and machine deployments are stable.
846856
// Ready to pick up the topology version.
847-
s.UpgradeTracker.MachineDeployments.MarkRollingOut(currentMDState.Object.Name)
857+
s.UpgradeTracker.MachineDeployments.MarkUpgradingAndRollingOut(currentMDState.Object.Name)
848858
return desiredVersion, nil
849859
}
850860

0 commit comments

Comments
 (0)