Skip to content

Commit a922ab8

Browse files
🐛 Fix race conditions ScaleDownOldMS (#12812)
* Fix ScaleDownOldMS * Minor improvements * Address feedback * Fix review findings Signed-off-by: Stefan Büringer [email protected] * More comments --------- Signed-off-by: Stefan Büringer [email protected] Co-authored-by: Stefan Bueringer <[email protected]>
1 parent 82f5743 commit a922ab8

12 files changed

+1442
-333
lines changed

internal/controllers/machinedeployment/machinedeployment_rolling.go

Lines changed: 196 additions & 156 deletions
Large diffs are not rendered by default.

internal/controllers/machinedeployment/machinedeployment_rolling_test.go

Lines changed: 421 additions & 126 deletions
Large diffs are not rendered by default.

internal/controllers/machinedeployment/machinedeployment_rollout_sequence_test.go

Lines changed: 77 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"time"
2929

3030
"github.com/google/go-cmp/cmp"
31+
"github.com/onsi/ginkgo/v2"
3132
. "github.com/onsi/gomega"
3233
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3334
"k8s.io/apimachinery/pkg/util/intstr"
@@ -39,7 +40,7 @@ import (
3940
"sigs.k8s.io/cluster-api/internal/controllers/machinedeployment/mdutil"
4041
)
4142

42-
type rolloutSequenceTestCase struct {
43+
type rolloutRollingSequenceTestCase struct {
4344
name string
4445
maxSurge int32
4546
maxUnavailable int32
@@ -107,11 +108,12 @@ type rolloutSequenceTestCase struct {
107108
seed int64
108109
}
109110

110-
func Test_rolloutSequencesWithPredictableReconcileOrder(t *testing.T) {
111+
func Test_rolloutRollingSequences(t *testing.T) {
111112
ctx := context.Background()
112113
ctx = ctrl.LoggerInto(ctx, klog.Background())
114+
klog.SetOutput(ginkgo.GinkgoWriter)
113115

114-
tests := []rolloutSequenceTestCase{
116+
tests := []rolloutRollingSequenceTestCase{
115117
// Regular rollout (no in-place)
116118

117119
{ // scale out by 1
@@ -161,7 +163,7 @@ func Test_rolloutSequencesWithPredictableReconcileOrder(t *testing.T) {
161163
maxSurge: 3,
162164
maxUnavailable: 1,
163165
currentScope: &rolloutScope{ // Manually providing a scope simulating a MD originally with 6 replica in the middle of a rollout, with 3 machines already created in the newMS and 3 still on the oldMS, and then MD scaled up to 12.
164-
machineDeployment: createMD("v2", 12, 3, 1),
166+
machineDeployment: createMD("v2", 12, withRolloutStrategy(3, 1)),
165167
machineSets: []*clusterv1.MachineSet{
166168
createMS("ms1", "v1", 3),
167169
createMS("ms2", "v2", 3),
@@ -189,7 +191,7 @@ func Test_rolloutSequencesWithPredictableReconcileOrder(t *testing.T) {
189191
maxSurge: 3,
190192
maxUnavailable: 1,
191193
currentScope: &rolloutScope{ // Manually providing a scope simulating a MD originally with 12 replica in the middle of a rollout, with 3 machines already created in the newMS and 9 still on the oldMS, and then MD scaled down to 6.
192-
machineDeployment: createMD("v2", 6, 3, 1),
194+
machineDeployment: createMD("v2", 6, withRolloutStrategy(3, 1)),
193195
machineSets: []*clusterv1.MachineSet{
194196
createMS("ms1", "v1", 9),
195197
createMS("ms2", "v2", 3),
@@ -223,7 +225,7 @@ func Test_rolloutSequencesWithPredictableReconcileOrder(t *testing.T) {
223225
maxSurge: 3,
224226
maxUnavailable: 1,
225227
currentScope: &rolloutScope{ // Manually providing a scope simulating a MD with 6 replica in the middle of a rollout, with 3 machines already created in the newMS and 3 still on the oldMS, and then MD spec is changed.
226-
machineDeployment: createMD("v3", 6, 3, 1),
228+
machineDeployment: createMD("v3", 6, withRolloutStrategy(3, 1)),
227229
machineSets: []*clusterv1.MachineSet{
228230
createMS("ms1", "v1", 3),
229231
createMS("ms2", "v2", 3),
@@ -249,9 +251,8 @@ func Test_rolloutSequencesWithPredictableReconcileOrder(t *testing.T) {
249251
}
250252

251253
testWithPredictableReconcileOrder := true
252-
// TODO(in-place): enable tests with random reconcile order as soon as the issues in reconcileOldMachineSets are fixed
253-
testWithRandomReconcileOrderFromConstantSeed := false
254-
testWithRandomReconcileOrderFromRandomSeed := false
254+
testWithRandomReconcileOrderFromConstantSeed := true
255+
testWithRandomReconcileOrderFromRandomSeed := true
255256

256257
for _, tt := range tests {
257258
t.Run(tt.name, func(t *testing.T) {
@@ -260,11 +261,9 @@ func Test_rolloutSequencesWithPredictableReconcileOrder(t *testing.T) {
260261
if testWithPredictableReconcileOrder {
261262
tt.maxIterations = 50
262263
tt.randomControllerOrder = false
263-
if tt.logAndGoldenFileName == "" {
264-
tt.logAndGoldenFileName = strings.ToLower(tt.name)
265-
}
264+
tt.logAndGoldenFileName = strings.ToLower(tt.name)
266265
t.Run("default", func(t *testing.T) {
267-
runTestCase(ctx, t, tt)
266+
runRolloutRollingTestCase(ctx, t, tt)
268267
})
269268
}
270269

@@ -273,13 +272,9 @@ func Test_rolloutSequencesWithPredictableReconcileOrder(t *testing.T) {
273272
tt.name = fmt.Sprintf("%s, random(0)", name)
274273
tt.randomControllerOrder = true
275274
tt.seed = 0
276-
// TODO(in-place): drop the following line as soon as issue with scale down are fixed
277-
tt.skipLogToFileAndGoldenFileCheck = true
278-
if tt.logAndGoldenFileName == "" {
279-
tt.logAndGoldenFileName = strings.ToLower(tt.name)
280-
}
275+
tt.logAndGoldenFileName = strings.ToLower(tt.name)
281276
t.Run("random(0)", func(t *testing.T) {
282-
runTestCase(ctx, t, tt)
277+
runRolloutRollingTestCase(ctx, t, tt)
283278
})
284279
}
285280

@@ -291,15 +286,15 @@ func Test_rolloutSequencesWithPredictableReconcileOrder(t *testing.T) {
291286
tt.randomControllerOrder = true
292287
tt.skipLogToFileAndGoldenFileCheck = true
293288
t.Run(fmt.Sprintf("random(%d)", tt.seed), func(t *testing.T) {
294-
runTestCase(ctx, t, tt)
289+
runRolloutRollingTestCase(ctx, t, tt)
295290
})
296291
}
297292
}
298293
})
299294
}
300295
}
301296

302-
func runTestCase(ctx context.Context, t *testing.T, tt rolloutSequenceTestCase) {
297+
func runRolloutRollingTestCase(ctx context.Context, t *testing.T, tt rolloutRollingSequenceTestCase) {
303298
t.Helper()
304299
g := NewWithT(t)
305300

@@ -310,7 +305,7 @@ func runTestCase(ctx context.Context, t *testing.T, tt rolloutSequenceTestCase)
310305
// Init current and desired state from test case
311306
current := tt.currentScope.Clone()
312307
if current == nil {
313-
current = initCurrentRolloutScope(tt)
308+
current = initCurrentRolloutScope(tt.currentMachineNames, withRolloutStrategy(tt.maxSurge, tt.maxUnavailable))
314309
}
315310
desired := computeDesiredRolloutScope(current, tt.desiredMachineNames)
316311

@@ -324,12 +319,15 @@ func runTestCase(ctx context.Context, t *testing.T, tt rolloutSequenceTestCase)
324319
i := 1
325320
maxIterations := tt.maxIterations
326321
for {
327-
taskOrder := defaultTaskOrder(current)
322+
taskList := getTaskListRolloutRolling(current)
323+
taskCount := len(taskList)
324+
taskOrder := defaultTaskOrder(taskCount)
328325
if tt.randomControllerOrder {
329-
taskOrder = randomTaskOrder(current, rng)
326+
taskOrder = randomTaskOrder(taskCount, rng)
330327
}
331328
for _, taskID := range taskOrder {
332-
if taskID == 0 {
329+
task := taskList[taskID]
330+
if task == "md" {
333331
fLogger.Logf("[MD controller] Iteration %d, Reconcile md", i)
334332
fLogger.Logf("[MD controller] - Input to rollout planner\n%s", current)
335333

@@ -339,8 +337,9 @@ func runTestCase(ctx context.Context, t *testing.T, tt rolloutSequenceTestCase)
339337
p.newMS = current.newMS()
340338
p.oldMSs = current.oldMSs()
341339

342-
err := p.Plan(ctx)
340+
err := p.planRolloutRolling(ctx)
343341
g.Expect(err).ToNot(HaveOccurred())
342+
344343
// Apply changes.
345344
for _, ms := range current.machineSets {
346345
if scaleIntent, ok := p.scaleIntents[ms.Name]; ok {
@@ -383,8 +382,8 @@ func runTestCase(ctx context.Context, t *testing.T, tt rolloutSequenceTestCase)
383382

384383
// Run mutators faking other controllers
385384
for _, ms := range current.machineSets {
386-
if fmt.Sprintf("ms%d", taskID) == ms.Name {
387-
fLogger.Logf("[MS controller] Iteration %d, Reconcile ms%d, %s", i, taskID, msLog(ms, current.machineSetMachines[ms.Name]))
385+
if ms.Name == task {
386+
fLogger.Logf("[MS controller] Iteration %d, Reconcile %s, %s", i, ms.Name, msLog(ms, current.machineSetMachines[ms.Name]))
388387
machineSetControllerMutator(fLogger, ms, current)
389388
break
390389
}
@@ -486,15 +485,14 @@ type rolloutScope struct {
486485
machineUID int32
487486
}
488487

489-
// Init creates current state and desired state for rolling out a md from currentMachines to wantMachineNames.
490-
func initCurrentRolloutScope(tt rolloutSequenceTestCase) (current *rolloutScope) {
488+
func initCurrentRolloutScope(currentMachineNames []string, mdOptions ...machineDeploymentOption) (current *rolloutScope) {
491489
// create current state, with a MD with
492490
// - given MaxSurge, MaxUnavailable
493491
// - replica counters assuming all the machines are at stable state
494492
// - spec different from the MachineSets and Machines we are going to create down below (to simulate a change that triggers a rollout, but it is not yet started)
495-
mdReplicaCount := int32(len(tt.currentMachineNames))
493+
mdReplicaCount := int32(len(currentMachineNames))
496494
current = &rolloutScope{
497-
machineDeployment: createMD("v2", mdReplicaCount, tt.maxSurge, tt.maxUnavailable),
495+
machineDeployment: createMD("v2", mdReplicaCount, mdOptions...),
498496
}
499497

500498
// Create current MS, with
@@ -507,13 +505,12 @@ func initCurrentRolloutScope(tt rolloutSequenceTestCase) (current *rolloutScope)
507505
// - spec at stable state (rollout is not yet propagated to machines)
508506
var totMachines int32
509507
currentMachines := []*clusterv1.Machine{}
510-
for _, machineSetMachineName := range tt.currentMachineNames {
508+
for _, machineSetMachineName := range currentMachineNames {
511509
totMachines++
512510
currentMachines = append(currentMachines, createM(machineSetMachineName, ms.Name, ms.Spec.Template.Spec.FailureDomain))
513511
}
514512
current.machineSetMachines = map[string][]*clusterv1.Machine{}
515513
current.machineSetMachines[ms.Name] = currentMachines
516-
517514
current.machineUID = totMachines
518515

519516
// TODO(in-place): this should be removed as soon as rolloutPlanner will take care of creating newMS
@@ -835,20 +832,30 @@ func maxSurgeToleration() func(log *fileLogger, _ int, _ *rolloutScope, _, _ int
835832
}
836833
}
837834

835+
func getTaskListRolloutRolling(current *rolloutScope) []string {
836+
taskList := make([]string, 0)
837+
taskList = append(taskList, "md")
838+
for _, ms := range current.machineSets {
839+
taskList = append(taskList, ms.Name)
840+
}
841+
taskList = append(taskList, fmt.Sprintf("ms%d", len(current.machineSets)+1)) // r the MachineSet that might be created when reconciling md
842+
return taskList
843+
}
844+
838845
// default task order ensure the controllers are run in a consistent and predictable way: md, ms1, ms2 and so on.
839-
func defaultTaskOrder(current *rolloutScope) []int {
846+
func defaultTaskOrder(taskCount int) []int {
840847
taskOrder := []int{}
841-
for t := range len(current.machineSets) + 1 + 1 { // +1 is for the MachineSet that might be created when reconciling md, +1 is for the md itself
848+
for t := range taskCount {
842849
taskOrder = append(taskOrder, t)
843850
}
844851
return taskOrder
845852
}
846853

847-
func randomTaskOrder(current *rolloutScope, rng *rand.Rand) []int {
854+
func randomTaskOrder(taskCount int, rng *rand.Rand) []int {
848855
u := &UniqueRand{
849856
rng: rng,
850857
generated: map[int]bool{},
851-
max: len(current.machineSets) + 1 + 1, // +1 is for the MachineSet that might be created when reconciling md, +1 is for the md itself
858+
max: taskCount,
852859
}
853860
taskOrder := []int{}
854861
for !u.Done() {
@@ -864,32 +871,47 @@ func randomTaskOrder(current *rolloutScope, rng *rand.Rand) []int {
864871
return taskOrder
865872
}
866873

867-
func createMD(failureDomain string, replicas int32, maxSurge, maxUnavailable int32) *clusterv1.MachineDeployment {
868-
return &clusterv1.MachineDeployment{
874+
type machineDeploymentOption func(md *clusterv1.MachineDeployment)
875+
876+
func withRolloutStrategy(maxSurge, maxUnavailable int32) func(md *clusterv1.MachineDeployment) {
877+
return func(md *clusterv1.MachineDeployment) {
878+
md.Spec.Rollout.Strategy = clusterv1.MachineDeploymentRolloutStrategy{
879+
Type: clusterv1.RollingUpdateMachineDeploymentStrategyType,
880+
RollingUpdate: clusterv1.MachineDeploymentRolloutStrategyRollingUpdate{
881+
MaxSurge: ptr.To(intstr.FromInt32(maxSurge)),
882+
MaxUnavailable: ptr.To(intstr.FromInt32(maxUnavailable)),
883+
},
884+
}
885+
}
886+
}
887+
888+
func createMD(failureDomain string, replicas int32, options ...machineDeploymentOption) *clusterv1.MachineDeployment {
889+
md := &clusterv1.MachineDeployment{
869890
ObjectMeta: metav1.ObjectMeta{Name: "md"},
870891
Spec: clusterv1.MachineDeploymentSpec{
871892
// Note: using failureDomain as a template field to determine upToDate
872893
Template: clusterv1.MachineTemplateSpec{Spec: clusterv1.MachineSpec{FailureDomain: failureDomain}},
873894
Replicas: &replicas,
874-
Rollout: clusterv1.MachineDeploymentRolloutSpec{
875-
Strategy: clusterv1.MachineDeploymentRolloutStrategy{
876-
Type: clusterv1.RollingUpdateMachineDeploymentStrategyType,
877-
RollingUpdate: clusterv1.MachineDeploymentRolloutStrategyRollingUpdate{
878-
MaxSurge: ptr.To(intstr.FromInt32(maxSurge)),
879-
MaxUnavailable: ptr.To(intstr.FromInt32(maxUnavailable)),
880-
},
881-
},
882-
},
883895
},
884896
Status: clusterv1.MachineDeploymentStatus{
885897
Replicas: &replicas,
886898
AvailableReplicas: &replicas,
887899
},
888900
}
901+
for _, opt := range options {
902+
opt(md)
903+
}
904+
return md
889905
}
890906

891-
func createMS(name, failureDomain string, replicas int32) *clusterv1.MachineSet {
892-
return &clusterv1.MachineSet{
907+
func withStatusAvailableReplicas(r int32) fakeMachineSetOption {
908+
return func(ms *clusterv1.MachineSet) {
909+
ms.Status.AvailableReplicas = ptr.To(r)
910+
}
911+
}
912+
913+
func createMS(name, failureDomain string, replicas int32, opts ...fakeMachineSetOption) *clusterv1.MachineSet {
914+
ms := &clusterv1.MachineSet{
893915
ObjectMeta: metav1.ObjectMeta{
894916
Name: name,
895917
},
@@ -903,6 +925,10 @@ func createMS(name, failureDomain string, replicas int32) *clusterv1.MachineSet
903925
AvailableReplicas: ptr.To(replicas),
904926
},
905927
}
928+
for _, opt := range opts {
929+
opt(ms)
930+
}
931+
return ms
906932
}
907933

908934
func createM(name, ownedByMS, failureDomain string) *clusterv1.Machine {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
## Regular rollout, 12 Replicas, maxSurge 3, maxUnavailable 1, scale down to 6, random(0)
2+
3+
[Test] Initial state
4+
md, 6/6 replicas
5+
- ms1, 9/9 replicas (m4,m5,m6,m7,m8,m9,m10,m11,m12)
6+
- ms2, 3/3 replicas (m13,m14,m15)
7+
[Test] Rollout 12 replicas, MaxSurge=3, MaxUnavailable=1, random(0)
8+
[MS controller] Iteration 1, Reconcile ms2, 3/3 replicas (m13,m14,m15)
9+
[MS controller] Iteration 1, Reconcile ms1, 9/9 replicas (m4,m5,m6,m7,m8,m9,m10,m11,m12)
10+
[MD controller] Iteration 1, Reconcile md
11+
[MD controller] - Input to rollout planner
12+
md, 6/6 replicas
13+
- ms1, 9/9 replicas (m4,m5,m6,m7,m8,m9,m10,m11,m12)
14+
- ms2, 3/3 replicas (m13,m14,m15)
15+
[MD controller] - Result of rollout planner
16+
md, 12/6 replicas
17+
- ms1, 9/2 replicas (m4,m5,m6,m7,m8,m9,m10,m11,m12)
18+
- ms2, 3/3 replicas (m13,m14,m15)
19+
[Toleration] tolerate maxSurge breach
20+
[MS controller] Iteration 2, Reconcile ms1, 9/2 replicas (m4,m5,m6,m7,m8,m9,m10,m11,m12)
21+
[MS controller] - ms1 scale down to 2/2 replicas (m4,m5,m6,m7,m8,m9,m10 deleted)
22+
[MS controller] Iteration 2, Reconcile ms1, 2/2 replicas (m11,m12)
23+
[MD controller] Iteration 3, Reconcile md
24+
[MD controller] - Input to rollout planner
25+
md, 12/6 replicas
26+
- ms1, 2/2 replicas (m11,m12)
27+
- ms2, 3/3 replicas (m13,m14,m15)
28+
[MD controller] - Result of rollout planner
29+
md, 5/6 replicas
30+
- ms1, 2/2 replicas (m11,m12)
31+
- ms2, 3/6 replicas (m13,m14,m15)
32+
[MS controller] Iteration 3, Reconcile ms2, 3/6 replicas (m13,m14,m15)
33+
[MS controller] - ms2 scale up to 6/6 replicas (m16,m17,m18 created)
34+
[MD controller] Iteration 4, Reconcile md
35+
[MD controller] - Input to rollout planner
36+
md, 5/6 replicas
37+
- ms1, 2/2 replicas (m11,m12)
38+
- ms2, 6/6 replicas (m13,m14,m15,m16,m17,m18)
39+
[MD controller] - Result of rollout planner
40+
md, 8/6 replicas
41+
- ms1, 2/0 replicas (m11,m12)
42+
- ms2, 6/6 replicas (m13,m14,m15,m16,m17,m18)
43+
[MS controller] Iteration 4, Reconcile ms1, 2/0 replicas (m11,m12)
44+
[MS controller] - ms1 scale down to 0/0 replicas (m11,m12 deleted)
45+
[MS controller] Iteration 4, Reconcile ms2, 6/6 replicas (m13,m14,m15,m16,m17,m18)
46+
[MS controller] Iteration 5, Reconcile ms2, 6/6 replicas (m13,m14,m15,m16,m17,m18)
47+
[MD controller] Iteration 5, Reconcile md
48+
[MD controller] - Input to rollout planner
49+
md, 8/6 replicas
50+
- ms1, 0/0 replicas ()
51+
- ms2, 6/6 replicas (m13,m14,m15,m16,m17,m18)
52+
[MD controller] - Result of rollout planner
53+
md, 6/6 replicas
54+
- ms1, 0/0 replicas ()
55+
- ms2, 6/6 replicas (m13,m14,m15,m16,m17,m18)
56+
[Test] Final state
57+
md, 6/6 replicas
58+
- ms1, 0/0 replicas ()
59+
- ms2, 6/6 replicas (m13,m14,m15,m16,m17,m18)

0 commit comments

Comments
 (0)