Skip to content

Commit 6d84b7e

Browse files
harikrongalijessesuen
authored andcommitted
fix: canary scaledown event could violate maxUnavailable (argoproj#1429)
Signed-off-by: hari rongali <[email protected]>
1 parent c2e351b commit 6d84b7e

File tree

3 files changed

+143
-24
lines changed

3 files changed

+143
-24
lines changed

test/e2e/canary_test.go

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,58 @@ func (s *CanarySuite) TestRolloutScalingWhenPaused() {
112112
ExpectCanaryStablePodCount(1, 3)
113113
}
114114

115+
116+
// TestRolloutWithMaxSurgeScalingDuringUpdate verifies behavior when scaling a rollout up/down in middle of update and with maxSurge 100%
117+
func (s *CanarySuite) TestRolloutWithMaxSurgeScalingDuringUpdate() {
118+
s.Given().
119+
HealthyRollout(`
120+
apiVersion: argoproj.io/v1alpha1
121+
kind: Rollout
122+
metadata:
123+
name: updatescaling
124+
spec:
125+
replicas: 4
126+
strategy:
127+
canary:
128+
maxSurge: 100%
129+
selector:
130+
matchLabels:
131+
app: updatescaling
132+
template:
133+
metadata:
134+
labels:
135+
app: updatescaling
136+
spec:
137+
containers:
138+
- name: updatescaling
139+
image: nginx:1.19-alpine
140+
resources:
141+
requests:
142+
memory: 16Mi
143+
cpu: 1m`).
144+
When().
145+
PatchSpec(`
146+
spec:
147+
template:
148+
spec:
149+
containers:
150+
- name: updatescaling
151+
command: [/bad-command]`).
152+
WaitForRolloutReplicas(7).
153+
Then().
154+
ExpectCanaryStablePodCount(4, 3).
155+
When().
156+
ScaleRollout(8).
157+
WaitForRolloutReplicas(11).
158+
Then().
159+
ExpectCanaryStablePodCount(8, 3).
160+
When().
161+
ScaleRollout(4).
162+
WaitForRolloutReplicas(7).
163+
Then().
164+
ExpectCanaryStablePodCount(4, 3)
165+
}
166+
115167
// TestRolloutScalingDuringUpdate verifies behavior when scaling a rollout up/down in middle of update
116168
func (s *CanarySuite) TestRolloutScalingDuringUpdate() {
117169
s.Given().
@@ -160,8 +212,10 @@ spec:
160212
// See: https://github.com/argoproj/argo-rollouts/issues/738
161213
ExpectCanaryStablePodCount(6, 4).
162214
When().
163-
ScaleRollout(4)
164-
// WaitForRolloutReplicas(4) // this doesn't work yet (bug)
215+
ScaleRollout(4).
216+
WaitForRolloutReplicas(6).
217+
Then().
218+
ExpectCanaryStablePodCount(2, 4)
165219
}
166220

167221
// TestReduceWeightAndHonorMaxUnavailable verifies we honor maxUnavailable when decreasing weight or aborting

utils/replicaset/canary.go

Lines changed: 71 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -175,44 +175,93 @@ func CalculateReplicaCountsForCanary(rollout *v1alpha1.Rollout, newRS *appsv1.Re
175175
}
176176

177177
minAvailableReplicaCount := rolloutSpecReplica - MaxUnavailable(rollout)
178+
178179
// isIncreasing indicates if we are supposed to be increasing our canary replica count.
179180
// If so, we can ignore pod availability of the stableRS. Otherwise, if we are reducing our
180181
// weight (e.g. we are aborting), then we can ignore pod availability of the canaryRS.
181182
isIncreasing := newRS == nil || desiredNewRSReplicaCount >= *newRS.Spec.Replicas
182183
replicasToScaleDown := GetReplicasForScaleDown(newRS, !isIncreasing) + GetReplicasForScaleDown(stableRS, isIncreasing)
183-
184184
if replicasToScaleDown <= minAvailableReplicaCount {
185185
// Cannot scale down stableRS or newRS without going below min available replica count
186186
return newRSReplicaCount, stableRSReplicaCount
187187
}
188188

189189
scaleDownCount := replicasToScaleDown - minAvailableReplicaCount
190190

191-
if newRS != nil && *newRS.Spec.Replicas > desiredNewRSReplicaCount {
192-
// if the controller doesn't have to use every replica to achieve the desired count, it only scales down to the
193-
// desired count.
194-
if *newRS.Spec.Replicas-scaleDownCount < desiredNewRSReplicaCount {
195-
newRSReplicaCount = desiredNewRSReplicaCount
196-
// Calculating how many replicas were used to scale down to the desired count
197-
scaleDownCount = scaleDownCount - (*newRS.Spec.Replicas - desiredNewRSReplicaCount)
198-
} else {
199-
// The controller is using every replica it can to get closer to desired state.
200-
newRSReplicaCount = *newRS.Spec.Replicas - scaleDownCount
201-
scaleDownCount = 0
202-
}
191+
if !isIncreasing {
192+
// Skip scalingDown Stable replicaSet when Canary availability is not taken into calculation for scaleDown
193+
newRSReplicaCount = calculateScaleDownReplicaCount(newRS, desiredNewRSReplicaCount, scaleDownCount, newRSReplicaCount)
194+
newRSReplicaCount, stableRSReplicaCount = adjustReplicaWithinLimits(newRS, stableRS, newRSReplicaCount, stableRSReplicaCount, maxReplicaCountAllowed, minAvailableReplicaCount)
195+
} else {
196+
// Skip scalingDown canary replicaSet when StableSet availability is not taken into calculation for scaleDown
197+
stableRSReplicaCount = calculateScaleDownReplicaCount(stableRS, desiredStableRSReplicaCount, scaleDownCount, stableRSReplicaCount)
198+
stableRSReplicaCount, newRSReplicaCount = adjustReplicaWithinLimits(stableRS, newRS, stableRSReplicaCount, newRSReplicaCount, maxReplicaCountAllowed, minAvailableReplicaCount)
203199
}
200+
return newRSReplicaCount, stableRSReplicaCount
201+
}
204202

205-
if scaleStableRS && *stableRS.Spec.Replicas > desiredStableRSReplicaCount {
206-
// This follows the same logic as scaling down the newRS except with the stableRS and it does not need to
207-
// set the scaleDownCount again since it's not used again
208-
if *stableRS.Spec.Replicas-scaleDownCount < desiredStableRSReplicaCount {
209-
stableRSReplicaCount = desiredStableRSReplicaCount
210-
} else {
211-
stableRSReplicaCount = *stableRS.Spec.Replicas - scaleDownCount
212-
}
203+
// calculateScaleDownReplicaCount calculates drainRSReplicaCount
204+
// drainRSReplicaCount can be either stableRS count or canaryRS count
205+
// drainRSReplicaCount corresponds to RS whose availability is not considered in calculating replicasToScaleDown
206+
func calculateScaleDownReplicaCount(drainRS *appsv1.ReplicaSet, desireRSReplicaCount int32, scaleDownCount int32, drainRSReplicaCount int32) int32 {
207+
if drainRS != nil && *drainRS.Spec.Replicas > desireRSReplicaCount {
208+
// if the controller doesn't have to use every replica to achieve the desired count,
209+
// it can scales down to the desired count or get closer to desired state.
210+
drainRSReplicaCount = maxValue(desireRSReplicaCount, *drainRS.Spec.Replicas-scaleDownCount)
213211
}
212+
return drainRSReplicaCount
213+
}
214214

215-
return newRSReplicaCount, stableRSReplicaCount
215+
// adjustReplicaWithinLimits adjusts replicaCounters to be within maxSurge & maxUnavailable limits
216+
// drainRSReplicaCount corresponds to RS whose availability is not considered in calculating replicasToScaleDown
217+
// adjustRSReplicaCount corresponds to RS whose availability is to taken account while adjusting maxUnavailable limit
218+
func adjustReplicaWithinLimits(drainRS *appsv1.ReplicaSet, adjustRS *appsv1.ReplicaSet, drainRSReplicaCount int32, adjustRSReplicaCount int32, maxReplicaCountAllowed int32, minAvailableReplicaCount int32) (int32, int32) {
219+
extraAvailableAdjustRS := int32(0)
220+
totalAvailableReplicas := int32(0)
221+
// calculates current limit over the allowed value
222+
overTheLimitVal := maxValue(0, adjustRSReplicaCount+drainRSReplicaCount-maxReplicaCountAllowed)
223+
if drainRS != nil {
224+
totalAvailableReplicas = totalAvailableReplicas + minValue(drainRS.Status.AvailableReplicas, drainRSReplicaCount)
225+
}
226+
if adjustRS != nil {
227+
// 1. adjust adjustRSReplicaCount to be within maxSurge
228+
adjustRSReplicaCount = adjustRSReplicaCount - overTheLimitVal
229+
// 2. Calculate availability corresponding to adjusted count
230+
totalAvailableReplicas = totalAvailableReplicas + minValue(adjustRS.Status.AvailableReplicas, adjustRSReplicaCount)
231+
// 3. Calculate decrease in availability of adjustRS because of (1)
232+
extraAvailableAdjustRS = maxValue(0, adjustRS.Status.AvailableReplicas-adjustRSReplicaCount)
233+
234+
// 4. Now calculate how far count is from maxUnavailable limit
235+
moreToNeedAvailableReplicas := maxValue(0, minAvailableReplicaCount-totalAvailableReplicas)
236+
// 5. From (3), we got the count for decrease in availability because of (1),
237+
// take the min of (3) & (4) and add it back to adjustRS
238+
// remaining of moreToNeedAvailableReplicas can be ignored as it is part of drainRS,
239+
// there is no case of deviating from maxUnavailable limit from drainRS as in the event of said case,
240+
// scaleDown calculation wont even occur as we are checking
241+
// replicasToScaleDown <= minAvailableReplicaCount in caller function
242+
adjustRSReplicaCount = adjustRSReplicaCount + minValue(extraAvailableAdjustRS, moreToNeedAvailableReplicas)
243+
// 6. Calculate final overTheLimit because of adjustment
244+
overTheLimitVal = maxValue(0, adjustRSReplicaCount+drainRSReplicaCount-maxReplicaCountAllowed)
245+
// 7. we can safely subtract from drainRS and other cases like deviation from maxUnavailable limit
246+
// wont occur as said in (5)
247+
drainRSReplicaCount = drainRSReplicaCount - overTheLimitVal
248+
}
249+
250+
return drainRSReplicaCount, adjustRSReplicaCount
251+
}
252+
253+
func minValue(countA int32, countB int32) int32 {
254+
if countA > countB {
255+
return countB
256+
}
257+
return countA
258+
}
259+
260+
func maxValue(countA int32, countB int32) int32 {
261+
if countA < countB {
262+
return countB
263+
}
264+
return countA
216265
}
217266

218267
// BeforeStartingStep checks if canary rollout is at the starting step

utils/replicaset/canary_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,22 @@ func TestCalculateReplicaCountsForCanary(t *testing.T) {
336336
expectedStableReplicaCount: 7,
337337
expectedCanaryReplicaCount: 3,
338338
},
339+
{
340+
name: "Scale down stable and canary available",
341+
rolloutSpecReplicas: 10,
342+
setWeight: 100,
343+
maxSurge: intstr.FromInt(1),
344+
maxUnavailable: intstr.FromInt(0),
345+
346+
stableSpecReplica: 10,
347+
stableAvailableReplica: 2,
348+
349+
canarySpecReplica: 10,
350+
canaryAvailableReplica: 8,
351+
352+
expectedStableReplicaCount: 2,
353+
expectedCanaryReplicaCount: 9,
354+
},
339355
{
340356
name: "Do not scale down newRS or stable when older RS count >= scaleDownCount",
341357
rolloutSpecReplicas: 10,

0 commit comments

Comments
 (0)