Skip to content

Commit 9a9b0cd

Browse files
authored
fix: replicaset count would flap when interrupting update with new pod spec (argoproj#1479)
Signed-off-by: Jesse Suen <[email protected]>
1 parent 8c3e1a2 commit 9a9b0cd

File tree

5 files changed

+176
-14
lines changed

5 files changed

+176
-14
lines changed

rollout/canary.go

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -185,10 +185,33 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli
185185
desiredReplicaCount = *targetRS.Spec.Replicas - maxScaleDown
186186
}
187187
} else {
188-
// For traffic shaped canary, we leave the old ReplicaSets up until scaleDownDelaySeconds
189-
annotationedRSs, desiredReplicaCount, err = c.scaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas)
190-
if err != nil {
191-
return totalScaledDown, err
188+
if rolloututil.IsFullyPromoted(c.rollout) || replicasetutil.HasScaleDownDeadline(targetRS) {
189+
// If we are fully promoted and we encounter an old ReplicaSet, we can infer that
190+
// this ReplicaSet is likely the previous stable and should follow scaleDownDelaySeconds
191+
annotationedRSs, desiredReplicaCount, err = c.scaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas)
192+
if err != nil {
193+
return totalScaledDown, err
194+
}
195+
} else {
196+
// If we get here, we are *not* fully promoted and are in the middle of an update.
197+
// We just encountered a scaled up ReplicaSet which is neither the stable or canary
198+
// and doesn't yet have scale down deadline. This happens when a user changes their
199+
// mind in the middle of an V1 -> V2 update, and then applies a V3. We are deciding
200+
// what to do with the defunct, intermediate V2 ReplicaSet right now.
201+
if replicasetutil.IsReplicaSetReady(c.newRS) && replicasetutil.IsReplicaSetReady(c.stableRS) {
202+
// If the both new and old RS are available, we can infer that it is safe to
203+
// scale down this ReplicaSet, since traffic should have shifted away from this RS.
204+
// TODO: instead of checking availability of canary/stable, a better way to determine
205+
// if it is safe to scale this down, is to check if traffic is directed to the RS.
206+
// But to do so, we need the new status.canary.weights field in PR:
207+
// https://github.com/argoproj/argo-rollouts/pull/1430
208+
c.log.Infof("scaling down intermediate RS '%s'", targetRS.Name)
209+
} else {
210+
// The current and stable ReplicaSets have not reached readiness. This implies
211+
// we might not have shifted traffic away from this ReplicaSet so we need to
212+
// keep this scaled up.
213+
continue
214+
}
192215
}
193216
}
194217
if *targetRS.Spec.Replicas == desiredReplicaCount {

rollout/canary_test.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,93 @@ func TestCanaryRolloutScaleDownOldRsDontScaleDownTooMuch(t *testing.T) {
706706

707707
}
708708

709+
// TestCanaryDontScaleDownOldRsDuringInterruptedUpdate tests when we need to prevent scale down an
710+
// intermediate V2 ReplicaSet when applying a V3 spec in the middle of updating a traffic routed
711+
// canary going from V1 -> V2 (i.e. because we haven't shifted traffic away from V2 yet).
712+
func TestCanaryDontScaleDownOldRsDuringInterruptedUpdate(t *testing.T) {
713+
f := newFixture(t)
714+
defer f.Close()
715+
716+
steps := []v1alpha1.CanaryStep{
717+
{
718+
SetWeight: pointer.Int32Ptr(100),
719+
},
720+
{
721+
Pause: &v1alpha1.RolloutPause{},
722+
},
723+
}
724+
r1 := newCanaryRollout("foo", 5, nil, steps, pointer.Int32Ptr(1), intstr.FromInt(1), intstr.FromInt(0))
725+
r1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{
726+
SMI: &v1alpha1.SMITrafficRouting{},
727+
}
728+
r1.Spec.Strategy.Canary.StableService = "stable-svc"
729+
r1.Spec.Strategy.Canary.CanaryService = "canary-svc"
730+
r2 := bumpVersion(r1)
731+
r3 := bumpVersion(r2)
732+
733+
stableSvc := newService("stable-svc", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: r1.Status.CurrentPodHash}, r1)
734+
canarySvc := newService("canary-svc", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: r3.Status.CurrentPodHash}, r3)
735+
r3.Status.StableRS = r1.Status.CurrentPodHash
736+
737+
rs1 := newReplicaSetWithStatus(r1, 5, 5)
738+
rs2 := newReplicaSetWithStatus(r2, 5, 5)
739+
rs3 := newReplicaSetWithStatus(r3, 5, 0)
740+
741+
f.objects = append(f.objects, r3)
742+
f.kubeobjects = append(f.kubeobjects, rs1, rs2, rs3, canarySvc, stableSvc)
743+
f.replicaSetLister = append(f.replicaSetLister, rs1, rs2, rs3)
744+
f.serviceLister = append(f.serviceLister, canarySvc, stableSvc)
745+
746+
f.expectPatchRolloutAction(r3)
747+
f.run(getKey(r3, t))
748+
}
749+
750+
// TestCanaryScaleDownOldRsDuringInterruptedUpdate tests that we proceed with scale down of an
751+
// intermediate V2 ReplicaSet when applying a V3 spec in the middle of updating a traffic routed
752+
// canary going from V1 -> V2 (i.e. after we have shifted traffic away from V2). This test is the
753+
// same as TestCanaryDontScaleDownOldRsDuringUpdate but rs3 is fully available
754+
func TestCanaryScaleDownOldRsDuringInterruptedUpdate(t *testing.T) {
755+
f := newFixture(t)
756+
defer f.Close()
757+
758+
steps := []v1alpha1.CanaryStep{
759+
{
760+
SetWeight: pointer.Int32Ptr(100),
761+
},
762+
{
763+
Pause: &v1alpha1.RolloutPause{},
764+
},
765+
}
766+
r1 := newCanaryRollout("foo", 5, nil, steps, pointer.Int32Ptr(1), intstr.FromInt(1), intstr.FromInt(0))
767+
r1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{
768+
SMI: &v1alpha1.SMITrafficRouting{},
769+
}
770+
r1.Spec.Strategy.Canary.StableService = "stable-svc"
771+
r1.Spec.Strategy.Canary.CanaryService = "canary-svc"
772+
r2 := bumpVersion(r1)
773+
r3 := bumpVersion(r2)
774+
775+
stableSvc := newService("stable-svc", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: r1.Status.CurrentPodHash}, r1)
776+
canarySvc := newService("canary-svc", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: r3.Status.CurrentPodHash}, r3)
777+
r3.Status.StableRS = r1.Status.CurrentPodHash
778+
779+
rs1 := newReplicaSetWithStatus(r1, 5, 5)
780+
rs2 := newReplicaSetWithStatus(r2, 5, 5)
781+
rs3 := newReplicaSetWithStatus(r3, 5, 5)
782+
783+
f.objects = append(f.objects, r3)
784+
f.kubeobjects = append(f.kubeobjects, rs1, rs2, rs3, canarySvc, stableSvc)
785+
f.replicaSetLister = append(f.replicaSetLister, rs1, rs2, rs3)
786+
f.serviceLister = append(f.serviceLister, canarySvc, stableSvc)
787+
788+
f.expectPatchRolloutAction(r3)
789+
updatedRSIndex := f.expectUpdateReplicaSetAction(rs2)
790+
f.run(getKey(r3, t))
791+
792+
updatedRs2 := f.getUpdatedReplicaSet(updatedRSIndex)
793+
assert.Equal(t, int32(0), *updatedRs2.Spec.Replicas)
794+
}
795+
709796
func TestRollBackToStable(t *testing.T) {
710797
f := newFixture(t)
711798
defer f.Close()

rollout/service.go

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ func (c *rolloutContext) reconcileStableAndCanaryService() error {
242242
return err
243243
}
244244

245-
if c.newRSReady() {
245+
if replicasetutil.IsReplicaSetReady(c.newRS) {
246246
err = c.ensureSVCTargets(c.rollout.Spec.Strategy.Canary.CanaryService, c.newRS)
247247
if err != nil {
248248
return err
@@ -267,12 +267,3 @@ func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet)
267267
}
268268
return nil
269269
}
270-
271-
func (c *rolloutContext) newRSReady() bool {
272-
if c.newRS == nil {
273-
return false
274-
}
275-
replicas := c.newRS.Spec.Replicas
276-
readyReplicas := c.newRS.Status.ReadyReplicas
277-
return replicas != nil && *replicas != 0 && readyReplicas != 0 && *replicas <= readyReplicas
278-
}

utils/replicaset/replicaset.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,3 +616,13 @@ func GetPodsOwnedByReplicaSet(ctx context.Context, client kubernetes.Interface,
616616
}
617617
return podOwnedByRS, nil
618618
}
619+
620+
// IsReplicaSetReady returns if a ReplicaSet is scaled up and its ready count is >= desired count
621+
func IsReplicaSetReady(rs *appsv1.ReplicaSet) bool {
622+
if rs == nil {
623+
return false
624+
}
625+
replicas := rs.Spec.Replicas
626+
readyReplicas := rs.Status.ReadyReplicas
627+
return replicas != nil && *replicas != 0 && readyReplicas != 0 && *replicas <= readyReplicas
628+
}

utils/replicaset/replicaset_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1247,3 +1247,54 @@ spec:
12471247

12481248
assert.True(t, PodTemplateEqualIgnoreHash(&live, &desired))
12491249
}
1250+
1251+
func TestIsReplicaSetReady(t *testing.T) {
1252+
{
1253+
assert.False(t, IsReplicaSetReady(nil))
1254+
}
1255+
{
1256+
rs := appsv1.ReplicaSet{
1257+
Spec: appsv1.ReplicaSetSpec{
1258+
Replicas: pointer.Int32Ptr(1),
1259+
},
1260+
Status: appsv1.ReplicaSetStatus{
1261+
ReadyReplicas: 0,
1262+
},
1263+
}
1264+
assert.False(t, IsReplicaSetReady(&rs))
1265+
}
1266+
{
1267+
rs := appsv1.ReplicaSet{
1268+
Spec: appsv1.ReplicaSetSpec{
1269+
Replicas: pointer.Int32Ptr(1),
1270+
},
1271+
Status: appsv1.ReplicaSetStatus{
1272+
ReadyReplicas: 1,
1273+
},
1274+
}
1275+
assert.True(t, IsReplicaSetReady(&rs))
1276+
}
1277+
{
1278+
rs := appsv1.ReplicaSet{
1279+
Spec: appsv1.ReplicaSetSpec{
1280+
Replicas: pointer.Int32Ptr(1),
1281+
},
1282+
Status: appsv1.ReplicaSetStatus{
1283+
ReadyReplicas: 2,
1284+
},
1285+
}
1286+
assert.True(t, IsReplicaSetReady(&rs))
1287+
}
1288+
{
1289+
rs := appsv1.ReplicaSet{
1290+
Spec: appsv1.ReplicaSetSpec{
1291+
Replicas: pointer.Int32Ptr(0),
1292+
},
1293+
Status: appsv1.ReplicaSetStatus{
1294+
ReadyReplicas: 0,
1295+
},
1296+
}
1297+
// NOTE: currently consider scaled down replicas as not ready
1298+
assert.False(t, IsReplicaSetReady(&rs))
1299+
}
1300+
}

0 commit comments

Comments
 (0)