Skip to content

Commit 68cbef9

Browse files
authored
feat: introduce abortScaleDownDelaySeconds to control scale down of preview/canary upon abort (argoproj#1160)
Signed-off-by: Hui Kang <[email protected]>
1 parent 6fd6b39 commit 68cbef9

20 files changed

+587
-16
lines changed

docs/features/scaledown-aborted-rs.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Scaledown New Replicaset on Aborted Rollout
2+
3+
Upon an aborted update, we may scale down the new replicaset for all strategies. Users can then choose to leave the new replicaset scaled up indefinitely by setting abortScaleDownDelaySeconds to 0, or adjust the value to something larger (or smaller).
4+
5+
The following table summarizes the behavior under combinations of rollout strategy and `abortScaleDownDelaySeconds`. Note that `abortScaleDownDelaySeconds` is not applicable to argo-rollouts v1.0.
6+
`abortScaleDownDelaySeconds = nil` is the default, which means in v1.1 across all rollout strategies, the new replicaset
7+
is scaled down in 30 seconds on abort by default.
8+
9+
| strategy | v1.0 behavior | abortScaleDownDelaySeconds | v1.1 behavior |
10+
|--------------------------------------------:|:-----------------------------:|:--------------------------:|:-----------------------------:|
11+
| blue-green | does not scale down | nil | scales down after 30 seconds |
12+
| blue-green | does not scale down | 0 | does not scale down |
13+
| blue-green | does not scale down | N | scales down after N seconds |
14+
| basic canary | rolling update back to stable | N/A | rolling update back to stable |
15+
| canary w/ traffic routing | scales down immediately | nil | scales down after 30 seconds |
16+
| canary w/ traffic routing | scales down immediately | 0 | does not scale down |
17+
| canary w/ traffic routing | scales down immediately | N | scales down after N seconds |
18+
| canary w/ traffic routing + setCanaryScale | does not scale down (bug) | * | should behave like canary w/ traffic routing |

docs/features/specification.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,10 @@ spec:
115115
# down. Defaults to nil
116116
scaleDownDelayRevisionLimit: 2
117117

118+
# Add a delay in second before scaling down the preview replicaset
119+
# if update is aborted. 0 means not to scale down. Default is 30 second
120+
abortScaleDownDelaySeconds: 30
121+
118122
# Anti Affinity configuration between desired and previous ReplicaSet.
119123
# Only one must be specified
120124
antiAffinity:
@@ -295,6 +299,11 @@ spec:
295299
rootService: root-svc # optional
296300
trafficSplitName: rollout-example-traffic-split # optional
297301

302+
# Add a delay in second before scaling down the canary pods when update
303+
# is aborted for canary strategy with traffic routing (not applicable for basic canary).
304+
# 0 means canary pods are not scaled down. Default is 30 seconds.
305+
abortScaleDownDelaySeconds: 30
306+
298307
status:
299308
pauseConditions:
300309
- reason: StepPause

manifests/crds/rollout-crd.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,9 @@ spec:
9191
properties:
9292
blueGreen:
9393
properties:
94+
abortScaleDownDelaySeconds:
95+
format: int32
96+
type: integer
9497
activeMetadata:
9598
properties:
9699
annotations:
@@ -224,6 +227,9 @@ spec:
224227
type: object
225228
canary:
226229
properties:
230+
abortScaleDownDelaySeconds:
231+
format: int32
232+
type: integer
227233
analysis:
228234
properties:
229235
args:

manifests/install.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9780,6 +9780,9 @@ spec:
97809780
properties:
97819781
blueGreen:
97829782
properties:
9783+
abortScaleDownDelaySeconds:
9784+
format: int32
9785+
type: integer
97839786
activeMetadata:
97849787
properties:
97859788
annotations:
@@ -9913,6 +9916,9 @@ spec:
99139916
type: object
99149917
canary:
99159918
properties:
9919+
abortScaleDownDelaySeconds:
9920+
format: int32
9921+
type: integer
99169922
analysis:
99179923
properties:
99189924
args:

manifests/namespace-install.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9780,6 +9780,9 @@ spec:
97809780
properties:
97819781
blueGreen:
97829782
properties:
9783+
abortScaleDownDelaySeconds:
9784+
format: int32
9785+
type: integer
97839786
activeMetadata:
97849787
properties:
97859788
annotations:
@@ -9913,6 +9916,9 @@ spec:
99139916
type: object
99149917
canary:
99159918
properties:
9919+
abortScaleDownDelaySeconds:
9920+
format: int32
9921+
type: integer
99169922
analysis:
99179923
properties:
99189924
args:

pkg/apiclient/rollout/rollout.swagger.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,11 @@
662662
"activeMetadata": {
663663
"$ref": "#/definitions/github.com.argoproj.argo_rollouts.pkg.apis.rollouts.v1alpha1.PodTemplateMetadata",
664664
"title": "ActiveMetadata specify labels and annotations which will be attached to the active pods for\nthe duration which they act as a active pod, and will be removed after"
665+
},
666+
"abortScaleDownDelaySeconds": {
667+
"type": "integer",
668+
"format": "int32",
669+
"title": "AbortScaleDownDelaySeconds adds a delay in second before scaling down the preview replicaset\nif update is aborted. 0 means not to scale down.\nDefault is 30 second\n+optional"
665670
}
666671
},
667672
"title": "BlueGreenStrategy defines parameters for Blue Green deployment"
@@ -766,6 +771,11 @@
766771
"type": "integer",
767772
"format": "int32",
768773
"title": "ScaleDownDelayRevisionLimit limits the number of old RS that can run at one time before getting scaled down\n+optional"
774+
},
775+
"abortScaleDownDelaySeconds": {
776+
"type": "integer",
777+
"format": "int32",
778+
"title": "AbortScaleDownDelaySeconds adds a delay in second before scaling down the canary pods when update\nis aborted for canary strategy with traffic routing (not applicable for basic canary).\n0 means canary pods are not scaled down.\nDefault is 30 seconds.\n+optional"
769779
}
770780
},
771781
"title": "CanaryStrategy defines parameters for a Replica Based Canary"

pkg/apis/rollouts/v1alpha1/types.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,11 @@ type BlueGreenStrategy struct {
202202
// ActiveMetadata specify labels and annotations which will be attached to the active pods for
203203
// the duration which they act as a active pod, and will be removed after
204204
ActiveMetadata *PodTemplateMetadata `json:"activeMetadata,omitempty" protobuf:"bytes,13,opt,name=activeMetadata"`
205+
// AbortScaleDownDelaySeconds adds a delay in second before scaling down the preview replicaset
206+
// if update is aborted. 0 means not to scale down.
207+
// Default is 30 second
208+
// +optional
209+
AbortScaleDownDelaySeconds *int32 `json:"abortScaleDownDelaySeconds,omitempty" protobuf:"varint,14,opt,name=abortScaleDownDelaySeconds"`
205210
}
206211

207212
// AntiAffinity defines which inter-pod scheduling rule to use for anti-affinity injection
@@ -282,6 +287,12 @@ type CanaryStrategy struct {
282287
// ScaleDownDelayRevisionLimit limits the number of old RS that can run at one time before getting scaled down
283288
// +optional
284289
ScaleDownDelayRevisionLimit *int32 `json:"scaleDownDelayRevisionLimit,omitempty" protobuf:"varint,12,opt,name=scaleDownDelayRevisionLimit"`
290+
// AbortScaleDownDelaySeconds adds a delay in second before scaling down the canary pods when update
291+
// is aborted for canary strategy with traffic routing (not applicable for basic canary).
292+
// 0 means canary pods are not scaled down.
293+
// Default is 30 seconds.
294+
// +optional
295+
AbortScaleDownDelaySeconds *int32 `json:"abortScaleDownDelaySeconds,omitempty" protobuf:"varint,13,opt,name=abortScaleDownDelaySeconds"`
285296
}
286297

287298
// ALBTrafficRouting configuration for ALB ingress controller to control traffic routing

pkg/apis/rollouts/v1alpha1/zz_generated.deepcopy.go

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

rollout/bluegreen_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,11 @@ var (
2626

2727
func newBlueGreenRollout(name string, replicas int, revisionHistoryLimit *int32, activeSvc string, previewSvc string) *v1alpha1.Rollout {
2828
rollout := newRollout(name, replicas, revisionHistoryLimit, map[string]string{"foo": "bar"})
29+
abortScaleDownDelaySeconds := int32(0)
2930
rollout.Spec.Strategy.BlueGreen = &v1alpha1.BlueGreenStrategy{
30-
ActiveService: activeSvc,
31-
PreviewService: previewSvc,
31+
ActiveService: activeSvc,
32+
PreviewService: previewSvc,
33+
AbortScaleDownDelaySeconds: &abortScaleDownDelaySeconds,
3234
}
3335
rollout.Status.CurrentStepHash = conditions.ComputeStepHash(rollout)
3436
rollout.Status.CurrentPodHash = controller.ComputeHash(&rollout.Spec.Template, rollout.Status.CollisionCount)

rollout/replicaset.go

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,12 @@ func (c *rolloutContext) removeScaleDownDelay(rs *appsv1.ReplicaSet) error {
3939
}
4040

4141
// addScaleDownDelay injects the `scale-down-deadline` annotation to the ReplicaSet, or if
42-
// scaleDownDelaySeconds is zero, removes it if it exists
43-
func (c *rolloutContext) addScaleDownDelay(rs *appsv1.ReplicaSet) error {
42+
// scaleDownDelaySeconds is zero, removes the annotation if it exists
43+
func (c *rolloutContext) addScaleDownDelay(rs *appsv1.ReplicaSet, scaleDownDelaySeconds time.Duration) error {
4444
if rs == nil {
4545
return nil
4646
}
4747
ctx := context.TODO()
48-
scaleDownDelaySeconds := time.Duration(defaults.GetScaleDownDelaySecondsOrDefault(c.rollout))
4948
if scaleDownDelaySeconds == 0 {
5049
// If scaledown deadline is zero, it means we need to remove any replicasets with the delay
5150
// This might happen if we switch from canary with traffic routing to basic canary
@@ -91,11 +90,11 @@ func (c *Controller) getReplicaSetsForRollouts(r *v1alpha1.Rollout) ([]*appsv1.R
9190
return cm.ClaimReplicaSets(rsList)
9291
}
9392

94-
// removeScaleDownDelays removes the scale-down-deadline annotation from the new/stable ReplicaSets,
93+
// removeScaleDownDeadlines removes the scale-down-deadline annotation from the new/stable ReplicaSets,
9594
// in the event that we moved back to an older revision that is still within its scaleDownDelay.
9695
func (c *rolloutContext) removeScaleDownDeadlines() error {
9796
var toRemove []*appsv1.ReplicaSet
98-
if c.newRS != nil {
97+
if c.newRS != nil && !c.isScaleDownOnabort() {
9998
toRemove = append(toRemove, c.newRS)
10099
}
101100
if c.stableRS != nil {
@@ -120,10 +119,52 @@ func (c *rolloutContext) reconcileNewReplicaSet() (bool, error) {
120119
if err != nil {
121120
return false, err
122121
}
122+
123+
if c.isScaleDownOnabort() {
124+
c.log.Infof("Scale down new rs '%s' on abort", c.newRS.Name)
125+
126+
// if the newRS has scale down annotation, check if it should be scaled down now
127+
if scaleDownAtStr, ok := c.newRS.Annotations[v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey]; ok {
128+
c.log.Infof("New rs '%s' has scaledown deadline annotation: %s", c.newRS.Name, scaleDownAtStr)
129+
scaleDownAtTime, err := time.Parse(time.RFC3339, scaleDownAtStr)
130+
if err != nil {
131+
c.log.Warnf("Unable to read scaleDownAt label on rs '%s'", c.newRS.Name)
132+
} else {
133+
now := metav1.Now()
134+
scaleDownAt := metav1.NewTime(scaleDownAtTime)
135+
if scaleDownAt.After(now.Time) {
136+
c.log.Infof("RS '%s' has not reached the scaleDownTime", c.newRS.Name)
137+
remainingTime := scaleDownAt.Sub(now.Time)
138+
if remainingTime < c.resyncPeriod {
139+
c.enqueueRolloutAfter(c.rollout, remainingTime)
140+
return false, nil
141+
}
142+
} else {
143+
c.log.Infof("RS '%s' has reached the scaleDownTime", c.newRS.Name)
144+
newReplicasCount = int32(0)
145+
}
146+
}
147+
} else {
148+
abortScaleDownDelaySeconds := time.Duration(defaults.GetAbortScaleDownDelaySecondsOrDefault(c.rollout))
149+
err = c.addScaleDownDelay(c.newRS, abortScaleDownDelaySeconds)
150+
if err != nil {
151+
return false, err
152+
}
153+
}
154+
}
155+
123156
scaled, _, err := c.scaleReplicaSetAndRecordEvent(c.newRS, newReplicasCount)
124157
return scaled, err
125158
}
126159

160+
func (c *rolloutContext) isScaleDownOnabort() bool {
161+
abortScaleDownDelaySeconds := time.Duration(defaults.GetAbortScaleDownDelaySecondsOrDefault(c.rollout))
162+
if c.pauseContext != nil && c.pauseContext.IsAborted() && abortScaleDownDelaySeconds > 0 {
163+
return true
164+
}
165+
return false
166+
}
167+
127168
// reconcileOtherReplicaSets reconciles "other" ReplicaSets.
128169
// Other ReplicaSets are ReplicaSets are neither the new or stable (allRSs - newRS - stableRS)
129170
func (c *rolloutContext) reconcileOtherReplicaSets() (bool, error) {

0 commit comments

Comments
 (0)