@@ -29,18 +29,67 @@ export PATH="$PATH:/usr/bin:/usr/local/bin"
2929# If true, all messages will be printed. If false, only fatal errors are printed.
3030DEBUG=true
3131
32+ # If true, all commands will have a initial jitter - use this if deploying to significant number of instances only
33+ INITIAL_JITTER=false
34+
3235# Number of times to check for a resouce to be in the desired state.
3336WAITER_ATTEMPTS=60
3437
3538# Number of seconds to wait between attempts for resource to be in a state for instance in ASG.
36- WAITER_INTERVAL_ASG=1
39+ WAITER_INTERVAL_ASG=3
3740
3841# Number of seconds to wait between attempts for resource to be in a state for ALB registration/deregistration.
3942WAITER_INTERVAL_ALB=10
4043
4144# AutoScaling Standby features at minimum require this version to work.
4245MIN_CLI_VERSION=' 1.10.55'
4346
47+ #
48+ # Performs CLI command and provides expotential backoff with Jitter between any failed CLI commands
49+ # FullJitter algorithm taken from: https://www.awsarchitectureblog.com/2015/03/backoff.html
50+ # Optional pre-jitter can be enabled via GLOBAL var INITIAL_JITTER (set to "true" to enable)
51+ #
52+ exec_with_fulljitter_retry () {
53+ local MAX_RETRIES=${EXPBACKOFF_MAX_RETRIES:- 8} # Max number of retries
54+ local BASE=${EXPBACKOFF_BASE:- 2} # Base value for backoff calculation
55+ local MAX=${EXPBACKOFF_MAX:- 120} # Max value for backoff calculation
56+ local FAILURES=0
57+ local RESP
58+
59+ # Perform initial jitter sleep if enabled
60+ if [ " $INITIAL_JITTER " = " true" ]; then
61+ local SECONDS=$(( $RANDOM % ( ($BASE * 2 ) ** 2 ) ))
62+ sleep $SECONDS
63+ fi
64+
65+ # Execute Provided Command
66+ RESP=$( eval $@ )
67+ until [ $? -eq 0 ]; do
68+ FAILURES=$(( $FAILURES + 1 ))
69+ if (( $FAILURES > $MAX_RETRIES )) ; then
70+ echo " $@ " >&2
71+ echo " * Failed, max retries exceeded" >&2
72+ return 1
73+ else
74+ local SECONDS=$(( $RANDOM % ( ($BASE * 2 ) ** $FAILURES ) ))
75+ if (( $SECONDS > $MAX )) ; then
76+ SECONDS=$MAX
77+ fi
78+
79+ echo " $@ " >&2
80+ echo " * $FAILURES failure(s), retrying in $SECONDS second(s)" >&2
81+ sleep $SECONDS
82+
83+ # Re-Execute provided command
84+ RESP=$( eval $@ )
85+ fi
86+ done
87+
88+ # Echo out CLI response which is captured by calling function
89+ echo $RESP
90+ return 0
91+ }
92+
4493# Usage: get_instance_region
4594#
4695# Writes to STDOUT the AWS region as known by the local instance.
@@ -54,7 +103,7 @@ get_instance_region() {
54103 echo $AWS_REGION
55104}
56105
57- AWS_CLI=" aws --region $( get_instance_region) "
106+ AWS_CLI=" exec_with_fulljitter_retry aws --region $( get_instance_region) "
58107
59108# Usage: autoscaling_group_name <EC2 instance ID>
60109#
@@ -110,8 +159,8 @@ autoscaling_enter_standby() {
110159
111160 msg " Checking to see if ASG ${asg_name} will let us decrease desired capacity"
112161 local min_desired=$( $AWS_CLI autoscaling describe-auto-scaling-groups \
113- --auto-scaling-group-name " ${asg_name} " \
114- --query ' AutoScalingGroups[0].[MinSize, DesiredCapacity]' \
162+ --auto-scaling-group-name \ "${asg_name} \ " \
163+ --query \ ' AutoScalingGroups[0].[MinSize, DesiredCapacity]\ ' \
115164 --output text)
116165
117166 local min_cap=$( echo $min_desired | awk ' {print $1}' )
@@ -124,7 +173,7 @@ autoscaling_enter_standby() {
124173 local new_min=$(( $min_cap - 1 ))
125174 msg " Decrementing ASG ${asg_name} 's minimum size to $new_min "
126175 msg $( $AWS_CLI autoscaling update-auto-scaling-group \
127- --auto-scaling-group-name " ${asg_name} " \
176+ --auto-scaling-group-name \ "${asg_name} \ " \
128177 --min-size $new_min )
129178 if [ $? != 0 ]; then
130179 msg " Failed to reduce ASG ${asg_name} 's minimum size to $new_min . Cannot put this instance into Standby."
@@ -139,7 +188,7 @@ autoscaling_enter_standby() {
139188 msg " Putting instance $instance_id into Standby"
140189 $AWS_CLI autoscaling enter-standby \
141190 --instance-ids $instance_id \
142- --auto-scaling-group-name " ${asg_name} " \
191+ --auto-scaling-group-name \ "${asg_name} \ " \
143192 --should-decrement-desired-capacity
144193 if [ $? != 0 ]; then
145194 msg " Failed to put instance $instance_id into Standby for ASG ${asg_name} ."
@@ -185,7 +234,7 @@ autoscaling_exit_standby() {
185234 msg " Moving instance $instance_id out of Standby"
186235 $AWS_CLI autoscaling exit-standby \
187236 --instance-ids $instance_id \
188- --auto-scaling-group-name " ${asg_name} "
237+ --auto-scaling-group-name \ "${asg_name} \ "
189238 if [ $? != 0 ]; then
190239 msg " Failed to put instance $instance_id back into InService for ASG ${asg_name} ."
191240 return 1
@@ -201,16 +250,16 @@ autoscaling_exit_standby() {
201250
202251 if [ -a /tmp/asgmindecremented ]; then
203252 local min_desired=$( $AWS_CLI autoscaling describe-auto-scaling-groups \
204- --auto-scaling-group-name " ${asg_name} " \
205- --query ' AutoScalingGroups[0].[MinSize, DesiredCapacity]' \
253+ --auto-scaling-group-name \ "${asg_name} \ " \
254+ --query \ ' AutoScalingGroups[0].[MinSize, DesiredCapacity]\ ' \
206255 --output text)
207256
208257 local min_cap=$( echo $min_desired | awk ' {print $1}' )
209258
210259 local new_min=$(( $min_cap + 1 ))
211260 msg " Incrementing ASG ${asg_name} 's minimum size to $new_min "
212261 msg $( $AWS_CLI autoscaling update-auto-scaling-group \
213- --auto-scaling-group-name " ${asg_name} " \
262+ --auto-scaling-group-name \ "${asg_name} \ " \
214263 --min-size $new_min )
215264 if [ $? != 0 ]; then
216265 msg " Failed to increase ASG ${asg_name} 's minimum size to $new_min ."
@@ -237,7 +286,7 @@ get_instance_state_asg() {
237286
238287 local state=$( $AWS_CLI autoscaling describe-auto-scaling-instances \
239288 --instance-ids $instance_id \
240- --query " AutoScalingInstances[?InstanceId == \` $instance_id \` ].LifecycleState | [0]" \
289+ --query \ " AutoScalingInstances[? InstanceId == \' $instance_id \' ].LifecycleState \ | [0]\ " \
241290 --output text)
242291 if [ $? != 0 ]; then
243292 return 1
@@ -261,7 +310,7 @@ reset_waiter_timeout() {
261310 msg " Getting target group health check configuration for target group $target_group_name "
262311 local target_group_info=$( $AWS_CLI elbv2 describe-target-groups \
263312 --names $target_group_name \
264- --query ' TargetGroups[*].[HealthCheckIntervalSeconds,HealthCheckTimeoutSeconds,HealthyThresholdCount]' \
313+ --query \ ' TargetGroups[* ].[HealthCheckIntervalSeconds,HealthCheckTimeoutSeconds,HealthyThresholdCount]\ ' \
265314 --output text)
266315
267316 if [ $? != 0 ]; then
@@ -278,13 +327,13 @@ reset_waiter_timeout() {
278327 msg " Getting target group arn for target group $target_group_name "
279328 local target_group_arn=$( $AWS_CLI elbv2 describe-target-groups \
280329 --names $target_group \
281- --query ' TargetGroups[*].[TargetGroupArn]' \
330+ --query \ ' TargetGroups[* ].[TargetGroupArn]\ ' \
282331 --output text)
283332
284333 msg " Getting instance deregistration delay timeout for target group $target_group with target group arn $target_group_arn "
285334 local timeout=$( $AWS_CLI elbv2 describe-target-group-attributes \
286335 --target-group-arn $target_group_arn \
287- --query " Attributes[?Key=='deregistration_delay.timeout_seconds'].Value[]" \
336+ --query \ " Attributes[? Key== \ ' deregistration_delay.timeout_seconds\ ' ].Value[]\ " \
288337 --output text)
289338 else
290339 msg " Unknown state name, '$state_name '" ;
@@ -363,7 +412,7 @@ get_instance_health_target_group() {
363412
364413 local target_group_info=$( $AWS_CLI elbv2 describe-target-groups \
365414 --names $target_group \
366- --query ' TargetGroups[*].[TargetGroupArn,Port]' \
415+ --query \ ' TargetGroups[* ].[TargetGroupArn,Port]\ ' \
367416 --output text)
368417
369418 if [ $? != 0 ]; then
@@ -383,7 +432,7 @@ get_instance_health_target_group() {
383432 local instance_status=$( $AWS_CLI elbv2 describe-target-health \
384433 --target-group-arn $target_group_arn \
385434 --targets Id=$instance_id ,Port=$target_group_port \
386- --query ' TargetHealthDescriptions[*].TargetHealth[].State' \
435+ --query \ ' TargetHealthDescriptions[* ].TargetHealth[].State\ ' \
387436 --output text 2> /dev/null)
388437
389438 if [ $? == 0 ]; then
@@ -411,7 +460,7 @@ deregister_instance() {
411460 # The target group arn is required to query instance health against the target group
412461 local target_group_arn=$( $AWS_CLI elbv2 describe-target-groups \
413462 --names $target_group_name \
414- --query ' TargetGroups[*].[TargetGroupArn]' \
463+ --query \ ' TargetGroups[* ].[TargetGroupArn]\ ' \
415464 --output text)
416465
417466 if [ $? != 0 ]; then
@@ -447,7 +496,7 @@ register_instance() {
447496
448497 local target_group_info=$( $AWS_CLI elbv2 describe-target-groups \
449498 --names $target_group_name \
450- --query ' TargetGroups[*].[TargetGroupArn,Port]' \
499+ --query \ ' TargetGroups[* ].[TargetGroupArn,Port]\ ' \
451500 --output text)
452501
453502 if [ $? != 0 ]; then
0 commit comments