Skip to content

Commit cce5875

Browse files
authored
Merge pull request #65 from andyfase/add-backoff-to-common
Add backoff with Jitter to load-balancing scripts
2 parents 7889380 + 8934b1f commit cce5875

File tree

2 files changed

+141
-44
lines changed

2 files changed

+141
-44
lines changed

load-balancing/elb-v2/common_functions.sh

Lines changed: 67 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -29,18 +29,67 @@ export PATH="$PATH:/usr/bin:/usr/local/bin"
2929
# If true, all messages will be printed. If false, only fatal errors are printed.
3030
DEBUG=true
3131

32+
# If true, all commands will have a initial jitter - use this if deploying to significant number of instances only
33+
INITIAL_JITTER=false
34+
3235
# Number of times to check for a resouce to be in the desired state.
3336
WAITER_ATTEMPTS=60
3437

3538
# Number of seconds to wait between attempts for resource to be in a state for instance in ASG.
36-
WAITER_INTERVAL_ASG=1
39+
WAITER_INTERVAL_ASG=3
3740

3841
# Number of seconds to wait between attempts for resource to be in a state for ALB registration/deregistration.
3942
WAITER_INTERVAL_ALB=10
4043

4144
# AutoScaling Standby features at minimum require this version to work.
4245
MIN_CLI_VERSION='1.10.55'
4346

47+
#
48+
# Performs CLI command and provides expotential backoff with Jitter between any failed CLI commands
49+
# FullJitter algorithm taken from: https://www.awsarchitectureblog.com/2015/03/backoff.html
50+
# Optional pre-jitter can be enabled via GLOBAL var INITIAL_JITTER (set to "true" to enable)
51+
#
52+
exec_with_fulljitter_retry() {
53+
local MAX_RETRIES=${EXPBACKOFF_MAX_RETRIES:-8} # Max number of retries
54+
local BASE=${EXPBACKOFF_BASE:-2} # Base value for backoff calculation
55+
local MAX=${EXPBACKOFF_MAX:-120} # Max value for backoff calculation
56+
local FAILURES=0
57+
local RESP
58+
59+
# Perform initial jitter sleep if enabled
60+
if [ "$INITIAL_JITTER" = "true" ]; then
61+
local SECONDS=$(( $RANDOM % ( ($BASE * 2) ** 2 ) ))
62+
sleep $SECONDS
63+
fi
64+
65+
# Execute Provided Command
66+
RESP=$(eval $@)
67+
until [ $? -eq 0 ]; do
68+
FAILURES=$(( $FAILURES + 1 ))
69+
if (( $FAILURES > $MAX_RETRIES )); then
70+
echo "$@" >&2
71+
echo " * Failed, max retries exceeded" >&2
72+
return 1
73+
else
74+
local SECONDS=$(( $RANDOM % ( ($BASE * 2) ** $FAILURES ) ))
75+
if (( $SECONDS > $MAX )); then
76+
SECONDS=$MAX
77+
fi
78+
79+
echo "$@" >&2
80+
echo " * $FAILURES failure(s), retrying in $SECONDS second(s)" >&2
81+
sleep $SECONDS
82+
83+
# Re-Execute provided command
84+
RESP=$(eval $@)
85+
fi
86+
done
87+
88+
# Echo out CLI response which is captured by calling function
89+
echo $RESP
90+
return 0
91+
}
92+
4493
# Usage: get_instance_region
4594
#
4695
# Writes to STDOUT the AWS region as known by the local instance.
@@ -54,7 +103,7 @@ get_instance_region() {
54103
echo $AWS_REGION
55104
}
56105

57-
AWS_CLI="aws --region $(get_instance_region)"
106+
AWS_CLI="exec_with_fulljitter_retry aws --region $(get_instance_region)"
58107

59108
# Usage: autoscaling_group_name <EC2 instance ID>
60109
#
@@ -110,8 +159,8 @@ autoscaling_enter_standby() {
110159

111160
msg "Checking to see if ASG ${asg_name} will let us decrease desired capacity"
112161
local min_desired=$($AWS_CLI autoscaling describe-auto-scaling-groups \
113-
--auto-scaling-group-name "${asg_name}" \
114-
--query 'AutoScalingGroups[0].[MinSize, DesiredCapacity]' \
162+
--auto-scaling-group-name \"${asg_name}\" \
163+
--query \'AutoScalingGroups[0].[MinSize, DesiredCapacity]\' \
115164
--output text)
116165

117166
local min_cap=$(echo $min_desired | awk '{print $1}')
@@ -124,7 +173,7 @@ autoscaling_enter_standby() {
124173
local new_min=$(($min_cap - 1))
125174
msg "Decrementing ASG ${asg_name}'s minimum size to $new_min"
126175
msg $($AWS_CLI autoscaling update-auto-scaling-group \
127-
--auto-scaling-group-name "${asg_name}" \
176+
--auto-scaling-group-name \"${asg_name}\" \
128177
--min-size $new_min)
129178
if [ $? != 0 ]; then
130179
msg "Failed to reduce ASG ${asg_name}'s minimum size to $new_min. Cannot put this instance into Standby."
@@ -139,7 +188,7 @@ autoscaling_enter_standby() {
139188
msg "Putting instance $instance_id into Standby"
140189
$AWS_CLI autoscaling enter-standby \
141190
--instance-ids $instance_id \
142-
--auto-scaling-group-name "${asg_name}" \
191+
--auto-scaling-group-name \"${asg_name}\" \
143192
--should-decrement-desired-capacity
144193
if [ $? != 0 ]; then
145194
msg "Failed to put instance $instance_id into Standby for ASG ${asg_name}."
@@ -185,7 +234,7 @@ autoscaling_exit_standby() {
185234
msg "Moving instance $instance_id out of Standby"
186235
$AWS_CLI autoscaling exit-standby \
187236
--instance-ids $instance_id \
188-
--auto-scaling-group-name "${asg_name}"
237+
--auto-scaling-group-name \"${asg_name}\"
189238
if [ $? != 0 ]; then
190239
msg "Failed to put instance $instance_id back into InService for ASG ${asg_name}."
191240
return 1
@@ -201,16 +250,16 @@ autoscaling_exit_standby() {
201250

202251
if [ -a /tmp/asgmindecremented ]; then
203252
local min_desired=$($AWS_CLI autoscaling describe-auto-scaling-groups \
204-
--auto-scaling-group-name "${asg_name}" \
205-
--query 'AutoScalingGroups[0].[MinSize, DesiredCapacity]' \
253+
--auto-scaling-group-name \"${asg_name}\" \
254+
--query \'AutoScalingGroups[0].[MinSize, DesiredCapacity]\' \
206255
--output text)
207256

208257
local min_cap=$(echo $min_desired | awk '{print $1}')
209258

210259
local new_min=$(($min_cap + 1))
211260
msg "Incrementing ASG ${asg_name}'s minimum size to $new_min"
212261
msg $($AWS_CLI autoscaling update-auto-scaling-group \
213-
--auto-scaling-group-name "${asg_name}" \
262+
--auto-scaling-group-name \"${asg_name}\" \
214263
--min-size $new_min)
215264
if [ $? != 0 ]; then
216265
msg "Failed to increase ASG ${asg_name}'s minimum size to $new_min."
@@ -237,7 +286,7 @@ get_instance_state_asg() {
237286

238287
local state=$($AWS_CLI autoscaling describe-auto-scaling-instances \
239288
--instance-ids $instance_id \
240-
--query "AutoScalingInstances[?InstanceId == \`$instance_id\`].LifecycleState | [0]" \
289+
--query \"AutoScalingInstances[?InstanceId == \'$instance_id\'].LifecycleState \| [0]\" \
241290
--output text)
242291
if [ $? != 0 ]; then
243292
return 1
@@ -261,7 +310,7 @@ reset_waiter_timeout() {
261310
msg "Getting target group health check configuration for target group $target_group_name"
262311
local target_group_info=$($AWS_CLI elbv2 describe-target-groups \
263312
--names $target_group_name \
264-
--query 'TargetGroups[*].[HealthCheckIntervalSeconds,HealthCheckTimeoutSeconds,HealthyThresholdCount]' \
313+
--query \'TargetGroups[*].[HealthCheckIntervalSeconds,HealthCheckTimeoutSeconds,HealthyThresholdCount]\' \
265314
--output text)
266315

267316
if [ $? != 0 ]; then
@@ -278,13 +327,13 @@ reset_waiter_timeout() {
278327
msg "Getting target group arn for target group $target_group_name"
279328
local target_group_arn=$($AWS_CLI elbv2 describe-target-groups \
280329
--names $target_group \
281-
--query 'TargetGroups[*].[TargetGroupArn]' \
330+
--query \'TargetGroups[*].[TargetGroupArn]\' \
282331
--output text)
283332

284333
msg "Getting instance deregistration delay timeout for target group $target_group with target group arn $target_group_arn"
285334
local timeout=$($AWS_CLI elbv2 describe-target-group-attributes \
286335
--target-group-arn $target_group_arn \
287-
--query "Attributes[?Key=='deregistration_delay.timeout_seconds'].Value[]" \
336+
--query \"Attributes[?Key==\'deregistration_delay.timeout_seconds\'].Value[]\" \
288337
--output text)
289338
else
290339
msg "Unknown state name, '$state_name'";
@@ -363,7 +412,7 @@ get_instance_health_target_group() {
363412

364413
local target_group_info=$($AWS_CLI elbv2 describe-target-groups \
365414
--names $target_group \
366-
--query 'TargetGroups[*].[TargetGroupArn,Port]' \
415+
--query \'TargetGroups[*].[TargetGroupArn,Port]\' \
367416
--output text)
368417

369418
if [ $? != 0 ]; then
@@ -383,7 +432,7 @@ get_instance_health_target_group() {
383432
local instance_status=$($AWS_CLI elbv2 describe-target-health \
384433
--target-group-arn $target_group_arn \
385434
--targets Id=$instance_id,Port=$target_group_port \
386-
--query 'TargetHealthDescriptions[*].TargetHealth[].State' \
435+
--query \'TargetHealthDescriptions[*].TargetHealth[].State\' \
387436
--output text 2>/dev/null)
388437

389438
if [ $? == 0 ]; then
@@ -411,7 +460,7 @@ deregister_instance() {
411460
# The target group arn is required to query instance health against the target group
412461
local target_group_arn=$($AWS_CLI elbv2 describe-target-groups \
413462
--names $target_group_name \
414-
--query 'TargetGroups[*].[TargetGroupArn]' \
463+
--query \'TargetGroups[*].[TargetGroupArn]\' \
415464
--output text)
416465

417466
if [ $? != 0 ]; then
@@ -447,7 +496,7 @@ register_instance() {
447496

448497
local target_group_info=$($AWS_CLI elbv2 describe-target-groups \
449498
--names $target_group_name \
450-
--query 'TargetGroups[*].[TargetGroupArn,Port]' \
499+
--query \'TargetGroups[*].[TargetGroupArn,Port]\' \
451500
--output text)
452501

453502
if [ $? != 0 ]; then

0 commit comments

Comments
 (0)