Skip to content
This repository was archived by the owner on May 3, 2023. It is now read-only.

Commit 2ccec71

Browse files
committed
Add logic for disabling cluster-autoscaler before performing rolling update
1 parent 8b9d5dc commit 2ccec71

File tree

1 file changed

+71
-26
lines changed

1 file changed

+71
-26
lines changed

eks_node_rollout/eks_node_rollout.py

Lines changed: 71 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -198,33 +198,78 @@ def rollout_nodes(cluster_name, dry_run, debug):
198198
logging.info(f"Beginning rolling updates on ASG {asg_name}...")
199199
instances = describe_nodes_not_matching_lt(asg_client=asg_client, ec2_client=ec2_client, asg_name=asg_name)
200200

201-
for instance in instances:
202-
before_instance_count = 0
203-
after_instance_count = 0
204-
205-
before_instance_count = get_num_of_instances(asg_client=asg_client, ec2_client=ec2_client, asg_name=asg_name)
206-
add_time = datetime.datetime.now(datetime.timezone.utc)
207-
add_node(asg_client=asg_client, asg_name=asg_name, dry_run=dry_run)
208-
logging.info(f'Waiting for instance to be created...')
209-
time.sleep(25) # new instance takes a bit to show up in API, don't bother polling yet
210-
latest_instance = get_latest_instance(asg_client=asg_client, ec2_client=ec2_client, asg_name=asg_name, add_time=add_time, dry_run=dry_run)
211-
latest_node_name = latest_instance["PrivateDnsName"]
212-
logging.info(f'Waiting for instance {latest_node_name} to be "Ready"...')
213-
time.sleep(25) # instance will never be ready before this, don't bother polling yet
214-
wait_for_ready_node(latest_node_name)
215-
logging.info(f'Node {latest_node_name} is now "Ready".')
216-
after_instance_count = get_num_of_instances(asg_client=asg_client, ec2_client=ec2_client, asg_name=asg_name)
217-
218-
# because get_latest_instance() doesn't necessarily return the instance launched by add_node(), this is just a safety precaution to ensure we've actually launched a node
219-
logging.info(f"Had {before_instance_count} instances in {asg_name} before, now have {after_instance_count} instances")
220-
if not dry_run:
221-
assert after_instance_count > before_instance_count
201+
response = asg_client.describe_auto_scaling_groups(
202+
AutoScalingGroupNames=[
203+
asg_name
204+
]
205+
)
206+
# check if cluster-autoscaler tag exists to begin with as we will set the value later on
207+
is_cluster_autoscaler_tag_present = len([x for x in response["AutoScalingGroups"][0]["Tags"] if x["Key"] == "k8s.io/cluster-autoscaler/enabled"]) > 0
208+
logging.info(f"cluster-autoscaler detected on {asg_name}: {is_cluster_autoscaler_tag_present}.")
222209

223-
node_name = instance["PrivateDnsName"]
224-
logging.info(f'Draining node {node_name} (--dry-run={dry_run})')
225-
output = kubectl.drain(node_name, "--force", "--delete-local-data=true", "--ignore-daemonsets=true", "--timeout=120s", f"--dry-run={dry_run}")
226-
print(output.stdout.decode())
227-
terminate_node(asg_client, instance["InstanceId"], dry_run)
210+
if is_cluster_autoscaler_tag_present:
211+
# prevent cluster-autoscaler from interrupting our rollout
212+
logging.info(f"Suspending cluster-autoscaler on {asg_name}...")
213+
if not dry_run:
214+
asg_client.delete_tag(
215+
Tags=[
216+
{
217+
'ResourceId': asg_name,
218+
'ResourceType': "auto-scaling-group",
219+
'Key': "k8s.io/cluster-autoscaler/enabled"
220+
},
221+
]
222+
)
223+
224+
try:
225+
for instance in instances:
226+
before_instance_count = 0
227+
after_instance_count = 0
228+
229+
before_instance_count = get_num_of_instances(asg_client=asg_client, ec2_client=ec2_client, asg_name=asg_name)
230+
add_time = datetime.datetime.now(datetime.timezone.utc)
231+
add_node(asg_client=asg_client, asg_name=asg_name, dry_run=dry_run)
232+
logging.info(f'Waiting for instance to be created...')
233+
logging.info(f'Sleeping 25s before polling.')
234+
time.sleep(25) # new instance takes a bit to show up in API, don't bother polling yet
235+
latest_instance = get_latest_instance(asg_client=asg_client, ec2_client=ec2_client, asg_name=asg_name, add_time=add_time, dry_run=dry_run)
236+
latest_node_name = latest_instance["PrivateDnsName"]
237+
logging.info(f'Waiting for instance {latest_node_name} to be "Ready"...')
238+
logging.info(f'Sleeping 25s before polling.')
239+
time.sleep(25) # instance will never be ready before this, don't bother polling yet
240+
wait_for_ready_node(latest_node_name)
241+
logging.info(f'Node {latest_node_name} is now "Ready".')
242+
after_instance_count = get_num_of_instances(asg_client=asg_client, ec2_client=ec2_client, asg_name=asg_name)
243+
244+
# because get_latest_instance() doesn't necessarily return the instance launched by add_node(), this is just a safety precaution to ensure we've actually launched a node
245+
logging.info(f"Had {before_instance_count} instances in {asg_name} before, now have {after_instance_count} instances")
246+
if not dry_run:
247+
assert after_instance_count > before_instance_count
248+
249+
node_name = instance["PrivateDnsName"]
250+
logging.info(f'Draining node {node_name} (--dry-run={dry_run})')
251+
output = kubectl.drain(node_name, "--force", "--delete-local-data=true", "--ignore-daemonsets=true", "--timeout=120s", f"--dry-run={dry_run}")
252+
print(output.stdout.decode().rstrip())
253+
254+
terminate_node(asg_client, instance["InstanceId"], dry_run)
255+
except Exception:
256+
logging.critical(f"Failed to upgrade all nodes in {asg_name}.")
257+
finally:
258+
if is_cluster_autoscaler_tag_present:
259+
# always re-enable cluster-autoscaler even if we fail partway through
260+
logging.info(f"Re-enabling cluster-autoscaler on {asg_name}...")
261+
if not dry_run:
262+
asg_client.create_or_update_tags(
263+
Tags=[
264+
{
265+
'ResourceId': asg_name,
266+
'ResourceType': "auto-scaling-group",
267+
'Key': "k8s.io/cluster-autoscaler/enabled",
268+
'Value': "true",
269+
'PropagateAtLaunch': False
270+
},
271+
]
272+
)
228273

229274
logging.info(f"All instances in {asg_name} are up to date.")
230275

0 commit comments

Comments
 (0)