@@ -131,7 +131,7 @@ def get_latest_instance(asg_client, ec2_client, asg_name, add_time, dry_run=True
131131 return latest_instance
132132
133133
134- def get_num_of_instances (asg_client , ec2_client , asg_name ):
134+ def get_num_of_instances (asg_client , ec2_client , asg_name , exclude_ids ):
135135 """Returns number of instances in an ASG"""
136136
137137 instances = []
@@ -141,7 +141,7 @@ def get_num_of_instances(asg_client, ec2_client, asg_name):
141141 asg_name
142142 ]
143143 )
144- instance_ids = [instance ["InstanceId" ] for instance in response ["AutoScalingGroups" ][0 ]["Instances" ]]
144+ instance_ids = [instance ["InstanceId" ] for instance in response ["AutoScalingGroups" ][0 ]["Instances" ] if instance [ "InstanceId" ] not in exclude_ids ]
145145 response = ec2_client .describe_instances (InstanceIds = instance_ids )
146146 for reservation in response ["Reservations" ]:
147147 for instance in reservation ["Instances" ]:
@@ -262,11 +262,12 @@ def rollout_nodes(cluster_name, drain_timeout, dry_run, debug):
262262 disable_autoscaling (asg_client = asg_client , asg_name = asg_name , dry_run = dry_run )
263263
264264 try :
265+ terminated_ids = []
265266 for instance in instances :
266267 before_instance_count = 0
267268 after_instance_count = 0
268269
269- before_instance_count = get_num_of_instances (asg_client = asg_client , ec2_client = ec2_client , asg_name = asg_name )
270+ before_instance_count = get_num_of_instances (asg_client = asg_client , ec2_client = ec2_client , asg_name = asg_name , exclude_ids = terminated_ids )
270271 add_time = datetime .datetime .now (datetime .timezone .utc )
271272 add_node (asg_client = asg_client , asg_name = asg_name , dry_run = dry_run )
272273 logging .info (f'Waiting for instance to be created...' )
@@ -279,7 +280,7 @@ def rollout_nodes(cluster_name, drain_timeout, dry_run, debug):
279280 time .sleep (40 ) # instance will never be ready before this, don't bother polling yet
280281 wait_for_ready_node (latest_node_name )
281282 logging .info (f'Node { latest_node_name } is now "Ready".' )
282- after_instance_count = get_num_of_instances (asg_client = asg_client , ec2_client = ec2_client , asg_name = asg_name )
283+ after_instance_count = get_num_of_instances (asg_client = asg_client , ec2_client = ec2_client , asg_name = asg_name , exclude_ids = terminated_ids )
283284
284285 # because get_latest_instance() doesn't necessarily return the instance launched by add_node(), this is just a safety precaution to ensure we've actually launched a node
285286 logging .info (f"Had { before_instance_count } instances in { asg_name } before, now have { after_instance_count } instances" )
@@ -292,6 +293,7 @@ def rollout_nodes(cluster_name, drain_timeout, dry_run, debug):
292293 print (output .stdout .decode ().rstrip ())
293294
294295 terminate_node (asg_client , instance ["InstanceId" ], dry_run )
296+ terminated_ids .append (instance ["InstanceId" ])
295297 except Exception :
296298 logging .critical (f"Failed to upgrade all nodes in { asg_name } ." )
297299 raise
0 commit comments