@@ -198,33 +198,78 @@ def rollout_nodes(cluster_name, dry_run, debug):
198198 logging .info (f"Beginning rolling updates on ASG { asg_name } ..." )
199199 instances = describe_nodes_not_matching_lt (asg_client = asg_client , ec2_client = ec2_client , asg_name = asg_name )
200200
201- for instance in instances :
202- before_instance_count = 0
203- after_instance_count = 0
204-
205- before_instance_count = get_num_of_instances (asg_client = asg_client , ec2_client = ec2_client , asg_name = asg_name )
206- add_time = datetime .datetime .now (datetime .timezone .utc )
207- add_node (asg_client = asg_client , asg_name = asg_name , dry_run = dry_run )
208- logging .info (f'Waiting for instance to be created...' )
209- time .sleep (25 ) # new instance takes a bit to show up in API, don't bother polling yet
210- latest_instance = get_latest_instance (asg_client = asg_client , ec2_client = ec2_client , asg_name = asg_name , add_time = add_time , dry_run = dry_run )
211- latest_node_name = latest_instance ["PrivateDnsName" ]
212- logging .info (f'Waiting for instance { latest_node_name } to be "Ready"...' )
213- time .sleep (25 ) # instance will never be ready before this, don't bother polling yet
214- wait_for_ready_node (latest_node_name )
215- logging .info (f'Node { latest_node_name } is now "Ready".' )
216- after_instance_count = get_num_of_instances (asg_client = asg_client , ec2_client = ec2_client , asg_name = asg_name )
217-
218- # because get_latest_instance() doesn't necessarily return the instance launched by add_node(), this is just a safety precaution to ensure we've actually launched a node
219- logging .info (f"Had { before_instance_count } instances in { asg_name } before, now have { after_instance_count } instances" )
220- if not dry_run :
221- assert after_instance_count > before_instance_count
201+ response = asg_client .describe_auto_scaling_groups (
202+ AutoScalingGroupNames = [
203+ asg_name
204+ ]
205+ )
206+ # check if cluster-autoscaler tag exists to begin with as we will set the value later on
207+ is_cluster_autoscaler_tag_present = len ([x for x in response ["AutoScalingGroups" ][0 ]["Tags" ] if x ["Key" ] == "k8s.io/cluster-autoscaler/enabled" ]) > 0
208+ logging .info (f"cluster-autoscaler detected on { asg_name } : { is_cluster_autoscaler_tag_present } ." )
222209
223- node_name = instance ["PrivateDnsName" ]
224- logging .info (f'Draining node { node_name } (--dry-run={ dry_run } )' )
225- output = kubectl .drain (node_name , "--force" , "--delete-local-data=true" , "--ignore-daemonsets=true" , "--timeout=120s" , f"--dry-run={ dry_run } " )
226- print (output .stdout .decode ())
227- terminate_node (asg_client , instance ["InstanceId" ], dry_run )
210+ if is_cluster_autoscaler_tag_present :
211+ # prevent cluster-autoscaler from interrupting our rollout
212+ logging .info (f"Suspending cluster-autoscaler on { asg_name } ..." )
213+ if not dry_run :
214+ asg_client .delete_tag (
215+ Tags = [
216+ {
217+ 'ResourceId' : asg_name ,
218+ 'ResourceType' : "auto-scaling-group" ,
219+ 'Key' : "k8s.io/cluster-autoscaler/enabled"
220+ },
221+ ]
222+ )
223+
224+ try :
225+ for instance in instances :
226+ before_instance_count = 0
227+ after_instance_count = 0
228+
229+ before_instance_count = get_num_of_instances (asg_client = asg_client , ec2_client = ec2_client , asg_name = asg_name )
230+ add_time = datetime .datetime .now (datetime .timezone .utc )
231+ add_node (asg_client = asg_client , asg_name = asg_name , dry_run = dry_run )
232+ logging .info (f'Waiting for instance to be created...' )
233+ logging .info (f'Sleeping 25s before polling.' )
234+ time .sleep (25 ) # new instance takes a bit to show up in API, don't bother polling yet
235+ latest_instance = get_latest_instance (asg_client = asg_client , ec2_client = ec2_client , asg_name = asg_name , add_time = add_time , dry_run = dry_run )
236+ latest_node_name = latest_instance ["PrivateDnsName" ]
237+ logging .info (f'Waiting for instance { latest_node_name } to be "Ready"...' )
238+ logging .info (f'Sleeping 25s before polling.' )
239+ time .sleep (25 ) # instance will never be ready before this, don't bother polling yet
240+ wait_for_ready_node (latest_node_name )
241+ logging .info (f'Node { latest_node_name } is now "Ready".' )
242+ after_instance_count = get_num_of_instances (asg_client = asg_client , ec2_client = ec2_client , asg_name = asg_name )
243+
244+ # because get_latest_instance() doesn't necessarily return the instance launched by add_node(), this is just a safety precaution to ensure we've actually launched a node
245+ logging .info (f"Had { before_instance_count } instances in { asg_name } before, now have { after_instance_count } instances" )
246+ if not dry_run :
247+ assert after_instance_count > before_instance_count
248+
249+ node_name = instance ["PrivateDnsName" ]
250+ logging .info (f'Draining node { node_name } (--dry-run={ dry_run } )' )
251+ output = kubectl .drain (node_name , "--force" , "--delete-local-data=true" , "--ignore-daemonsets=true" , "--timeout=120s" , f"--dry-run={ dry_run } " )
252+ print (output .stdout .decode ().rstrip ())
253+
254+ terminate_node (asg_client , instance ["InstanceId" ], dry_run )
255+ except Exception :
256+ logging .critical (f"Failed to upgrade all nodes in { asg_name } ." )
257+ finally :
258+ if is_cluster_autoscaler_tag_present :
259+ # always re-enable cluster-autoscaler even if we fail partway through
260+ logging .info (f"Re-enabling cluster-autoscaler on { asg_name } ..." )
261+ if not dry_run :
262+ asg_client .create_or_update_tags (
263+ Tags = [
264+ {
265+ 'ResourceId' : asg_name ,
266+ 'ResourceType' : "auto-scaling-group" ,
267+ 'Key' : "k8s.io/cluster-autoscaler/enabled" ,
268+ 'Value' : "true" ,
269+ 'PropagateAtLaunch' : False
270+ },
271+ ]
272+ )
228273
229274 logging .info (f"All instances in { asg_name } are up to date." )
230275
0 commit comments