@@ -386,13 +386,18 @@ async def _activate_drained_nodes(
386386 if node .assigned_tasks
387387 ]
388388
389- # activate these nodes now
390- await asyncio .gather (
391- * (
392- _activate_and_notify (app , auto_scaling_mode , node )
393- for node in nodes_to_activate
389+ if not nodes_to_activate :
390+ return cluster
391+
392+ with log_context (
393+ _logger , logging .INFO , f"activate { len (nodes_to_activate )} drained nodes"
394+ ):
395+ await asyncio .gather (
396+ * (
397+ _activate_and_notify (app , auto_scaling_mode , node )
398+ for node in nodes_to_activate
399+ )
394400 )
395- )
396401 new_active_node_ids = {node .ec2_instance .id for node in nodes_to_activate }
397402 remaining_drained_nodes = [
398403 node
@@ -421,12 +426,17 @@ async def _start_buffer_instances(
421426 if not instances_to_start :
422427 return cluster
423428 # change the buffer machine to an active one
424- await get_ec2_client (app ).set_instances_tags (
425- instances_to_start ,
426- tags = get_activated_buffer_ec2_tags (app , auto_scaling_mode ),
427- )
429+ with log_context (
430+ _logger , logging .INFO , f"start { len (instances_to_start )} buffer machines"
431+ ):
432+ await get_ec2_client (app ).set_instances_tags (
433+ instances_to_start ,
434+ tags = get_activated_buffer_ec2_tags (app , auto_scaling_mode ),
435+ )
428436
429- started_instances = await get_ec2_client (app ).start_instances (instances_to_start )
437+ started_instances = await get_ec2_client (app ).start_instances (
438+ instances_to_start
439+ )
430440 started_instance_ids = [i .id for i in started_instances ]
431441
432442 return dataclasses .replace (
@@ -615,10 +625,10 @@ async def _find_needed_instances(
615625 _logger .exception ("Unexpected error:" )
616626
617627 _logger .info (
618- "found following needed %s instances: %s" ,
628+ "found following %s needed instances: %s" ,
619629 len (needed_new_instance_types_for_tasks ),
620630 [
621- f"{ i .instance_type .name = } :{ i .instance_type .resources } with { len (i .assigned_tasks )} tasks "
631+ f"{ i .instance_type .name } :{ i .instance_type .resources } takes { len (i .assigned_tasks )} task { 's' if len ( i . assigned_tasks ) > 1 else '' } "
622632 for i in needed_new_instance_types_for_tasks
623633 ],
624634 )
@@ -810,39 +820,6 @@ async def _launch_instances(
810820 return new_pending_instances
811821
812822
813- async def _scale_up_cluster (
814- app : FastAPI ,
815- cluster : Cluster ,
816- unassigned_tasks : list ,
817- auto_scaling_mode : BaseAutoscaling ,
818- allowed_instance_types : list [EC2InstanceType ],
819- ) -> Cluster :
820- app_settings : ApplicationSettings = app .state .settings
821- assert app_settings .AUTOSCALING_EC2_ACCESS # nosec
822- assert app_settings .AUTOSCALING_EC2_INSTANCES # nosec
823-
824- # let's start these
825- if needed_ec2_instances := await _find_needed_instances (
826- app , unassigned_tasks , allowed_instance_types , cluster , auto_scaling_mode
827- ):
828- await auto_scaling_mode .log_message_from_tasks (
829- app ,
830- unassigned_tasks ,
831- "service is pending due to missing resources, scaling up cluster now..." ,
832- level = logging .INFO ,
833- )
834- new_pending_instances = await _launch_instances (
835- app , needed_ec2_instances , unassigned_tasks , auto_scaling_mode
836- )
837- cluster .pending_ec2s .extend (
838- [NonAssociatedInstance (ec2_instance = i ) for i in new_pending_instances ]
839- )
840- # NOTE: to check the logs of UserData in EC2 instance
841- # run: tail -f -n 1000 /var/log/cloud-init-output.log in the instance
842-
843- return cluster
844-
845-
846823async def _find_drainable_nodes (
847824 app : FastAPI , cluster : Cluster
848825) -> list [AssociatedInstance ]:
@@ -898,23 +875,25 @@ async def _deactivate_empty_nodes(app: FastAPI, cluster: Cluster) -> Cluster:
898875 if not active_empty_instances :
899876 return cluster
900877
901- # drain this empty nodes
902- updated_nodes : list [Node ] = await asyncio .gather (
903- * (
904- utils_docker .set_node_osparc_ready (
905- app_settings ,
906- docker_client ,
907- node .node ,
908- ready = False ,
878+ with log_context (
879+ _logger , logging .INFO , f"drain { len (active_empty_instances )} empty nodes"
880+ ):
881+ updated_nodes : list [Node ] = await asyncio .gather (
882+ * (
883+ utils_docker .set_node_osparc_ready (
884+ app_settings ,
885+ docker_client ,
886+ node .node ,
887+ ready = False ,
888+ )
889+ for node in active_empty_instances
909890 )
910- for node in active_empty_instances
911- )
912- )
913- if updated_nodes :
914- _logger .info (
915- "following nodes were set to drain: '%s'" ,
916- f"{ [node .description .hostname for node in updated_nodes if node .description ]} " ,
917891 )
892+ if updated_nodes :
893+ _logger .info (
894+ "following nodes were set to drain: '%s'" ,
895+ f"{ [node .description .hostname for node in updated_nodes if node .description ]} " ,
896+ )
918897 newly_drained_instances = [
919898 AssociatedInstance (node = node , ec2_instance = instance .ec2_instance )
920899 for instance , node in zip (active_empty_instances , updated_nodes , strict = True )
@@ -1021,10 +1000,15 @@ async def _try_scale_down_cluster(app: FastAPI, cluster: Cluster) -> Cluster:
10211000 if i .ec2_instance .id
10221001 not in (new_terminating_instance_ids + terminated_instance_ids )
10231002 ]
1003+ still_terminating_nodes = [
1004+ i
1005+ for i in cluster .terminating_nodes
1006+ if i .ec2_instance .id not in terminated_instance_ids
1007+ ]
10241008 return dataclasses .replace (
10251009 cluster ,
10261010 drained_nodes = still_drained_nodes ,
1027- terminating_nodes = cluster . terminating_nodes + new_terminating_instances ,
1011+ terminating_nodes = still_terminating_nodes + new_terminating_instances ,
10281012 terminated_instances = cluster .terminated_instances
10291013 + [
10301014 NonAssociatedInstance (ec2_instance = i .ec2_instance )
@@ -1119,7 +1103,7 @@ async def _drain_retired_nodes(
11191103 )
11201104
11211105
1122- async def _scale_down_unused_cluster_machines (
1106+ async def _scale_down_unused_cluster_instances (
11231107 app : FastAPI ,
11241108 cluster : Cluster ,
11251109 auto_scaling_mode : BaseAutoscaling ,
@@ -1129,57 +1113,89 @@ async def _scale_down_unused_cluster_machines(
11291113 return await _try_scale_down_cluster (app , cluster )
11301114
11311115
1132- async def _autoscale_cluster (
1116+ async def _scale_up_cluster (
11331117 app : FastAPI ,
11341118 cluster : Cluster ,
11351119 auto_scaling_mode : BaseAutoscaling ,
11361120 allowed_instance_types : list [EC2InstanceType ],
1121+ unassigned_tasks : list ,
11371122) -> Cluster :
1138- # 1. check if we have pending tasks
1139- unrunnable_tasks = await auto_scaling_mode .list_unrunnable_tasks (app )
1140- _logger .info ("found %s unrunnable tasks" , len (unrunnable_tasks ))
1141- # NOTE: this function predicts how the backend will assign tasks
1142- queued_or_missing_instance_tasks , cluster = await _assign_tasks_to_current_cluster (
1143- app , unrunnable_tasks , cluster , auto_scaling_mode
1144- )
1145- # 2. try to activate drained nodes to cover some of the tasks
1146- cluster = await _activate_drained_nodes (app , cluster , auto_scaling_mode )
1147-
1148- # 3. start buffer instances to cover the remaining tasks
1149- cluster = await _start_buffer_instances (app , cluster , auto_scaling_mode )
1150-
1151- # 4. scale down unused machines
1152- cluster = await _scale_down_unused_cluster_machines (app , cluster , auto_scaling_mode )
1153-
1154- # 4. let's check if there are still pending tasks or if the reserve was used
11551123 app_settings = get_application_settings (app )
11561124 assert app_settings .AUTOSCALING_EC2_INSTANCES # nosec
1125+ if not unassigned_tasks and (
1126+ len (cluster .buffer_drained_nodes )
1127+ >= app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MACHINES_BUFFER
1128+ ):
1129+ return cluster
1130+
11571131 if (
1158- queued_or_missing_instance_tasks
1159- or (
1160- len (cluster .buffer_drained_nodes )
1161- < app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MACHINES_BUFFER
1162- )
1163- and (
1164- cluster .total_number_of_machines ()
1165- < app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES
1166- )
1132+ cluster .total_number_of_machines ()
1133+ >= app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES
11671134 ):
11681135 _logger .info (
1169- "%s unrunnable tasks could not be assigned, slowly trying to scale up..." ,
1170- len (queued_or_missing_instance_tasks ),
1136+ "cluster already hit the maximum allowed amount of instances (%s), not scaling up. "
1137+ "%s tasks will wait until instances are free." ,
1138+ app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES ,
1139+ len (unassigned_tasks ),
11711140 )
1172- cluster = await _scale_up_cluster (
1141+ return cluster
1142+
1143+ # now we scale up
1144+ assert app_settings .AUTOSCALING_EC2_ACCESS # nosec
1145+
1146+ # let's start these
1147+ if needed_ec2_instances := await _find_needed_instances (
1148+ app , unassigned_tasks , allowed_instance_types , cluster , auto_scaling_mode
1149+ ):
1150+ await auto_scaling_mode .log_message_from_tasks (
11731151 app ,
1174- cluster ,
1175- queued_or_missing_instance_tasks ,
1176- auto_scaling_mode ,
1177- allowed_instance_types ,
1152+ unassigned_tasks ,
1153+ "service is pending due to missing resources, scaling up cluster now..." ,
1154+ level = logging .INFO ,
1155+ )
1156+ new_pending_instances = await _launch_instances (
1157+ app , needed_ec2_instances , unassigned_tasks , auto_scaling_mode
1158+ )
1159+ cluster .pending_ec2s .extend (
1160+ [NonAssociatedInstance (ec2_instance = i ) for i in new_pending_instances ]
11781161 )
1162+ # NOTE: to check the logs of UserData in EC2 instance
1163+ # run: tail -f -n 1000 /var/log/cloud-init-output.log in the instance
11791164
11801165 return cluster
11811166
11821167
1168+ async def _autoscale_cluster (
1169+ app : FastAPI ,
1170+ cluster : Cluster ,
1171+ auto_scaling_mode : BaseAutoscaling ,
1172+ allowed_instance_types : list [EC2InstanceType ],
1173+ ) -> Cluster :
1174+ # 1. check if we have pending tasks
1175+ unnasigned_pending_tasks = await auto_scaling_mode .list_unrunnable_tasks (app )
1176+ _logger .info ("found %s pending tasks" , len (unnasigned_pending_tasks ))
1177+ # NOTE: this function predicts how the backend will assign tasks
1178+ still_pending_tasks , cluster = await _assign_tasks_to_current_cluster (
1179+ app , unnasigned_pending_tasks , cluster , auto_scaling_mode
1180+ )
1181+
1182+ # 2. activate available drained nodes to cover some of the tasks
1183+ cluster = await _activate_drained_nodes (app , cluster , auto_scaling_mode )
1184+
1185+ # 3. start buffer instances to cover the remaining tasks
1186+ cluster = await _start_buffer_instances (app , cluster , auto_scaling_mode )
1187+
1188+ # 4. scale down unused instances
1189+ cluster = await _scale_down_unused_cluster_instances (
1190+ app , cluster , auto_scaling_mode
1191+ )
1192+
1193+ # 5. scale up if necessary
1194+ return await _scale_up_cluster (
1195+ app , cluster , auto_scaling_mode , allowed_instance_types , still_pending_tasks
1196+ )
1197+
1198+
11831199async def _notify_autoscaling_status (
11841200 app : FastAPI , cluster : Cluster , auto_scaling_mode : BaseAutoscaling
11851201) -> None :
0 commit comments