Skip to content

Commit 3a40819

Browse files
committed
improve logging
1 parent 0de1bed commit 3a40819

File tree

1 file changed

+113
-97
lines changed

1 file changed

+113
-97
lines changed

services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py

Lines changed: 113 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -386,13 +386,18 @@ async def _activate_drained_nodes(
386386
if node.assigned_tasks
387387
]
388388

389-
# activate these nodes now
390-
await asyncio.gather(
391-
*(
392-
_activate_and_notify(app, auto_scaling_mode, node)
393-
for node in nodes_to_activate
389+
if not nodes_to_activate:
390+
return cluster
391+
392+
with log_context(
393+
_logger, logging.INFO, f"activate {len(nodes_to_activate)} drained nodes"
394+
):
395+
await asyncio.gather(
396+
*(
397+
_activate_and_notify(app, auto_scaling_mode, node)
398+
for node in nodes_to_activate
399+
)
394400
)
395-
)
396401
new_active_node_ids = {node.ec2_instance.id for node in nodes_to_activate}
397402
remaining_drained_nodes = [
398403
node
@@ -421,12 +426,17 @@ async def _start_buffer_instances(
421426
if not instances_to_start:
422427
return cluster
423428
# change the buffer machine to an active one
424-
await get_ec2_client(app).set_instances_tags(
425-
instances_to_start,
426-
tags=get_activated_buffer_ec2_tags(app, auto_scaling_mode),
427-
)
429+
with log_context(
430+
_logger, logging.INFO, f"start {len(instances_to_start)} buffer machines"
431+
):
432+
await get_ec2_client(app).set_instances_tags(
433+
instances_to_start,
434+
tags=get_activated_buffer_ec2_tags(app, auto_scaling_mode),
435+
)
428436

429-
started_instances = await get_ec2_client(app).start_instances(instances_to_start)
437+
started_instances = await get_ec2_client(app).start_instances(
438+
instances_to_start
439+
)
430440
started_instance_ids = [i.id for i in started_instances]
431441

432442
return dataclasses.replace(
@@ -615,10 +625,10 @@ async def _find_needed_instances(
615625
_logger.exception("Unexpected error:")
616626

617627
_logger.info(
618-
"found following needed %s instances: %s",
628+
"found following %s needed instances: %s",
619629
len(needed_new_instance_types_for_tasks),
620630
[
621-
f"{i.instance_type.name=}:{i.instance_type.resources} with {len(i.assigned_tasks)} tasks"
631+
f"{i.instance_type.name}:{i.instance_type.resources} takes {len(i.assigned_tasks)} task{'s' if len(i.assigned_tasks)>1 else ''}"
622632
for i in needed_new_instance_types_for_tasks
623633
],
624634
)
@@ -810,39 +820,6 @@ async def _launch_instances(
810820
return new_pending_instances
811821

812822

813-
async def _scale_up_cluster(
814-
app: FastAPI,
815-
cluster: Cluster,
816-
unassigned_tasks: list,
817-
auto_scaling_mode: BaseAutoscaling,
818-
allowed_instance_types: list[EC2InstanceType],
819-
) -> Cluster:
820-
app_settings: ApplicationSettings = app.state.settings
821-
assert app_settings.AUTOSCALING_EC2_ACCESS # nosec
822-
assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec
823-
824-
# let's start these
825-
if needed_ec2_instances := await _find_needed_instances(
826-
app, unassigned_tasks, allowed_instance_types, cluster, auto_scaling_mode
827-
):
828-
await auto_scaling_mode.log_message_from_tasks(
829-
app,
830-
unassigned_tasks,
831-
"service is pending due to missing resources, scaling up cluster now...",
832-
level=logging.INFO,
833-
)
834-
new_pending_instances = await _launch_instances(
835-
app, needed_ec2_instances, unassigned_tasks, auto_scaling_mode
836-
)
837-
cluster.pending_ec2s.extend(
838-
[NonAssociatedInstance(ec2_instance=i) for i in new_pending_instances]
839-
)
840-
# NOTE: to check the logs of UserData in EC2 instance
841-
# run: tail -f -n 1000 /var/log/cloud-init-output.log in the instance
842-
843-
return cluster
844-
845-
846823
async def _find_drainable_nodes(
847824
app: FastAPI, cluster: Cluster
848825
) -> list[AssociatedInstance]:
@@ -898,23 +875,25 @@ async def _deactivate_empty_nodes(app: FastAPI, cluster: Cluster) -> Cluster:
898875
if not active_empty_instances:
899876
return cluster
900877

901-
# drain this empty nodes
902-
updated_nodes: list[Node] = await asyncio.gather(
903-
*(
904-
utils_docker.set_node_osparc_ready(
905-
app_settings,
906-
docker_client,
907-
node.node,
908-
ready=False,
878+
with log_context(
879+
_logger, logging.INFO, f"drain {len(active_empty_instances)} empty nodes"
880+
):
881+
updated_nodes: list[Node] = await asyncio.gather(
882+
*(
883+
utils_docker.set_node_osparc_ready(
884+
app_settings,
885+
docker_client,
886+
node.node,
887+
ready=False,
888+
)
889+
for node in active_empty_instances
909890
)
910-
for node in active_empty_instances
911-
)
912-
)
913-
if updated_nodes:
914-
_logger.info(
915-
"following nodes were set to drain: '%s'",
916-
f"{[node.description.hostname for node in updated_nodes if node.description]}",
917891
)
892+
if updated_nodes:
893+
_logger.info(
894+
"following nodes were set to drain: '%s'",
895+
f"{[node.description.hostname for node in updated_nodes if node.description]}",
896+
)
918897
newly_drained_instances = [
919898
AssociatedInstance(node=node, ec2_instance=instance.ec2_instance)
920899
for instance, node in zip(active_empty_instances, updated_nodes, strict=True)
@@ -1021,10 +1000,15 @@ async def _try_scale_down_cluster(app: FastAPI, cluster: Cluster) -> Cluster:
10211000
if i.ec2_instance.id
10221001
not in (new_terminating_instance_ids + terminated_instance_ids)
10231002
]
1003+
still_terminating_nodes = [
1004+
i
1005+
for i in cluster.terminating_nodes
1006+
if i.ec2_instance.id not in terminated_instance_ids
1007+
]
10241008
return dataclasses.replace(
10251009
cluster,
10261010
drained_nodes=still_drained_nodes,
1027-
terminating_nodes=cluster.terminating_nodes + new_terminating_instances,
1011+
terminating_nodes=still_terminating_nodes + new_terminating_instances,
10281012
terminated_instances=cluster.terminated_instances
10291013
+ [
10301014
NonAssociatedInstance(ec2_instance=i.ec2_instance)
@@ -1119,7 +1103,7 @@ async def _drain_retired_nodes(
11191103
)
11201104

11211105

1122-
async def _scale_down_unused_cluster_machines(
1106+
async def _scale_down_unused_cluster_instances(
11231107
app: FastAPI,
11241108
cluster: Cluster,
11251109
auto_scaling_mode: BaseAutoscaling,
@@ -1129,57 +1113,89 @@ async def _scale_down_unused_cluster_machines(
11291113
return await _try_scale_down_cluster(app, cluster)
11301114

11311115

1132-
async def _autoscale_cluster(
1116+
async def _scale_up_cluster(
11331117
app: FastAPI,
11341118
cluster: Cluster,
11351119
auto_scaling_mode: BaseAutoscaling,
11361120
allowed_instance_types: list[EC2InstanceType],
1121+
unassigned_tasks: list,
11371122
) -> Cluster:
1138-
# 1. check if we have pending tasks
1139-
unrunnable_tasks = await auto_scaling_mode.list_unrunnable_tasks(app)
1140-
_logger.info("found %s unrunnable tasks", len(unrunnable_tasks))
1141-
# NOTE: this function predicts how the backend will assign tasks
1142-
queued_or_missing_instance_tasks, cluster = await _assign_tasks_to_current_cluster(
1143-
app, unrunnable_tasks, cluster, auto_scaling_mode
1144-
)
1145-
# 2. try to activate drained nodes to cover some of the tasks
1146-
cluster = await _activate_drained_nodes(app, cluster, auto_scaling_mode)
1147-
1148-
# 3. start buffer instances to cover the remaining tasks
1149-
cluster = await _start_buffer_instances(app, cluster, auto_scaling_mode)
1150-
1151-
# 4. scale down unused machines
1152-
cluster = await _scale_down_unused_cluster_machines(app, cluster, auto_scaling_mode)
1153-
1154-
# 4. let's check if there are still pending tasks or if the reserve was used
11551123
app_settings = get_application_settings(app)
11561124
assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec
1125+
if not unassigned_tasks and (
1126+
len(cluster.buffer_drained_nodes)
1127+
>= app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER
1128+
):
1129+
return cluster
1130+
11571131
if (
1158-
queued_or_missing_instance_tasks
1159-
or (
1160-
len(cluster.buffer_drained_nodes)
1161-
< app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER
1162-
)
1163-
and (
1164-
cluster.total_number_of_machines()
1165-
< app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES
1166-
)
1132+
cluster.total_number_of_machines()
1133+
>= app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES
11671134
):
11681135
_logger.info(
1169-
"%s unrunnable tasks could not be assigned, slowly trying to scale up...",
1170-
len(queued_or_missing_instance_tasks),
1136+
"cluster already hit the maximum allowed amount of instances (%s), not scaling up. "
1137+
"%s tasks will wait until instances are free.",
1138+
app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES,
1139+
len(unassigned_tasks),
11711140
)
1172-
cluster = await _scale_up_cluster(
1141+
return cluster
1142+
1143+
# now we scale up
1144+
assert app_settings.AUTOSCALING_EC2_ACCESS # nosec
1145+
1146+
# let's start these
1147+
if needed_ec2_instances := await _find_needed_instances(
1148+
app, unassigned_tasks, allowed_instance_types, cluster, auto_scaling_mode
1149+
):
1150+
await auto_scaling_mode.log_message_from_tasks(
11731151
app,
1174-
cluster,
1175-
queued_or_missing_instance_tasks,
1176-
auto_scaling_mode,
1177-
allowed_instance_types,
1152+
unassigned_tasks,
1153+
"service is pending due to missing resources, scaling up cluster now...",
1154+
level=logging.INFO,
1155+
)
1156+
new_pending_instances = await _launch_instances(
1157+
app, needed_ec2_instances, unassigned_tasks, auto_scaling_mode
1158+
)
1159+
cluster.pending_ec2s.extend(
1160+
[NonAssociatedInstance(ec2_instance=i) for i in new_pending_instances]
11781161
)
1162+
# NOTE: to check the logs of UserData in EC2 instance
1163+
# run: tail -f -n 1000 /var/log/cloud-init-output.log in the instance
11791164

11801165
return cluster
11811166

11821167

1168+
async def _autoscale_cluster(
1169+
app: FastAPI,
1170+
cluster: Cluster,
1171+
auto_scaling_mode: BaseAutoscaling,
1172+
allowed_instance_types: list[EC2InstanceType],
1173+
) -> Cluster:
1174+
# 1. check if we have pending tasks
1175+
unnasigned_pending_tasks = await auto_scaling_mode.list_unrunnable_tasks(app)
1176+
_logger.info("found %s pending tasks", len(unnasigned_pending_tasks))
1177+
# NOTE: this function predicts how the backend will assign tasks
1178+
still_pending_tasks, cluster = await _assign_tasks_to_current_cluster(
1179+
app, unnasigned_pending_tasks, cluster, auto_scaling_mode
1180+
)
1181+
1182+
# 2. activate available drained nodes to cover some of the tasks
1183+
cluster = await _activate_drained_nodes(app, cluster, auto_scaling_mode)
1184+
1185+
# 3. start buffer instances to cover the remaining tasks
1186+
cluster = await _start_buffer_instances(app, cluster, auto_scaling_mode)
1187+
1188+
# 4. scale down unused instances
1189+
cluster = await _scale_down_unused_cluster_instances(
1190+
app, cluster, auto_scaling_mode
1191+
)
1192+
1193+
# 5. scale up if necessary
1194+
return await _scale_up_cluster(
1195+
app, cluster, auto_scaling_mode, allowed_instance_types, still_pending_tasks
1196+
)
1197+
1198+
11831199
async def _notify_autoscaling_status(
11841200
app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling
11851201
) -> None:

0 commit comments

Comments
 (0)