Skip to content

Commit f9fffc8

Browse files
authored
♻️ 📝 Minor refactor and doc of autoscaling service (#6551)
1 parent 118b363 commit f9fffc8

File tree

11 files changed

+119
-136
lines changed

11 files changed

+119
-136
lines changed

packages/aws-library/src/aws_library/ec2/_client.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
from collections.abc import Iterable, Sequence
44
from dataclasses import dataclass
5-
from typing import cast
5+
from typing import Literal, cast
66

77
import aioboto3
88
import botocore.exceptions
@@ -66,20 +66,28 @@ async def ping(self) -> bool:
6666
@ec2_exception_handler(_logger)
6767
async def get_ec2_instance_capabilities(
6868
self,
69-
instance_type_names: set[InstanceTypeType],
69+
instance_type_names: set[InstanceTypeType] | Literal["ALL"],
7070
) -> list[EC2InstanceType]:
71-
"""returns the ec2 instance types from a list of instance type names
72-
NOTE: the order might differ!
71+
"""Returns the ec2 instance types from a list of instance type names (sorted by name)
72+
7373
Arguments:
74-
instance_type_names -- the types to filter with
74+
instance_type_names -- the types to filter with or "ALL", to return all EC2 possible instances
7575
7676
Raises:
7777
Ec2InstanceTypeInvalidError: some invalid types were used as filter
7878
ClustersKeeperRuntimeError: unexpected error communicating with EC2
7979
8080
"""
81+
if instance_type_names == "ALL":
82+
selection_or_all_if_empty = []
83+
else:
84+
selection_or_all_if_empty = list(instance_type_names)
85+
if len(selection_or_all_if_empty) == 0:
86+
msg = "`instance_type_names` cannot be an empty set. Use either a selection or 'ALL'"
87+
raise ValueError(msg)
88+
8189
instance_types = await self.client.describe_instance_types(
82-
InstanceTypes=list(instance_type_names)
90+
InstanceTypes=selection_or_all_if_empty
8391
)
8492
list_instances: list[EC2InstanceType] = []
8593
for instance in instance_types.get("InstanceTypes", []):
@@ -95,7 +103,7 @@ async def get_ec2_instance_capabilities(
95103
),
96104
)
97105
)
98-
return list_instances
106+
return sorted(list_instances, key=lambda i: i.name)
99107

100108
@ec2_exception_handler(_logger)
101109
async def launch_instances(

packages/aws-library/tests/test_ec2_client.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -116,23 +116,25 @@ async def test_get_ec2_instance_capabilities(
116116
)
117117
)
118118
assert instance_types
119-
assert len(instance_types) == len(ec2_allowed_instances)
119+
assert [_.name for _ in instance_types] == sorted(ec2_allowed_instances)
120120

121-
# all the instance names are found and valid
122-
assert all(i.name in ec2_allowed_instances for i in instance_types)
123-
for instance_type_name in ec2_allowed_instances:
124-
assert any(i.name == instance_type_name for i in instance_types)
125121

126-
127-
async def test_get_ec2_instance_capabilities_empty_list_returns_all_options(
122+
async def test_get_ec2_instance_capabilities_returns_all_options(
128123
simcore_ec2_api: SimcoreEC2API,
129124
):
130-
instance_types = await simcore_ec2_api.get_ec2_instance_capabilities(set())
125+
instance_types = await simcore_ec2_api.get_ec2_instance_capabilities("ALL")
131126
assert instance_types
132127
# NOTE: this might need adaptation when moto is updated
133128
assert 700 < len(instance_types) < 828
134129

135130

131+
async def test_get_ec2_instance_capabilities_raise_with_empty_set(
132+
simcore_ec2_api: SimcoreEC2API,
133+
):
134+
with pytest.raises(ValueError, match="instance_type_names"):
135+
await simcore_ec2_api.get_ec2_instance_capabilities(set())
136+
137+
136138
async def test_get_ec2_instance_capabilities_with_invalid_type_raises(
137139
simcore_ec2_api: SimcoreEC2API,
138140
faker: Faker,

packages/service-integration/src/service_integration/pytest_plugin/folder_structure.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -46,21 +46,6 @@ def metadata_file(project_slug_dir: Path, request: pytest.FixtureRequest) -> Pat
4646
return metadata_file
4747

4848

49-
def get_expected_files(docker_name: str) -> tuple[str, ...]:
50-
return (
51-
".cookiecutterrc",
52-
".dockerignore",
53-
"metadata:metadata.yml",
54-
f"docker/{docker_name}:entrypoint.sh",
55-
f"docker/{docker_name}:Dockerfile",
56-
"service.cli:execute.sh",
57-
"docker-compose-build.yml",
58-
"docker-compose-meta.yml",
59-
"docker-compose.devel.yml",
60-
"docker-compose.yml",
61-
)
62-
63-
6449
def assert_path_in_repo(expected_path: str, project_slug_dir: Path):
6550

6651
if ":" in expected_path:

packages/service-library/src/servicelib/rabbitmq/rpc_interfaces/clusters_keeper/ec2_instances.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from typing import Literal
2+
13
from models_library.api_schemas_clusters_keeper import CLUSTERS_KEEPER_RPC_NAMESPACE
24
from models_library.api_schemas_clusters_keeper.ec2_instances import EC2InstanceTypeGet
35
from models_library.rabbitmq_basic_types import RPCMethodName
@@ -7,7 +9,7 @@
79

810

911
async def get_instance_type_details(
10-
client: RabbitMQRPCClient, *, instance_type_names: set[str]
12+
client: RabbitMQRPCClient, *, instance_type_names: set[str] | Literal["ALL"]
1113
) -> list[EC2InstanceTypeGet]:
1214
"""**Remote method**
1315

services/autoscaling/.cookiecutterrc

Lines changed: 0 additions & 20 deletions
This file was deleted.

services/autoscaling/src/simcore_service_autoscaling/core/settings.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ class Config(EC2Settings.Config):
6060
class EC2InstancesSettings(BaseCustomSettings):
6161
EC2_INSTANCES_ALLOWED_TYPES: dict[str, EC2InstanceBootSpecific] = Field(
6262
...,
63-
description="Defines which EC2 instances are considered as candidates for new EC2 instance and their respective boot specific parameters",
63+
description="Defines which EC2 instances are considered as candidates for new EC2 instance and their respective boot specific parameters"
64+
"NOTE: minimum length >0",
6465
)
6566

6667
EC2_INSTANCES_KEY_NAME: str = Field(
@@ -133,7 +134,7 @@ class EC2InstancesSettings(BaseCustomSettings):
133134

134135
@validator("EC2_INSTANCES_TIME_BEFORE_DRAINING")
135136
@classmethod
136-
def ensure_draining_delay_time_is_in_range(
137+
def _ensure_draining_delay_time_is_in_range(
137138
cls, value: datetime.timedelta
138139
) -> datetime.timedelta:
139140
if value < datetime.timedelta(seconds=10):
@@ -144,7 +145,7 @@ def ensure_draining_delay_time_is_in_range(
144145

145146
@validator("EC2_INSTANCES_TIME_BEFORE_TERMINATION")
146147
@classmethod
147-
def ensure_termination_delay_time_is_in_range(
148+
def _ensure_termination_delay_time_is_in_range(
148149
cls, value: datetime.timedelta
149150
) -> datetime.timedelta:
150151
if value < datetime.timedelta(minutes=0):
@@ -155,12 +156,18 @@ def ensure_termination_delay_time_is_in_range(
155156

156157
@validator("EC2_INSTANCES_ALLOWED_TYPES")
157158
@classmethod
158-
def check_valid_instance_names(
159+
def _check_valid_instance_names_and_not_empty(
159160
cls, value: dict[str, EC2InstanceBootSpecific]
160161
) -> dict[str, EC2InstanceBootSpecific]:
161162
# NOTE: needed because of a flaw in BaseCustomSettings
162163
# issubclass raises TypeError if used on Aliases
163164
parse_obj_as(list[InstanceTypeType], list(value))
165+
166+
if not value:
167+
# NOTE: Field( ... , min_items=...) cannot be used to contraint number of iterms in a dict
168+
msg = "At least one item expecte EC2_INSTANCES_ALLOWED_TYPES, got none"
169+
raise ValueError(msg)
170+
164171
return value
165172

166173

@@ -293,12 +300,12 @@ def LOG_LEVEL(self): # noqa: N802
293300

294301
@validator("AUTOSCALING_LOGLEVEL")
295302
@classmethod
296-
def valid_log_level(cls, value: str) -> str:
303+
def _valid_log_level(cls, value: str) -> str:
297304
return cls.validate_log_level(value)
298305

299306
@root_validator()
300307
@classmethod
301-
def exclude_both_dynamic_computational_mode(cls, values):
308+
def _exclude_both_dynamic_computational_mode(cls, values):
302309
if (
303310
values.get("AUTOSCALING_DASK") is not None
304311
and values.get("AUTOSCALING_NODES_MONITORING") is not None

services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py

Lines changed: 52 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import collections
33
import dataclasses
44
import datetime
5+
import functools
56
import itertools
67
import logging
78
from typing import Final, cast
@@ -327,30 +328,30 @@ async def _try_attach_pending_ec2s(
327328
)
328329

329330

330-
async def sorted_allowed_instance_types(app: FastAPI) -> list[EC2InstanceType]:
331+
async def _sorted_allowed_instance_types(app: FastAPI) -> list[EC2InstanceType]:
331332
app_settings: ApplicationSettings = app.state.settings
332333
assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec
333334
ec2_client = get_ec2_client(app)
334335

335-
# some instances might be able to run several tasks
336+
allowed_instance_type_names = list(
337+
app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES
338+
)
339+
340+
assert ( # nosec
341+
allowed_instance_type_names
342+
), "EC2_INSTANCES_ALLOWED_TYPES cannot be empty!"
343+
336344
allowed_instance_types: list[
337345
EC2InstanceType
338346
] = await ec2_client.get_ec2_instance_capabilities(
339-
cast(
340-
set[InstanceTypeType],
341-
set(
342-
app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES,
343-
),
344-
)
347+
cast(set[InstanceTypeType], set(allowed_instance_type_names))
345348
)
346349

347-
def _sort_according_to_allowed_types(instance_type: EC2InstanceType) -> int:
348-
assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec
349-
return list(
350-
app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES
351-
).index(f"{instance_type.name}")
350+
def _as_selection(instance_type: EC2InstanceType) -> int:
351+
# NOTE: will raise ValueError if allowed_instance_types not in allowed_instance_type_names
352+
return allowed_instance_type_names.index(f"{instance_type.name}")
352353

353-
allowed_instance_types.sort(key=_sort_according_to_allowed_types)
354+
allowed_instance_types.sort(key=_as_selection)
354355
return allowed_instance_types
355356

356357

@@ -497,51 +498,44 @@ async def _assign_tasks_to_current_cluster(
497498
cluster: Cluster,
498499
auto_scaling_mode: BaseAutoscaling,
499500
) -> tuple[list, Cluster]:
501+
"""
502+
Evaluates whether a task can be executed on any instance within the cluster. If the task's resource requirements are met, the task is *denoted* as assigned to the cluster.
503+
Note: This is an estimation only since actual scheduling is handled by Dask/Docker (depending on the mode).
504+
505+
Returns:
506+
A tuple containing:
507+
- A list of unassigned tasks (tasks whose resource requirements cannot be fulfilled by the available machines in the cluster).
508+
- The same cluster instance passed as input.
509+
"""
500510
unassigned_tasks = []
511+
assignment_predicates = [
512+
functools.partial(_try_assign_task_to_ec2_instance, instances=instances)
513+
for instances in (
514+
cluster.active_nodes,
515+
cluster.drained_nodes + cluster.buffer_drained_nodes,
516+
cluster.pending_nodes,
517+
cluster.pending_ec2s,
518+
cluster.buffer_ec2s,
519+
)
520+
]
521+
501522
for task in tasks:
502523
task_required_resources = auto_scaling_mode.get_task_required_resources(task)
503524
task_required_ec2_instance = await auto_scaling_mode.get_task_defined_instance(
504525
app, task
505526
)
506527

507-
assignment_functions = [
508-
lambda task, required_ec2, required_resources: _try_assign_task_to_ec2_instance(
509-
task,
510-
instances=cluster.active_nodes,
511-
task_required_ec2_instance=required_ec2,
512-
task_required_resources=required_resources,
513-
),
514-
lambda task, required_ec2, required_resources: _try_assign_task_to_ec2_instance(
515-
task,
516-
instances=cluster.drained_nodes + cluster.buffer_drained_nodes,
517-
task_required_ec2_instance=required_ec2,
518-
task_required_resources=required_resources,
519-
),
520-
lambda task, required_ec2, required_resources: _try_assign_task_to_ec2_instance(
521-
task,
522-
instances=cluster.pending_nodes,
523-
task_required_ec2_instance=required_ec2,
524-
task_required_resources=required_resources,
525-
),
526-
lambda task, required_ec2, required_resources: _try_assign_task_to_ec2_instance(
527-
task,
528-
instances=cluster.pending_ec2s,
529-
task_required_ec2_instance=required_ec2,
530-
task_required_resources=required_resources,
531-
),
532-
lambda task, required_ec2, required_resources: _try_assign_task_to_ec2_instance(
533-
task,
534-
instances=cluster.buffer_ec2s,
535-
task_required_ec2_instance=required_ec2,
536-
task_required_resources=required_resources,
537-
),
538-
]
539-
540528
if any(
541-
assignment(task, task_required_ec2_instance, task_required_resources)
542-
for assignment in assignment_functions
529+
is_assigned(
530+
task,
531+
task_required_ec2_instance=task_required_ec2_instance,
532+
task_required_resources=task_required_resources,
533+
)
534+
for is_assigned in assignment_predicates
543535
):
544-
_logger.debug("assigned task to cluster")
536+
_logger.debug(
537+
"task %s is assigned to one instance available in cluster", task
538+
)
545539
else:
546540
unassigned_tasks.append(task)
547541

@@ -1131,7 +1125,7 @@ async def _autoscale_cluster(
11311125
# 1. check if we have pending tasks and resolve them by activating some drained nodes
11321126
unrunnable_tasks = await auto_scaling_mode.list_unrunnable_tasks(app)
11331127
_logger.info("found %s unrunnable tasks", len(unrunnable_tasks))
1134-
1128+
# NOTE: this function predicts how dask will assign a task to a machine
11351129
queued_or_missing_instance_tasks, cluster = await _assign_tasks_to_current_cluster(
11361130
app, unrunnable_tasks, cluster, auto_scaling_mode
11371131
)
@@ -1217,11 +1211,13 @@ async def auto_scale_cluster(
12171211
If there are such tasks, this method will allocate new machines in AWS to cope with
12181212
the additional load.
12191213
"""
1220-
1221-
allowed_instance_types = await sorted_allowed_instance_types(app)
1214+
# current state
1215+
allowed_instance_types = await _sorted_allowed_instance_types(app)
12221216
cluster = await _analyze_current_cluster(
12231217
app, auto_scaling_mode, allowed_instance_types
12241218
)
1219+
1220+
# cleanup
12251221
cluster = await _cleanup_disconnected_nodes(app, cluster)
12261222
cluster = await _terminate_broken_ec2s(app, cluster)
12271223
cluster = await _make_pending_buffer_ec2s_join_cluster(app, cluster)
@@ -1230,8 +1226,11 @@ async def auto_scale_cluster(
12301226
)
12311227
cluster = await _drain_retired_nodes(app, cluster)
12321228

1229+
# desired state
12331230
cluster = await _autoscale_cluster(
12341231
app, cluster, auto_scaling_mode, allowed_instance_types
12351232
)
1233+
1234+
# notify
12361235
await _notify_machine_creation_progress(app, cluster, auto_scaling_mode)
12371236
await _notify_autoscaling_status(app, cluster, auto_scaling_mode)

0 commit comments

Comments
 (0)