From c8ab88997562001ac05607dd0659944d3d89c068 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 11:37:19 +0200 Subject: [PATCH 01/93] disable hack --- .../modules/cluster_scaling/_provider_computational.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py index c9b2d498fd66..92be7fe1f4be 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py @@ -88,13 +88,7 @@ async def list_unrunnable_tasks(self, app: FastAPI) -> list[DaskTask]: def get_task_required_resources(self, task) -> Resources: assert self # nosec - task_required_resources = utils.resources_from_dask_task(task) - # ensure cpu is set at least to 1 as dask-workers use 1 thread per CPU - if task_required_resources.cpus < 1.0: - task_required_resources = task_required_resources.model_copy( - update={"cpus": 1.0} - ) - return task_required_resources + return utils.resources_from_dask_task(task) async def get_task_defined_instance( self, app: FastAPI, task From d0fc1c638e4ac73c9cd93cf6b60ee4a57cc5f51f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 11:37:59 +0200 Subject: [PATCH 02/93] refactor --- .../modules/cluster_scaling/_auto_scaling_core.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py index ff4b0ad4f5b6..5b74cb412fad 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py @@ -728,15 +728,15 @@ async def _find_needed_instances( task_required_resources = auto_scaling_mode.get_task_required_resources( task ) - task_required_ec2_instance = ( - await auto_scaling_mode.get_task_defined_instance(app, task) + task_required_ec2 = await auto_scaling_mode.get_task_defined_instance( + app, task ) # first check if we can assign the task to one of the newly tobe created instances if _try_assign_task_to_ec2_instance_type( task, instances=needed_new_instance_types_for_tasks, - task_required_ec2_instance=task_required_ec2_instance, + task_required_ec2_instance=task_required_ec2, task_required_resources=task_required_resources, ): continue @@ -744,12 +744,12 @@ async def _find_needed_instances( # so we need to find what we can create now try: # check if exact instance type is needed first - if task_required_ec2_instance: + if task_required_ec2: defined_ec2 = find_selected_instance_type_for_task( - task_required_ec2_instance, + task_required_ec2, available_ec2_types, task, - auto_scaling_mode.get_task_required_resources(task), + task_required_resources, ) needed_new_instance_types_for_tasks.append( AssignedTasksToInstanceType( @@ -763,7 +763,7 @@ async def _find_needed_instances( # we go for best fitting type best_ec2_instance = utils_ec2.find_best_fitting_ec2_instance( available_ec2_types, - auto_scaling_mode.get_task_required_resources(task), + task_required_resources, score_type=utils_ec2.closest_instance_policy, ) needed_new_instance_types_for_tasks.append( From 29465a59f25421981a416830bc1076426af8f0cb Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 13:11:40 +0200 Subject: [PATCH 03/93] added generic resource to ec2 resource model --- .../src/aws_library/ec2/_models.py | 102 +++++++++++++++--- 1 file changed, 88 insertions(+), 14 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index e08e207b0b0e..d5c1ca62442d 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -14,46 +14,120 @@ Field, NonNegativeFloat, NonNegativeInt, + StrictFloat, + StrictInt, StringConstraints, field_validator, ) from pydantic.config import JsonDict from types_aiobotocore_ec2.literals import InstanceStateNameType, InstanceTypeType +GenericResourceValue: TypeAlias = StrictInt | StrictFloat | str + class Resources(BaseModel, frozen=True): cpus: NonNegativeFloat ram: ByteSize + generic_resources: Annotated[ + dict[str, GenericResourceValue], + Field( + default_factory=dict, + description=( + "Arbitrary additional resources (e.g. {'threads': 8}). " + "Numeric values are treated as quantities and participate in add/sub/compare." + ), + ), + ] = DEFAULT_FACTORY @classmethod def create_as_empty(cls) -> "Resources": return cls(cpus=0, ram=ByteSize(0)) def __ge__(self, other: "Resources") -> bool: - return self.cpus >= other.cpus and self.ram >= other.ram + if not (self.cpus >= other.cpus and self.ram >= other.ram): + return False + # ensure all numeric generic resources in `other` are satisfied by `self` + for k, v in other.generic_resources.items(): + if isinstance(v, int | float): + lhs_val = self.generic_resources.get(k, 0) + if not isinstance(lhs_val, int | float) or lhs_val < v: + return False + continue + # non-numeric must be equal and present + if k not in self.generic_resources or self.generic_resources[k] != v: + return False + return True def __gt__(self, other: "Resources") -> bool: - return self.cpus > other.cpus or self.ram > other.ram + if self.cpus > other.cpus or self.ram > other.ram: + return True + for k, v in other.generic_resources.items(): + lhs_val = self.generic_resources.get(k) + if ( + isinstance(v, int | float) + and isinstance(lhs_val, int | float) + and lhs_val > v + ): + return True + if not isinstance(v, int | float) and lhs_val is not None and lhs_val != v: + return True + return False def __add__(self, other: "Resources") -> "Resources": + """operator for adding two Resources + Note that only numeric generic resources are added + Non-numeric generic resources are ignored + """ + merged: dict[str, GenericResourceValue] = {} + keys = set(self.generic_resources) | set(other.generic_resources) + for k in keys: + a = self.generic_resources.get(k) + b = other.generic_resources.get(k) + # adding non numeric values does not make sense, so we skip those for the resulting resource + if isinstance(a, int | float) and isinstance(b, int | float): + merged[k] = a + b + elif a is None and isinstance(b, int | float): + merged[k] = b + elif b is None and isinstance(a, int | float): + merged[k] = a + return Resources.model_construct( - **{ - key: a + b - for (key, a), b in zip( - self.model_dump().items(), other.model_dump().values(), strict=True - ) - } + cpus=self.cpus + other.cpus, + ram=self.ram + other.ram, + generic_resources=merged, ) def __sub__(self, other: "Resources") -> "Resources": + """operator for subtracting two Resources + Note that only numeric generic resources are subtracted + Non-numeric generic resources are ignored + """ + merged: dict[str, GenericResourceValue] = {} + keys = set(self.generic_resources) | set(other.generic_resources) + for k in keys: + a = self.generic_resources.get(k) + b = other.generic_resources.get(k) + # subtracting non numeric values does not make sense, so we skip those for the resulting resource + if isinstance(a, int | float) and isinstance(b, int | float): + merged[k] = a - b + elif a is None and isinstance(b, int | float): + merged[k] = -b + elif b is None and isinstance(a, int | float): + merged[k] = a + return Resources.model_construct( - **{ - key: a - b - for (key, a), b in zip( - self.model_dump().items(), other.model_dump().values(), strict=True - ) - } + cpus=self.cpus - other.cpus, + ram=self.ram - other.ram, + generic_resources=merged, + ) + + def __hash__(self) -> int: + """Deterministic hash including cpus, ram (in bytes) and generic_resources.""" + # sort generic_resources items to ensure order-independent hashing + generic_items: tuple[tuple[str, GenericResourceValue], ...] = tuple( + sorted(self.generic_resources.items()) ) + return hash((self.cpus, self.ram, generic_items)) @field_validator("cpus", mode="before") @classmethod From 3915a82af6d820f892d3e11bc1085e6b88dc6a7d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 13:56:21 +0200 Subject: [PATCH 04/93] do not define gt it is weird for resources --- .../src/aws_library/ec2/_models.py | 42 +++++++-------- packages/aws-library/tests/test_ec2_models.py | 53 +++++++++++-------- .../src/simcore_service_autoscaling/models.py | 6 +-- .../_provider_computational.py | 3 ++ 4 files changed, 57 insertions(+), 47 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index d5c1ca62442d..5cdd5ba305b4 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -44,35 +44,31 @@ def create_as_empty(cls) -> "Resources": return cls(cpus=0, ram=ByteSize(0)) def __ge__(self, other: "Resources") -> bool: + """operator for >= comparison + if self has greater or equal resources than other, returns True + Note that generic_resources are compared only if they are numeric + Non-numeric generic resources must be equal in both or only defined in self + to be considered greater or equal + """ + if not (self.cpus >= other.cpus and self.ram >= other.ram): return False - # ensure all numeric generic resources in `other` are satisfied by `self` - for k, v in other.generic_resources.items(): - if isinstance(v, int | float): - lhs_val = self.generic_resources.get(k, 0) - if not isinstance(lhs_val, int | float) or lhs_val < v: + + keys = set(self.generic_resources) | set(other.generic_resources) + for k in keys: + a = self.generic_resources.get(k) + b = other.generic_resources.get( + k, a + ) # NOTE: get from other, default to a so that non-existing keys are considered equal + if isinstance(a, int | float) and isinstance(b, int | float): + if not (a >= b): return False - continue - # non-numeric must be equal and present - if k not in self.generic_resources or self.generic_resources[k] != v: + elif a != b: + assert isinstance(a, str | None) # nosec + assert isinstance(b, int | float | str | None) # nosec return False return True - def __gt__(self, other: "Resources") -> bool: - if self.cpus > other.cpus or self.ram > other.ram: - return True - for k, v in other.generic_resources.items(): - lhs_val = self.generic_resources.get(k) - if ( - isinstance(v, int | float) - and isinstance(lhs_val, int | float) - and lhs_val > v - ): - return True - if not isinstance(v, int | float) and lhs_val is not None and lhs_val != v: - return True - return False - def __add__(self, other: "Resources") -> "Resources": """operator for adding two Resources Note that only numeric generic resources are added diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index ed232ad0043d..a2953e1d6b7e 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -37,46 +37,57 @@ Resources(cpus=0.1, ram=ByteSize(1)), False, ), - ], -) -def test_resources_ge_operator( - a: Resources, b: Resources, a_greater_or_equal_than_b: bool -): - assert (a >= b) is a_greater_or_equal_than_b - - -@pytest.mark.parametrize( - "a,b,a_greater_than_b", - [ ( - Resources(cpus=0.2, ram=ByteSize(0)), - Resources(cpus=0.1, ram=ByteSize(0)), + Resources(cpus=0.1, ram=ByteSize(0), generic_resources={"GPU": 1}), + Resources(cpus=0.1, ram=ByteSize(1)), + False, # ram is not enough + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + Resources(cpus=0.1, ram=ByteSize(1)), True, ), ( - Resources(cpus=0.1, ram=ByteSize(0)), - Resources(cpus=0.1, ram=ByteSize(0)), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + True, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), False, ), ( Resources(cpus=0.1, ram=ByteSize(1)), - Resources(cpus=0.1, ram=ByteSize(0)), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), + False, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": "2"}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), + False, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), + Resources(cpus=0.1, ram=ByteSize(1)), True, ), ( - Resources(cpus=0.05, ram=ByteSize(1)), - Resources(cpus=0.1, ram=ByteSize(0)), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), True, ), ( - Resources(cpus=0.1, ram=ByteSize(0)), Resources(cpus=0.1, ram=ByteSize(1)), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), False, ), ], ) -def test_resources_gt_operator(a: Resources, b: Resources, a_greater_than_b: bool): - assert (a > b) is a_greater_than_b +def test_resources_ge_operator( + a: Resources, b: Resources, a_greater_or_equal_than_b: bool +): + assert (a >= b) is a_greater_or_equal_than_b @pytest.mark.parametrize( diff --git a/services/autoscaling/src/simcore_service_autoscaling/models.py b/services/autoscaling/src/simcore_service_autoscaling/models.py index 7645b300e8de..ca697a13fa06 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/models.py +++ b/services/autoscaling/src/simcore_service_autoscaling/models.py @@ -23,6 +23,9 @@ def assign_task(self, task, task_resources: Resources) -> None: def has_resources_for_task(self, task_resources: Resources) -> bool: return bool(self.available_resources >= task_resources) + def has_assigned_tasks(self) -> bool: + return len(self.assigned_tasks) > 0 + @dataclass(frozen=True, kw_only=True, slots=True) class AssignedTasksToInstanceType(_TaskAssignmentMixin): @@ -37,9 +40,6 @@ def __post_init__(self) -> None: if self.available_resources == Resources.create_as_empty(): object.__setattr__(self, "available_resources", self.ec2_instance.resources) - def has_assigned_tasks(self) -> bool: - return bool(self.available_resources < self.ec2_instance.resources) - @dataclass(frozen=True, kw_only=True, slots=True) class AssociatedInstance(_BaseInstance): diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py index 92be7fe1f4be..a47323e42587 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py @@ -88,6 +88,9 @@ async def list_unrunnable_tasks(self, app: FastAPI) -> list[DaskTask]: def get_task_required_resources(self, task) -> Resources: assert self # nosec + # NOTE: a dask worker can take a task if it has a free thread, regardless of its resources + # so we need to be careful when interpreting the resources, adding the thread here will mimick this + return utils.resources_from_dask_task(task) async def get_task_defined_instance( From 2209e2c914ee4e657e4289e9f55865a05f6a2eb2 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:06:10 +0200 Subject: [PATCH 05/93] improve coverage --- packages/aws-library/tests/test_ec2_models.py | 69 ++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index a2953e1d6b7e..adaf81158bb8 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -103,6 +103,36 @@ def test_resources_ge_operator( Resources(cpus=1, ram=ByteSize(34)), Resources(cpus=1.1, ram=ByteSize(35)), ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + Resources(cpus=1, ram=ByteSize(34)), + Resources(cpus=1.1, ram=ByteSize(35), generic_resources={"GPU": 1}), + ), + ( + Resources(cpus=0.1, ram=ByteSize(1)), + Resources(cpus=1, ram=ByteSize(34), generic_resources={"GPU": 1}), + Resources(cpus=1.1, ram=ByteSize(35), generic_resources={"GPU": 1}), + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + Resources(cpus=1, ram=ByteSize(34), generic_resources={"GPU": 1}), + Resources(cpus=1.1, ram=ByteSize(35), generic_resources={"GPU": 2}), + ), + ( + Resources( + cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1, "SSE": "yes"} + ), + Resources(cpus=1, ram=ByteSize(34), generic_resources={"GPU": 1}), + Resources(cpus=1.1, ram=ByteSize(35), generic_resources={"GPU": 2}), + ), # string resources are not summed + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": "1"}), + Resources(cpus=1, ram=ByteSize(34), generic_resources={"GPU": 1}), + Resources( + cpus=1.1, + ram=ByteSize(35), + ), + ), # string resources are ignored in summation ], ) def test_resources_add(a: Resources, b: Resources, result: Resources): @@ -112,7 +142,9 @@ def test_resources_add(a: Resources, b: Resources, result: Resources): def test_resources_create_as_empty(): - assert Resources.create_as_empty() == Resources(cpus=0, ram=ByteSize(0)) + assert Resources.create_as_empty() == Resources( + cpus=0, ram=ByteSize(0), generic_resources={} + ) @pytest.mark.parametrize( @@ -128,6 +160,41 @@ def test_resources_create_as_empty(): Resources(cpus=1, ram=ByteSize(1)), Resources.model_construct(cpus=-0.9, ram=ByteSize(33)), ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + Resources(cpus=1, ram=ByteSize(34)), + Resources.model_construct( + cpus=-0.9, ram=ByteSize(-33), generic_resources={"GPU": 1} + ), + ), + ( + Resources(cpus=0.1, ram=ByteSize(1)), + Resources(cpus=1, ram=ByteSize(34), generic_resources={"GPU": 1}), + Resources.model_construct( + cpus=-0.9, ram=ByteSize(-33), generic_resources={"GPU": -1} + ), + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + Resources(cpus=1, ram=ByteSize(34), generic_resources={"GPU": 1}), + Resources.model_construct( + cpus=-0.9, ram=ByteSize(-33), generic_resources={"GPU": 0} + ), + ), + ( + Resources( + cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1, "SSE": "yes"} + ), + Resources(cpus=1, ram=ByteSize(34), generic_resources={"GPU": 1}), + Resources.model_construct( + cpus=-0.9, ram=ByteSize(-33), generic_resources={"GPU": 0} + ), + ), # string resources are not summed + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": "1"}), + Resources(cpus=1, ram=ByteSize(34), generic_resources={"GPU": 1}), + Resources.model_construct(cpus=-0.9, ram=ByteSize(-33)), + ), # string resources are ignored in summation ], ) def test_resources_sub(a: Resources, b: Resources, result: Resources): From 4db4871da2b41afa9bc06169e2be8a33748062f6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:07:28 +0200 Subject: [PATCH 06/93] ruff --- packages/aws-library/tests/test_ec2_models.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index adaf81158bb8..53c9ed031ac7 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -226,7 +226,11 @@ def test_ec2_instance_data_hashable(faker: Faker): cpus=faker.pyfloat(min_value=0.1), ram=ByteSize(faker.pyint(min_value=123)), ), - {AWSTagKey("mytagkey"): AWSTagValue("mytagvalue")}, + { + TypeAdapter(AWSTagKey) + .validate_python("mytagkey"): TypeAdapter(AWSTagValue) + .validate_python("mytagvalue") + }, ) } second_set_of_ec2s = { @@ -241,7 +245,11 @@ def test_ec2_instance_data_hashable(faker: Faker): cpus=faker.pyfloat(min_value=0.1), ram=ByteSize(faker.pyint(min_value=123)), ), - {AWSTagKey("mytagkey"): AWSTagValue("mytagvalue")}, + { + TypeAdapter(AWSTagKey) + .validate_python("mytagkey"): TypeAdapter(AWSTagValue) + .validate_python("mytagvalue") + }, ) } From e5012b671c299c12a15188b78626955edbd0bf23 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:12:34 +0200 Subject: [PATCH 07/93] add missing test --- packages/aws-library/tests/test_ec2_models.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index 53c9ed031ac7..1dc3521ab6e9 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -4,7 +4,13 @@ import pytest -from aws_library.ec2._models import AWSTagKey, AWSTagValue, EC2InstanceData, Resources +from aws_library.ec2._models import ( + AWSTagKey, + AWSTagValue, + EC2InstanceBootSpecific, + EC2InstanceData, + Resources, +) from faker import Faker from pydantic import ByteSize, TypeAdapter, ValidationError @@ -256,3 +262,11 @@ def test_ec2_instance_data_hashable(faker: Faker): union_of_sets = first_set_of_ec2s.union(second_set_of_ec2s) assert next(iter(first_set_of_ec2s)) in union_of_sets assert next(iter(second_set_of_ec2s)) in union_of_sets + + +def test_ec2_instance_boot_specific_with_invalid_custome_script(faker: Faker): + valid_model = EC2InstanceBootSpecific.model_json_schema()["examples"][0] + invalid_model = {**valid_model, "custom_boot_scripts": ["echo 'missing end quote"]} + + with pytest.raises(ValueError, match="Invalid bash call"): + EC2InstanceBootSpecific(**invalid_model) From cf3915862506c21c0f49189f531f9139edc48ee0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:14:04 +0200 Subject: [PATCH 08/93] ruff --- packages/aws-library/src/aws_library/ec2/_models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index 5cdd5ba305b4..9aba3ccf838b 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -244,8 +244,9 @@ def validate_bash_calls(cls, v): temp_file.flush() # NOTE: this will not capture runtime errors, but at least some syntax errors such as invalid quotes sh.bash( - "-n", temp_file.name - ) # pyright: ignore[reportCallIssue] # sh is untyped, but this call is safe for bash syntax checking + "-n", + temp_file.name, # pyright: ignore[reportCallIssue] + ) # sh is untyped, but this call is safe for bash syntax checking except sh.ErrorReturnCode as exc: msg = f"Invalid bash call in custom_boot_scripts: {v}, Error: {exc.stderr}" raise ValueError(msg) from exc From cfc3ec70b2aa4cdf5a22ee773b9ae77da0766084 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:16:26 +0200 Subject: [PATCH 09/93] use ge operator --- .../src/simcore_service_autoscaling/utils/cluster_scaling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py index 13c25dcd2112..5a64de471cd8 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py @@ -109,7 +109,7 @@ def find_selected_instance_type_for_task( selected_instance = filtered_instances[0] # check that the assigned resources and the machine resource fit - if task_required_resources > selected_instance.resources: + if task_required_resources <= selected_instance.resources: raise TaskRequirementsAboveRequiredEC2InstanceTypeError( task=task, instance_type=selected_instance, From 9e3e916e4b992743d9d54bf3edf08ef0f4df9b2d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:26:11 +0200 Subject: [PATCH 10/93] added model dump flat --- packages/aws-library/src/aws_library/ec2/_models.py | 6 ++++++ packages/aws-library/tests/test_ec2_models.py | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index 9aba3ccf838b..e2ad26fbba2e 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -125,6 +125,12 @@ def __hash__(self) -> int: ) return hash((self.cpus, self.ram, generic_items)) + def model_dump_flat(self) -> dict[str, float | int]: + """Like model_dump, but flattens ram to bytes and generic_resources to top level keys""" + base = self.model_dump() + base.update(base.pop("generic_resources")) + return base + @field_validator("cpus", mode="before") @classmethod def _floor_cpus_to_0(cls, v: float) -> float: diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index 1dc3521ab6e9..1dca9b5a3a0b 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -209,6 +209,14 @@ def test_resources_sub(a: Resources, b: Resources, result: Resources): assert a == result +def test_resources_model_dump_flat(): + r = Resources( + cpus=0.1, ram=ByteSize(1024), generic_resources={"GPU": 2, "SSE": "yes"} + ) + flat = r.model_dump_flat() + assert flat == {"cpus": 0.1, "ram": 1024, "GPU": 2, "SSE": "yes"} + + @pytest.mark.parametrize("ec2_tag_key", ["", "/", " ", ".", "..", "_index"]) def test_aws_tag_key_invalid(ec2_tag_key: str): # for a key it raises From 4dd4191f14a11e5fc0bd2fc1702330373093932e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:32:16 +0200 Subject: [PATCH 11/93] both direction --- .../aws-library/src/aws_library/ec2/_models.py | 14 ++++++++++++-- packages/aws-library/tests/test_ec2_models.py | 7 +++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index e2ad26fbba2e..3a4aaf42ec19 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -125,12 +125,22 @@ def __hash__(self) -> int: ) return hash((self.cpus, self.ram, generic_items)) - def model_dump_flat(self) -> dict[str, float | int]: - """Like model_dump, but flattens ram to bytes and generic_resources to top level keys""" + def as_flat_dict(self) -> dict[str, int | float | str]: + """Like model_dump, but flattens generic_resources to top level keys""" base = self.model_dump() base.update(base.pop("generic_resources")) return base + @classmethod + def from_flat_dict(cls, data: dict[str, int | float | str]) -> "Resources": + """Inverse of as_flat_dict""" + generic_resources = {k: v for k, v in data.items() if k not in {"cpus", "ram"}} + return cls( + cpus=data.get("cpus", 0), + ram=ByteSize(data.get("ram", 0)), + generic_resources=generic_resources, + ) + @field_validator("cpus", mode="before") @classmethod def _floor_cpus_to_0(cls, v: float) -> float: diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index 1dca9b5a3a0b..d1ef498f0c28 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -209,13 +209,16 @@ def test_resources_sub(a: Resources, b: Resources, result: Resources): assert a == result -def test_resources_model_dump_flat(): +def test_resources_flat_dict(): r = Resources( cpus=0.1, ram=ByteSize(1024), generic_resources={"GPU": 2, "SSE": "yes"} ) - flat = r.model_dump_flat() + flat = r.as_flat_dict() assert flat == {"cpus": 0.1, "ram": 1024, "GPU": 2, "SSE": "yes"} + reconstructed = Resources.from_flat_dict(flat) + assert reconstructed == r + @pytest.mark.parametrize("ec2_tag_key", ["", "/", " ", ".", "..", "_index"]) def test_aws_tag_key_invalid(ec2_tag_key: str): From 073050fc22024bad4f2b180ac8473498926a7a07 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:45:22 +0200 Subject: [PATCH 12/93] added ENV variables for nthreads and threads multiplier --- .../simcore_service_autoscaling/core/settings.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/services/autoscaling/src/simcore_service_autoscaling/core/settings.py b/services/autoscaling/src/simcore_service_autoscaling/core/settings.py index 0ae53b943954..38f994bcea6e 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/core/settings.py +++ b/services/autoscaling/src/simcore_service_autoscaling/core/settings.py @@ -14,6 +14,7 @@ AnyUrl, Field, NonNegativeInt, + PositiveInt, TypeAdapter, field_validator, model_validator, @@ -241,6 +242,18 @@ class DaskMonitoringSettings(BaseCustomSettings): description="defines the authentication of the clusters created via clusters-keeper (can be None or TLS)", ), ] + DASK_NTHREADS: Annotated[ + NonNegativeInt, + Field( + description="if >0, it overrides the default number of threads per process in the dask-sidecars, (see description in dask-sidecar)", + ), + ] + DASK_NTHREADS_MULTIPLIER: Annotated[ + PositiveInt, + Field( + description="if >1, it overrides the default number of threads per process in the dask-sidecars, by multiplying the number of vCPUs with this factor (see description in dask-sidecar)", + ), + ] class ApplicationSettings(BaseApplicationSettings, MixinLoggingSettings): From 74bcf7d19e9a0f3432e72ea4cd466e8f22fabbc4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:45:42 +0200 Subject: [PATCH 13/93] use ge operator instead of incorrect gt operator --- .../src/simcore_service_autoscaling/utils/cluster_scaling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py index 5a64de471cd8..1cff28a0bb46 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py @@ -109,7 +109,7 @@ def find_selected_instance_type_for_task( selected_instance = filtered_instances[0] # check that the assigned resources and the machine resource fit - if task_required_resources <= selected_instance.resources: + if not (task_required_resources <= selected_instance.resources): raise TaskRequirementsAboveRequiredEC2InstanceTypeError( task=task, instance_type=selected_instance, From 9a7eb37638d93064c5b7bbc1785dbb4d888e1f31 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:46:52 +0200 Subject: [PATCH 14/93] define variables for tests --- services/autoscaling/tests/unit/conftest.py | 4 ++-- .../unit/test_modules_cluster_scaling_computational.py | 10 ++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index 57c9b381fc2d..192cc4932dde 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -380,8 +380,8 @@ def enabled_computational_mode( "AUTOSCALING_DASK": "{}", "DASK_MONITORING_URL": faker.url(), "DASK_SCHEDULER_AUTH": "{}", - "DASK_MONITORING_USER_NAME": faker.user_name(), - "DASK_MONITORING_PASSWORD": faker.password(), + "DASK_NTHREADS": f"{faker.pyint(min_value=0, max_value=10)}", + "DASK_NTHREADS_MULTIPLIER": f"{faker.pyint(min_value=1, max_value=4)}", }, ) diff --git a/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py index f83eaac9ea8b..fbfd965cd34a 100644 --- a/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py @@ -14,7 +14,7 @@ from collections.abc import Awaitable, Callable, Iterator from copy import deepcopy from dataclasses import dataclass -from typing import Any, Final, cast +from typing import Any, cast from unittest import mock import arrow @@ -285,13 +285,11 @@ class _ScaleUpParams: expected_num_instances: int -_RESOURCE_TO_DASK_RESOURCE_MAP: Final[dict[str, str]] = {"CPUS": "CPU", "RAM": "RAM"} - - def _dask_task_resources_from_resources(resources: Resources) -> DaskTaskResources: return { - _RESOURCE_TO_DASK_RESOURCE_MAP[res_key.upper()]: res_value - for res_key, res_value in resources.model_dump().items() + "CPU": resources.cpus, + "RAM": resources.ram, + **dict(resources.generic_resources.items()), } From 2a67d9ec547101e663194d262ea62d47e70b50be Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:48:49 +0200 Subject: [PATCH 15/93] pass nthreads and multiplier also to the autoscaling service --- .../src/simcore_service_clusters_keeper/data/docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml index dc44dd9ece75..d3ba68cb76a8 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml @@ -104,6 +104,8 @@ services: AUTOSCALING_RABBITMQ: ${AUTOSCALING_RABBITMQ} DASK_MONITORING_URL: tls://dask-scheduler:8786 DASK_SCHEDULER_AUTH: '{"type":"tls","tls_ca_file":"${DASK_TLS_CA_FILE}","tls_client_cert":"${DASK_TLS_CERT}","tls_client_key":"${DASK_TLS_KEY}"}' + DASK_NTHREADS: ${DASK_NTHREADS} + DASK_NTHREADS_MULTIPLIER: ${DASK_NTHREADS_MULTIPLIER} EC2_INSTANCES_ALLOWED_TYPES: ${WORKERS_EC2_INSTANCES_ALLOWED_TYPES} EC2_INSTANCES_COLD_START_DOCKER_IMAGES_PRE_PULLING: ${WORKERS_EC2_INSTANCES_COLD_START_DOCKER_IMAGES_PRE_PULLING} EC2_INSTANCES_CUSTOM_TAGS: ${WORKERS_EC2_INSTANCES_CUSTOM_TAGS} From 2a3019f84d1f728dabdd34ec58e0055daf36efa8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:58:22 +0200 Subject: [PATCH 16/93] ongoing --- .../cluster_scaling/_provider_computational.py | 18 ++++++++++++------ .../modules/dask.py | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py index a47323e42587..a605c7ea0484 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py @@ -1,6 +1,6 @@ import collections import logging -from typing import cast +from typing import Any, cast from aws_library.ec2 import EC2InstanceData, EC2Tags, Resources from fastapi import FastAPI @@ -90,8 +90,10 @@ def get_task_required_resources(self, task) -> Resources: assert self # nosec # NOTE: a dask worker can take a task if it has a free thread, regardless of its resources # so we need to be careful when interpreting the resources, adding the thread here will mimick this - - return utils.resources_from_dask_task(task) + task_required_resources = utils.resources_from_dask_task(task) + # TODO: should we add a generic resource for threads? + # task_required_resources.generic_resources[_DASK_WORKER_THREAD_RESOURCE_NAME] = 1 + return task_required_resources async def get_task_defined_instance( self, app: FastAPI, task @@ -138,10 +140,14 @@ async def compute_cluster_used_resources( list_of_used_resources: list[Resources] = await logged_gather( *(self.compute_node_used_resources(app, i) for i in instances) ) - counter = collections.Counter(dict.fromkeys(Resources.model_fields, 0)) + counter = collections.Counter() for result in list_of_used_resources: - counter.update(result.model_dump()) - return Resources.model_validate(dict(counter)) + counter.update(result.as_flat_dict()) + + flat_counter: dict[str, Any] = dict(counter) + flat_counter.setdefault("cpus", 0) + flat_counter.setdefault("ram", 0) + return Resources.from_flat_dict(flat_counter) async def compute_cluster_total_resources( self, app: FastAPI, instances: list[AssociatedInstance] diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 966593295e87..ce52f997bdd5 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -15,6 +15,7 @@ from models_library.clusters import ClusterAuthentication, TLSAuthentication from pydantic import AnyUrl, ByteSize, TypeAdapter +from ...core.settings import DaskMonitoringSettings from ..core.errors import ( DaskNoWorkersError, DaskSchedulerNotFoundError, @@ -39,6 +40,7 @@ async def _wrap_client_async_routine( _DASK_SCHEDULER_CONNECT_TIMEOUT_S: Final[int] = 5 +_DASK_WORKER_THREAD_RESOURCE_NAME: Final[str] = "threads" @contextlib.asynccontextmanager @@ -326,3 +328,19 @@ async def try_retire_nodes( await _wrap_client_async_routine( client.retire_workers(close_workers=False, remove=False) ) + + +async def add_instance_generic_resources( + settings: DaskMonitoringSettings, instance: EC2InstanceData +) -> None: + instance_threads = round(instance.available_resources.cpus) + if settings.AUTOSCALING_DASK.DASK_NTHREADS > 0: + # this overrides everything + instance_threads = settings.AUTOSCALING_DASK.DASK_NTHREADS + if settings.AUTOSCALING_DASK.DASK_NTHREADS_MULTIPLIER > 1: + instance_threads = ( + instance_threads * settings.AUTOSCALING_DASK.DASK_NTHREADS_MULTIPLIER + ) + instance.available_resources.generic_resources[ + _DASK_WORKER_THREAD_RESOURCE_NAME + ] = instance_threads From 30b4367b9ff604dc88baf497cba2e376e6151461 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 17:01:57 +0200 Subject: [PATCH 17/93] typo --- .../autoscaling/src/simcore_service_autoscaling/modules/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index ce52f997bdd5..6ed2bd7741bd 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -15,12 +15,12 @@ from models_library.clusters import ClusterAuthentication, TLSAuthentication from pydantic import AnyUrl, ByteSize, TypeAdapter -from ...core.settings import DaskMonitoringSettings from ..core.errors import ( DaskNoWorkersError, DaskSchedulerNotFoundError, DaskWorkerNotFoundError, ) +from ..core.settings import DaskMonitoringSettings from ..models import AssociatedInstance, DaskTask, DaskTaskId from ..utils.utils_ec2 import ( node_host_name_from_ec2_private_dns, From 3c8732141b71e14344c4bb795e42051aec37429e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 17:24:09 +0200 Subject: [PATCH 18/93] fix counter --- .../simcore_service_autoscaling/utils/utils_docker.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py index a48951986763..e2b9e044b9ef 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py @@ -395,14 +395,16 @@ async def compute_cluster_used_resources( docker_client: AutoscalingDocker, nodes: list[Node] ) -> Resources: """Returns the total amount of resources (reservations) used on each of the given nodes""" - list_of_used_resources = await logged_gather( + list_of_used_resources: list[Resources] = await logged_gather( *(compute_node_used_resources(docker_client, node) for node in nodes) ) - counter = collections.Counter(dict.fromkeys(list(Resources.model_fields), 0)) + flat_counter = collections.Counter() for result in list_of_used_resources: - counter.update(result.model_dump()) + flat_counter.update(result.as_flat_dict()) + flat_counter.setdefault("cpus", 0) + flat_counter.setdefault("ram", 0) - return Resources.model_validate(dict(counter)) + return Resources.from_flat_dict(dict(flat_counter)) _COMMAND_TIMEOUT_S = 10 From 7971dea38d800da74f4de4da2567e1ea23bdabb8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 17:24:26 +0200 Subject: [PATCH 19/93] fixed test assert --- .../tests/unit/test_modules_cluster_scaling_dynamic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py index 8ba17f3f34ff..4b2682805345 100644 --- a/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py @@ -437,6 +437,7 @@ async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expect / 1e9, "ram": app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER * fake_node.description.resources.memory_bytes, + "generic_resources": {}, }, ) From 89819621c1b5917f2145bfb4ee2c8bc4aa583174 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 17:27:08 +0200 Subject: [PATCH 20/93] fix assert --- .../tests/unit/test_modules_cluster_scaling_dynamic.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py index 4b2682805345..bf9e42a1ec67 100644 --- a/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py @@ -713,11 +713,9 @@ async def _assert_wait_for_ec2_instances_running() -> list[InstanceTypeDef]: cluster_total_resources={ "cpus": fake_attached_node.description.resources.nano_cp_us / 1e9, "ram": fake_attached_node.description.resources.memory_bytes, + "generic_resources": {}, }, - cluster_used_resources={ - "cpus": float(0), - "ram": 0, - }, + cluster_used_resources={"cpus": float(0), "ram": 0, "generic_resources": {}}, instances_running=scale_up_params.expected_num_instances, ) mock_rabbitmq_post_message.reset_mock() From 8fd2fcea2e359c0b5af7b3ecc6b3cced5f1c021e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 17:30:30 +0200 Subject: [PATCH 21/93] typo --- packages/aws-library/tests/test_ec2_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index d1ef498f0c28..b83b57e75f57 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -275,7 +275,7 @@ def test_ec2_instance_data_hashable(faker: Faker): assert next(iter(second_set_of_ec2s)) in union_of_sets -def test_ec2_instance_boot_specific_with_invalid_custome_script(faker: Faker): +def test_ec2_instance_boot_specific_with_invalid_custom_script(faker: Faker): valid_model = EC2InstanceBootSpecific.model_json_schema()["examples"][0] invalid_model = {**valid_model, "custom_boot_scripts": ["echo 'missing end quote"]} From f62edf4c13887a00a5a7ec72735133ff3cd95874 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 17:32:22 +0200 Subject: [PATCH 22/93] wrong types --- .../modules/dask.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 6ed2bd7741bd..105551885aca 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -333,14 +333,12 @@ async def try_retire_nodes( async def add_instance_generic_resources( settings: DaskMonitoringSettings, instance: EC2InstanceData ) -> None: - instance_threads = round(instance.available_resources.cpus) - if settings.AUTOSCALING_DASK.DASK_NTHREADS > 0: + instance_threads = round(instance.resources.cpus) + if settings.DASK_NTHREADS > 0: # this overrides everything - instance_threads = settings.AUTOSCALING_DASK.DASK_NTHREADS - if settings.AUTOSCALING_DASK.DASK_NTHREADS_MULTIPLIER > 1: - instance_threads = ( - instance_threads * settings.AUTOSCALING_DASK.DASK_NTHREADS_MULTIPLIER - ) - instance.available_resources.generic_resources[ - _DASK_WORKER_THREAD_RESOURCE_NAME - ] = instance_threads + instance_threads = settings.DASK_NTHREADS + if settings.DASK_NTHREADS_MULTIPLIER > 1: + instance_threads = instance_threads * settings.DASK_NTHREADS_MULTIPLIER + instance.resources.generic_resources[_DASK_WORKER_THREAD_RESOURCE_NAME] = ( + instance_threads + ) From 6911754cf5f6ee5d8a371b4ea5e65652980c53da Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 17:39:21 +0200 Subject: [PATCH 23/93] ongoing --- .../modules/cluster_scaling/_provider_computational.py | 2 +- .../src/simcore_service_autoscaling/utils/utils_docker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py index a605c7ea0484..b55c8fc2f02e 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py @@ -140,7 +140,7 @@ async def compute_cluster_used_resources( list_of_used_resources: list[Resources] = await logged_gather( *(self.compute_node_used_resources(app, i) for i in instances) ) - counter = collections.Counter() + counter: collections.Counter = collections.Counter() for result in list_of_used_resources: counter.update(result.as_flat_dict()) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py index e2b9e044b9ef..f4feea61cfde 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py @@ -398,7 +398,7 @@ async def compute_cluster_used_resources( list_of_used_resources: list[Resources] = await logged_gather( *(compute_node_used_resources(docker_client, node) for node in nodes) ) - flat_counter = collections.Counter() + flat_counter: collections.Counter = collections.Counter() for result in list_of_used_resources: flat_counter.update(result.as_flat_dict()) flat_counter.setdefault("cpus", 0) From 7c9a8b71645ba28df002126d16c5fa8db8676573 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 08:04:29 +0200 Subject: [PATCH 24/93] mypy --- packages/aws-library/src/aws_library/ec2/_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index 3a4aaf42ec19..5045d34541df 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -136,7 +136,7 @@ def from_flat_dict(cls, data: dict[str, int | float | str]) -> "Resources": """Inverse of as_flat_dict""" generic_resources = {k: v for k, v in data.items() if k not in {"cpus", "ram"}} return cls( - cpus=data.get("cpus", 0), + cpus=float(data.get("cpus", 0)), ram=ByteSize(data.get("ram", 0)), generic_resources=generic_resources, ) From 6dfde1d79061ece28d98b4badf7b395aaef1981b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 08:44:18 +0200 Subject: [PATCH 25/93] added test for getting threads resources --- .../modules/dask.py | 6 +-- .../tests/unit/test_modules_dask.py | 41 +++++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 105551885aca..1eb5a51aa5ed 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -40,7 +40,7 @@ async def _wrap_client_async_routine( _DASK_SCHEDULER_CONNECT_TIMEOUT_S: Final[int] = 5 -_DASK_WORKER_THREAD_RESOURCE_NAME: Final[str] = "threads" +DASK_WORKER_THREAD_RESOURCE_NAME: Final[str] = "threads" @contextlib.asynccontextmanager @@ -330,7 +330,7 @@ async def try_retire_nodes( ) -async def add_instance_generic_resources( +def add_instance_generic_resources( settings: DaskMonitoringSettings, instance: EC2InstanceData ) -> None: instance_threads = round(instance.resources.cpus) @@ -339,6 +339,6 @@ async def add_instance_generic_resources( instance_threads = settings.DASK_NTHREADS if settings.DASK_NTHREADS_MULTIPLIER > 1: instance_threads = instance_threads * settings.DASK_NTHREADS_MULTIPLIER - instance.resources.generic_resources[_DASK_WORKER_THREAD_RESOURCE_NAME] = ( + instance.resources.generic_resources[DASK_WORKER_THREAD_RESOURCE_NAME] = ( instance_threads ) diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 9c53865cfa30..c4cd3c52794a 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -31,8 +31,11 @@ EC2InstanceData, ) from simcore_service_autoscaling.modules.dask import ( + DASK_WORKER_THREAD_RESOURCE_NAME, + DaskMonitoringSettings, DaskTask, _scheduler_client, + add_instance_generic_resources, get_worker_still_has_results_in_memory, get_worker_used_resources, list_processing_tasks_per_worker, @@ -370,3 +373,41 @@ def _add_fct(x: int, y: int) -> int: ) == Resources.create_as_empty() ) + + +@pytest.mark.parametrize( + "dask_nthreads, dask_nthreads_multiplier, expected_threads_resource", + [(4, 1, 4), (4, 2, 8), (0, 2.0, -1)], +) +def test_add_instance_generic_resources( + fake_ec2_instance_data: Callable[..., EC2InstanceData], + faker: Faker, + dask_nthreads: int, + dask_nthreads_multiplier: int, + expected_threads_resource: int, +): + settings = DaskMonitoringSettings( + DASK_MONITORING_URL=faker.url(), + DASK_SCHEDULER_AUTH=NoAuthentication(), + DASK_NTHREADS=dask_nthreads, + DASK_NTHREADS_MULTIPLIER=dask_nthreads_multiplier, + ) + ec2_instance_data = fake_ec2_instance_data() + assert ec2_instance_data.resources.cpus > 0 + assert ec2_instance_data.resources.ram > 0 + assert ec2_instance_data.resources.generic_resources == {} + + add_instance_generic_resources(settings, ec2_instance_data) + assert ec2_instance_data.resources.generic_resources != {} + assert ( + DASK_WORKER_THREAD_RESOURCE_NAME + in ec2_instance_data.resources.generic_resources + ) + if expected_threads_resource < 0: + expected_threads_resource = ( + ec2_instance_data.resources.cpus * dask_nthreads_multiplier + ) + assert ( + ec2_instance_data.resources.generic_resources[DASK_WORKER_THREAD_RESOURCE_NAME] + == expected_threads_resource + ) From 03be3af3ab68c8e1f1197c871f67e20af5cb0bde Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 08:51:16 +0200 Subject: [PATCH 26/93] added test --- .../tests/unit/test_modules_dask.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index c4cd3c52794a..a77fffd3fc43 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -38,6 +38,7 @@ add_instance_generic_resources, get_worker_still_has_results_in_memory, get_worker_used_resources, + is_worker_connected, list_processing_tasks_per_worker, list_unrunnable_tasks, ) @@ -380,15 +381,16 @@ def _add_fct(x: int, y: int) -> int: [(4, 1, 4), (4, 2, 8), (0, 2.0, -1)], ) def test_add_instance_generic_resources( + scheduler_url: AnyUrl, + scheduler_authentication: ClusterAuthentication, fake_ec2_instance_data: Callable[..., EC2InstanceData], - faker: Faker, dask_nthreads: int, dask_nthreads_multiplier: int, expected_threads_resource: int, ): settings = DaskMonitoringSettings( - DASK_MONITORING_URL=faker.url(), - DASK_SCHEDULER_AUTH=NoAuthentication(), + DASK_MONITORING_URL=scheduler_url, + DASK_SCHEDULER_AUTH=scheduler_authentication, DASK_NTHREADS=dask_nthreads, DASK_NTHREADS_MULTIPLIER=dask_nthreads_multiplier, ) @@ -411,3 +413,17 @@ def test_add_instance_generic_resources( ec2_instance_data.resources.generic_resources[DASK_WORKER_THREAD_RESOURCE_NAME] == expected_threads_resource ) + + +async def test_is_worker_connected( + scheduler_url: AnyUrl, + scheduler_authentication: ClusterAuthentication, + fake_ec2_instance_data: Callable[..., EC2InstanceData], +): + ec2_instance_data = fake_ec2_instance_data() + assert ( + await is_worker_connected( + scheduler_url, scheduler_authentication, ec2_instance_data + ) + is False + ) From c3117a7958bc887089da30df73955781d62abfaf Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 10:07:23 +0200 Subject: [PATCH 27/93] improve testing --- .../modules/dask.py | 4 ++ .../tests/unit/test_modules_dask.py | 56 ++++++++++++++++++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 1eb5a51aa5ed..7d8f8c289c4f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -103,6 +103,10 @@ def _find_by_worker_host( _, details = dask_worker if match := re.match(DASK_NAME_PATTERN, details["name"]): return bool(match.group("private_ip") == node_hostname) + _logger.warning( + "Unexpected worker name format: %s. TIP: this should be investigated", + details["name"], + ) return False filtered_workers = dict(filter(_find_by_worker_host, workers.items())) diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index a77fffd3fc43..28f6a3047d12 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -39,10 +39,11 @@ get_worker_still_has_results_in_memory, get_worker_used_resources, is_worker_connected, + is_worker_retired, list_processing_tasks_per_worker, list_unrunnable_tasks, ) -from tenacity import retry, stop_after_delay, wait_fixed +from tenacity import AsyncRetrying, retry, stop_after_delay, wait_fixed _authentication_types = [ NoAuthentication(), @@ -406,7 +407,7 @@ def test_add_instance_generic_resources( in ec2_instance_data.resources.generic_resources ) if expected_threads_resource < 0: - expected_threads_resource = ( + expected_threads_resource = int( ec2_instance_data.resources.cpus * dask_nthreads_multiplier ) assert ( @@ -419,6 +420,7 @@ async def test_is_worker_connected( scheduler_url: AnyUrl, scheduler_authentication: ClusterAuthentication, fake_ec2_instance_data: Callable[..., EC2InstanceData], + fake_localhost_ec2_instance_data: EC2InstanceData, ): ec2_instance_data = fake_ec2_instance_data() assert ( @@ -427,3 +429,53 @@ async def test_is_worker_connected( ) is False ) + + assert ( + await is_worker_connected( + scheduler_url, scheduler_authentication, fake_localhost_ec2_instance_data + ) + is True + ) + + +async def test_is_worker_retired( + dask_spec_local_cluster: distributed.SpecCluster, + scheduler_url: AnyUrl, + scheduler_authentication: ClusterAuthentication, + fake_ec2_instance_data: Callable[..., EC2InstanceData], + fake_localhost_ec2_instance_data: EC2InstanceData, +): + ec2_instance_data = fake_ec2_instance_data() + # fake instance is not connected, so it cannot be retired + assert ( + await is_worker_retired( + scheduler_url, scheduler_authentication, ec2_instance_data + ) + is False + ) + + # localhost is connected, but not retired + assert ( + await is_worker_retired( + scheduler_url, scheduler_authentication, fake_localhost_ec2_instance_data + ) + is False + ) + + # retire localhost worker + assert isinstance(dask_spec_local_cluster.scheduler, distributed.Scheduler) + await dask_spec_local_cluster.scheduler.retire_workers( + close_workers=True, remove=False + ) + async for attempt in AsyncRetrying( + stop=stop_after_delay(10), wait=wait_fixed(1), reraise=True + ): + with attempt: + assert ( + await is_worker_retired( + scheduler_url, + scheduler_authentication, + fake_localhost_ec2_instance_data, + ) + is True + ) From 37d1b449e040b448f8bbe0a7f70685381f5e1d8e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 10:17:01 +0200 Subject: [PATCH 28/93] improve testing --- services/autoscaling/tests/unit/test_modules_dask.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 28f6a3047d12..09e104c5cfe6 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -42,6 +42,7 @@ is_worker_retired, list_processing_tasks_per_worker, list_unrunnable_tasks, + try_retire_nodes, ) from tenacity import AsyncRetrying, retry, stop_after_delay, wait_fixed @@ -463,10 +464,7 @@ async def test_is_worker_retired( ) # retire localhost worker - assert isinstance(dask_spec_local_cluster.scheduler, distributed.Scheduler) - await dask_spec_local_cluster.scheduler.retire_workers( - close_workers=True, remove=False - ) + await try_retire_nodes(scheduler_url, scheduler_authentication) async for attempt in AsyncRetrying( stop=stop_after_delay(10), wait=wait_fixed(1), reraise=True ): From 0902d083d0d5e2f9931cd5e7d9daceb05b2024f6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 11:17:28 +0200 Subject: [PATCH 29/93] ruff --- services/autoscaling/tests/unit/test_utils_cluster_scaling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/services/autoscaling/tests/unit/test_utils_cluster_scaling.py b/services/autoscaling/tests/unit/test_utils_cluster_scaling.py index 1c325c1f6234..5525cedc9268 100644 --- a/services/autoscaling/tests/unit/test_utils_cluster_scaling.py +++ b/services/autoscaling/tests/unit/test_utils_cluster_scaling.py @@ -296,6 +296,7 @@ def test_sort_drained_nodes( assert app_settings.AUTOSCALING_EC2_INSTANCES machine_buffer_type = get_hot_buffer_type(random_fake_available_instances) _NUM_DRAINED_NODES = 20 + assert app_settings.AUTOSCALING_EC2_INSTANCES _NUM_NODE_WITH_TYPE_BUFFER = ( 3 * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER ) From 64d84dc7247bbf5330469cd769f280d8d1a92911 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 11:17:37 +0200 Subject: [PATCH 30/93] adding test --- .../autoscaling/tests/unit/test_modules_dask.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 09e104c5cfe6..4d3e667ad5df 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -36,6 +36,7 @@ DaskTask, _scheduler_client, add_instance_generic_resources, + compute_cluster_total_resources, get_worker_still_has_results_in_memory, get_worker_used_resources, is_worker_connected, @@ -378,6 +379,19 @@ def _add_fct(x: int, y: int) -> int: ) +async def test_compute_cluster_total_resources( + scheduler_url: AnyUrl, + scheduler_authentication: ClusterAuthentication, +): + # asking for resources of empty cluster returns empty resources + assert ( + await compute_cluster_total_resources( + scheduler_url, scheduler_authentication, [] + ) + == Resources.create_as_empty() + ) + + @pytest.mark.parametrize( "dask_nthreads, dask_nthreads_multiplier, expected_threads_resource", [(4, 1, 4), (4, 2, 8), (0, 2.0, -1)], From 067f8ff756e02b12976a08ca424b36856b9e6d1f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 11:45:19 +0200 Subject: [PATCH 31/93] implemented compute cluster total resources --- .../_provider_computational.py | 4 +++- .../modules/dask.py | 22 +++++++++++------ .../tests/unit/test_modules_dask.py | 24 ++++++++++++++++++- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py index b55c8fc2f02e..f73fa60a3c7d 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py @@ -155,7 +155,9 @@ async def compute_cluster_total_resources( assert self # nosec try: return await dask.compute_cluster_total_resources( - _scheduler_url(app), _scheduler_auth(app), instances + _scheduler_url(app), + _scheduler_auth(app), + [i.ec2_instance for i in instances], ) except DaskNoWorkersError: return Resources.create_as_empty() diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 7d8f8c289c4f..c07baf992034 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -21,7 +21,7 @@ DaskWorkerNotFoundError, ) from ..core.settings import DaskMonitoringSettings -from ..models import AssociatedInstance, DaskTask, DaskTaskId +from ..models import DaskTask, DaskTaskId from ..utils.utils_ec2 import ( node_host_name_from_ec2_private_dns, node_ip_from_ec2_private_dns, @@ -306,23 +306,31 @@ def _list_processing_tasks_on_worker( async def compute_cluster_total_resources( scheduler_url: AnyUrl, authentication: ClusterAuthentication, - instances: list[AssociatedInstance], + instances: list[EC2InstanceData], ) -> Resources: if not instances: return Resources.create_as_empty() async with _scheduler_client(scheduler_url, authentication) as client: - instance_hosts = ( - node_ip_from_ec2_private_dns(i.ec2_instance) for i in instances - ) + instance_host_resources_map = { + node_ip_from_ec2_private_dns(i): i.resources for i in instances + } scheduler_info = client.scheduler_info() if "workers" not in scheduler_info or not scheduler_info["workers"]: raise DaskNoWorkersError(url=scheduler_url) workers: dict[str, Any] = scheduler_info["workers"] + cluster_resources = Resources.create_as_empty() for worker_details in workers.values(): - if worker_details["host"] not in instance_hosts: + if worker_details["host"] not in instance_host_resources_map: continue + worker_ram = worker_details["memory_limit"] + worker_threads = worker_details["nthreads"] + cluster_resources += Resources( + cpus=instance_host_resources_map[worker_details["host"]].cpus, + ram=TypeAdapter(ByteSize).validate_python(worker_ram), + generic_resources={DASK_WORKER_THREAD_RESOURCE_NAME: worker_threads}, + ) - return Resources.create_as_empty() + return cluster_resources async def try_retire_nodes( diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 4d3e667ad5df..ebaab72ca406 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -380,8 +380,11 @@ def _add_fct(x: int, y: int) -> int: async def test_compute_cluster_total_resources( + dask_spec_local_cluster: distributed.SpecCluster, scheduler_url: AnyUrl, scheduler_authentication: ClusterAuthentication, + fake_ec2_instance_data: Callable[..., EC2InstanceData], + fake_localhost_ec2_instance_data: EC2InstanceData, ): # asking for resources of empty cluster returns empty resources assert ( @@ -390,6 +393,26 @@ async def test_compute_cluster_total_resources( ) == Resources.create_as_empty() ) + ec2_instance_data = fake_ec2_instance_data() + assert ec2_instance_data.resources.cpus > 0 + assert ec2_instance_data.resources.ram > 0 + assert ec2_instance_data.resources.generic_resources == {} + assert ( + await compute_cluster_total_resources( + scheduler_url, scheduler_authentication, [ec2_instance_data] + ) + == Resources.create_as_empty() + ), "this instance is not connected and should not be accounted for" + + cluster_total_resources = await compute_cluster_total_resources( + scheduler_url, scheduler_authentication, [fake_localhost_ec2_instance_data] + ) + assert cluster_total_resources.cpus > 0 + assert cluster_total_resources.ram > 0 + assert DASK_WORKER_THREAD_RESOURCE_NAME in cluster_total_resources.generic_resources + assert ( + cluster_total_resources.generic_resources[DASK_WORKER_THREAD_RESOURCE_NAME] == 2 + ) @pytest.mark.parametrize( @@ -454,7 +477,6 @@ async def test_is_worker_connected( async def test_is_worker_retired( - dask_spec_local_cluster: distributed.SpecCluster, scheduler_url: AnyUrl, scheduler_authentication: ClusterAuthentication, fake_ec2_instance_data: Callable[..., EC2InstanceData], From 7048af26086b947409d226fadc8feaf112aae815 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 14:02:01 +0200 Subject: [PATCH 32/93] adjusted compute used resources --- .../modules/dask.py | 20 +++++++++++++++---- .../tests/unit/test_modules_dask.py | 6 +++++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index c07baf992034..f6a234b2e508 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -295,12 +295,18 @@ def _list_processing_tasks_on_worker( total_resources_used.update(task_resources) _logger.debug("found %s for %s", f"{total_resources_used=}", f"{worker_url=}") - return Resources( + worker_used_resources = Resources( cpus=total_resources_used.get("CPU", 0), ram=TypeAdapter(ByteSize).validate_python( total_resources_used.get("RAM", 0) ), ) + if worker_processing_tasks: + worker_used_resources.generic_resources[ + DASK_WORKER_THREAD_RESOURCE_NAME + ] = len(worker_processing_tasks) + + return worker_used_resources async def compute_cluster_total_resources( @@ -322,11 +328,17 @@ async def compute_cluster_total_resources( for worker_details in workers.values(): if worker_details["host"] not in instance_host_resources_map: continue - worker_ram = worker_details["memory_limit"] + worker_dask_resources = worker_details["resources"] worker_threads = worker_details["nthreads"] cluster_resources += Resources( - cpus=instance_host_resources_map[worker_details["host"]].cpus, - ram=TypeAdapter(ByteSize).validate_python(worker_ram), + cpus=worker_dask_resources.get( + "CPU", instance_host_resources_map[worker_details["host"]].cpus + ), + ram=TypeAdapter(ByteSize).validate_python( + worker_dask_resources.get( + "RAM", instance_host_resources_map[worker_details["host"]].ram + ) + ), generic_resources={DASK_WORKER_THREAD_RESOURCE_NAME: worker_threads}, ) diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index ebaab72ca406..1b2b6aac1bb3 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -365,7 +365,11 @@ def _add_fct(x: int, y: int) -> int: await _wait_for_dask_scheduler_to_change_state() assert await get_worker_used_resources( scheduler_url, scheduler_authentication, fake_localhost_ec2_instance_data - ) == Resources(cpus=num_cpus, ram=ByteSize(0)) + ) == Resources( + cpus=num_cpus, + ram=ByteSize(0), + generic_resources={DASK_WORKER_THREAD_RESOURCE_NAME: 1}, + ) result = await future_queued_task.result(timeout=_DASK_SCHEDULER_REACTION_TIME_S) # type: ignore assert result == 7 From 52ee133ea861fdc626f08727a0a72ec8fff46881 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 14:11:56 +0200 Subject: [PATCH 33/93] testing --- .../autoscaling/src/simcore_service_autoscaling/modules/dask.py | 1 - 1 file changed, 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index f6a234b2e508..431554e2ad67 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -8,7 +8,6 @@ import dask.typing import distributed -import distributed.scheduler from aws_library.ec2 import EC2InstanceData, Resources from dask_task_models_library.resource_constraints import DaskTaskResources from distributed.core import Status From 171e702d80864cec0088bea7fcad45cdcb7caf7b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 14:56:19 +0200 Subject: [PATCH 34/93] simplify --- .../modules/dask.py | 53 +++++++++---------- .../tests/unit/test_modules_dask.py | 7 ++- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 431554e2ad67..52bb06d497c4 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -185,21 +185,16 @@ def _list_tasks( return [ DaskTask( task_id=_dask_key_to_dask_task_id(task_id), - required_resources=task_resources, + required_resources=task_resources + | {DASK_WORKER_THREAD_RESOURCE_NAME: 1}, ) for task_id, task_resources in list_of_tasks.items() ] -async def list_processing_tasks_per_worker( - scheduler_url: AnyUrl, - authentication: ClusterAuthentication, -) -> dict[DaskWorkerUrl, list[DaskTask]]: - """ - Raises: - DaskSchedulerNotFoundError - """ - +async def _list_cluster_processing_tasks( + client: distributed.Client, +) -> dict[DaskWorkerUrl, list[tuple[dask.typing.Key, DaskTaskResources]]]: def _list_processing_tasks( dask_scheduler: distributed.Scheduler, ) -> dict[str, list[tuple[dask.typing.Key, DaskTaskResources]]]: @@ -211,13 +206,26 @@ def _list_processing_tasks( ) return worker_to_processing_tasks + list_of_tasks: dict[str, list[tuple[dask.typing.Key, DaskTaskResources]]] = ( + await client.run_on_scheduler(_list_processing_tasks) + ) + _logger.debug("found processing tasks: %s", list_of_tasks) + + return list_of_tasks + + +async def list_processing_tasks_per_worker( + scheduler_url: AnyUrl, + authentication: ClusterAuthentication, +) -> dict[DaskWorkerUrl, list[DaskTask]]: + """ + Raises: + DaskSchedulerNotFoundError + """ + async with _scheduler_client(scheduler_url, authentication) as client: - worker_to_tasks: dict[str, list[tuple[dask.typing.Key, DaskTaskResources]]] = ( - await _wrap_client_async_routine( - client.run_on_scheduler(_list_processing_tasks) - ) - ) - _logger.debug("found processing tasks: %s", worker_to_tasks) + worker_to_tasks = await _list_cluster_processing_tasks(client) + tasks_per_worker = defaultdict(list) for worker, tasks in worker_to_tasks.items(): for task_id, required_resources in tasks: @@ -277,17 +285,8 @@ def _list_processing_tasks_on_worker( async with _scheduler_client(scheduler_url, authentication) as client: worker_url, _ = _dask_worker_from_ec2_instance(client, ec2_instance) - - _logger.debug("looking for processing tasks for %s", f"{worker_url=}") - - # now get the used resources - worker_processing_tasks: list[tuple[dask.typing.Key, DaskTaskResources]] = ( - await _wrap_client_async_routine( - client.run_on_scheduler( - _list_processing_tasks_on_worker, worker_url=worker_url - ), - ) - ) + worker_to_tasks = await _list_cluster_processing_tasks(client) + worker_processing_tasks = worker_to_tasks.get(worker_url, []) total_resources_used: collections.Counter[str] = collections.Counter() for _, task_resources in worker_processing_tasks: diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 1b2b6aac1bb3..4d721b3f28ad 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -126,7 +126,12 @@ async def test_list_unrunnable_tasks( future = create_dask_task(dask_task_impossible_resources) assert future assert await list_unrunnable_tasks(scheduler_url, scheduler_authentication) == [ - DaskTask(task_id=future.key, required_resources=dask_task_impossible_resources) + DaskTask( + task_id=future.key, + required_resources=( + dask_task_impossible_resources | {DASK_WORKER_THREAD_RESOURCE_NAME: 1} + ), + ) ] # remove that future, will remove the task del future From 7328d7500d16704b461b959e580e2cfc3167e4da Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 15:10:45 +0200 Subject: [PATCH 35/93] simplify --- .../modules/dask.py | 19 +++++-------------- .../tests/unit/test_modules_dask.py | 5 ++++- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 52bb06d497c4..92d3c13ee250 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -202,7 +202,11 @@ def _list_processing_tasks( for task_key, task_state in dask_scheduler.tasks.items(): if task_state.processing_on: worker_to_processing_tasks[task_state.processing_on.address].append( - (task_key, task_state.resource_restrictions or {}) + ( + task_key, + (task_state.resource_restrictions or {}) + | {DASK_WORKER_THREAD_RESOURCE_NAME: 1}, + ) ) return worker_to_processing_tasks @@ -270,19 +274,6 @@ async def get_worker_used_resources( DaskNoWorkersError """ - def _list_processing_tasks_on_worker( - dask_scheduler: distributed.Scheduler, *, worker_url: str - ) -> list[tuple[dask.typing.Key, DaskTaskResources]]: - processing_tasks = [] - for task_key, task_state in dask_scheduler.tasks.items(): - if task_state.processing_on and ( - task_state.processing_on.address == worker_url - ): - processing_tasks.append( - (task_key, task_state.resource_restrictions or {}) - ) - return processing_tasks - async with _scheduler_client(scheduler_url, authentication) as client: worker_url, _ = _dask_worker_from_ec2_instance(client, ec2_instance) worker_to_tasks = await _list_cluster_processing_tasks(client) diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 4d721b3f28ad..4f6b4ce73366 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -166,7 +166,10 @@ def _add_fct(x: int, y: int) -> int: scheduler_url, scheduler_authentication ) == { next(iter(dask_spec_cluster_client.scheduler_info()["workers"])): [ - DaskTask(task_id=DaskTaskId(future_queued_task.key), required_resources={}) + DaskTask( + task_id=DaskTaskId(future_queued_task.key), + required_resources={DASK_WORKER_THREAD_RESOURCE_NAME: 1}, + ) ] } From addb6fd058fd243f2625535aa2f57c987f677005 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 15:10:59 +0200 Subject: [PATCH 36/93] create a typed dict --- .../src/dask_task_models_library/resource_constraints.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py index 3a81114ef878..27b5bb1cb192 100644 --- a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py +++ b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py @@ -1,8 +1,13 @@ -from typing import Any, TypeAlias +from typing import Literal, TypedDict from .constants import DASK_TASK_EC2_RESOURCE_RESTRICTION_KEY -DaskTaskResources: TypeAlias = dict[str, Any] + +class DaskTaskResources(TypedDict): + CPU: float + RAM: int # in bytes + # threads is a constant of 1 (enforced by static type checkers via Literal) + threads: Literal[1] def create_ec2_resource_constraint_key(ec2_instance_type: str) -> str: From 82889c4737bbca77d6c86c6c414642cfb4b9d38c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 15:29:58 +0200 Subject: [PATCH 37/93] simplify --- .../modules/dask.py | 81 +++++++++++-------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 92d3c13ee250..500eec402fa9 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -4,7 +4,7 @@ import re from collections import defaultdict from collections.abc import AsyncIterator, Coroutine -from typing import Any, Final, TypeAlias +from typing import Any, Final, TypeAlias, TypedDict import dask.typing import distributed @@ -119,6 +119,44 @@ def _find_by_worker_host( return next(iter(filtered_workers.items())) +class DaskClusterTasks(TypedDict): + processing: dict[DaskWorkerUrl, list[tuple[dask.typing.Key, DaskTaskResources]]] + unrunnable: dict[dask.typing.Key, DaskTaskResources] + + +async def _list_cluster_known_tasks( + client: distributed.Client, +) -> DaskClusterTasks: + def _list_on_scheduler( + dask_scheduler: distributed.Scheduler, + ) -> DaskClusterTasks: + worker_to_processing_tasks = defaultdict(list) + unrunnable_tasks = {} + for task_key, task_state in dask_scheduler.tasks.items(): + if task_state.processing_on: + worker_to_processing_tasks[task_state.processing_on.address].append( + ( + task_key, + (task_state.resource_restrictions or {}) + | {DASK_WORKER_THREAD_RESOURCE_NAME: 1}, + ) + ) + elif task_state in dask_scheduler.unrunnable: + unrunnable_tasks[task_key] = ( + task_state.resource_restrictions or {} + ) | {DASK_WORKER_THREAD_RESOURCE_NAME: 1} + + return DaskClusterTasks( + processing=dict(worker_to_processing_tasks), + unrunnable=unrunnable_tasks, + ) + + list_of_tasks: DaskClusterTasks = await client.run_on_scheduler(_list_on_scheduler) + _logger.debug("found tasks: %s", list_of_tasks) + + return list_of_tasks + + async def is_worker_connected( scheduler_url: AnyUrl, authentication: ClusterAuthentication, @@ -178,10 +216,9 @@ def _list_tasks( } async with _scheduler_client(scheduler_url, authentication) as client: - list_of_tasks: dict[dask.typing.Key, DaskTaskResources] = ( - await _wrap_client_async_routine(client.run_on_scheduler(_list_tasks)) - ) - _logger.debug("found unrunnable tasks: %s", list_of_tasks) + known_tasks = await _list_cluster_known_tasks(client) + list_of_tasks = known_tasks["unrunnable"] + return [ DaskTask( task_id=_dask_key_to_dask_task_id(task_id), @@ -192,32 +229,6 @@ def _list_tasks( ] -async def _list_cluster_processing_tasks( - client: distributed.Client, -) -> dict[DaskWorkerUrl, list[tuple[dask.typing.Key, DaskTaskResources]]]: - def _list_processing_tasks( - dask_scheduler: distributed.Scheduler, - ) -> dict[str, list[tuple[dask.typing.Key, DaskTaskResources]]]: - worker_to_processing_tasks = defaultdict(list) - for task_key, task_state in dask_scheduler.tasks.items(): - if task_state.processing_on: - worker_to_processing_tasks[task_state.processing_on.address].append( - ( - task_key, - (task_state.resource_restrictions or {}) - | {DASK_WORKER_THREAD_RESOURCE_NAME: 1}, - ) - ) - return worker_to_processing_tasks - - list_of_tasks: dict[str, list[tuple[dask.typing.Key, DaskTaskResources]]] = ( - await client.run_on_scheduler(_list_processing_tasks) - ) - _logger.debug("found processing tasks: %s", list_of_tasks) - - return list_of_tasks - - async def list_processing_tasks_per_worker( scheduler_url: AnyUrl, authentication: ClusterAuthentication, @@ -228,10 +239,10 @@ async def list_processing_tasks_per_worker( """ async with _scheduler_client(scheduler_url, authentication) as client: - worker_to_tasks = await _list_cluster_processing_tasks(client) + worker_to_tasks = await _list_cluster_known_tasks(client) tasks_per_worker = defaultdict(list) - for worker, tasks in worker_to_tasks.items(): + for worker, tasks in worker_to_tasks["processing"].items(): for task_id, required_resources in tasks: tasks_per_worker[worker].append( DaskTask( @@ -276,8 +287,8 @@ async def get_worker_used_resources( async with _scheduler_client(scheduler_url, authentication) as client: worker_url, _ = _dask_worker_from_ec2_instance(client, ec2_instance) - worker_to_tasks = await _list_cluster_processing_tasks(client) - worker_processing_tasks = worker_to_tasks.get(worker_url, []) + known_tasks = await _list_cluster_known_tasks(client) + worker_processing_tasks = known_tasks["processing"].get(worker_url, []) total_resources_used: collections.Counter[str] = collections.Counter() for _, task_resources in worker_processing_tasks: From 711a7e9d36a30dd4756cb1fb302a91d9e9b6020a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 15:35:28 +0200 Subject: [PATCH 38/93] moved naming --- .../dask_task_models_library/resource_constraints.py | 6 ++++-- .../src/simcore_service_autoscaling/modules/dask.py | 6 ++++-- services/autoscaling/tests/unit/test_modules_dask.py | 12 ++++++------ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py index 27b5bb1cb192..49f050a05a46 100644 --- a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py +++ b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py @@ -1,9 +1,11 @@ -from typing import Literal, TypedDict +from typing import Final, Literal, TypedDict from .constants import DASK_TASK_EC2_RESOURCE_RESTRICTION_KEY +DASK_WORKER_THREAD_RESOURCE_NAME: Final[str] = "threads" -class DaskTaskResources(TypedDict): + +class DaskTaskResources(TypedDict, total=False): CPU: float RAM: int # in bytes # threads is a constant of 1 (enforced by static type checkers via Literal) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 500eec402fa9..9f16ca74e425 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -9,7 +9,10 @@ import dask.typing import distributed from aws_library.ec2 import EC2InstanceData, Resources -from dask_task_models_library.resource_constraints import DaskTaskResources +from dask_task_models_library.resource_constraints import ( + DASK_WORKER_THREAD_RESOURCE_NAME, + DaskTaskResources, +) from distributed.core import Status from models_library.clusters import ClusterAuthentication, TLSAuthentication from pydantic import AnyUrl, ByteSize, TypeAdapter @@ -39,7 +42,6 @@ async def _wrap_client_async_routine( _DASK_SCHEDULER_CONNECT_TIMEOUT_S: Final[int] = 5 -DASK_WORKER_THREAD_RESOURCE_NAME: Final[str] = "threads" @contextlib.asynccontextmanager diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 4f6b4ce73366..6cc5b98a1516 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -11,6 +11,9 @@ import pytest from arrow import utcnow from aws_library.ec2 import Resources +from dask_task_models_library.resource_constraints import ( + DASK_WORKER_THREAD_RESOURCE_NAME, +) from faker import Faker from models_library.clusters import ( ClusterAuthentication, @@ -31,7 +34,6 @@ EC2InstanceData, ) from simcore_service_autoscaling.modules.dask import ( - DASK_WORKER_THREAD_RESOURCE_NAME, DaskMonitoringSettings, DaskTask, _scheduler_client, @@ -122,15 +124,13 @@ async def test_list_unrunnable_tasks( # we have nothing running now assert await list_unrunnable_tasks(scheduler_url, scheduler_authentication) == [] # start a task that cannot run - dask_task_impossible_resources = {"XRAM": 213} + dask_task_impossible_resources = DaskTaskResources(XRAM=213, threads=1) future = create_dask_task(dask_task_impossible_resources) assert future assert await list_unrunnable_tasks(scheduler_url, scheduler_authentication) == [ DaskTask( task_id=future.key, - required_resources=( - dask_task_impossible_resources | {DASK_WORKER_THREAD_RESOURCE_NAME: 1} - ), + required_resources=(dask_task_impossible_resources), ) ] # remove that future, will remove the task @@ -168,7 +168,7 @@ def _add_fct(x: int, y: int) -> int: next(iter(dask_spec_cluster_client.scheduler_info()["workers"])): [ DaskTask( task_id=DaskTaskId(future_queued_task.key), - required_resources={DASK_WORKER_THREAD_RESOURCE_NAME: 1}, + required_resources=DaskTaskResources(threads=1), ) ] } From b236f576ed69d1d3f5389484b145fa71c508daf6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 16:06:12 +0200 Subject: [PATCH 39/93] more --- .../modules/dask.py | 25 ++++++------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 9f16ca74e425..78489740507b 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -208,15 +208,6 @@ async def list_unrunnable_tasks( DaskSchedulerNotFoundError """ - def _list_tasks( - dask_scheduler: distributed.Scheduler, - ) -> dict[dask.typing.Key, dict[str, float]]: - # NOTE: task.key can be a byte, str, or a tuple - return { - task.key: task.resource_restrictions or {} - for task in dask_scheduler.unrunnable - } - async with _scheduler_client(scheduler_url, authentication) as client: known_tasks = await _list_cluster_known_tasks(client) list_of_tasks = known_tasks["unrunnable"] @@ -224,8 +215,7 @@ def _list_tasks( return [ DaskTask( task_id=_dask_key_to_dask_task_id(task_id), - required_resources=task_resources - | {DASK_WORKER_THREAD_RESOURCE_NAME: 1}, + required_resources=task_resources, ) for task_id, task_resources in list_of_tasks.items() ] @@ -291,24 +281,23 @@ async def get_worker_used_resources( worker_url, _ = _dask_worker_from_ec2_instance(client, ec2_instance) known_tasks = await _list_cluster_known_tasks(client) worker_processing_tasks = known_tasks["processing"].get(worker_url, []) + if not worker_processing_tasks: + return Resources.create_as_empty() total_resources_used: collections.Counter[str] = collections.Counter() for _, task_resources in worker_processing_tasks: total_resources_used.update(task_resources) _logger.debug("found %s for %s", f"{total_resources_used=}", f"{worker_url=}") - worker_used_resources = Resources( + return Resources( cpus=total_resources_used.get("CPU", 0), ram=TypeAdapter(ByteSize).validate_python( total_resources_used.get("RAM", 0) ), + generic_resources={ + k: v for k, v in total_resources_used.items() if k not in {"CPU", "RAM"} + }, ) - if worker_processing_tasks: - worker_used_resources.generic_resources[ - DASK_WORKER_THREAD_RESOURCE_NAME - ] = len(worker_processing_tasks) - - return worker_used_resources async def compute_cluster_total_resources( From d9682ff755c8e0d075afb20a1c7ebcf7dc923761 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 16:51:00 +0200 Subject: [PATCH 40/93] mypy --- .../src/simcore_service_autoscaling/modules/dask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 78489740507b..fc6adfb6014a 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -149,8 +149,8 @@ def _list_on_scheduler( ) | {DASK_WORKER_THREAD_RESOURCE_NAME: 1} return DaskClusterTasks( - processing=dict(worker_to_processing_tasks), - unrunnable=unrunnable_tasks, + processing=worker_to_processing_tasks, # type: ignore[typeddict-item] + unrunnable=unrunnable_tasks, # type: ignore[typeddict-item] ) list_of_tasks: DaskClusterTasks = await client.run_on_scheduler(_list_on_scheduler) From fb5f001549e2de61b57ea83aba1152306a177d3f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 16:53:39 +0200 Subject: [PATCH 41/93] mypy --- .../modules/cluster_scaling/_utils_computational.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py index 4fb76ee5e129..6a351a4d1c9e 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py @@ -3,8 +3,10 @@ from aws_library.ec2 import Resources from dask_task_models_library.resource_constraints import ( + DASK_WORKER_THREAD_RESOURCE_NAME, get_ec2_instance_type_from_resources, ) +from pydantic import ByteSize from ...models import DaskTask @@ -17,7 +19,8 @@ def resources_from_dask_task(task: DaskTask) -> Resources: return Resources( cpus=task.required_resources.get("CPU", _DEFAULT_MAX_CPU), - ram=task.required_resources.get("RAM", _DEFAULT_MAX_RAM), + ram=ByteSize(task.required_resources.get("RAM", _DEFAULT_MAX_RAM)), + generic_resources={DASK_WORKER_THREAD_RESOURCE_NAME: 1}, ) From adfdb9c3a01d7f9ac049bc1de5315c6243c153a5 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Sep 2025 17:13:27 +0200 Subject: [PATCH 42/93] fix test --- .../cluster_scaling/_utils_computational.py | 29 +++++++++++++++---- ...les_cluster_scaling_utils_computational.py | 23 ++++++++++++--- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py index 6a351a4d1c9e..3584c7afd290 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py @@ -3,7 +3,7 @@ from aws_library.ec2 import Resources from dask_task_models_library.resource_constraints import ( - DASK_WORKER_THREAD_RESOURCE_NAME, + DaskTaskResources, get_ec2_instance_type_from_resources, ) from pydantic import ByteSize @@ -15,13 +15,32 @@ _DEFAULT_MAX_CPU: Final[float] = 1 _DEFAULT_MAX_RAM: Final[int] = 1024 +_DASK_TO_RESOURCE_NAME_MAPPING: Final[dict[str, str]] = { + "CPU": "cpus", + "RAM": "ram", +} +_DEFAULT_DASK_RESOURCES: Final[DaskTaskResources] = DaskTaskResources( + CPU=_DEFAULT_MAX_CPU, RAM=ByteSize(_DEFAULT_MAX_RAM), threads=1 +) + def resources_from_dask_task(task: DaskTask) -> Resources: - return Resources( - cpus=task.required_resources.get("CPU", _DEFAULT_MAX_CPU), - ram=ByteSize(task.required_resources.get("RAM", _DEFAULT_MAX_RAM)), - generic_resources={DASK_WORKER_THREAD_RESOURCE_NAME: 1}, + task_resources = ( + _DEFAULT_DASK_RESOURCES | task.required_resources + ) # merge with defaults + + return Resources.from_flat_dict( + {_DASK_TO_RESOURCE_NAME_MAPPING.get(k, k): v for k, v in task_resources.items()} ) + # ({ + # "cpus": task.required_resources.get("CPU", _DEFAULT_MAX_CPU), + # "ram": task.required_resources.get("RAM", _DEFAULT_MAX_RAM), + # } + # ) + # return Resources( + # cpus=task.required_resources.get("CPU", _DEFAULT_MAX_CPU), + # ram=ByteSize(task.required_resources.get("RAM", _DEFAULT_MAX_RAM)), + # ) def get_task_instance_restriction(task: DaskTask) -> str | None: diff --git a/services/autoscaling/tests/unit/test_modules_cluster_scaling_utils_computational.py b/services/autoscaling/tests/unit/test_modules_cluster_scaling_utils_computational.py index e412487f4ea6..e051766dae31 100644 --- a/services/autoscaling/tests/unit/test_modules_cluster_scaling_utils_computational.py +++ b/services/autoscaling/tests/unit/test_modules_cluster_scaling_utils_computational.py @@ -6,6 +6,9 @@ import pytest from aws_library.ec2 import Resources +from dask_task_models_library.resource_constraints import ( + DASK_WORKER_THREAD_RESOURCE_NAME, +) from pydantic import ByteSize, TypeAdapter from simcore_service_autoscaling.models import DaskTask, DaskTaskResources from simcore_service_autoscaling.modules.cluster_scaling._utils_computational import ( @@ -23,13 +26,16 @@ Resources( cpus=_DEFAULT_MAX_CPU, ram=TypeAdapter(ByteSize).validate_python(_DEFAULT_MAX_RAM), + generic_resources={DASK_WORKER_THREAD_RESOURCE_NAME: 1}, ), id="missing resources returns defaults", ), pytest.param( DaskTask(task_id="fake", required_resources={"CPU": 2.5}), Resources( - cpus=2.5, ram=TypeAdapter(ByteSize).validate_python(_DEFAULT_MAX_RAM) + cpus=2.5, + ram=TypeAdapter(ByteSize).validate_python(_DEFAULT_MAX_RAM), + generic_resources={DASK_WORKER_THREAD_RESOURCE_NAME: 1}, ), id="only cpus defined", ), @@ -38,16 +44,25 @@ task_id="fake", required_resources={"CPU": 2.5, "RAM": 2 * 1024 * 1024 * 1024}, ), - Resources(cpus=2.5, ram=TypeAdapter(ByteSize).validate_python("2GiB")), + Resources( + cpus=2.5, + ram=TypeAdapter(ByteSize).validate_python("2GiB"), + generic_resources={DASK_WORKER_THREAD_RESOURCE_NAME: 1}, + ), id="cpu and ram defined", ), pytest.param( DaskTask( task_id="fake", - required_resources={"CPU": 2.5, "ram": 2 * 1024 * 1024 * 1024}, + required_resources={"CPU": 2.5, "xram": 2 * 1024 * 1024 * 1024}, # type: ignore ), Resources( - cpus=2.5, ram=TypeAdapter(ByteSize).validate_python(_DEFAULT_MAX_RAM) + cpus=2.5, + ram=TypeAdapter(ByteSize).validate_python(_DEFAULT_MAX_RAM), + generic_resources={ + DASK_WORKER_THREAD_RESOURCE_NAME: 1, + "xram": 2 * 1024 * 1024 * 1024, + }, ), id="invalid naming", ), From a402559bd3cc3e2b3aab7a0308370bfb917fc930 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 16 Oct 2025 17:08:56 +0200 Subject: [PATCH 43/93] mypy --- .../cluster_scaling/_utils_computational.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py index 3584c7afd290..01fcff71523f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py @@ -1,5 +1,5 @@ import logging -from typing import Final +from typing import Final, cast from aws_library.ec2 import Resources from dask_task_models_library.resource_constraints import ( @@ -30,17 +30,11 @@ def resources_from_dask_task(task: DaskTask) -> Resources: ) # merge with defaults return Resources.from_flat_dict( - {_DASK_TO_RESOURCE_NAME_MAPPING.get(k, k): v for k, v in task_resources.items()} + { + _DASK_TO_RESOURCE_NAME_MAPPING.get(k, k): cast(int | float | str, v) + for k, v in task_resources.items() + } ) - # ({ - # "cpus": task.required_resources.get("CPU", _DEFAULT_MAX_CPU), - # "ram": task.required_resources.get("RAM", _DEFAULT_MAX_RAM), - # } - # ) - # return Resources( - # cpus=task.required_resources.get("CPU", _DEFAULT_MAX_CPU), - # ram=ByteSize(task.required_resources.get("RAM", _DEFAULT_MAX_RAM)), - # ) def get_task_instance_restriction(task: DaskTask) -> str | None: From 97a195b64ae505fb9f00e9b8e76ee4de729ca756 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 16 Oct 2025 17:09:03 +0200 Subject: [PATCH 44/93] improve docs --- packages/aws-library/src/aws_library/ec2/_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index 5045d34541df..20960b5b6862 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -59,7 +59,7 @@ def __ge__(self, other: "Resources") -> bool: a = self.generic_resources.get(k) b = other.generic_resources.get( k, a - ) # NOTE: get from other, default to a so that non-existing keys are considered equal + ) # NOTE: get from other, default to "a" resources so that non-existing keys can be compared as equal if isinstance(a, int | float) and isinstance(b, int | float): if not (a >= b): return False From 89c547cc760d9258add9ac64f1796809c48c6fc4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 16 Oct 2025 17:11:18 +0200 Subject: [PATCH 45/93] remove todo --- .../modules/cluster_scaling/_provider_computational.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py index f73fa60a3c7d..7d12ae19e6e7 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py @@ -90,10 +90,7 @@ def get_task_required_resources(self, task) -> Resources: assert self # nosec # NOTE: a dask worker can take a task if it has a free thread, regardless of its resources # so we need to be careful when interpreting the resources, adding the thread here will mimick this - task_required_resources = utils.resources_from_dask_task(task) - # TODO: should we add a generic resource for threads? - # task_required_resources.generic_resources[_DASK_WORKER_THREAD_RESOURCE_NAME] = 1 - return task_required_resources + return utils.resources_from_dask_task(task) async def get_task_defined_instance( self, app: FastAPI, task From 6179fe79df5a3bc54e288bb9a1d111dad7a8d563 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 16 Oct 2025 17:46:29 +0200 Subject: [PATCH 46/93] re-added gt operator --- .../src/aws_library/ec2/_models.py | 9 +++ packages/aws-library/tests/test_ec2_models.py | 79 +++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index 20960b5b6862..4ca976618747 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -69,6 +69,15 @@ def __ge__(self, other: "Resources") -> bool: return False return True + def __gt__(self, other: "Resources") -> bool: + """operator for > comparison + if self has greater resources than other, returns True + Note that generic_resources are compared only if they are numeric + Non-numeric generic resources must be equal in both or only defined in self + to be considered greater + """ + return self >= other and self != other + def __add__(self, other: "Resources") -> "Resources": """operator for adding two Resources Note that only numeric generic resources are added diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index b83b57e75f57..00767b220f7f 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -96,6 +96,85 @@ def test_resources_ge_operator( assert (a >= b) is a_greater_or_equal_than_b +@pytest.mark.parametrize( + "a,b,a_greater_than_b", + [ + ( + Resources(cpus=0.2, ram=ByteSize(0)), + Resources(cpus=0.1, ram=ByteSize(0)), + True, + ), + ( + Resources(cpus=0.1, ram=ByteSize(0)), + Resources(cpus=0.1, ram=ByteSize(0)), + False, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1)), + Resources(cpus=0.1, ram=ByteSize(0)), + True, + ), + ( + Resources(cpus=0.05, ram=ByteSize(1)), + Resources(cpus=0.1, ram=ByteSize(0)), + False, + ), + ( + Resources(cpus=0.1, ram=ByteSize(0)), + Resources(cpus=0.1, ram=ByteSize(1)), + False, + ), + ( + Resources(cpus=0.1, ram=ByteSize(0), generic_resources={"GPU": 1}), + Resources(cpus=0.1, ram=ByteSize(1)), + False, # ram is not enough + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + Resources(cpus=0.1, ram=ByteSize(1)), + True, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + False, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), + False, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1)), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), + False, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": "2"}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), + False, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), + Resources(cpus=0.1, ram=ByteSize(1)), + True, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), + False, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1)), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), + False, + ), + ], +) +def test_resources_gt_operator(a: Resources, b: Resources, a_greater_than_b: bool): + assert (a > b) is a_greater_than_b + + @pytest.mark.parametrize( "a,b,result", [ From 03a26142bcffe33d949e727809d599efe9e7741f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 16 Oct 2025 18:06:00 +0200 Subject: [PATCH 47/93] use Required --- .../dask_task_models_library/resource_constraints.py | 11 +++++++---- .../tests/test_resource_constraints.py | 12 ++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py index 49f050a05a46..55c700541a73 100644 --- a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py +++ b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py @@ -1,4 +1,4 @@ -from typing import Final, Literal, TypedDict +from typing import Final, Literal, Required, TypedDict from .constants import DASK_TASK_EC2_RESOURCE_RESTRICTION_KEY @@ -6,10 +6,13 @@ class DaskTaskResources(TypedDict, total=False): - CPU: float - RAM: int # in bytes + CPU: Required[float] + RAM: Required[int] # in bytes # threads is a constant of 1 (enforced by static type checkers via Literal) - threads: Literal[1] + # NOTE: a dask worker can take a task if it has a free thread, + # regardless of its resources so we need to be careful when interpreting + # the resources, adding the thread here will mimick this + threads: Required[Literal[1]] def create_ec2_resource_constraint_key(ec2_instance_type: str) -> str: diff --git a/packages/dask-task-models-library/tests/test_resource_constraints.py b/packages/dask-task-models-library/tests/test_resource_constraints.py index 9a2c1e59e26b..121d2b740d23 100644 --- a/packages/dask-task-models-library/tests/test_resource_constraints.py +++ b/packages/dask-task-models-library/tests/test_resource_constraints.py @@ -1,11 +1,23 @@ from dask_task_models_library.constants import DASK_TASK_EC2_RESOURCE_RESTRICTION_KEY from dask_task_models_library.resource_constraints import ( + DaskTaskResources, create_ec2_resource_constraint_key, get_ec2_instance_type_from_resources, ) from faker import Faker +def test_dask_task_resource(faker: Faker): + task_resources = DaskTaskResources( + CPU=faker.pyfloat(min_value=0.1, max_value=100), + RAM=faker.pyint(min_value=1024, max_value=1024**3), + threads=1, + ) + assert task_resources["threads"] == 1 + assert task_resources["CPU"] > 0 + assert task_resources["RAM"] >= 1024 + + def test_create_ec2_resource_constraint_key(faker: Faker): faker_instance_type = faker.pystr() assert ( From 8e0f9055727de86163f0f930fc66122c23e3ba93 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 16 Oct 2025 18:06:53 +0200 Subject: [PATCH 48/93] improve docs --- .../modules/cluster_scaling/_utils_computational.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py index 01fcff71523f..8ec895348108 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py @@ -27,7 +27,7 @@ def resources_from_dask_task(task: DaskTask) -> Resources: task_resources = ( _DEFAULT_DASK_RESOURCES | task.required_resources - ) # merge with defaults + ) # merge with defaults to ensure there is always some minimal resource defined return Resources.from_flat_dict( { From 74d8ca81b18581810397c70d3fb9c310175d1cc0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 16 Oct 2025 18:07:10 +0200 Subject: [PATCH 49/93] moved docs --- .../modules/cluster_scaling/_provider_computational.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py index 7d12ae19e6e7..243674344a39 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py @@ -88,8 +88,6 @@ async def list_unrunnable_tasks(self, app: FastAPI) -> list[DaskTask]: def get_task_required_resources(self, task) -> Resources: assert self # nosec - # NOTE: a dask worker can take a task if it has a free thread, regardless of its resources - # so we need to be careful when interpreting the resources, adding the thread here will mimick this return utils.resources_from_dask_task(task) async def get_task_defined_instance( From 1158e99423292fa1c88e8fe1b5f7b74cc9e70a68 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 16 Oct 2025 18:16:05 +0200 Subject: [PATCH 50/93] added mapping --- .../src/aws_library/ec2/_models.py | 21 ++++++++++++++----- packages/aws-library/tests/test_ec2_models.py | 7 +++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index 4ca976618747..27e8024e5a42 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -141,12 +141,23 @@ def as_flat_dict(self) -> dict[str, int | float | str]: return base @classmethod - def from_flat_dict(cls, data: dict[str, int | float | str]) -> "Resources": - """Inverse of as_flat_dict""" - generic_resources = {k: v for k, v in data.items() if k not in {"cpus", "ram"}} + def from_flat_dict( + cls, + data: dict[str, int | float | str], + *, + mapping: dict[str, str] | None = None, + ) -> "Resources": + """Inverse of as_flat_dict with optional key mapping""" + mapped_data = data + if mapping: + mapped_data = {mapping.get(k, k): v for k, v in data.items()} + generic_resources = { + k: v for k, v in mapped_data.items() if k not in {"cpus", "ram"} + } + return cls( - cpus=float(data.get("cpus", 0)), - ram=ByteSize(data.get("ram", 0)), + cpus=float(mapped_data.get("cpus", 0)), + ram=ByteSize(mapped_data.get("ram", 0)), generic_resources=generic_resources, ) diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index 00767b220f7f..131b24da87f8 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -298,6 +298,13 @@ def test_resources_flat_dict(): reconstructed = Resources.from_flat_dict(flat) assert reconstructed == r + # test with mapping + flat_with_oter_names = {"CPU": 0.1, "RAM": 1024, "GPU": 2, "SSE": "yes"} + reconstructed2 = Resources.from_flat_dict( + flat_with_oter_names, mapping={"CPU": "cpus", "RAM": "ram"} + ) + assert reconstructed2 == r + @pytest.mark.parametrize("ec2_tag_key", ["", "/", " ", ".", "..", "_index"]) def test_aws_tag_key_invalid(ec2_tag_key: str): From 49b4bb9ded8af3afb9ae5233443084286edff17e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 16 Oct 2025 18:17:40 +0200 Subject: [PATCH 51/93] added mapping --- .../modules/cluster_scaling/_utils_computational.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py index 8ec895348108..80e846398096 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py @@ -1,5 +1,5 @@ import logging -from typing import Final, cast +from typing import Final from aws_library.ec2 import Resources from dask_task_models_library.resource_constraints import ( @@ -30,10 +30,7 @@ def resources_from_dask_task(task: DaskTask) -> Resources: ) # merge with defaults to ensure there is always some minimal resource defined return Resources.from_flat_dict( - { - _DASK_TO_RESOURCE_NAME_MAPPING.get(k, k): cast(int | float | str, v) - for k, v in task_resources.items() - } + task_resources.items(), mapping=_DASK_TO_RESOURCE_NAME_MAPPING ) From 83db960fb7a2d07aa6e15d45025b1361d661c593 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 16 Oct 2025 18:19:58 +0200 Subject: [PATCH 52/93] improve error --- .../src/simcore_service_autoscaling/modules/dask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index fc6adfb6014a..e759a0c08027 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -104,8 +104,8 @@ def _find_by_worker_host( _, details = dask_worker if match := re.match(DASK_NAME_PATTERN, details["name"]): return bool(match.group("private_ip") == node_hostname) - _logger.warning( - "Unexpected worker name format: %s. TIP: this should be investigated", + _logger.error( + "Unexpected worker name format: %s. TIP: this should be investigated as this is unexpected", details["name"], ) return False From 4d2281f1fc5abaf25051650ff78948cfee36afe6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 16 Oct 2025 18:24:27 +0200 Subject: [PATCH 53/93] make private --- .../src/simcore_service_autoscaling/modules/dask.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index e759a0c08027..d2ef18333574 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -121,17 +121,18 @@ def _find_by_worker_host( return next(iter(filtered_workers.items())) -class DaskClusterTasks(TypedDict): +class _DaskClusterTasks(TypedDict): processing: dict[DaskWorkerUrl, list[tuple[dask.typing.Key, DaskTaskResources]]] unrunnable: dict[dask.typing.Key, DaskTaskResources] async def _list_cluster_known_tasks( client: distributed.Client, -) -> DaskClusterTasks: +) -> _DaskClusterTasks: def _list_on_scheduler( dask_scheduler: distributed.Scheduler, - ) -> DaskClusterTasks: + ) -> _DaskClusterTasks: + worker_to_processing_tasks = defaultdict(list) unrunnable_tasks = {} for task_key, task_state in dask_scheduler.tasks.items(): @@ -148,12 +149,12 @@ def _list_on_scheduler( task_state.resource_restrictions or {} ) | {DASK_WORKER_THREAD_RESOURCE_NAME: 1} - return DaskClusterTasks( + return _DaskClusterTasks( processing=worker_to_processing_tasks, # type: ignore[typeddict-item] unrunnable=unrunnable_tasks, # type: ignore[typeddict-item] ) - list_of_tasks: DaskClusterTasks = await client.run_on_scheduler(_list_on_scheduler) + list_of_tasks: _DaskClusterTasks = await client.run_on_scheduler(_list_on_scheduler) _logger.debug("found tasks: %s", list_of_tasks) return list_of_tasks From 7418819b9bbd3177fef3f2649f0eb93e3b86f0fb Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 08:30:28 +0200 Subject: [PATCH 54/93] simplify --- .../simcore_service_autoscaling/modules/dask.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index d2ef18333574..4f7495c0162d 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -28,6 +28,9 @@ node_host_name_from_ec2_private_dns, node_ip_from_ec2_private_dns, ) +from .cluster_scaling._utils_computational import ( + resources_from_dask_task, +) _logger = logging.getLogger(__name__) @@ -285,20 +288,12 @@ async def get_worker_used_resources( if not worker_processing_tasks: return Resources.create_as_empty() - total_resources_used: collections.Counter[str] = collections.Counter() + total_resources_used: collections.Counter = collections.Counter() for _, task_resources in worker_processing_tasks: total_resources_used.update(task_resources) _logger.debug("found %s for %s", f"{total_resources_used=}", f"{worker_url=}") - return Resources( - cpus=total_resources_used.get("CPU", 0), - ram=TypeAdapter(ByteSize).validate_python( - total_resources_used.get("RAM", 0) - ), - generic_resources={ - k: v for k, v in total_resources_used.items() if k not in {"CPU", "RAM"} - }, - ) + return resources_from_dask_task(total_resources_used) async def compute_cluster_total_resources( From 3f930bd9b691857ddb1d666ee766d303616dee39 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 08:33:36 +0200 Subject: [PATCH 55/93] simplify --- .../modules/cluster_scaling/_utils_computational.py | 4 ++-- .../src/simcore_service_autoscaling/modules/dask.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py index 80e846398096..0e6c8dbed549 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py @@ -15,7 +15,7 @@ _DEFAULT_MAX_CPU: Final[float] = 1 _DEFAULT_MAX_RAM: Final[int] = 1024 -_DASK_TO_RESOURCE_NAME_MAPPING: Final[dict[str, str]] = { +DASK_TO_RESOURCE_NAME_MAPPING: Final[dict[str, str]] = { "CPU": "cpus", "RAM": "ram", } @@ -30,7 +30,7 @@ def resources_from_dask_task(task: DaskTask) -> Resources: ) # merge with defaults to ensure there is always some minimal resource defined return Resources.from_flat_dict( - task_resources.items(), mapping=_DASK_TO_RESOURCE_NAME_MAPPING + task_resources.items(), mapping=DASK_TO_RESOURCE_NAME_MAPPING ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 4f7495c0162d..5570fccee15f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -29,7 +29,7 @@ node_ip_from_ec2_private_dns, ) from .cluster_scaling._utils_computational import ( - resources_from_dask_task, + DASK_TO_RESOURCE_NAME_MAPPING, ) _logger = logging.getLogger(__name__) @@ -293,7 +293,9 @@ async def get_worker_used_resources( total_resources_used.update(task_resources) _logger.debug("found %s for %s", f"{total_resources_used=}", f"{worker_url=}") - return resources_from_dask_task(total_resources_used) + return Resources.from_flat_dict( + dict(total_resources_used), mapping=DASK_TO_RESOURCE_NAME_MAPPING + ) async def compute_cluster_total_resources( From ec8cbaf614e4665b8e111bb1797d1dd4a3f4f3e5 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 08:46:54 +0200 Subject: [PATCH 56/93] fix computation --- .../modules/dask.py | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 5570fccee15f..1d921bbb3ba7 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -15,7 +15,7 @@ ) from distributed.core import Status from models_library.clusters import ClusterAuthentication, TLSAuthentication -from pydantic import AnyUrl, ByteSize, TypeAdapter +from pydantic import AnyUrl from ..core.errors import ( DaskNoWorkersError, @@ -306,7 +306,7 @@ async def compute_cluster_total_resources( if not instances: return Resources.create_as_empty() async with _scheduler_client(scheduler_url, authentication) as client: - instance_host_resources_map = { + ec2_instance_resources_map = { node_ip_from_ec2_private_dns(i): i.resources for i in instances } scheduler_info = client.scheduler_info() @@ -315,20 +315,17 @@ async def compute_cluster_total_resources( workers: dict[str, Any] = scheduler_info["workers"] cluster_resources = Resources.create_as_empty() for worker_details in workers.values(): - if worker_details["host"] not in instance_host_resources_map: + if worker_details["host"] not in ec2_instance_resources_map: continue + # get dask information about resources worker_dask_resources = worker_details["resources"] worker_threads = worker_details["nthreads"] - cluster_resources += Resources( - cpus=worker_dask_resources.get( - "CPU", instance_host_resources_map[worker_details["host"]].cpus - ), - ram=TypeAdapter(ByteSize).validate_python( - worker_dask_resources.get( - "RAM", instance_host_resources_map[worker_details["host"]].ram - ) - ), - generic_resources={DASK_WORKER_THREAD_RESOURCE_NAME: worker_threads}, + worker_dask_resources = { + **worker_dask_resources, + DASK_WORKER_THREAD_RESOURCE_NAME: worker_threads, + } + cluster_resources += Resources.from_flat_dict( + worker_dask_resources.items(), mapping=DASK_TO_RESOURCE_NAME_MAPPING ) return cluster_resources From f40f5dbae9bade6476917a21be3e3a6571fb0451 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 08:47:37 +0200 Subject: [PATCH 57/93] type --- .../autoscaling/src/simcore_service_autoscaling/modules/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 1d921bbb3ba7..c105057254c2 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -320,7 +320,7 @@ async def compute_cluster_total_resources( # get dask information about resources worker_dask_resources = worker_details["resources"] worker_threads = worker_details["nthreads"] - worker_dask_resources = { + worker_dask_resources: dict[str, int | float | str] = { **worker_dask_resources, DASK_WORKER_THREAD_RESOURCE_NAME: worker_threads, } From fd3a58d8f9df4df6ba28032e31b908cf09ba5fec Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 08:48:21 +0200 Subject: [PATCH 58/93] type --- .../autoscaling/src/simcore_service_autoscaling/modules/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index c105057254c2..dd6dffde0aeb 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -325,7 +325,7 @@ async def compute_cluster_total_resources( DASK_WORKER_THREAD_RESOURCE_NAME: worker_threads, } cluster_resources += Resources.from_flat_dict( - worker_dask_resources.items(), mapping=DASK_TO_RESOURCE_NAME_MAPPING + worker_dask_resources, mapping=DASK_TO_RESOURCE_NAME_MAPPING ) return cluster_resources From b114585b8bad11557a34594931809410fcb7fbf2 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 08:50:30 +0200 Subject: [PATCH 59/93] no need to call items --- .../modules/cluster_scaling/_utils_computational.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py index 0e6c8dbed549..4b32fbbbede1 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py @@ -30,7 +30,7 @@ def resources_from_dask_task(task: DaskTask) -> Resources: ) # merge with defaults to ensure there is always some minimal resource defined return Resources.from_flat_dict( - task_resources.items(), mapping=DASK_TO_RESOURCE_NAME_MAPPING + task_resources, mapping=DASK_TO_RESOURCE_NAME_MAPPING ) From f852935c1a0a9ad1f1153974d13ee8539d3266b9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 08:51:43 +0200 Subject: [PATCH 60/93] revert --- .../src/simcore_service_autoscaling/utils/cluster_scaling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py index 1cff28a0bb46..13c25dcd2112 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py @@ -109,7 +109,7 @@ def find_selected_instance_type_for_task( selected_instance = filtered_instances[0] # check that the assigned resources and the machine resource fit - if not (task_required_resources <= selected_instance.resources): + if task_required_resources > selected_instance.resources: raise TaskRequirementsAboveRequiredEC2InstanceTypeError( task=task, instance_type=selected_instance, From b02ec9732487e8345b3f1b8b954cc976a98ddee7 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:18:29 +0200 Subject: [PATCH 61/93] refactor --- .../aws-library/src/aws_library/ec2/__init__.py | 2 ++ packages/aws-library/src/aws_library/ec2/_models.py | 10 +++++----- .../modules/cluster_scaling/_utils_computational.py | 7 ++++--- .../src/simcore_service_autoscaling/modules/dask.py | 13 ++++++------- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/__init__.py b/packages/aws-library/src/aws_library/ec2/__init__.py index 0acff01ff0d6..127a6dd076db 100644 --- a/packages/aws-library/src/aws_library/ec2/__init__.py +++ b/packages/aws-library/src/aws_library/ec2/__init__.py @@ -17,6 +17,7 @@ EC2InstanceData, EC2InstanceType, EC2Tags, + GenericResourceValueType, Resources, ) @@ -36,6 +37,7 @@ "EC2NotConnectedError", "EC2RuntimeError", "EC2Tags", + "GenericResourceValueType", "Resources", "SimcoreEC2API", ) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index 27e8024e5a42..6d4ff3a7cdca 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -22,14 +22,14 @@ from pydantic.config import JsonDict from types_aiobotocore_ec2.literals import InstanceStateNameType, InstanceTypeType -GenericResourceValue: TypeAlias = StrictInt | StrictFloat | str +GenericResourceValueType: TypeAlias = StrictInt | StrictFloat | str class Resources(BaseModel, frozen=True): cpus: NonNegativeFloat ram: ByteSize generic_resources: Annotated[ - dict[str, GenericResourceValue], + dict[str, GenericResourceValueType], Field( default_factory=dict, description=( @@ -83,7 +83,7 @@ def __add__(self, other: "Resources") -> "Resources": Note that only numeric generic resources are added Non-numeric generic resources are ignored """ - merged: dict[str, GenericResourceValue] = {} + merged: dict[str, GenericResourceValueType] = {} keys = set(self.generic_resources) | set(other.generic_resources) for k in keys: a = self.generic_resources.get(k) @@ -107,7 +107,7 @@ def __sub__(self, other: "Resources") -> "Resources": Note that only numeric generic resources are subtracted Non-numeric generic resources are ignored """ - merged: dict[str, GenericResourceValue] = {} + merged: dict[str, GenericResourceValueType] = {} keys = set(self.generic_resources) | set(other.generic_resources) for k in keys: a = self.generic_resources.get(k) @@ -129,7 +129,7 @@ def __sub__(self, other: "Resources") -> "Resources": def __hash__(self) -> int: """Deterministic hash including cpus, ram (in bytes) and generic_resources.""" # sort generic_resources items to ensure order-independent hashing - generic_items: tuple[tuple[str, GenericResourceValue], ...] = tuple( + generic_items: tuple[tuple[str, GenericResourceValueType], ...] = tuple( sorted(self.generic_resources.items()) ) return hash((self.cpus, self.ram, generic_items)) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py index 4b32fbbbede1..f5ed682f6669 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py @@ -1,7 +1,7 @@ import logging -from typing import Final +from typing import Final, cast -from aws_library.ec2 import Resources +from aws_library.ec2 import GenericResourceValueType, Resources from dask_task_models_library.resource_constraints import ( DaskTaskResources, get_ec2_instance_type_from_resources, @@ -30,7 +30,8 @@ def resources_from_dask_task(task: DaskTask) -> Resources: ) # merge with defaults to ensure there is always some minimal resource defined return Resources.from_flat_dict( - task_resources, mapping=DASK_TO_RESOURCE_NAME_MAPPING + cast(dict[str, GenericResourceValueType], task_resources), + mapping=DASK_TO_RESOURCE_NAME_MAPPING, ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index dd6dffde0aeb..f7eb0193de0f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -135,7 +135,6 @@ async def _list_cluster_known_tasks( def _list_on_scheduler( dask_scheduler: distributed.Scheduler, ) -> _DaskClusterTasks: - worker_to_processing_tasks = defaultdict(list) unrunnable_tasks = {} for task_key, task_state in dask_scheduler.tasks.items(): @@ -319,13 +318,13 @@ async def compute_cluster_total_resources( continue # get dask information about resources worker_dask_resources = worker_details["resources"] - worker_threads = worker_details["nthreads"] - worker_dask_resources: dict[str, int | float | str] = { - **worker_dask_resources, - DASK_WORKER_THREAD_RESOURCE_NAME: worker_threads, - } + worker_dask_nthreads = worker_details["nthreads"] cluster_resources += Resources.from_flat_dict( - worker_dask_resources, mapping=DASK_TO_RESOURCE_NAME_MAPPING + { + **worker_dask_resources, + DASK_WORKER_THREAD_RESOURCE_NAME: worker_dask_nthreads, + }, + mapping=DASK_TO_RESOURCE_NAME_MAPPING, ) return cluster_resources From 44eb03d18563c2ef7f211935acc5164d3286b268 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:18:38 +0200 Subject: [PATCH 62/93] better assert --- .../src/dask_task_models_library/container_tasks/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/utils.py b/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/utils.py index d97b0c896c36..97cfb440f45b 100644 --- a/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/utils.py +++ b/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/utils.py @@ -34,7 +34,7 @@ def parse_dask_job_id( job_id: str, ) -> tuple[ServiceKey, ServiceVersion, UserID, ProjectID, NodeID]: parts = job_id.split(":") - assert len(parts) == _JOB_ID_PARTS # nosec + assert len(parts) == _JOB_ID_PARTS, f"unexpected job id {parts=}" # nosec return ( parts[0], parts[1], From 8916cc6d2d1975f594fdf9e228783938e609a529 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:18:59 +0200 Subject: [PATCH 63/93] improving test --- services/autoscaling/tests/unit/conftest.py | 52 +++++++++++++++++++ ...t_modules_cluster_scaling_computational.py | 12 +++-- .../tests/unit/test_modules_dask.py | 4 +- .../tests/unit/test_utils_rabbitmq.py | 25 --------- 4 files changed, 62 insertions(+), 31 deletions(-) diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index 192cc4932dde..3c77b01be372 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -30,6 +30,7 @@ Resources, ) from common_library.json_serialization import json_dumps +from dask_task_models_library.container_tasks.utils import generate_dask_job_id from deepdiff import DeepDiff from faker import Faker from fakeredis.aioredis import FakeRedis @@ -52,7 +53,11 @@ Service, TaskSpec, ) +from models_library.projects import ProjectID +from models_library.projects_nodes_io import NodeID from models_library.services_metadata_runtime import SimcoreContainerLabels +from models_library.services_types import ServiceKey, ServiceVersion +from models_library.users import UserID from pydantic import ByteSize, NonNegativeInt, PositiveInt, TypeAdapter from pytest_mock import MockType from pytest_mock.plugin import MockerFixture @@ -857,9 +862,55 @@ def _creator(**cluter_overrides) -> Cluster: return _creator +@pytest.fixture +def service_version() -> ServiceVersion: + return "1.0.234" + + +@pytest.fixture +def service_key() -> ServiceKey: + return "simcore/services/dynamic/test" + + +@pytest.fixture +def node_id(faker: Faker) -> NodeID: + return faker.uuid4(cast_to=None) + + +@pytest.fixture +def project_id(faker: Faker) -> ProjectID: + return faker.uuid4(cast_to=None) + + +@pytest.fixture +def user_id(faker: Faker) -> UserID: + return faker.pyint(min_value=1) + + +@pytest.fixture +def fake_dask_job_id( + service_key: ServiceKey, + service_version: ServiceVersion, + user_id: UserID, + project_id: ProjectID, + faker: Faker, +) -> Callable[[], str]: + def _() -> str: + return generate_dask_job_id( + service_key=service_key, + service_version=service_version, + user_id=user_id, + project_id=project_id, + node_id=faker.uuid4(cast_to=None), + ) + + return _ + + @pytest.fixture async def create_dask_task( dask_spec_cluster_client: distributed.Client, + fake_dask_job_id: Callable[[], str], ) -> Callable[..., distributed.Future]: def _remote_pytest_fct(x: int, y: int) -> int: return x + y @@ -874,6 +925,7 @@ def _creator( 43, resources=required_resources, pure=False, + key=fake_dask_job_id(), **overrides, ) assert future diff --git a/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py index fbfd965cd34a..0ec7755c3c61 100644 --- a/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py @@ -126,10 +126,14 @@ def _assert_rabbit_autoscaling_message_sent( instances_running=0, ) expected_message = default_message.model_copy(update=message_update_kwargs) - mock_rabbitmq_post_message.assert_called_once_with( - app, - expected_message, - ) + # in this mock we get all kind of messages, we just want to assert one of them is the expected one and there is only one + autoscaling_status_messages = [ + call_args.args[1] + for call_args in mock_rabbitmq_post_message.call_args_list + if isinstance(call_args.args[1], RabbitAutoscalingStatusMessage) + ] + assert len(autoscaling_status_messages) == 1, "too many messages sent" + assert autoscaling_status_messages[0] == expected_message @pytest.fixture diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 6cc5b98a1516..4ed547d9f4d2 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -124,7 +124,7 @@ async def test_list_unrunnable_tasks( # we have nothing running now assert await list_unrunnable_tasks(scheduler_url, scheduler_authentication) == [] # start a task that cannot run - dask_task_impossible_resources = DaskTaskResources(XRAM=213, threads=1) + dask_task_impossible_resources = DaskTaskResources(XRAM=213, threads=1) # type: ignore future = create_dask_task(dask_task_impossible_resources) assert future assert await list_unrunnable_tasks(scheduler_url, scheduler_authentication) == [ @@ -168,7 +168,7 @@ def _add_fct(x: int, y: int) -> int: next(iter(dask_spec_cluster_client.scheduler_info()["workers"])): [ DaskTask( task_id=DaskTaskId(future_queued_task.key), - required_resources=DaskTaskResources(threads=1), + required_resources=DaskTaskResources(threads=1), # type: ignore ) ] } diff --git a/services/autoscaling/tests/unit/test_utils_rabbitmq.py b/services/autoscaling/tests/unit/test_utils_rabbitmq.py index 006155b1e0fa..8741949e76a7 100644 --- a/services/autoscaling/tests/unit/test_utils_rabbitmq.py +++ b/services/autoscaling/tests/unit/test_utils_rabbitmq.py @@ -122,31 +122,6 @@ async def _(labels: dict[DockerLabelKey, str]) -> list[Task]: return _ -@pytest.fixture -def service_version() -> ServiceVersion: - return "1.0.0" - - -@pytest.fixture -def service_key() -> ServiceKey: - return "simcore/services/dynamic/test" - - -@pytest.fixture -def node_id(faker: Faker) -> NodeID: - return faker.uuid4(cast_to=None) - - -@pytest.fixture -def project_id(faker: Faker) -> ProjectID: - return faker.uuid4(cast_to=None) - - -@pytest.fixture -def user_id(faker: Faker) -> UserID: - return faker.pyint(min_value=1) - - @pytest.fixture def dask_task( service_key: ServiceKey, From 255a7d26cfd00a226a44a55911ad70ced8e2bccc Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 16:36:47 +0200 Subject: [PATCH 64/93] add generic instances based on the provider --- .../modules/cluster_scaling/_auto_scaling_core.py | 3 +++ .../modules/cluster_scaling/_provider_computational.py | 9 +++++++++ .../modules/cluster_scaling/_provider_protocol.py | 4 ++++ .../src/simcore_service_autoscaling/modules/dask.py | 8 ++++++++ .../unit/test_modules_cluster_scaling_computational.py | 2 +- 5 files changed, 25 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py index 5b74cb412fad..967b85281d18 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py @@ -112,6 +112,9 @@ async def _analyze_current_cluster( state_names=["stopped"], ) + for instance in itertools.chain(existing_ec2_instances, warm_buffer_ec2_instances): + auto_scaling_mode.add_instance_generic_resources(app, instance) + attached_ec2s, pending_ec2s = associate_ec2_instances_with_nodes( docker_nodes, existing_ec2_instances ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py index 243674344a39..70cb9eedad47 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py @@ -182,3 +182,12 @@ async def is_instance_retired( async def try_retire_nodes(self, app: FastAPI) -> None: assert self # nosec await dask.try_retire_nodes(_scheduler_url(app), _scheduler_auth(app)) + + def add_instance_generic_resources( + self, app: FastAPI, instance: EC2InstanceData + ) -> None: + assert self # nosec + assert app # nosec + app_settings = get_application_settings(app) + assert app_settings.AUTOSCALING_DASK # nosec + dask.add_instance_generic_resources(app_settings.AUTOSCALING_DASK, instance) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py index 355394b9f1d3..71355d21bcf3 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py @@ -47,3 +47,7 @@ async def is_instance_retired( ) -> bool: ... async def try_retire_nodes(self, app: FastAPI) -> None: ... + + def add_instance_generic_resources( + self, app: FastAPI, instance: EC2InstanceData + ) -> None: ... diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index f7eb0193de0f..b7290d129919 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -12,6 +12,7 @@ from dask_task_models_library.resource_constraints import ( DASK_WORKER_THREAD_RESOURCE_NAME, DaskTaskResources, + create_ec2_resource_constraint_key, ) from distributed.core import Status from models_library.clusters import ClusterAuthentication, TLSAuthentication @@ -339,6 +340,9 @@ async def try_retire_nodes( ) +_LARGE_RESOURCE: Final[int] = 99999 + + def add_instance_generic_resources( settings: DaskMonitoringSettings, instance: EC2InstanceData ) -> None: @@ -351,3 +355,7 @@ def add_instance_generic_resources( instance.resources.generic_resources[DASK_WORKER_THREAD_RESOURCE_NAME] = ( instance_threads ) + + instance.resources.generic_resources[ + create_ec2_resource_constraint_key(instance.type) + ] = _LARGE_RESOURCE diff --git a/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py index 0ec7755c3c61..6cf0562976e5 100644 --- a/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py @@ -638,7 +638,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 ) mock_docker_tag_node.reset_mock() mock_docker_set_node_availability.assert_not_called() - mock_rabbitmq_post_message.assert_called_once() + assert mock_rabbitmq_post_message.call_count == 3 mock_rabbitmq_post_message.reset_mock() # now we have 1 monitored node that needs to be mocked From 225491909141ca46af2cbf76d17158a9fe598fae Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 16:39:41 +0200 Subject: [PATCH 65/93] sonar --- packages/aws-library/src/aws_library/ec2/_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index 6d4ff3a7cdca..c8bfb91d099d 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -61,7 +61,7 @@ def __ge__(self, other: "Resources") -> bool: k, a ) # NOTE: get from other, default to "a" resources so that non-existing keys can be compared as equal if isinstance(a, int | float) and isinstance(b, int | float): - if not (a >= b): + if a < b: return False elif a != b: assert isinstance(a, str | None) # nosec From b9d7428460e5de0f1b0970b1edda13075f0c9eb7 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:15:11 +0200 Subject: [PATCH 66/93] fix? --- .../src/aws_library/ec2/_models.py | 42 +++++++++++-------- packages/aws-library/tests/test_ec2_models.py | 12 +++--- ...t_modules_cluster_scaling_computational.py | 1 - 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index c8bfb91d099d..982d73dfa839 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -46,37 +46,43 @@ def create_as_empty(cls) -> "Resources": def __ge__(self, other: "Resources") -> bool: """operator for >= comparison if self has greater or equal resources than other, returns True + This will return True only if any of the resources in self is greater or equal to other + Note that generic_resources are compared only if they are numeric Non-numeric generic resources must be equal in both or only defined in self to be considered greater or equal """ + if self == other: + return True + return self > other + + def __gt__(self, other: "Resources") -> bool: + """operator for > comparison + if self has any resources gretaer than other, returns True (even if different resource types are smaller) - if not (self.cpus >= other.cpus and self.ram >= other.ram): - return False + Note that generic_resources are compared only if they are numeric + Non-numeric generic resources must be equal in both or only defined in self + to be considered greater + """ + if (self.cpus > other.cpus) or (self.ram > other.ram): + return True keys = set(self.generic_resources) | set(other.generic_resources) for k in keys: a = self.generic_resources.get(k) - b = other.generic_resources.get( - k, a - ) # NOTE: get from other, default to "a" resources so that non-existing keys can be compared as equal + b = other.generic_resources.get(k) + if a is None: + continue + if b is None: + return True if isinstance(a, int | float) and isinstance(b, int | float): - if a < b: - return False + if a > b: + return True elif a != b: assert isinstance(a, str | None) # nosec assert isinstance(b, int | float | str | None) # nosec - return False - return True - - def __gt__(self, other: "Resources") -> bool: - """operator for > comparison - if self has greater resources than other, returns True - Note that generic_resources are compared only if they are numeric - Non-numeric generic resources must be equal in both or only defined in self - to be considered greater - """ - return self >= other and self != other + return True + return False def __add__(self, other: "Resources") -> "Resources": """operator for adding two Resources diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index 131b24da87f8..0a77b88c38aa 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -36,7 +36,7 @@ ( Resources(cpus=0.05, ram=ByteSize(1)), Resources(cpus=0.1, ram=ByteSize(0)), - False, + True, # ram is larger ), ( Resources(cpus=0.1, ram=ByteSize(0)), @@ -46,7 +46,7 @@ ( Resources(cpus=0.1, ram=ByteSize(0), generic_resources={"GPU": 1}), Resources(cpus=0.1, ram=ByteSize(1)), - False, # ram is not enough + True, # GPU is larger ), ( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), @@ -71,7 +71,7 @@ ( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": "2"}), Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), - False, + True, # string resrouces are not comparable so "2" is considered larger ), ( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), @@ -117,7 +117,7 @@ def test_resources_ge_operator( ( Resources(cpus=0.05, ram=ByteSize(1)), Resources(cpus=0.1, ram=ByteSize(0)), - False, + True, ), ( Resources(cpus=0.1, ram=ByteSize(0)), @@ -127,7 +127,7 @@ def test_resources_ge_operator( ( Resources(cpus=0.1, ram=ByteSize(0), generic_resources={"GPU": 1}), Resources(cpus=0.1, ram=ByteSize(1)), - False, # ram is not enough + True, # ram is not enough ), ( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), @@ -152,7 +152,7 @@ def test_resources_ge_operator( ( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": "2"}), Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), - False, + True, # string resources are not comparable, so a > b ), ( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), diff --git a/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py index 6cf0562976e5..5f51a4f34c19 100644 --- a/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py @@ -934,7 +934,6 @@ async def test_cluster_does_not_scale_up_if_defined_instance_is_not_fitting_reso [InstanceTypeType | None, Resources], DaskTaskResources ], ec2_client: EC2Client, - faker: Faker, caplog: pytest.LogCaptureFixture, ): # we have nothing running now From 824dfa7b9e16f025d8c13a1320b27b755da6ffc7 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:35:37 +0200 Subject: [PATCH 67/93] improve error --- .../src/simcore_service_autoscaling/core/errors.py | 6 ++++-- .../simcore_service_autoscaling/utils/cluster_scaling.py | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/core/errors.py b/services/autoscaling/src/simcore_service_autoscaling/core/errors.py index e4294631224a..0277acf38936 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/core/errors.py +++ b/services/autoscaling/src/simcore_service_autoscaling/core/errors.py @@ -18,7 +18,7 @@ class TaskRequiresUnauthorizedEC2InstanceTypeError(AutoscalingRuntimeError): class TaskRequirementsAboveRequiredEC2InstanceTypeError(AutoscalingRuntimeError): msg_template: str = ( - "Task {task} requires {instance_type} but requires {resources}. " + "Task {task} requires {instance_type} but requires {resources}. {resources_diff} are missing! " "TIP: Ensure task resources requirements fit required instance type available resources." ) @@ -43,4 +43,6 @@ class DaskNoWorkersError(AutoscalingRuntimeError): class DaskWorkerNotFoundError(AutoscalingRuntimeError): - msg_template: str = "Dask worker running on {worker_host} is not registered to scheduler in {url}, it is not found!" + msg_template: str = ( + "Dask worker running on {worker_host} is not registered to scheduler in {url}, it is not found!" + ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py index 13c25dcd2112..93e86c99eaec 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py @@ -114,6 +114,7 @@ def find_selected_instance_type_for_task( task=task, instance_type=selected_instance, resources=task_required_resources, + resources_diff=task_required_resources - selected_instance.resources, ) return selected_instance From 6020b89dd61f144a5060cf8c860907bb27b0612f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:36:59 +0200 Subject: [PATCH 68/93] add resource info on instance types as well --- .../cluster_scaling/_auto_scaling_core.py | 11 ++++++++-- .../_provider_computational.py | 12 +++++++++++ .../cluster_scaling/_provider_protocol.py | 5 +++++ .../modules/dask.py | 20 +++++++++++++++++++ 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py index 967b85281d18..912233cb58a3 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py @@ -346,7 +346,9 @@ async def _try_attach_pending_ec2s( ) -async def _sorted_allowed_instance_types(app: FastAPI) -> list[EC2InstanceType]: +async def _sorted_allowed_instance_types( + app: FastAPI, auto_scaling_mode: AutoscalingProvider +) -> list[EC2InstanceType]: app_settings: ApplicationSettings = app.state.settings assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec ec2_client = get_ec2_client(app) @@ -370,6 +372,8 @@ def _as_selection(instance_type: EC2InstanceType) -> int: return allowed_instance_type_names.index(f"{instance_type.name}") allowed_instance_types.sort(key=_as_selection) + for instance_type in allowed_instance_types: + auto_scaling_mode.add_instance_type_generic_resource(app, instance_type) return allowed_instance_types @@ -1578,7 +1582,10 @@ async def auto_scale_cluster( the additional load. """ # current state - allowed_instance_types = await _sorted_allowed_instance_types(app) + allowed_instance_types = await _sorted_allowed_instance_types( + app, auto_scaling_mode + ) + cluster = await _analyze_current_cluster( app, auto_scaling_mode, allowed_instance_types ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py index 70cb9eedad47..585d503ab4ce 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py @@ -3,6 +3,7 @@ from typing import Any, cast from aws_library.ec2 import EC2InstanceData, EC2Tags, Resources +from aws_library.ec2._models import EC2InstanceType from fastapi import FastAPI from models_library.clusters import ClusterAuthentication from models_library.docker import DockerLabelKey @@ -191,3 +192,14 @@ def add_instance_generic_resources( app_settings = get_application_settings(app) assert app_settings.AUTOSCALING_DASK # nosec dask.add_instance_generic_resources(app_settings.AUTOSCALING_DASK, instance) + + def add_instance_type_generic_resource( + self, app: FastAPI, instance_type: EC2InstanceType + ) -> None: + assert self # nosec + assert app # nosec + app_settings = get_application_settings(app) + assert app_settings.AUTOSCALING_DASK # nosec + dask.add_instance_type_generic_resource( + app_settings.AUTOSCALING_DASK, instance_type + ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py index 71355d21bcf3..e161893f71bf 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py @@ -1,6 +1,7 @@ from typing import Protocol from aws_library.ec2 import EC2InstanceData, EC2Tags, Resources +from aws_library.ec2._models import EC2InstanceType from fastapi import FastAPI from models_library.docker import DockerLabelKey from models_library.generated_models.docker_rest_api import Node as DockerNode @@ -51,3 +52,7 @@ async def try_retire_nodes(self, app: FastAPI) -> None: ... def add_instance_generic_resources( self, app: FastAPI, instance: EC2InstanceData ) -> None: ... + + def add_instance_type_generic_resource( + self, app: FastAPI, instance_type: EC2InstanceType + ) -> None: ... diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index b7290d129919..5d28f31223cc 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -9,6 +9,7 @@ import dask.typing import distributed from aws_library.ec2 import EC2InstanceData, Resources +from aws_library.ec2._models import EC2InstanceType from dask_task_models_library.resource_constraints import ( DASK_WORKER_THREAD_RESOURCE_NAME, DaskTaskResources, @@ -359,3 +360,22 @@ def add_instance_generic_resources( instance.resources.generic_resources[ create_ec2_resource_constraint_key(instance.type) ] = _LARGE_RESOURCE + + +def add_instance_type_generic_resource( + settings: DaskMonitoringSettings, instance_type: EC2InstanceType +) -> None: + instance_threads = round(instance_type.resources.cpus) + if settings.DASK_NTHREADS > 0: + # this overrides everything + instance_threads = settings.DASK_NTHREADS + if settings.DASK_NTHREADS_MULTIPLIER > 1: + instance_threads = instance_threads * settings.DASK_NTHREADS_MULTIPLIER + + instance_type.resources.generic_resources[DASK_WORKER_THREAD_RESOURCE_NAME] = ( + instance_threads + ) + + instance_type.resources.generic_resources[ + create_ec2_resource_constraint_key(instance_type.name) + ] = _LARGE_RESOURCE From 1c0869f04c31e9d9473c8961dc8911c2caad0f18 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 19 Oct 2025 22:40:44 +0200 Subject: [PATCH 69/93] mypy --- .../cluster_scaling/_auto_scaling_core.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py index 912233cb58a3..6c34fdaa2f57 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py @@ -94,26 +94,32 @@ async def _analyze_current_cluster( docker_nodes: list[Node] = await auto_scaling_mode.get_monitored_nodes(app) # get the EC2 instances we have - existing_ec2_instances = await get_ec2_client(app).get_instances( + existing_ec2_instances: list[EC2InstanceData] = await get_ec2_client( + app + ).get_instances( key_names=[app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_KEY_NAME], tags=auto_scaling_mode.get_ec2_tags(app), state_names=["pending", "running"], ) - terminated_ec2_instances = await get_ec2_client(app).get_instances( + terminated_ec2_instances: list[EC2InstanceData] = await get_ec2_client( + app + ).get_instances( key_names=[app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_KEY_NAME], tags=auto_scaling_mode.get_ec2_tags(app), state_names=["terminated"], ) - warm_buffer_ec2_instances = await get_ec2_client(app).get_instances( + warm_buffer_ec2_instances: list[EC2InstanceData] = await get_ec2_client( + app + ).get_instances( key_names=[app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_KEY_NAME], tags=get_deactivated_warm_buffer_ec2_tags(auto_scaling_mode.get_ec2_tags(app)), state_names=["stopped"], ) - for instance in itertools.chain(existing_ec2_instances, warm_buffer_ec2_instances): - auto_scaling_mode.add_instance_generic_resources(app, instance) + for i in itertools.chain(existing_ec2_instances, warm_buffer_ec2_instances): + auto_scaling_mode.add_instance_generic_resources(app, i) attached_ec2s, pending_ec2s = associate_ec2_instances_with_nodes( docker_nodes, existing_ec2_instances From e55ec441b4b68714afe06cc793b35812d6d25087 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 19 Oct 2025 22:49:51 +0200 Subject: [PATCH 70/93] need to be fixed --- packages/aws-library/src/aws_library/ec2/_models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index 982d73dfa839..faf1380d09c4 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -43,6 +43,7 @@ class Resources(BaseModel, frozen=True): def create_as_empty(cls) -> "Resources": return cls(cpus=0, ram=ByteSize(0)) + # TODO: this is not ok. everything shall be compared! def __ge__(self, other: "Resources") -> bool: """operator for >= comparison if self has greater or equal resources than other, returns True @@ -58,7 +59,7 @@ def __ge__(self, other: "Resources") -> bool: def __gt__(self, other: "Resources") -> bool: """operator for > comparison - if self has any resources gretaer than other, returns True (even if different resource types are smaller) + if self has any resources greater than other, returns True (even if different resource types are smaller) Note that generic_resources are compared only if they are numeric Non-numeric generic resources must be equal in both or only defined in self From d2f241a67291d9e92f87226a2b0eef2996728de2 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 08:55:07 +0200 Subject: [PATCH 71/93] done --- packages/aws-library/src/aws_library/ec2/_models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index faf1380d09c4..803bceb7ef88 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -43,11 +43,10 @@ class Resources(BaseModel, frozen=True): def create_as_empty(cls) -> "Resources": return cls(cpus=0, ram=ByteSize(0)) - # TODO: this is not ok. everything shall be compared! def __ge__(self, other: "Resources") -> bool: """operator for >= comparison if self has greater or equal resources than other, returns True - This will return True only if any of the resources in self is greater or equal to other + This will return True only if all of the resources in self are greater or equal to other Note that generic_resources are compared only if they are numeric Non-numeric generic resources must be equal in both or only defined in self @@ -59,10 +58,11 @@ def __ge__(self, other: "Resources") -> bool: def __gt__(self, other: "Resources") -> bool: """operator for > comparison - if self has any resources greater than other, returns True (even if different resource types are smaller) + if self has resources greater than other, returns True + This will return True only if all of the resources in self are greater than other Note that generic_resources are compared only if they are numeric - Non-numeric generic resources must be equal in both or only defined in self + Non-numeric generic resources must only be defined in self to be considered greater """ if (self.cpus > other.cpus) or (self.ram > other.ram): From 31aa4c299a3a62a7051a4beb67f34201cc2396d3 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 13:29:11 +0200 Subject: [PATCH 72/93] fixed tests --- .../src/aws_library/ec2/_models.py | 26 +++++++++++-------- packages/aws-library/tests/test_ec2_models.py | 20 ++++++++++---- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index 803bceb7ef88..09f49bb6aae4 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -65,25 +65,29 @@ def __gt__(self, other: "Resources") -> bool: Non-numeric generic resources must only be defined in self to be considered greater """ - if (self.cpus > other.cpus) or (self.ram > other.ram): - return True + if (self.cpus < other.cpus) or (self.ram < other.ram): + return False keys = set(self.generic_resources) | set(other.generic_resources) for k in keys: a = self.generic_resources.get(k) b = other.generic_resources.get(k) if a is None: - continue + return False if b is None: - return True + # a is greater as b is not defined + continue if isinstance(a, int | float) and isinstance(b, int | float): - if a > b: - return True - elif a != b: - assert isinstance(a, str | None) # nosec - assert isinstance(b, int | float | str | None) # nosec - return True - return False + if a < b: + return False + else: + # remaining options is a is str and b is str or mixed types + assert isinstance(a, str) # nosec + assert isinstance(b, int | float | str) # nosec + + # here we have either everything greater or equal or non-comparable strings + + return self != other def __add__(self, other: "Resources") -> "Resources": """operator for adding two Resources diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index 0a77b88c38aa..0b1fa016fa79 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -36,7 +36,7 @@ ( Resources(cpus=0.05, ram=ByteSize(1)), Resources(cpus=0.1, ram=ByteSize(0)), - True, # ram is larger + False, # CPU is smaller ), ( Resources(cpus=0.1, ram=ByteSize(0)), @@ -46,7 +46,7 @@ ( Resources(cpus=0.1, ram=ByteSize(0), generic_resources={"GPU": 1}), Resources(cpus=0.1, ram=ByteSize(1)), - True, # GPU is larger + False, # RAM is smaller ), ( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), @@ -71,7 +71,7 @@ ( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": "2"}), Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), - True, # string resrouces are not comparable so "2" is considered larger + True, # string resources are not comparable so "2" is considered larger ), ( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), @@ -117,7 +117,7 @@ def test_resources_ge_operator( ( Resources(cpus=0.05, ram=ByteSize(1)), Resources(cpus=0.1, ram=ByteSize(0)), - True, + False, # CPU is smaller ), ( Resources(cpus=0.1, ram=ByteSize(0)), @@ -127,7 +127,7 @@ def test_resources_ge_operator( ( Resources(cpus=0.1, ram=ByteSize(0), generic_resources={"GPU": 1}), Resources(cpus=0.1, ram=ByteSize(1)), - True, # ram is not enough + False, # ram is not enough ), ( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), @@ -144,6 +144,11 @@ def test_resources_ge_operator( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), False, ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 1}), + True, + ), ( Resources(cpus=0.1, ram=ByteSize(1)), Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"GPU": 2}), @@ -169,6 +174,11 @@ def test_resources_ge_operator( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), False, ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "no"}), + True, + ), ], ) def test_resources_gt_operator(a: Resources, b: Resources, a_greater_than_b: bool): From ac714225a6c683036305b8195ecbc982b6e170fe Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 13:55:03 +0200 Subject: [PATCH 73/93] added missing calls --- .../cluster_scaling/_provider_dynamic.py | 17 +++++++++++++++++ .../utils/cluster_scaling.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py index e6dbca840e37..ac28d9e775f3 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py @@ -1,4 +1,5 @@ from aws_library.ec2 import EC2InstanceData, EC2Tags, Resources +from aws_library.ec2._models import EC2InstanceType from fastapi import FastAPI from models_library.docker import DockerLabelKey from models_library.generated_models.docker_rest_api import Node, Task @@ -104,3 +105,19 @@ async def try_retire_nodes(self, app: FastAPI) -> None: assert self # nosec assert app # nosec # nothing to do here + + def add_instance_generic_resources( + self, app: FastAPI, instance: EC2InstanceData + ) -> None: + assert self # nosec + assert app # nosec + assert instance # nosec + # nothing to do at the moment + + def add_instance_type_generic_resource( + self, app: FastAPI, instance_type: EC2InstanceType + ) -> None: + assert self # nosec + assert app # nosec + assert instance_type # nosec + # nothing to do at the moment diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py b/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py index 93e86c99eaec..cc2c1ad3ee0c 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/cluster_scaling.py @@ -109,7 +109,7 @@ def find_selected_instance_type_for_task( selected_instance = filtered_instances[0] # check that the assigned resources and the machine resource fit - if task_required_resources > selected_instance.resources: + if not (task_required_resources <= selected_instance.resources): raise TaskRequirementsAboveRequiredEC2InstanceTypeError( task=task, instance_type=selected_instance, From 28bdfee62dac673a762e5654fb759d832ab08a46 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 13:56:48 +0200 Subject: [PATCH 74/93] @copilot review --- services/autoscaling/tests/unit/test_utils_cluster_scaling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/services/autoscaling/tests/unit/test_utils_cluster_scaling.py b/services/autoscaling/tests/unit/test_utils_cluster_scaling.py index 5525cedc9268..1c325c1f6234 100644 --- a/services/autoscaling/tests/unit/test_utils_cluster_scaling.py +++ b/services/autoscaling/tests/unit/test_utils_cluster_scaling.py @@ -296,7 +296,6 @@ def test_sort_drained_nodes( assert app_settings.AUTOSCALING_EC2_INSTANCES machine_buffer_type = get_hot_buffer_type(random_fake_available_instances) _NUM_DRAINED_NODES = 20 - assert app_settings.AUTOSCALING_EC2_INSTANCES _NUM_NODE_WITH_TYPE_BUFFER = ( 3 * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER ) From 04e68cbf38745f266c1f14d13f8218b9ef5d63b9 Mon Sep 17 00:00:00 2001 From: Sylvain <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 13:58:12 +0200 Subject: [PATCH 75/93] Update services/autoscaling/src/simcore_service_autoscaling/core/errors.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../autoscaling/src/simcore_service_autoscaling/core/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/core/errors.py b/services/autoscaling/src/simcore_service_autoscaling/core/errors.py index 0277acf38936..d1020d382f76 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/core/errors.py +++ b/services/autoscaling/src/simcore_service_autoscaling/core/errors.py @@ -18,7 +18,7 @@ class TaskRequiresUnauthorizedEC2InstanceTypeError(AutoscalingRuntimeError): class TaskRequirementsAboveRequiredEC2InstanceTypeError(AutoscalingRuntimeError): msg_template: str = ( - "Task {task} requires {instance_type} but requires {resources}. {resources_diff} are missing! " + "Task {task} specifies instance type {instance_type} but requests {resources}. {resources_diff} are missing! " "TIP: Ensure task resources requirements fit required instance type available resources." ) From d396dab1738522f82fd35b1201501c03992a1af1 Mon Sep 17 00:00:00 2001 From: Sylvain <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 14:05:26 +0200 Subject: [PATCH 76/93] Update services/autoscaling/tests/unit/test_modules_dask.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- services/autoscaling/tests/unit/test_modules_dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 4ed547d9f4d2..d99c0f2f0869 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -429,7 +429,7 @@ async def test_compute_cluster_total_resources( @pytest.mark.parametrize( "dask_nthreads, dask_nthreads_multiplier, expected_threads_resource", - [(4, 1, 4), (4, 2, 8), (0, 2.0, -1)], + [(4, 1, 4), (4, 2, 8), (0, 2, -1)], ) def test_add_instance_generic_resources( scheduler_url: AnyUrl, From 2b521095423cbe3b61e73296701e349dceb7fcca Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 14:11:24 +0200 Subject: [PATCH 77/93] ensure thread is at least 1 --- .../src/simcore_service_autoscaling/modules/dask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 5d28f31223cc..4f72410a9f8a 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -347,7 +347,7 @@ async def try_retire_nodes( def add_instance_generic_resources( settings: DaskMonitoringSettings, instance: EC2InstanceData ) -> None: - instance_threads = round(instance.resources.cpus) + instance_threads = min(1, round(instance.resources.cpus)) if settings.DASK_NTHREADS > 0: # this overrides everything instance_threads = settings.DASK_NTHREADS @@ -365,7 +365,7 @@ def add_instance_generic_resources( def add_instance_type_generic_resource( settings: DaskMonitoringSettings, instance_type: EC2InstanceType ) -> None: - instance_threads = round(instance_type.resources.cpus) + instance_threads = min(1, round(instance_type.resources.cpus)) if settings.DASK_NTHREADS > 0: # this overrides everything instance_threads = settings.DASK_NTHREADS From 56c4326f4b7697acb5c438cc693d20518cbc213a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 15:52:46 +0200 Subject: [PATCH 78/93] improve coverage --- .../tests/unit/test_utils_cluster_scaling.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/services/autoscaling/tests/unit/test_utils_cluster_scaling.py b/services/autoscaling/tests/unit/test_utils_cluster_scaling.py index 1c325c1f6234..5a15d63a6ed3 100644 --- a/services/autoscaling/tests/unit/test_utils_cluster_scaling.py +++ b/services/autoscaling/tests/unit/test_utils_cluster_scaling.py @@ -75,6 +75,24 @@ async def test_associate_ec2_instances_with_nodes_with_no_correspondence( assert len(non_associated_instances) == len(ec2_instances) +async def test_associate_ec2_instances_with_nodes_with_invalid_dns( + fake_ec2_instance_data: Callable[..., EC2InstanceData], + node: Callable[..., DockerNode], +): + nodes = [node() for _ in range(10)] + ec2_instances = [ + fake_ec2_instance_data(aws_private_dns="invalid-dns-name") for _ in range(10) + ] + + ( + associated_instances, + non_associated_instances, + ) = associate_ec2_instances_with_nodes(nodes, ec2_instances) + + assert not associated_instances + assert non_associated_instances + + async def test_associate_ec2_instances_with_corresponding_nodes( fake_ec2_instance_data: Callable[..., EC2InstanceData], node: Callable[..., DockerNode], From e3b998e711b13b1191d0662bb1e878aa52ff7208 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 16:02:19 +0200 Subject: [PATCH 79/93] added test and a fix --- .../modules/instrumentation/_core.py | 2 +- .../unit/test_modules_instrumentation_core.py | 30 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 services/autoscaling/tests/unit/test_modules_instrumentation_core.py diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/instrumentation/_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/instrumentation/_core.py index 9de65aac078f..af84e97bc01b 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/instrumentation/_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/instrumentation/_core.py @@ -35,7 +35,7 @@ async def on_shutdown() -> None: ... def get_instrumentation(app: FastAPI) -> AutoscalingInstrumentation: - if not app.state.instrumentation: + if not hasattr(app.state, "instrumentation"): raise ConfigurationError( msg="Instrumentation not setup. Please check the configuration." ) diff --git a/services/autoscaling/tests/unit/test_modules_instrumentation_core.py b/services/autoscaling/tests/unit/test_modules_instrumentation_core.py new file mode 100644 index 000000000000..b3a843d8adbe --- /dev/null +++ b/services/autoscaling/tests/unit/test_modules_instrumentation_core.py @@ -0,0 +1,30 @@ +import pytest +from fastapi import FastAPI +from pytest_simcore.helpers.typing_env import EnvVarsDict +from simcore_service_autoscaling.core.errors import ConfigurationError +from simcore_service_autoscaling.modules.instrumentation._core import ( + get_instrumentation, + has_instrumentation, +) + + +@pytest.fixture +def disabled_instrumentation( + app_environment: EnvVarsDict, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("AUTOSCALING_PROMETHEUS_INSTRUMENTATION_ENABLED", "false") + + +async def test_disabled_instrumentation( + disabled_rabbitmq: None, + disabled_ec2: None, + disabled_ssm: None, + disabled_instrumentation: None, + mocked_redis_server: None, + initialized_app: FastAPI, +): + # instrumentation disabled by default + assert not has_instrumentation(initialized_app) + + with pytest.raises(ConfigurationError): + get_instrumentation(initialized_app) From 3bb8fad16a83afe1f13b89acb0b52236e41597fd Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 16:24:21 +0200 Subject: [PATCH 80/93] fix code --- .../autoscaling/src/simcore_service_autoscaling/modules/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 4f72410a9f8a..546a7ad9917f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -347,7 +347,7 @@ async def try_retire_nodes( def add_instance_generic_resources( settings: DaskMonitoringSettings, instance: EC2InstanceData ) -> None: - instance_threads = min(1, round(instance.resources.cpus)) + instance_threads = max(1, round(instance.resources.cpus)) if settings.DASK_NTHREADS > 0: # this overrides everything instance_threads = settings.DASK_NTHREADS From a6c322b2abec66dece6bfb91c177a4893ee7ed16 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 17:15:16 +0200 Subject: [PATCH 81/93] adjust ram cpu --- .../resource_constraints.py | 21 ++++++++++++++ .../cluster_scaling/_auto_scaling_core.py | 2 +- .../_provider_computational.py | 15 +++++++++- .../cluster_scaling/_provider_dynamic.py | 28 +++++++++++++++++-- .../cluster_scaling/_provider_protocol.py | 2 +- .../db/repositories/comp_tasks/_utils.py | 16 ++++++----- 6 files changed, 72 insertions(+), 12 deletions(-) diff --git a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py index 55c700541a73..68716c9d5b67 100644 --- a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py +++ b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py @@ -26,3 +26,24 @@ def get_ec2_instance_type_from_resources( if resource_name.startswith(DASK_TASK_EC2_RESOURCE_RESTRICTION_KEY): return resource_name.split(":")[-1] return None + + +_RAM_SAFE_MARGIN_RATIO: Final[float] = ( + 0.1 # NOTE: machines always have less available RAM than advertised +) +_CPUS_SAFE_MARGIN: Final[float] = 0.1 + + +def estimate_dask_worker_resources_from_ec2_instance( + cpus: float, ram: int +) -> tuple[float, float]: + """Estimates the resources available to a dask worker running in an EC2 instance, + taking into account safe margins for CPU and RAM. + + Returns: + tuple: Estimated resources for the dask worker (cpus, ram). + """ + worker_cpus = min(0.1, cpus - _CPUS_SAFE_MARGIN) # ensure at least 0.1 CPU + worker_ram = int(ram * (1 - _RAM_SAFE_MARGIN_RATIO)) # apply safe margin + + return (worker_cpus, worker_ram) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py index 6c34fdaa2f57..37c7dcd6102d 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py @@ -379,7 +379,7 @@ def _as_selection(instance_type: EC2InstanceType) -> int: allowed_instance_types.sort(key=_as_selection) for instance_type in allowed_instance_types: - auto_scaling_mode.add_instance_type_generic_resource(app, instance_type) + auto_scaling_mode.adjust_instance_type_resources(app, instance_type) return allowed_instance_types diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py index 585d503ab4ce..9701bc19fe79 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py @@ -1,9 +1,13 @@ import collections +import dataclasses import logging from typing import Any, cast from aws_library.ec2 import EC2InstanceData, EC2Tags, Resources from aws_library.ec2._models import EC2InstanceType +from dask_task_models_library.resource_constraints import ( + estimate_dask_worker_resources_from_ec2_instance, +) from fastapi import FastAPI from models_library.clusters import ClusterAuthentication from models_library.docker import DockerLabelKey @@ -193,13 +197,22 @@ def add_instance_generic_resources( assert app_settings.AUTOSCALING_DASK # nosec dask.add_instance_generic_resources(app_settings.AUTOSCALING_DASK, instance) - def add_instance_type_generic_resource( + def adjust_instance_type_resources( self, app: FastAPI, instance_type: EC2InstanceType ) -> None: assert self # nosec assert app # nosec app_settings = get_application_settings(app) assert app_settings.AUTOSCALING_DASK # nosec + adjusted_cpus, adjusted_ram = estimate_dask_worker_resources_from_ec2_instance( + instance_type.resources.cpus, instance_type.resources.ram + ) + dataclasses.replace( + instance_type, + resources=instance_type.resources.model_copy( + update={"cpus": adjusted_cpus, "ram": ByteSize(adjusted_ram)} + ), + ) dask.add_instance_type_generic_resource( app_settings.AUTOSCALING_DASK, instance_type ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py index ac28d9e775f3..e6b40c22b14a 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py @@ -1,8 +1,12 @@ +import dataclasses +from typing import Final + from aws_library.ec2 import EC2InstanceData, EC2Tags, Resources from aws_library.ec2._models import EC2InstanceType from fastapi import FastAPI from models_library.docker import DockerLabelKey from models_library.generated_models.docker_rest_api import Node, Task +from pydantic import ByteSize, TypeAdapter from types_aiobotocore_ec2.literals import InstanceTypeType from ...core.settings import get_application_settings @@ -10,6 +14,15 @@ from ...utils import utils_docker, utils_ec2 from ..docker import get_docker_client +_MACHINE_TOTAL_RAM_SAFE_MARGIN_RATIO: Final[float] = ( + 0.1 # NOTE: machines always have less available RAM than advertised +) +_SIDECARS_OPS_SAFE_RAM_MARGIN: Final[ByteSize] = TypeAdapter(ByteSize).validate_python( + "1GiB" +) +_CPUS_SAFE_MARGIN: Final[float] = 1.4 +_MIN_NUM_CPUS: Final[float] = 0.5 + class DynamicAutoscalingProvider: async def get_monitored_nodes(self, app: FastAPI) -> list[Node]: @@ -114,10 +127,21 @@ def add_instance_generic_resources( assert instance # nosec # nothing to do at the moment - def add_instance_type_generic_resource( + def adjust_instance_type_resources( self, app: FastAPI, instance_type: EC2InstanceType ) -> None: assert self # nosec assert app # nosec - assert instance_type # nosec # nothing to do at the moment + adjusted_cpus = float(instance_type.resources.cpus) - _CPUS_SAFE_MARGIN + adjusted_ram = int( + instance_type.resources.ram + - _MACHINE_TOTAL_RAM_SAFE_MARGIN_RATIO * instance_type.resources.ram + - _SIDECARS_OPS_SAFE_RAM_MARGIN + ) + dataclasses.replace( + instance_type, + resources=instance_type.resources.model_copy( + update={"cpus": adjusted_cpus, "ram": ByteSize(adjusted_ram)} + ), + ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py index e161893f71bf..62003854fef3 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py @@ -53,6 +53,6 @@ def add_instance_generic_resources( self, app: FastAPI, instance: EC2InstanceData ) -> None: ... - def add_instance_type_generic_resource( + def adjust_instance_type_resources( self, app: FastAPI, instance_type: EC2InstanceType ) -> None: ... diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py index 10103909a631..3446cb2f1497 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py @@ -5,6 +5,9 @@ import arrow from dask_task_models_library.container_tasks.protocol import ContainerEnvsDict +from dask_task_models_library.resource_constraints import ( + estimate_dask_worker_resources_from_ec2_instance, +) from models_library.api_schemas_catalog.services import ServiceGet from models_library.api_schemas_clusters_keeper.ec2_instances import EC2InstanceTypeGet from models_library.api_schemas_directorv2.services import ( @@ -292,15 +295,14 @@ def _by_type_name(ec2: EC2InstanceTypeGet) -> bool: image_resources: ImageResources = node_resources[ DEFAULT_SINGLE_SERVICE_NAME ] - image_resources.resources["CPU"].set_value( - float(selected_ec2_instance_type.cpus) - _CPUS_SAFE_MARGIN - ) - image_resources.resources["RAM"].set_value( - int( - selected_ec2_instance_type.ram - - _RAM_SAFE_MARGIN_RATIO * selected_ec2_instance_type.ram + adjusted_cpus, adjusted_ram = ( + estimate_dask_worker_resources_from_ec2_instance( + float(selected_ec2_instance_type.cpus), + selected_ec2_instance_type.ram, ) ) + image_resources.resources["CPU"].set_value(adjusted_cpus) + image_resources.resources["RAM"].set_value(adjusted_ram) await project_nodes_repo.update( connection, From 2e713951ba6bdc38396bc339ffb03f745d33827a Mon Sep 17 00:00:00 2001 From: Sylvain <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 17:49:41 +0200 Subject: [PATCH 82/93] Update packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../src/dask_task_models_library/resource_constraints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py index 68716c9d5b67..715c037f107b 100644 --- a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py +++ b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py @@ -43,7 +43,7 @@ def estimate_dask_worker_resources_from_ec2_instance( Returns: tuple: Estimated resources for the dask worker (cpus, ram). """ - worker_cpus = min(0.1, cpus - _CPUS_SAFE_MARGIN) # ensure at least 0.1 CPU + worker_cpus = max(0.1, cpus - _CPUS_SAFE_MARGIN) # ensure at least 0.1 CPU worker_ram = int(ram * (1 - _RAM_SAFE_MARGIN_RATIO)) # apply safe margin return (worker_cpus, worker_ram) From 4cc9bb3a324d10afd309540daeca8ae8e9e04cf5 Mon Sep 17 00:00:00 2001 From: Sylvain <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 17:49:56 +0200 Subject: [PATCH 83/93] Update services/autoscaling/src/simcore_service_autoscaling/modules/dask.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../autoscaling/src/simcore_service_autoscaling/modules/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 546a7ad9917f..cf16afc2b5fc 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -365,7 +365,7 @@ def add_instance_generic_resources( def add_instance_type_generic_resource( settings: DaskMonitoringSettings, instance_type: EC2InstanceType ) -> None: - instance_threads = min(1, round(instance_type.resources.cpus)) + instance_threads = max(1, round(instance_type.resources.cpus)) if settings.DASK_NTHREADS > 0: # this overrides everything instance_threads = settings.DASK_NTHREADS From ed8fcd0ededaf5cca614f634fd653aa691a17519 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 20 Oct 2025 17:55:24 +0200 Subject: [PATCH 84/93] fix return value --- .../modules/cluster_scaling/_auto_scaling_core.py | 6 +++--- .../modules/cluster_scaling/_provider_computational.py | 7 ++++--- .../modules/cluster_scaling/_provider_dynamic.py | 4 ++-- .../modules/cluster_scaling/_provider_protocol.py | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py index 37c7dcd6102d..535df02d3cf2 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py @@ -377,10 +377,10 @@ def _as_selection(instance_type: EC2InstanceType) -> int: # NOTE: will raise ValueError if allowed_instance_types not in allowed_instance_type_names return allowed_instance_type_names.index(f"{instance_type.name}") - allowed_instance_types.sort(key=_as_selection) - for instance_type in allowed_instance_types: + return [ auto_scaling_mode.adjust_instance_type_resources(app, instance_type) - return allowed_instance_types + for instance_type in sorted(allowed_instance_types, key=_as_selection) + ] async def _activate_and_notify( diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py index 9701bc19fe79..d580868627f2 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_computational.py @@ -199,7 +199,7 @@ def add_instance_generic_resources( def adjust_instance_type_resources( self, app: FastAPI, instance_type: EC2InstanceType - ) -> None: + ) -> EC2InstanceType: assert self # nosec assert app # nosec app_settings = get_application_settings(app) @@ -207,12 +207,13 @@ def adjust_instance_type_resources( adjusted_cpus, adjusted_ram = estimate_dask_worker_resources_from_ec2_instance( instance_type.resources.cpus, instance_type.resources.ram ) - dataclasses.replace( + replaced_instance_type = dataclasses.replace( instance_type, resources=instance_type.resources.model_copy( update={"cpus": adjusted_cpus, "ram": ByteSize(adjusted_ram)} ), ) dask.add_instance_type_generic_resource( - app_settings.AUTOSCALING_DASK, instance_type + app_settings.AUTOSCALING_DASK, replaced_instance_type ) + return replaced_instance_type diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py index e6b40c22b14a..7aba033ba1fb 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py @@ -129,7 +129,7 @@ def add_instance_generic_resources( def adjust_instance_type_resources( self, app: FastAPI, instance_type: EC2InstanceType - ) -> None: + ) -> EC2InstanceType: assert self # nosec assert app # nosec # nothing to do at the moment @@ -139,7 +139,7 @@ def adjust_instance_type_resources( - _MACHINE_TOTAL_RAM_SAFE_MARGIN_RATIO * instance_type.resources.ram - _SIDECARS_OPS_SAFE_RAM_MARGIN ) - dataclasses.replace( + return dataclasses.replace( instance_type, resources=instance_type.resources.model_copy( update={"cpus": adjusted_cpus, "ram": ByteSize(adjusted_ram)} diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py index 62003854fef3..d2f711229c4f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_protocol.py @@ -55,4 +55,4 @@ def add_instance_generic_resources( def adjust_instance_type_resources( self, app: FastAPI, instance_type: EC2InstanceType - ) -> None: ... + ) -> EC2InstanceType: ... From e23a7897c83056666cc15e03daa9fbe06f54e773 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 21 Oct 2025 17:20:04 +0200 Subject: [PATCH 85/93] created a base function to compute resources --- .../src/servicelib/docker_utils.py | 30 +++++++++++++++++++ .../cluster_scaling/_provider_dynamic.py | 24 +++++---------- .../projects/_projects_service.py | 29 +++++++++--------- 3 files changed, 52 insertions(+), 31 deletions(-) diff --git a/packages/service-library/src/servicelib/docker_utils.py b/packages/service-library/src/servicelib/docker_utils.py index a919cb9487d7..374c05595beb 100644 --- a/packages/service-library/src/servicelib/docker_utils.py +++ b/packages/service-library/src/servicelib/docker_utils.py @@ -326,3 +326,33 @@ async def _pull_image_with_retry() -> None: ) await _pull_image_with_retry() + + +_CPUS_SAFE_MARGIN: Final[float] = ( + 1.4 # accounts for machine overhead (ops + sidecar itself) +) +_MACHINE_TOTAL_RAM_SAFE_MARGIN_RATIO: Final[float] = ( + 0.1 # NOTE: machines always have less available RAM than advertised +) +_SIDECARS_OPS_SAFE_RAM_MARGIN: Final[ByteSize] = TypeAdapter(ByteSize).validate_python( + "1GiB" +) +DYNAMIC_SIDECAR_MIN_CPUS: Final[float] = 0.5 + + +def estimate_dynamic_sidecar_resources_from_ec2_instance( + cpus: float, ram: int +) -> tuple[float, int]: + """Estimates the resources available to a dynamic-sidecar running in an EC2 instance, + taking into account safe margins for CPU and RAM, as the EC2 full resources are not completely visible + + Returns: + tuple: Estimated resources for the dynamic-sidecar (cpus, ram). + """ + # dynamic-sidecar usually needs less CPU + sidecar_cpus = max(DYNAMIC_SIDECAR_MIN_CPUS, cpus - _CPUS_SAFE_MARGIN) + sidecar_ram = int( + ram - _MACHINE_TOTAL_RAM_SAFE_MARGIN_RATIO * ram - _SIDECARS_OPS_SAFE_RAM_MARGIN + ) + + return (sidecar_cpus, sidecar_ram) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py index 7aba033ba1fb..d7499dc92e1a 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_provider_dynamic.py @@ -1,12 +1,12 @@ import dataclasses -from typing import Final from aws_library.ec2 import EC2InstanceData, EC2Tags, Resources from aws_library.ec2._models import EC2InstanceType from fastapi import FastAPI from models_library.docker import DockerLabelKey from models_library.generated_models.docker_rest_api import Node, Task -from pydantic import ByteSize, TypeAdapter +from pydantic import ByteSize +from servicelib.docker_utils import estimate_dynamic_sidecar_resources_from_ec2_instance from types_aiobotocore_ec2.literals import InstanceTypeType from ...core.settings import get_application_settings @@ -14,15 +14,6 @@ from ...utils import utils_docker, utils_ec2 from ..docker import get_docker_client -_MACHINE_TOTAL_RAM_SAFE_MARGIN_RATIO: Final[float] = ( - 0.1 # NOTE: machines always have less available RAM than advertised -) -_SIDECARS_OPS_SAFE_RAM_MARGIN: Final[ByteSize] = TypeAdapter(ByteSize).validate_python( - "1GiB" -) -_CPUS_SAFE_MARGIN: Final[float] = 1.4 -_MIN_NUM_CPUS: Final[float] = 0.5 - class DynamicAutoscalingProvider: async def get_monitored_nodes(self, app: FastAPI) -> list[Node]: @@ -132,13 +123,12 @@ def adjust_instance_type_resources( ) -> EC2InstanceType: assert self # nosec assert app # nosec - # nothing to do at the moment - adjusted_cpus = float(instance_type.resources.cpus) - _CPUS_SAFE_MARGIN - adjusted_ram = int( - instance_type.resources.ram - - _MACHINE_TOTAL_RAM_SAFE_MARGIN_RATIO * instance_type.resources.ram - - _SIDECARS_OPS_SAFE_RAM_MARGIN + adjusted_cpus, adjusted_ram = ( + estimate_dynamic_sidecar_resources_from_ec2_instance( + instance_type.resources.cpus, instance_type.resources.ram + ) ) + return dataclasses.replace( instance_type, resources=instance_type.resources.model_copy( diff --git a/services/web/server/src/simcore_service_webserver/projects/_projects_service.py b/services/web/server/src/simcore_service_webserver/projects/_projects_service.py index 9bd620d3d23b..4d244d2c1f35 100644 --- a/services/web/server/src/simcore_service_webserver/projects/_projects_service.py +++ b/services/web/server/src/simcore_service_webserver/projects/_projects_service.py @@ -88,6 +88,10 @@ X_FORWARDED_PROTO, X_SIMCORE_USER_AGENT, ) +from servicelib.docker_utils import ( + DYNAMIC_SIDECAR_MIN_CPUS, + estimate_dynamic_sidecar_resources_from_ec2_instance, +) from servicelib.logging_utils import log_context from servicelib.rabbitmq import RemoteMethodNotRegisteredError, RPCServerError from servicelib.rabbitmq.rpc_interfaces.catalog import services as catalog_rpc @@ -652,12 +656,12 @@ def _by_type_name(ec2: EC2InstanceTypeGet) -> bool: app, user_id, project_id, node_id, service_key, service_version ) scalable_service_name = DEFAULT_SINGLE_SERVICE_NAME - new_cpus_value = float(selected_ec2_instance_type.cpus) - _CPUS_SAFE_MARGIN - new_ram_value = int( - selected_ec2_instance_type.ram - - _MACHINE_TOTAL_RAM_SAFE_MARGIN_RATIO * selected_ec2_instance_type.ram - - _SIDECARS_OPS_SAFE_RAM_MARGIN + new_cpus_value, new_ram_value = ( + estimate_dynamic_sidecar_resources_from_ec2_instance( + selected_ec2_instance_type.cpus, selected_ec2_instance_type.ram + ) ) + if DEFAULT_SINGLE_SERVICE_NAME not in node_resources: # NOTE: we go for the largest sub-service and scale it up/down scalable_service_name, hungry_service_resources = max( @@ -680,17 +684,14 @@ def _by_type_name(ec2: EC2InstanceTypeGet) -> bool: } ) new_cpus_value = max( - float(selected_ec2_instance_type.cpus) - - _CPUS_SAFE_MARGIN - - other_services_resources["CPU"], - _MIN_NUM_CPUS, + new_cpus_value - other_services_resources["CPU"], + DYNAMIC_SIDECAR_MIN_CPUS, ) - new_ram_value = int( - selected_ec2_instance_type.ram - - _MACHINE_TOTAL_RAM_SAFE_MARGIN_RATIO * selected_ec2_instance_type.ram - - other_services_resources["RAM"] - - _SIDECARS_OPS_SAFE_RAM_MARGIN + + new_ram_value = max( + int(new_ram_value - other_services_resources["RAM"]), 128 * 1024 * 1024 ) + # scale the service node_resources[scalable_service_name].resources["CPU"].set_value(new_cpus_value) node_resources[scalable_service_name].resources["RAM"].set_value(new_ram_value) From 525954cdf015b87bfbb9dd9eee713bed374339d0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 21 Oct 2025 17:57:33 +0200 Subject: [PATCH 86/93] fixed tests --- .../unit/test_modules_cluster_scaling_dynamic.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py index bf9e42a1ec67..b16ac41233f0 100644 --- a/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py @@ -978,7 +978,7 @@ async def _assert_wait_for_ec2_instances_terminated() -> None: _ScaleUpParams( imposed_instance_type=None, service_resources=Resources( - cpus=4, ram=TypeAdapter(ByteSize).validate_python("128Gib") + cpus=4, ram=TypeAdapter(ByteSize).validate_python("114Gib") ), num_services=1, expected_instance_type="r5n.4xlarge", @@ -990,7 +990,7 @@ async def _assert_wait_for_ec2_instances_terminated() -> None: _ScaleUpParams( imposed_instance_type="t2.xlarge", service_resources=Resources( - cpus=4, ram=TypeAdapter(ByteSize).validate_python("4Gib") + cpus=2.6, ram=TypeAdapter(ByteSize).validate_python("4Gib") ), num_services=1, expected_instance_type="t2.xlarge", @@ -1002,7 +1002,7 @@ async def _assert_wait_for_ec2_instances_terminated() -> None: _ScaleUpParams( imposed_instance_type="r5n.8xlarge", service_resources=Resources( - cpus=4, ram=TypeAdapter(ByteSize).validate_python("128Gib") + cpus=4, ram=TypeAdapter(ByteSize).validate_python("114Gib") ), num_services=1, expected_instance_type="r5n.8xlarge", @@ -1165,7 +1165,7 @@ async def test_cluster_scaling_up_and_down_against_aws( ), num_services=10, expected_instance_type="r5n.4xlarge", # 1 GPU, 16 CPUs, 128GiB - expected_num_instances=4, + expected_num_instances=5, ), id="sim4life-light", ), @@ -1254,7 +1254,7 @@ async def test_cluster_scaling_up_starts_multiple_instances( _ScaleUpParams( imposed_instance_type="g4dn.2xlarge", # 1 GPU, 8 CPUs, 32GiB service_resources=Resources( - cpus=8, ram=TypeAdapter(ByteSize).validate_python("15Gib") + cpus=6.6, ram=TypeAdapter(ByteSize).validate_python("15Gib") ), num_services=12, expected_instance_type="g4dn.2xlarge", # 1 GPU, 8 CPUs, 32GiB @@ -1263,7 +1263,7 @@ async def test_cluster_scaling_up_starts_multiple_instances( _ScaleUpParams( imposed_instance_type="g4dn.8xlarge", # 32CPUs, 128GiB service_resources=Resources( - cpus=32, ram=TypeAdapter(ByteSize).validate_python("20480MB") + cpus=30.6, ram=TypeAdapter(ByteSize).validate_python("20480MB") ), num_services=7, expected_instance_type="g4dn.8xlarge", # 32CPUs, 128GiB @@ -1556,7 +1556,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 _ScaleUpParams( imposed_instance_type=None, service_resources=Resources( - cpus=4, ram=TypeAdapter(ByteSize).validate_python("128Gib") + cpus=4, ram=TypeAdapter(ByteSize).validate_python("114Gib") ), num_services=1, expected_instance_type="r5n.4xlarge", From fc095151cbf99d17af9a32c97e432739ccad13a0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 21 Oct 2025 17:58:50 +0200 Subject: [PATCH 87/93] pylint --- .../autoscaling/tests/unit/test_modules_instrumentation_core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/services/autoscaling/tests/unit/test_modules_instrumentation_core.py b/services/autoscaling/tests/unit/test_modules_instrumentation_core.py index b3a843d8adbe..8b1eff250e9a 100644 --- a/services/autoscaling/tests/unit/test_modules_instrumentation_core.py +++ b/services/autoscaling/tests/unit/test_modules_instrumentation_core.py @@ -12,6 +12,7 @@ def disabled_instrumentation( app_environment: EnvVarsDict, monkeypatch: pytest.MonkeyPatch ) -> None: + assert app_environment monkeypatch.setenv("AUTOSCALING_PROMETHEUS_INSTRUMENTATION_ENABLED", "false") From a7d975be232fd9f65c0bcd1d1f6547869d31290c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 21 Oct 2025 17:59:02 +0200 Subject: [PATCH 88/93] linter --- .../tests/unit/test_modules_instrumentation_core.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/services/autoscaling/tests/unit/test_modules_instrumentation_core.py b/services/autoscaling/tests/unit/test_modules_instrumentation_core.py index 8b1eff250e9a..ffc8d87bcb9d 100644 --- a/services/autoscaling/tests/unit/test_modules_instrumentation_core.py +++ b/services/autoscaling/tests/unit/test_modules_instrumentation_core.py @@ -1,3 +1,11 @@ +# pylint: disable=no-value-for-parameter +# pylint: disable=redefined-outer-name +# pylint: disable=too-many-arguments +# pylint: disable=too-many-positional-arguments +# pylint: disable=too-many-statements +# pylint: disable=unused-argument +# pylint: disable=unused-variable + import pytest from fastapi import FastAPI from pytest_simcore.helpers.typing_env import EnvVarsDict From c998a25f7a76e3d9d56033b8f2cf8f8cc01d7043 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 21 Oct 2025 18:29:26 +0200 Subject: [PATCH 89/93] fixed tests --- ...t_modules_cluster_scaling_computational.py | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py index 5f51a4f34c19..bba8531e0032 100644 --- a/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py @@ -22,7 +22,9 @@ import pytest from aws_library.ec2 import Resources from dask_task_models_library.resource_constraints import ( + DASK_WORKER_THREAD_RESOURCE_NAME, create_ec2_resource_constraint_key, + estimate_dask_worker_resources_from_ec2_instance, ) from faker import Faker from fastapi import FastAPI @@ -259,16 +261,25 @@ async def _create_task_with_resources( instance_types = await ec2_client.describe_instance_types( InstanceTypes=[dask_task_imposed_ec2_type] ) + assert instance_types assert "InstanceTypes" in instance_types assert instance_types["InstanceTypes"] assert "MemoryInfo" in instance_types["InstanceTypes"][0] assert "SizeInMiB" in instance_types["InstanceTypes"][0]["MemoryInfo"] + ec2_ram = TypeAdapter(ByteSize).validate_python( + f"{instance_types['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']}MiB", + ) + assert "VCpuInfo" in instance_types["InstanceTypes"][0] + assert "DefaultVCpus" in instance_types["InstanceTypes"][0]["VCpuInfo"] + ec2_cpus = instance_types["InstanceTypes"][0]["VCpuInfo"]["DefaultVCpus"] + required_cpus, required_ram = estimate_dask_worker_resources_from_ec2_instance( + ec2_cpus, ec2_ram + ) task_resources = Resources( - cpus=1, - ram=TypeAdapter(ByteSize).validate_python( - f"{instance_types['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']}MiB", - ), + cpus=required_cpus, + ram=ByteSize(required_ram), + generic_resources={DASK_WORKER_THREAD_RESOURCE_NAME: 1}, ) assert task_resources @@ -443,7 +454,7 @@ async def test_cluster_scaling_with_task_with_too_much_resources_starts_nothing( _ScaleUpParams( imposed_instance_type=None, task_resources=Resources( - cpus=1, ram=TypeAdapter(ByteSize).validate_python("128Gib") + cpus=1, ram=TypeAdapter(ByteSize).validate_python("115Gib") ), num_tasks=1, expected_instance_type="r5n.4xlarge", @@ -465,7 +476,7 @@ async def test_cluster_scaling_with_task_with_too_much_resources_starts_nothing( _ScaleUpParams( imposed_instance_type="r5n.8xlarge", task_resources=Resources( - cpus=1, ram=TypeAdapter(ByteSize).validate_python("116Gib") + cpus=1, ram=TypeAdapter(ByteSize).validate_python("115Gib") ), num_tasks=1, expected_instance_type="r5n.8xlarge", @@ -1281,7 +1292,7 @@ async def test_cluster_scaling_up_more_than_allowed_with_multiple_types_max_star _ScaleUpParams( imposed_instance_type=None, task_resources=Resources( - cpus=1, ram=TypeAdapter(ByteSize).validate_python("128Gib") + cpus=1, ram=TypeAdapter(ByteSize).validate_python("115Gib") ), num_tasks=1, expected_instance_type="r5n.4xlarge", @@ -1456,7 +1467,7 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( _ScaleUpParams( imposed_instance_type="g4dn.2xlarge", # 1 GPU, 8 CPUs, 32GiB task_resources=Resources( - cpus=8, ram=TypeAdapter(ByteSize).validate_python("15Gib") + cpus=7.9, ram=TypeAdapter(ByteSize).validate_python("15Gib") ), num_tasks=12, expected_instance_type="g4dn.2xlarge", # 1 GPU, 8 CPUs, 32GiB @@ -1465,7 +1476,7 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( _ScaleUpParams( imposed_instance_type="g4dn.8xlarge", # 32CPUs, 128GiB task_resources=Resources( - cpus=32, ram=TypeAdapter(ByteSize).validate_python("20480MB") + cpus=31.9, ram=TypeAdapter(ByteSize).validate_python("20480MB") ), num_tasks=7, expected_instance_type="g4dn.8xlarge", # 32CPUs, 128GiB From 6371b25502a76809b6c511a86cbc68d7fde43c11 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 21 Oct 2025 18:29:33 +0200 Subject: [PATCH 90/93] fixed types --- .../src/dask_task_models_library/resource_constraints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py index 715c037f107b..7770ba74050a 100644 --- a/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py +++ b/packages/dask-task-models-library/src/dask_task_models_library/resource_constraints.py @@ -36,7 +36,7 @@ def get_ec2_instance_type_from_resources( def estimate_dask_worker_resources_from_ec2_instance( cpus: float, ram: int -) -> tuple[float, float]: +) -> tuple[float, int]: """Estimates the resources available to a dask worker running in an EC2 instance, taking into account safe margins for CPU and RAM. From c773ac1cd8fa97da42a398995adda312c355b638 Mon Sep 17 00:00:00 2001 From: Sylvain <35365065+sanderegg@users.noreply.github.com> Date: Tue, 21 Oct 2025 21:36:23 +0200 Subject: [PATCH 91/93] Update services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../modules/cluster_scaling/_utils_computational.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py index f5ed682f6669..1b5225966809 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_utils_computational.py @@ -27,7 +27,7 @@ def resources_from_dask_task(task: DaskTask) -> Resources: task_resources = ( _DEFAULT_DASK_RESOURCES | task.required_resources - ) # merge with defaults to ensure there is always some minimal resource defined + ) # merge defaults with task resources (task resources override defaults) return Resources.from_flat_dict( cast(dict[str, GenericResourceValueType], task_resources), From 29d4211419cc72f64bef989cea35e30887b24347 Mon Sep 17 00:00:00 2001 From: Sylvain <35365065+sanderegg@users.noreply.github.com> Date: Tue, 21 Oct 2025 21:36:51 +0200 Subject: [PATCH 92/93] Update packages/aws-library/src/aws_library/ec2/_models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- packages/aws-library/src/aws_library/ec2/_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index 09f49bb6aae4..d5ff855ad355 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -292,8 +292,8 @@ def validate_bash_calls(cls, v): # NOTE: this will not capture runtime errors, but at least some syntax errors such as invalid quotes sh.bash( "-n", - temp_file.name, # pyright: ignore[reportCallIssue] - ) # sh is untyped, but this call is safe for bash syntax checking + temp_file.name, # pyright: ignore[reportCallIssue] - sh is untyped but safe for bash syntax checking + ) except sh.ErrorReturnCode as exc: msg = f"Invalid bash call in custom_boot_scripts: {v}, Error: {exc.stderr}" raise ValueError(msg) from exc From f2bcb526e12eb9d8966ff96e4dd8a6eff46016a4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 22 Oct 2025 18:43:17 +0200 Subject: [PATCH 93/93] @pcrespov review: add some more string comparisons --- .../aws-library/src/aws_library/ec2/_models.py | 10 ++++++++++ packages/aws-library/tests/test_ec2_models.py | 15 +++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/packages/aws-library/src/aws_library/ec2/_models.py b/packages/aws-library/src/aws_library/ec2/_models.py index d5ff855ad355..3e5a2b00691d 100644 --- a/packages/aws-library/src/aws_library/ec2/_models.py +++ b/packages/aws-library/src/aws_library/ec2/_models.py @@ -1,3 +1,4 @@ +import contextlib import datetime import re import tempfile @@ -17,6 +18,8 @@ StrictFloat, StrictInt, StringConstraints, + TypeAdapter, + ValidationError, field_validator, ) from pydantic.config import JsonDict @@ -82,8 +85,15 @@ def __gt__(self, other: "Resources") -> bool: return False else: # remaining options is a is str and b is str or mixed types + # NOTE: we cannot compare strings unless they are equal or some kind of boolean (e.g. "true", "false", "yes", "no", "1", "0") assert isinstance(a, str) # nosec assert isinstance(b, int | float | str) # nosec + # let's try to get a boolean out of the values to compare them + with contextlib.suppress(ValidationError): + a_as_boolean = TypeAdapter(bool).validate_python(a) + b_as_boolean = TypeAdapter(bool).validate_python(b) + if not a_as_boolean and b_as_boolean: + return False # here we have either everything greater or equal or non-comparable strings diff --git a/packages/aws-library/tests/test_ec2_models.py b/packages/aws-library/tests/test_ec2_models.py index 0b1fa016fa79..22f03a0bd102 100644 --- a/packages/aws-library/tests/test_ec2_models.py +++ b/packages/aws-library/tests/test_ec2_models.py @@ -83,6 +83,16 @@ Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), True, ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "no"}), + True, + ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "no"}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), + False, + ), ( Resources(cpus=0.1, ram=ByteSize(1)), Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), @@ -179,6 +189,11 @@ def test_resources_ge_operator( Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "no"}), True, ), + ( + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "no"}), + Resources(cpus=0.1, ram=ByteSize(1), generic_resources={"SSE": "yes"}), + False, + ), ], ) def test_resources_gt_operator(a: Resources, b: Resources, a_greater_than_b: bool):