odeimaiz
diff --git a/‎services/autoscaling/src/simcore_service_autoscaling/core/errors.py‎
Lines changed: 1 addition & 1 deletion b/‎services/autoscaling/src/simcore_service_autoscaling/core/errors.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎services/autoscaling/src/simcore_service_autoscaling/core/settings.py‎
Lines changed: 15 additions & 1 deletion b/‎services/autoscaling/src/simcore_service_autoscaling/core/settings.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎services/autoscaling/src/simcore_service_autoscaling/dynamic_scaling_core.py‎
Lines changed: 223 additions & 41 deletions b/‎services/autoscaling/src/simcore_service_autoscaling/dynamic_scaling_core.py‎
Lines changed: 223 additions & 41 deletions
diff --git a/‎services/autoscaling/src/simcore_service_autoscaling/models.py‎
Lines changed: 15 additions & 0 deletions b/‎services/autoscaling/src/simcore_service_autoscaling/models.py‎
Lines changed: 15 additions & 0 deletions
@@ -14,7 +14,7 @@ class Ec2NotConnectedError(AutoscalingRuntimeError):
 
 
 class Ec2InstanceNotFoundError(AutoscalingRuntimeError):
-    msg_template: str = "Needed instance was not found"
+    msg_template: str = "EC2 instance was not found"
 
 
 class Ec2TooManyInstancesError(AutoscalingRuntimeError):
 
@@ -40,7 +40,7 @@ class EC2InstancesSettings(BaseCustomSettings):
         description="Defines the AMI (Amazon Machine Image) ID used to start a new EC2 instance",
     )
     EC2_INSTANCES_MAX_INSTANCES: int = Field(
-        10,
+        default=10,
         description="Defines the maximum number of instances the autoscaling app may create",
     )
     EC2_INSTANCES_SECURITY_GROUP_IDS: list[str] = Field(
@@ -65,6 +65,20 @@ class EC2InstancesSettings(BaseCustomSettings):
         "this is required to start a new EC2 instance",
     )
 
+    EC2_INSTANCES_TIME_BEFORE_TERMINATION: datetime.timedelta = Field(
+        default=datetime.timedelta(minutes=55),
+        description="Time after which an EC2 instance may be terminated (repeat every hour, min 0, max 59 minutes)",
+    )
+
+    @validator("EC2_INSTANCES_TIME_BEFORE_TERMINATION")
+    @classmethod
+    def ensure_time_is_in_range(cls, value):
+        if value < datetime.timedelta(minutes=0):
+            value = datetime.timedelta(minutes=0)
+        elif value > datetime.timedelta(minutes=59):
+            value = datetime.timedelta(minutes=59)
+        return value
+
     @validator("EC2_INSTANCES_ALLOWED_TYPES")
     @classmethod
     def check_valid_intance_names(cls, value):
 
@@ -1,68 +1,203 @@
+import asyncio
 import json
 import logging
 import re
-from datetime import datetime
+from datetime import datetime, timedelta, timezone
 
 from fastapi import FastAPI
+from models_library.generated_models.docker_rest_api import Availability, Node, Task
 from pydantic import parse_obj_as
 from types_aiobotocore_ec2.literals import InstanceTypeType
 
 from ._meta import VERSION
 from .core.errors import Ec2InstanceNotFoundError
 from .core.settings import ApplicationSettings
+from .models import Resources
 from .modules.docker import get_docker_client
-from .modules.ec2 import get_ec2_client
+from .modules.ec2 import EC2InstanceData, get_ec2_client
 from .utils import ec2, rabbitmq, utils_docker
 
 logger = logging.getLogger(__name__)
 
 _EC2_INTERNAL_DNS_RE: re.Pattern = re.compile(r"^(?P<ip>ip-[0-9-]+).+$")
 
 
-async def check_dynamic_resources(app: FastAPI) -> None:
-    """Check that there are no pending tasks requiring additional resources in the cluster (docker swarm)
-    If there are such tasks, this method will allocate new machines in AWS to cope with
-    the additional load.
-    """
+async def _mark_empty_active_nodes_to_drain(
+    app: FastAPI,
+    monitored_nodes: list[Node],
+) -> None:
     app_settings: ApplicationSettings = app.state.settings
     assert app_settings.AUTOSCALING_NODES_MONITORING  # nosec
-
-    # 1. get monitored nodes information and resources
     docker_client = get_docker_client(app)
-    monitored_nodes = await utils_docker.get_monitored_nodes(
-        docker_client,
-        node_labels=app_settings.AUTOSCALING_NODES_MONITORING.NODES_MONITORING_NODE_LABELS,
+    active_empty_nodes = [
+        node
+        for node in monitored_nodes
+        if (
+            await utils_docker.compute_node_used_resources(
+                docker_client,
+                node,
+                service_labels=app_settings.AUTOSCALING_NODES_MONITORING.NODES_MONITORING_SERVICE_LABELS,
+            )
+            == Resources.create_as_empty()
+        )
+        and (node.Spec is not None)
+        and (node.Spec.Availability == Availability.active)
+    ]
+    await asyncio.gather(
+        *(
+            utils_docker.set_node_availability(
+                docker_client,
+                node,
+                available=False,
+            )
+            for node in active_empty_nodes
+            if (node.Spec) and (node.Spec.Labels is not None)
+        )
     )
+    if active_empty_nodes:
+        logger.info(
+            "The following nodes set to drain: '%s'",
+            f"{[node.Description.Hostname for node in active_empty_nodes if node.Description]}",
+        )
 
-    cluster_total_resources = await utils_docker.compute_cluster_total_resources(
-        monitored_nodes
-    )
-    logger.info("%s", f"{cluster_total_resources=}")
-    cluster_used_resources = await utils_docker.compute_cluster_used_resources(
-        docker_client, monitored_nodes
-    )
-    logger.info("%s", f"{cluster_used_resources=}")
 
-    # 2. Remove nodes that are gone
-    await utils_docker.remove_monitored_down_nodes(docker_client, monitored_nodes)
+async def _find_terminateable_nodes(
+    app: FastAPI, monitored_nodes: list[Node]
+) -> list[tuple[Node, EC2InstanceData]]:
+    app_settings: ApplicationSettings = app.state.settings
+    assert app_settings.AUTOSCALING_NODES_MONITORING  # nosec
+    docker_client = get_docker_client(app)
+
+    # NOTE: we want the drained nodes where no monitored service is running anymore
+    drained_empty_nodes = [
+        node
+        for node in monitored_nodes
+        if (
+            await utils_docker.compute_node_used_resources(
+                docker_client,
+                node,
+                service_labels=app_settings.AUTOSCALING_NODES_MONITORING.NODES_MONITORING_SERVICE_LABELS,
+            )
+            == Resources.create_as_empty()
+        )
+        and (node.Spec is not None)
+        and (node.Spec.Availability == Availability.drain)
+    ]
+    assert app_settings.AUTOSCALING_EC2_INSTANCES  # nosec
+    if not drained_empty_nodes:
+        # there is nothing to terminate here
+        return []
 
-    # 3. Scale up nodes if there are pending tasks
-    pending_tasks = await utils_docker.pending_service_tasks_with_insufficient_resources(
-        docker_client,
-        service_labels=app_settings.AUTOSCALING_NODES_MONITORING.NODES_MONITORING_SERVICE_LABELS,
-    )
-    await rabbitmq.post_state_message(
-        app,
-        monitored_nodes,
-        cluster_total_resources,
-        cluster_used_resources,
-        pending_tasks,
+    # get the corresponding ec2 instance data
+    # NOTE: some might be in the process of terminating and will not be found
+    ec2_client = get_ec2_client(app)
+    drained_empty_ec2_instances = await asyncio.gather(
+        *(
+            ec2_client.get_running_instance(
+                app_settings.AUTOSCALING_EC2_INSTANCES,
+                tag_keys=[
+                    "io.simcore.autoscaling.version",
+                ],
+                instance_host_name=node.Description.Hostname,
+            )
+            for node in drained_empty_nodes
+            if node.Description and node.Description.Hostname
+        ),
+        return_exceptions=True,
     )
 
+    terminateable_nodes: list[tuple[Node, EC2InstanceData]] = []
+    for node, ec2_instance_data in zip(
+        drained_empty_nodes, drained_empty_ec2_instances
+    ):
+        if isinstance(ec2_instance_data, Ec2InstanceNotFoundError):
+            # skip if already terminating
+            continue
+        # NOTE: AWS price is hourly based (e.g. same price for a machine used 2 minutes or 1 hour, so we wait until 55 minutes)
+        elapsed_time_since_launched = (
+            datetime.utcnow().replace(tzinfo=timezone.utc)
+            - ec2_instance_data.launch_time
+        )
+        elapsed_time_since_full_hour = elapsed_time_since_launched % timedelta(hours=1)
+        if (
+            elapsed_time_since_full_hour
+            >= app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_TIME_BEFORE_TERMINATION
+        ):
+            # let's terminate that one
+            terminateable_nodes.append((node, ec2_instance_data))
+    if terminateable_nodes:
+        logger.info(
+            "the following nodes were found to be terminateable: '%s'",
+            f"{[node.Description.Hostname for node,_ in terminateable_nodes if node.Description]}",
+        )
+    return terminateable_nodes
+
+
+async def _try_scale_down_cluster(app: FastAPI, monitored_nodes: list[Node]) -> None:
+    # 2. once it is in draining mode and we are nearing a modulo of an hour we can start the termination procedure
+    # NOTE: the nodes that were just changed to drain above will be eventually terminated on the next iteration
+    if terminateable_nodes := await _find_terminateable_nodes(app, monitored_nodes):
+        await asyncio.gather(
+            *(
+                get_ec2_client(app).terminate_instance(ec2_instance_data)
+                for _, ec2_instance_data in terminateable_nodes
+            )
+        )
+        logger.info(
+            "terminated the following machines: '%s'",
+            f"{[node.Description.Hostname for node,_ in terminateable_nodes if node.Description]}",
+        )
+        # since these nodes are being terminated, remove them from the swarm
+        await utils_docker.remove_nodes(
+            get_docker_client(app),
+            [node for node, _ in terminateable_nodes],
+            force=True,
+        )
+
+    # 3. we could ask on rabbit whether someone would like to keep that machine for something (like the agent for example), if that is the case, we wait another hour and ask again?
+    # 4.
+
+
+async def _try_scale_up_with_drained_nodes(
+    app: FastAPI,
+    monitored_nodes: list[Node],
+    pending_tasks: list[Task],
+) -> bool:
+    docker_client = get_docker_client(app)
     if not pending_tasks:
-        logger.debug("no pending tasks with insufficient resources at the moment")
-        return
+        return True
+    for task in pending_tasks:
+        # NOTE: currently we go one by one and break, next iteration
+        # will take care of next tasks if there are any
+
+        # check if there is some node with enough resources
+        for node in monitored_nodes:
+            assert node.Spec  # nosec
+            assert node.Description  # nosec
+            if (node.Spec.Availability == Availability.drain) and (
+                utils_docker.get_node_total_resources(node)
+                >= utils_docker.get_max_resources_from_docker_task(task)
+            ):
+                # let's make that node available again
+                await utils_docker.set_node_availability(
+                    docker_client, node, available=True
+                )
+                logger.info(
+                    "Activated former drained node '%s'", node.Description.Hostname
+                )
+                await rabbitmq.post_log_message(
+                    app,
+                    task,
+                    "cluster was scaled up and is now ready to run service",
+                    logging.INFO,
+                )
+                return True
+    logger.info("There are no available drained node for the pending tasks")
+    return False
+
 
+async def _scale_up_cluster(app: FastAPI, pending_tasks: list[Task]) -> None:
+    app_settings: ApplicationSettings = app.state.settings
     assert app_settings.AUTOSCALING_EC2_ACCESS  # nosec
     assert app_settings.AUTOSCALING_EC2_INSTANCES  # nosec
     ec2_client = get_ec2_client(app)
@@ -88,13 +223,12 @@ async def check_dynamic_resources(app: FastAPI) -> None:
             assert app_settings.AUTOSCALING_NODES_MONITORING  # nosec
 
             logger.debug("%s", f"{ec2_instances_needed[0]=}")
-            new_instance_dns_name = await ec2_client.start_aws_instance(
+            new_instance_data = await ec2_client.start_aws_instance(
                 app_settings.AUTOSCALING_EC2_INSTANCES,
                 instance_type=parse_obj_as(
                     InstanceTypeType, ec2_instances_needed[0].name
                 ),
                 tags={
-                    "io.simcore.autoscaling.created": f"{datetime.utcnow()}",
                     "io.simcore.autoscaling.version": f"{VERSION}",
                     "io.simcore.autoscaling.monitored_nodes_labels": json.dumps(
                         app_settings.AUTOSCALING_NODES_MONITORING.NODES_MONITORING_NODE_LABELS
@@ -107,13 +241,15 @@ async def check_dynamic_resources(app: FastAPI) -> None:
             )
 
             # NOTE: new_instance_dns_name is of type ip-123-23-23-3.ec2.internal and we need only the first part
-            if match := re.match(_EC2_INTERNAL_DNS_RE, new_instance_dns_name):
+            if match := re.match(
+                _EC2_INTERNAL_DNS_RE, new_instance_data.aws_private_dns
+            ):
                 new_instance_dns_name = match.group(1)
                 new_node = await utils_docker.wait_for_node(
-                    docker_client, new_instance_dns_name
+                    get_docker_client(app), new_instance_dns_name
                 )
                 await utils_docker.tag_node(
-                    docker_client,
+                    get_docker_client(app),
                     new_node,
                     tags={
                         tag_key: "true"
@@ -138,5 +274,51 @@ async def check_dynamic_resources(app: FastAPI) -> None:
             logger.error(
                 "Task %s needs more resources than any EC2 instance "
                 "can provide with the current configuration. Please check.",
-                {f"{task.Name=}:{task.ServiceID=}"},
+                {
+                    f"{task.Name if task.Name else 'unknown task name'}:{task.ServiceID if task.ServiceID else 'unknown service ID'}"
+                },
             )
+
+
+async def check_dynamic_resources(app: FastAPI) -> None:
+    """Check that there are no pending tasks requiring additional resources in the cluster (docker swarm)
+    If there are such tasks, this method will allocate new machines in AWS to cope with
+    the additional load.
+    """
+    app_settings: ApplicationSettings = app.state.settings
+    assert app_settings.AUTOSCALING_NODES_MONITORING  # nosec
+
+    # 1. get monitored nodes information and resources
+    docker_client = get_docker_client(app)
+
+    monitored_nodes = await utils_docker.get_monitored_nodes(
+        docker_client,
+        node_labels=app_settings.AUTOSCALING_NODES_MONITORING.NODES_MONITORING_NODE_LABELS,
+    )
+    cluster_total_resources = await utils_docker.compute_cluster_total_resources(
+        monitored_nodes
+    )
+    cluster_used_resources = await utils_docker.compute_cluster_used_resources(
+        docker_client, monitored_nodes
+    )
+    logger.info("Monitored nodes total resources: %s", f"{cluster_total_resources}")
+    logger.info(
+        "Monitored nodes current used resources: %s", f"{cluster_used_resources}"
+    )
+
+    # 2. Cleanup nodes that are gone
+    await utils_docker.remove_nodes(docker_client, monitored_nodes)
+
+    # 3. Scale up the cluster if there are pending tasks, else see if we shall scale down
+    if pending_tasks := await utils_docker.pending_service_tasks_with_insufficient_resources(
+        docker_client,
+        service_labels=app_settings.AUTOSCALING_NODES_MONITORING.NODES_MONITORING_SERVICE_LABELS,
+    ):
+        if not await _try_scale_up_with_drained_nodes(
+            app, monitored_nodes, pending_tasks
+        ):
+            # no? then scale up
+            await _scale_up_cluster(app, pending_tasks)
+    else:
+        await _mark_empty_active_nodes_to_drain(app, monitored_nodes)
+        await _try_scale_down_cluster(app, monitored_nodes)
@@ -9,6 +9,21 @@ class Resources(BaseModel):
     cpus: NonNegativeFloat
     ram: ByteSize
 
+    @classmethod
+    def create_as_empty(cls) -> "Resources":
+        return cls(cpus=0, ram=ByteSize(0))
+
+    def __ge__(self, other: "Resources") -> bool:
+        return self.cpus >= other.cpus and self.ram >= other.ram
+
+    def __add__(self, other: "Resources") -> "Resources":
+        return Resources.construct(
+            **{
+                key: a + b
+                for (key, a), b in zip(self.dict().items(), other.dict().values())
+            }
+        )
+
 
 class EC2Instance(BaseModel):
     name: str