From b2cd7f31dd348879c418c19c07b183dee277bc06 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 13 Sep 2024 18:38:53 +0200 Subject: [PATCH 01/14] refactor --- .../modules/clusters.py | 2 +- .../modules/clusters_management_core.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py index 38246f3008a0..62528f6b69e1 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py @@ -49,7 +49,7 @@ async def _get_primary_ec2_params( ec2_instance_types: list[ EC2InstanceType ] = await ec2_client.get_ec2_instance_capabilities( - instance_type_names=[ec2_type_name] + instance_type_names={ec2_type_name} ) assert ec2_instance_types # nosec assert len(ec2_instance_types) == 1 # nosec diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py index a7c23143a0b7..fd905b1d18e0 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py @@ -42,8 +42,8 @@ def _get_instance_last_heartbeat(instance: EC2InstanceData) -> datetime.datetime async def _get_all_associated_worker_instances( app: FastAPI, primary_instances: Iterable[EC2InstanceData], -) -> list[EC2InstanceData]: - worker_instances = [] +) -> set[EC2InstanceData]: + worker_instances: set[EC2InstanceData] = set() for instance in primary_instances: assert "user_id" in instance.tags # nosec user_id = UserID(instance.tags[_USER_ID_TAG_KEY]) @@ -55,7 +55,7 @@ async def _get_all_associated_worker_instances( else None ) - worker_instances.extend( + worker_instances.update( await get_cluster_workers(app, user_id=user_id, wallet_id=wallet_id) ) return worker_instances @@ -63,12 +63,12 @@ async def _get_all_associated_worker_instances( async def _find_terminateable_instances( app: FastAPI, instances: Iterable[EC2InstanceData] -) -> list[EC2InstanceData]: +) -> set[EC2InstanceData]: app_settings = get_application_settings(app) assert app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES # nosec # get the corresponding ec2 instance data - terminateable_instances: list[EC2InstanceData] = [] + terminateable_instances: set[EC2InstanceData] = set() time_to_wait_before_termination = ( app_settings.CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION @@ -82,7 +82,7 @@ async def _find_terminateable_instances( elapsed_time_since_heartbeat = arrow.utcnow().datetime - last_heartbeat allowed_time_to_wait = time_to_wait_before_termination if elapsed_time_since_heartbeat >= allowed_time_to_wait: - terminateable_instances.append(instance) + terminateable_instances.add(instance) else: _logger.info( "%s has still %ss before being terminateable", @@ -93,14 +93,14 @@ async def _find_terminateable_instances( elapsed_time_since_startup = arrow.utcnow().datetime - instance.launch_time allowed_time_to_wait = startup_delay if elapsed_time_since_startup >= allowed_time_to_wait: - terminateable_instances.append(instance) + terminateable_instances.add(instance) # get all terminateable instances associated worker instances worker_instances = await _get_all_associated_worker_instances( app, terminateable_instances ) - return terminateable_instances + worker_instances + return terminateable_instances.union(worker_instances) async def check_clusters(app: FastAPI) -> None: From aa11d6594df7ca553706db986897a29ba474987d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 13 Sep 2024 18:54:17 +0200 Subject: [PATCH 02/14] ongoing --- .../modules/clusters_management_core.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py index fd905b1d18e0..c28aa3ca22d1 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py @@ -112,6 +112,7 @@ async def check_clusters(app: FastAPI) -> None: if await ping_scheduler(get_scheduler_url(instance), get_scheduler_auth(app)) } + # set intance heartbeat if scheduler is busy for instance in connected_intances: with log_catch(_logger, reraise=False): # NOTE: some connected instance could in theory break between these 2 calls, therefore this is silenced and will @@ -124,6 +125,7 @@ async def check_clusters(app: FastAPI) -> None: f"{instance.id=} for {instance.tags=}", ) await set_instance_heartbeat(app, instance=instance) + # clean any cluster that is not doing anything if terminateable_instances := await _find_terminateable_instances( app, connected_intances ): @@ -138,7 +140,7 @@ async def check_clusters(app: FastAPI) -> None: for instance in disconnected_instances if _get_instance_last_heartbeat(instance) is None } - + # remove instances that were starting for too long if terminateable_instances := await _find_terminateable_instances( app, starting_instances ): @@ -149,7 +151,14 @@ async def check_clusters(app: FastAPI) -> None: ) await delete_clusters(app, instances=terminateable_instances) - # the other instances are broken (they were at some point connected but now not anymore) + # TODO: transmit command to start docker swarm/stack if needed + # once the instance is connected to the SSM server, + # use ssm client to send the command to these instances, + # we send a command that contain: + # the docker-compose file in binary, + # the call to init the docker swarm and the call to deploy the stack + + # the remaining instances are broken (they were at some point connected but now not anymore) broken_instances = disconnected_instances - starting_instances if terminateable_instances := await _find_terminateable_instances( app, broken_instances From 01fdf4d76f909e75c4502260a361eba873b299f9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 16 Sep 2024 08:50:36 +0200 Subject: [PATCH 03/14] added ssm client, use ssm client, on the way to deploying stack via ssm --- .../constants.py | 9 +++ .../core/application.py | 2 + .../core/settings.py | 32 ++++++++- .../modules/clusters.py | 8 --- .../modules/clusters_management_core.py | 67 ++++++++++++++++++- .../modules/clusters_management_task.py | 1 + .../modules/ssm.py | 56 ++++++++++++++++ .../utils/clusters.py | 35 ++++++---- services/docker-compose.yml | 2 + 9 files changed, 187 insertions(+), 25 deletions(-) create mode 100644 services/clusters-keeper/src/simcore_service_clusters_keeper/constants.py create mode 100644 services/clusters-keeper/src/simcore_service_clusters_keeper/modules/ssm.py diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/constants.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/constants.py new file mode 100644 index 000000000000..f34285726073 --- /dev/null +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/constants.py @@ -0,0 +1,9 @@ +from typing import Final + +from aws_library.ec2._models import AWSTagKey +from pydantic import parse_obj_as + +DOCKER_STACK_DEPLOY_COMMAND_NAME: Final[str] = "private cluster docker deploy" +DOCKER_STACK_DEPLOY_COMMAND_EC2_TAG_KEY: Final[AWSTagKey] = parse_obj_as( + AWSTagKey, "io.simcore.clusters-keeper.private_cluster_docker_deploy" +) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/application.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/application.py index 14b3d344b701..5948715b0813 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/application.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/application.py @@ -19,6 +19,7 @@ from ..modules.ec2 import setup as setup_ec2 from ..modules.rabbitmq import setup as setup_rabbitmq from ..modules.redis import setup as setup_redis +from ..modules.ssm import setup as setup_ssm from ..rpc.rpc_routes import setup_rpc_routes from .settings import ApplicationSettings @@ -55,6 +56,7 @@ def create_app(settings: ApplicationSettings) -> FastAPI: setup_rabbitmq(app) setup_rpc_routes(app) setup_ec2(app) + setup_ssm(app) setup_redis(app) setup_clusters_management(app) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py index 17a8ffcaae8c..7f53f2f97ce7 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py @@ -25,6 +25,7 @@ from settings_library.ec2 import EC2Settings from settings_library.rabbit import RabbitSettings from settings_library.redis import RedisSettings +from settings_library.ssm import SSMSettings from settings_library.tracing import TracingSettings from settings_library.utils_logging import MixinLoggingSettings from types_aiobotocore_ec2.literals import InstanceTypeType @@ -50,6 +51,19 @@ class Config(EC2Settings.Config): } +class ClustersKeeperSSMSettings(SSMSettings): + class Config(SSMSettings.Config): + env_prefix = CLUSTERS_KEEPER_ENV_PREFIX + prefixed_examples: ClassVar[list[dict[str, Any]]] = [ + {f"{CLUSTERS_KEEPER_ENV_PREFIX}{key}": var for key, var in example.items()} + for example in SSMSettings.Config.schema_extra["examples"] + ] + + schema_extra: ClassVar[dict[str, Any]] = { + "examples": prefixed_examples, + } + + class WorkersEC2InstancesSettings(BaseCustomSettings): WORKERS_EC2_INSTANCES_ALLOWED_TYPES: dict[str, EC2InstanceBootSpecific] = Field( ..., @@ -183,6 +197,12 @@ class PrimaryEC2InstancesSettings(BaseCustomSettings): "that take longer than this time will be terminated as sometimes it happens that EC2 machine fail on start.", ) + PRIMARY_EC2_INSTANCES_DOCKER_DEFAULT_ADDRESS_POOL: str = Field( + default="172.20.0.0/14", + description="defines the docker swarm default address pool in CIDR format " + "(see https://docs.docker.com/reference/cli/docker/swarm/init/)", + ) + @validator("PRIMARY_EC2_INSTANCES_ALLOWED_TYPES") @classmethod def check_valid_instance_names( @@ -250,6 +270,10 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): auto_default_from_env=True ) + CLUSTERS_KEEPER_SSM_ACCESS: ClustersKeeperSSMSettings | None = Field( + auto_default_from_env=True + ) + CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES: PrimaryEC2InstancesSettings | None = Field( auto_default_from_env=True ) @@ -285,9 +309,11 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): "(default to seconds, or see https://pydantic-docs.helpmanual.io/usage/types/#datetime-types for string formating)", ) - CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION: NonNegativeInt = Field( - default=5, - description="Max number of missed heartbeats before a cluster is terminated", + CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION: NonNegativeInt = ( + Field( + default=5, + description="Max number of missed heartbeats before a cluster is terminated", + ) ) CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG: str = Field( diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py index 62528f6b69e1..89860549fd3a 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py @@ -72,15 +72,7 @@ async def create_cluster( tags=creation_ec2_tags(app_settings, user_id=user_id, wallet_id=wallet_id), startup_script=create_startup_script( app_settings, - cluster_machines_name_prefix=get_cluster_name( - app_settings, user_id=user_id, wallet_id=wallet_id, is_manager=False - ), ec2_boot_specific=ec2_instance_boot_specs, - additional_custom_tags={ - AWSTagKey("user_id"): AWSTagValue(f"{user_id}"), - AWSTagKey("wallet_id"): AWSTagValue(f"{wallet_id}"), - AWSTagKey("role"): AWSTagValue("worker"), - }, ), ami_id=ec2_instance_boot_specs.ami_id, key_name=app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_KEY_NAME, diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py index c28aa3ca22d1..20443f7488b3 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py @@ -5,12 +5,18 @@ import arrow from aws_library.ec2 import AWSTagKey, EC2InstanceData +from aws_library.ec2._models import AWSTagValue from fastapi import FastAPI from models_library.users import UserID from models_library.wallets import WalletID from pydantic import parse_obj_as from servicelib.logging_utils import log_catch +from servicelib.utils import limited_gather +from ..constants import ( + DOCKER_STACK_DEPLOY_COMMAND_EC2_TAG_KEY, + DOCKER_STACK_DEPLOY_COMMAND_NAME, +) from ..core.settings import get_application_settings from ..modules.clusters import ( delete_clusters, @@ -18,9 +24,12 @@ get_cluster_workers, set_instance_heartbeat, ) +from ..utils.clusters import create_deploy_cluster_stack_script from ..utils.dask import get_scheduler_auth, get_scheduler_url -from ..utils.ec2 import HEARTBEAT_TAG_KEY +from ..utils.ec2 import HEARTBEAT_TAG_KEY, get_cluster_name from .dask import is_scheduler_busy, ping_scheduler +from .ec2 import get_ec2_client +from .ssm import get_ssm_client _logger = logging.getLogger(__name__) @@ -157,6 +166,62 @@ async def check_clusters(app: FastAPI) -> None: # we send a command that contain: # the docker-compose file in binary, # the call to init the docker swarm and the call to deploy the stack + instances_in_need_of_deployment = { + i + for i in starting_instances - terminateable_instances + if DOCKER_STACK_DEPLOY_COMMAND_EC2_TAG_KEY not in i.tags + } + + if instances_in_need_of_deployment: + app_settings = get_application_settings(app) + ssm_client = get_ssm_client(app) + ec2_client = get_ec2_client(app) + instances_in_need_of_deployment_ssm_connection_state = await limited_gather( + *[ + ssm_client.is_instance_connected_to_ssm_server(i.id) + for i in instances_in_need_of_deployment + ], + reraise=False, + log=_logger, + limit=20, + ) + ec2_connected_to_ssm_server = [ + i + for i, c in zip( + instances_in_need_of_deployment, + instances_in_need_of_deployment_ssm_connection_state, + strict=True, + ) + if c is True + ] + started_instances_ready_for_command = ec2_connected_to_ssm_server + if started_instances_ready_for_command: + ssm_command = await ssm_client.send_command( + [i.id for i in started_instances_ready_for_command], + command=create_deploy_cluster_stack_script( + app_settings, + cluster_machines_name_prefix=get_cluster_name( + app_settings, + user_id=user_id, + wallet_id=wallet_id, + is_manager=False, + ), + additional_custom_tags={ + AWSTagKey("user_id"): AWSTagValue(f"{user_id}"), + AWSTagKey("wallet_id"): AWSTagValue(f"{wallet_id}"), + AWSTagKey("role"): AWSTagValue("worker"), + }, + ), + command_name=DOCKER_STACK_DEPLOY_COMMAND_NAME, + ) + await ec2_client.set_instances_tags( + started_instances_ready_for_command, + tags={ + DOCKER_STACK_DEPLOY_COMMAND_EC2_TAG_KEY: AWSTagValue( + ssm_command.command_id + ), + }, + ) # the remaining instances are broken (they were at some point connected but now not anymore) broken_instances = disconnected_instances - starting_instances diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_task.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_task.py index 806cb6d472c1..410edba1efbc 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_task.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_task.py @@ -47,6 +47,7 @@ def setup(app: FastAPI): for s in [ app_settings.CLUSTERS_KEEPER_EC2_ACCESS, app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES, + app_settings.CLUSTERS_KEEPER_SSM_ACCESS, ] ): logger.warning( diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/ssm.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/ssm.py new file mode 100644 index 000000000000..fb1c0e88b55b --- /dev/null +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/ssm.py @@ -0,0 +1,56 @@ +import logging +from typing import cast + +from aws_library.ssm import SimcoreSSMAPI +from aws_library.ssm._errors import SSMNotConnectedError +from fastapi import FastAPI +from settings_library.ssm import SSMSettings +from tenacity.asyncio import AsyncRetrying +from tenacity.before_sleep import before_sleep_log +from tenacity.stop import stop_after_delay +from tenacity.wait import wait_random_exponential + +from ..core.errors import ConfigurationError +from ..core.settings import get_application_settings + +_logger = logging.getLogger(__name__) + + +def setup(app: FastAPI) -> None: + async def on_startup() -> None: + app.state.ssm_client = None + settings: SSMSettings | None = get_application_settings( + app + ).AUTOSCALING_SSM_ACCESS + + if not settings: + _logger.warning("SSM client is de-activated in the settings") + return + + app.state.ssm_client = client = await SimcoreSSMAPI.create(settings) + + async for attempt in AsyncRetrying( + reraise=True, + stop=stop_after_delay(120), + wait=wait_random_exponential(max=30), + before_sleep=before_sleep_log(_logger, logging.WARNING), + ): + with attempt: + connected = await client.ping() + if not connected: + raise SSMNotConnectedError # pragma: no cover + + async def on_shutdown() -> None: + if app.state.ssm_client: + await cast(SimcoreSSMAPI, app.state.ssm_client).close() + + app.add_event_handler("startup", on_startup) + app.add_event_handler("shutdown", on_shutdown) + + +def get_ssm_client(app: FastAPI) -> SimcoreSSMAPI: + if not app.state.ssm_client: + raise ConfigurationError( + msg="SSM client is not available. Please check the configuration." + ) + return cast(SimcoreSSMAPI, app.state.ssm_client) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py index 48eb4dee380b..c9b4a32f4afa 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py @@ -8,6 +8,7 @@ import arrow import yaml from aws_library.ec2 import EC2InstanceBootSpecific, EC2InstanceData, EC2Tags +from aws_library.ec2._models import CommandStr from fastapi.encoders import jsonable_encoder from models_library.api_schemas_clusters_keeper.clusters import ( ClusterState, @@ -107,35 +108,43 @@ def _convert_to_env_dict(entries: dict[str, Any]) -> str: def create_startup_script( app_settings: ApplicationSettings, *, - cluster_machines_name_prefix: str, ec2_boot_specific: EC2InstanceBootSpecific, - additional_custom_tags: EC2Tags, ) -> str: assert app_settings.CLUSTERS_KEEPER_EC2_ACCESS # nosec assert app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES # nosec - environment_variables = _prepare_environment_variables( - app_settings, - cluster_machines_name_prefix=cluster_machines_name_prefix, - additional_custom_tags=additional_custom_tags, - ) - startup_commands = ec2_boot_specific.custom_boot_scripts.copy() + return "\n".join(startup_commands) + + +def create_deploy_cluster_stack_script( + app_settings: ApplicationSettings, + *, + cluster_machines_name_prefix: str, + additional_custom_tags: EC2Tags, +) -> str: + deploy_script: list[CommandStr] = [] assert app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES # nosec if isinstance( app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH, TLSAuthentication, ): - + # get the dask certificates download_certificates_commands = [ f"mkdir --parents {_HOST_CERTIFICATES_BASE_PATH}", f'aws ssm get-parameter --name "{app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CA}" --region us-east-1 --with-decryption --query "Parameter.Value" --output text > {_HOST_TLS_CA_FILE_PATH}', f'aws ssm get-parameter --name "{app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CERT}" --region us-east-1 --with-decryption --query "Parameter.Value" --output text > {_HOST_TLS_CERT_FILE_PATH}', f'aws ssm get-parameter --name "{app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_KEY}" --region us-east-1 --with-decryption --query "Parameter.Value" --output text > {_HOST_TLS_KEY_FILE_PATH}', ] - startup_commands.extend(download_certificates_commands) + deploy_script.extend(download_certificates_commands) + + environment_variables = _prepare_environment_variables( + app_settings, + cluster_machines_name_prefix=cluster_machines_name_prefix, + additional_custom_tags=additional_custom_tags, + ) - startup_commands.extend( + deploy_script.extend( [ # NOTE: https://stackoverflow.com/questions/41203492/solving-redis-warnings-on-overcommit-memory-and-transparent-huge-pages-for-ubunt "sysctl vm.overcommit_memory=1", @@ -143,11 +152,11 @@ def create_startup_script( f"echo '{_prometheus_yml_base64_encoded()}' | base64 -d > {_HOST_PROMETHEUS_PATH}", f"echo '{_prometheus_basic_auth_yml_base64_encoded(app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_PROMETHEUS_USERNAME, app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_PROMETHEUS_PASSWORD.get_secret_value())}' | base64 -d > {_HOST_PROMETHEUS_WEB_PATH}", # NOTE: --default-addr-pool is necessary in order to prevent conflicts with AWS node IPs - "docker swarm init --default-addr-pool 172.20.0.0/14", + f"docker swarm init --default-addr-pool {app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_DOCKER_DEFAULT_ADDRESS_POOL}", f"{' '.join(environment_variables)} docker stack deploy --with-registry-auth --compose-file={_HOST_DOCKER_COMPOSE_PATH} dask_stack", ] ) - return "\n".join(startup_commands) + return "\n".join(deploy_script) def _convert_ec2_state_to_cluster_state( diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 60ce2c26b17f..7581962fb09e 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -204,6 +204,8 @@ services: PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_KEY: ${PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_KEY} PRIMARY_EC2_INSTANCES_PROMETHEUS_USERNAME: ${PRIMARY_EC2_INSTANCES_PROMETHEUS_USERNAME} PRIMARY_EC2_INSTANCES_PROMETHEUS_PASSWORD: ${PRIMARY_EC2_INSTANCES_PROMETHEUS_PASSWORD} + PRIMARY_EC2_INSTANCES_MAX_START_TIME: ${PRIMARY_EC2_INSTANCES_MAX_START_TIME} + PRIMARY_EC2_INSTANCES_DOCKER_DEFAULT_ADDRESS_POOL: ${PRIMARY_EC2_INSTANCES_DOCKER_DEFAULT_ADDRESS_POOL} RABBIT_HOST: ${RABBIT_HOST} RABBIT_PASSWORD: ${RABBIT_PASSWORD} RABBIT_PORT: ${RABBIT_PORT} From aa7e03e5a5909cd40356c73d0640f66dfaed8e39 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 16 Sep 2024 17:16:28 +0200 Subject: [PATCH 04/14] fixed mypy --- .../constants.py | 8 +++- .../core/settings.py | 14 +++--- .../modules/clusters_management_core.py | 45 ++++++++++++------- .../modules/ssm.py | 2 +- .../utils/ec2.py | 27 ++++++++--- 5 files changed, 66 insertions(+), 30 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/constants.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/constants.py index f34285726073..7f970665f25e 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/constants.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/constants.py @@ -1,9 +1,15 @@ from typing import Final -from aws_library.ec2._models import AWSTagKey +from aws_library.ec2._models import AWSTagKey, AWSTagValue from pydantic import parse_obj_as DOCKER_STACK_DEPLOY_COMMAND_NAME: Final[str] = "private cluster docker deploy" DOCKER_STACK_DEPLOY_COMMAND_EC2_TAG_KEY: Final[AWSTagKey] = parse_obj_as( AWSTagKey, "io.simcore.clusters-keeper.private_cluster_docker_deploy" ) + +USER_ID_TAG_KEY: Final[AWSTagKey] = parse_obj_as(AWSTagKey, "user_id") +WALLET_ID_TAG_KEY: Final[AWSTagKey] = parse_obj_as(AWSTagKey, "wallet_id") +ROLE_TAG_KEY: Final[AWSTagKey] = parse_obj_as(AWSTagKey, "role") +WORKER_ROLE_TAG_VALUE: Final[AWSTagValue] = parse_obj_as(AWSTagValue, "worker") +MANAGER_ROLE_TAG_VALUE: Final[AWSTagValue] = parse_obj_as(AWSTagValue, "manager") diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py index 7f53f2f97ce7..07fd7deb8bfa 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py @@ -54,13 +54,15 @@ class Config(EC2Settings.Config): class ClustersKeeperSSMSettings(SSMSettings): class Config(SSMSettings.Config): env_prefix = CLUSTERS_KEEPER_ENV_PREFIX - prefixed_examples: ClassVar[list[dict[str, Any]]] = [ - {f"{CLUSTERS_KEEPER_ENV_PREFIX}{key}": var for key, var in example.items()} - for example in SSMSettings.Config.schema_extra["examples"] - ] - schema_extra: ClassVar[dict[str, Any]] = { - "examples": prefixed_examples, + schema_extra: ClassVar[dict[str, Any]] = { # type: ignore[misc] + "examples": [ + { + f"{CLUSTERS_KEEPER_ENV_PREFIX}{key}": var + for key, var in example.items() + } + for example in SSMSettings.Config.schema_extra["examples"] + ], } diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py index 20443f7488b3..cf23a01e522a 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py @@ -16,6 +16,10 @@ from ..constants import ( DOCKER_STACK_DEPLOY_COMMAND_EC2_TAG_KEY, DOCKER_STACK_DEPLOY_COMMAND_NAME, + ROLE_TAG_KEY, + USER_ID_TAG_KEY, + WALLET_ID_TAG_KEY, + WORKER_ROLE_TAG_VALUE, ) from ..core.settings import get_application_settings from ..modules.clusters import ( @@ -26,7 +30,12 @@ ) from ..utils.clusters import create_deploy_cluster_stack_script from ..utils.dask import get_scheduler_auth, get_scheduler_url -from ..utils.ec2 import HEARTBEAT_TAG_KEY, get_cluster_name +from ..utils.ec2 import ( + HEARTBEAT_TAG_KEY, + get_cluster_name, + user_id_from_instance_tags, + wallet_id_from_instance_tags, +) from .dask import is_scheduler_busy, ping_scheduler from .ec2 import get_ec2_client from .ssm import get_ssm_client @@ -196,24 +205,26 @@ async def check_clusters(app: FastAPI) -> None: ] started_instances_ready_for_command = ec2_connected_to_ssm_server if started_instances_ready_for_command: - ssm_command = await ssm_client.send_command( - [i.id for i in started_instances_ready_for_command], - command=create_deploy_cluster_stack_script( - app_settings, - cluster_machines_name_prefix=get_cluster_name( + # we need to send 1 command per machine here, as the user_id/wallet_id changes + for i in started_instances_ready_for_command: + ssm_command = await ssm_client.send_command( + [i.id], + command=create_deploy_cluster_stack_script( app_settings, - user_id=user_id, - wallet_id=wallet_id, - is_manager=False, + cluster_machines_name_prefix=get_cluster_name( + app_settings, + user_id=user_id_from_instance_tags(i.tags), + wallet_id=wallet_id_from_instance_tags(i.tags), + is_manager=False, + ), + additional_custom_tags={ + USER_ID_TAG_KEY: i.tags[USER_ID_TAG_KEY], + WALLET_ID_TAG_KEY: i.tags[WALLET_ID_TAG_KEY], + ROLE_TAG_KEY: WORKER_ROLE_TAG_VALUE, + }, ), - additional_custom_tags={ - AWSTagKey("user_id"): AWSTagValue(f"{user_id}"), - AWSTagKey("wallet_id"): AWSTagValue(f"{wallet_id}"), - AWSTagKey("role"): AWSTagValue("worker"), - }, - ), - command_name=DOCKER_STACK_DEPLOY_COMMAND_NAME, - ) + command_name=DOCKER_STACK_DEPLOY_COMMAND_NAME, + ) await ec2_client.set_instances_tags( started_instances_ready_for_command, tags={ diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/ssm.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/ssm.py index fb1c0e88b55b..218812d55232 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/ssm.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/ssm.py @@ -21,7 +21,7 @@ async def on_startup() -> None: app.state.ssm_client = None settings: SSMSettings | None = get_application_settings( app - ).AUTOSCALING_SSM_ACCESS + ).CLUSTERS_KEEPER_SSM_ACCESS if not settings: _logger.warning("SSM client is de-activated in the settings") diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/ec2.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/ec2.py index c74bbc554d9a..b48e1076e59d 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/ec2.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/ec2.py @@ -7,6 +7,12 @@ from pydantic import parse_obj_as from .._meta import VERSION +from ..constants import ( + MANAGER_ROLE_TAG_VALUE, + ROLE_TAG_KEY, + USER_ID_TAG_KEY, + WALLET_ID_TAG_KEY, +) from ..core.settings import ApplicationSettings _APPLICATION_TAG_KEY: Final[str] = "io.simcore.clusters-keeper" @@ -50,9 +56,9 @@ def creation_ec2_tags( app_settings, user_id=user_id, wallet_id=wallet_id, is_manager=True ) ), - AWSTagKey("user_id"): AWSTagValue(f"{user_id}"), - AWSTagKey("wallet_id"): AWSTagValue(f"{wallet_id}"), - AWSTagKey("role"): AWSTagValue("manager"), + USER_ID_TAG_KEY: AWSTagValue(f"{user_id}"), + WALLET_ID_TAG_KEY: AWSTagValue(f"{wallet_id}"), + ROLE_TAG_KEY: MANAGER_ROLE_TAG_VALUE, } | app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_CUSTOM_TAGS ) @@ -67,8 +73,8 @@ def ec2_instances_for_user_wallet_filter( ) -> EC2Tags: return ( _minimal_identification_tag(app_settings) - | {AWSTagKey("user_id"): AWSTagValue(f"{user_id}")} - | {AWSTagKey("wallet_id"): AWSTagValue(f"{wallet_id}")} + | {USER_ID_TAG_KEY: AWSTagValue(f"{user_id}")} + | {WALLET_ID_TAG_KEY: AWSTagValue(f"{wallet_id}")} ) @@ -81,3 +87,14 @@ def compose_user_data(bash_command: str) -> str: echo "completed user data bash script" """ ) + + +def wallet_id_from_instance_tags(tags: EC2Tags) -> WalletID | None: + wallet_id_str = tags[WALLET_ID_TAG_KEY] + if wallet_id_str == "None": + return None + return WalletID(wallet_id_str) + + +def user_id_from_instance_tags(tags: EC2Tags) -> UserID: + return UserID(tags[USER_ID_TAG_KEY]) From 20bdb08086c04f7af90514fd76f8d56210e90094 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 16 Sep 2024 18:12:44 +0200 Subject: [PATCH 05/14] added SSM access ENVs --- .env-devel | 1 + services/docker-compose.yml | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/.env-devel b/.env-devel index 4a28dc33cf1b..af4ba2efdcc4 100644 --- a/.env-devel +++ b/.env-devel @@ -50,6 +50,7 @@ CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG=master-github-latest CLUSTERS_KEEPER_DASK_NTHREADS=0 CLUSTERS_KEEPER_DASK_WORKER_SATURATION=inf CLUSTERS_KEEPER_EC2_ACCESS=null +CLUSTERS_KEEPER_SSM_ACCESS=null CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX="" CLUSTERS_KEEPER_LOGLEVEL=WARNING CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION=5 diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 7581962fb09e..af73de611b41 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -189,6 +189,11 @@ services: CLUSTERS_KEEPER_EC2_ENDPOINT: ${CLUSTERS_KEEPER_EC2_ENDPOINT} CLUSTERS_KEEPER_EC2_REGION_NAME: ${CLUSTERS_KEEPER_EC2_REGION_NAME} CLUSTERS_KEEPER_EC2_SECRET_ACCESS_KEY: ${CLUSTERS_KEEPER_EC2_SECRET_ACCESS_KEY} + CLUSTERS_KEEPER_SSM_ACCESS: ${CLUSTERS_KEEPER_SSM_ACCESS} + CLUSTERS_KEEPER_SSM_ACCESS_KEY_ID: ${CLUSTERS_KEEPER_SSM_ACCESS_KEY_ID} + CLUSTERS_KEEPER_SSM_ENDPOINT: ${CLUSTERS_KEEPER_SSM_ENDPOINT} + CLUSTERS_KEEPER_SSM_REGION_NAME: ${CLUSTERS_KEEPER_SSM_REGION_NAME} + CLUSTERS_KEEPER_SSM_SECRET_ACCESS_KEY: ${CLUSTERS_KEEPER_SSM_SECRET_ACCESS_KEY} CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX: ${CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX} LOG_FORMAT_LOCAL_DEV_ENABLED: ${LOG_FORMAT_LOCAL_DEV_ENABLED} CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES: ${CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES} From 42e17fd653bcbe91d3fb221eef4087775a17deb7 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 16 Sep 2024 18:20:18 +0200 Subject: [PATCH 06/14] healthcheck fixed --- .../api/health.py | 23 +++++++++++++------ .../clusters-keeper/tests/unit/conftest.py | 20 ++++++++++++++++ .../tests/unit/test_api_health.py | 7 ++++++ 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/api/health.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/api/health.py index ad2882da3c84..a971a551e4e0 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/api/health.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/api/health.py @@ -21,7 +21,7 @@ @router.get("/", include_in_schema=True, response_class=PlainTextResponse) async def health_check(): # NOTE: sync url in docker/healthcheck.py with this entrypoint! - return f"{__name__}.health_check@{datetime.datetime.now(datetime.timezone.utc).isoformat()}" + return f"{__name__}.health_check@{datetime.datetime.now(datetime.UTC).isoformat()}" class _ComponentStatus(BaseModel): @@ -33,6 +33,7 @@ class _StatusGet(BaseModel): rabbitmq: _ComponentStatus ec2: _ComponentStatus redis_client_sdk: _ComponentStatus + ssm: _ComponentStatus @router.get("/status", include_in_schema=True, response_model=_StatusGet) @@ -40,18 +41,26 @@ async def get_status(app: Annotated[FastAPI, Depends(get_app)]) -> _StatusGet: return _StatusGet( rabbitmq=_ComponentStatus( is_enabled=is_rabbitmq_enabled(app), - is_responsive=await get_rabbitmq_client(app).ping() - if is_rabbitmq_enabled(app) - else False, + is_responsive=( + await get_rabbitmq_client(app).ping() + if is_rabbitmq_enabled(app) + else False + ), ), ec2=_ComponentStatus( is_enabled=bool(app.state.ec2_client), - is_responsive=await app.state.ec2_client.ping() - if app.state.ec2_client - else False, + is_responsive=( + await app.state.ec2_client.ping() if app.state.ec2_client else False + ), ), redis_client_sdk=_ComponentStatus( is_enabled=bool(app.state.redis_client_sdk), is_responsive=await get_redis_client(app).ping(), ), + ssm=_ComponentStatus( + is_enabled=(app.state.ssm_client is not None), + is_responsive=( + await app.state.ssm_client.ping() if app.state.ssm_client else False + ), + ), ) diff --git a/services/clusters-keeper/tests/unit/conftest.py b/services/clusters-keeper/tests/unit/conftest.py index a8f4913d4bb5..62ecddce3562 100644 --- a/services/clusters-keeper/tests/unit/conftest.py +++ b/services/clusters-keeper/tests/unit/conftest.py @@ -22,11 +22,13 @@ from fastapi import FastAPI from models_library.users import UserID from models_library.wallets import WalletID +from pydantic import SecretStr from pytest_mock.plugin import MockerFixture from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict from servicelib.rabbitmq import RabbitMQRPCClient from settings_library.ec2 import EC2Settings from settings_library.rabbit import RabbitSettings +from settings_library.ssm import SSMSettings from simcore_service_clusters_keeper.core.application import create_app from simcore_service_clusters_keeper.core.settings import ( CLUSTERS_KEEPER_ENV_PREFIX, @@ -86,6 +88,21 @@ def mocked_ec2_server_envs( return setenvs_from_dict(monkeypatch, changed_envs) +@pytest.fixture +def mocked_ssm_server_envs( + mocked_ssm_server_settings: SSMSettings, + monkeypatch: pytest.MonkeyPatch, +) -> EnvVarsDict: + # NOTE: overrides the SSMSettings with what clusters-keeper expects + changed_envs: EnvVarsDict = { + f"{CLUSTERS_KEEPER_ENV_PREFIX}{k}": ( + v.get_secret_value() if isinstance(v, SecretStr) else v + ) + for k, v in mocked_ssm_server_settings.dict().items() + } + return setenvs_from_dict(monkeypatch, changed_envs) + + @pytest.fixture def ec2_settings(mocked_ec2_server_settings: EC2Settings) -> EC2Settings: return mocked_ec2_server_settings @@ -105,6 +122,9 @@ def app_environment( "CLUSTERS_KEEPER_EC2_ACCESS": "{}", "CLUSTERS_KEEPER_EC2_ACCESS_KEY_ID": faker.pystr(), "CLUSTERS_KEEPER_EC2_SECRET_ACCESS_KEY": faker.pystr(), + "CLUSTERS_KEEPER_SSM_ACCESS": "{}", + "CLUSTERS_KEEPER_SSM_ACCESS_KEY_ID": faker.pystr(), + "CLUSTERS_KEEPER_SSM_SECRET_ACCESS_KEY": faker.pystr(), "CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES": "{}", "CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX": faker.pystr(), "CLUSTERS_KEEPER_DASK_NTHREADS": f"{faker.pyint(min_value=0)}", diff --git a/services/clusters-keeper/tests/unit/test_api_health.py b/services/clusters-keeper/tests/unit/test_api_health.py index 734620afa1b2..5bf72ccae8ef 100644 --- a/services/clusters-keeper/tests/unit/test_api_health.py +++ b/services/clusters-keeper/tests/unit/test_api_health.py @@ -21,6 +21,7 @@ def app_environment( app_environment: EnvVarsDict, enabled_rabbitmq: None, mocked_ec2_server_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, mocked_redis_server: None, ) -> EnvVarsDict: return app_environment @@ -69,6 +70,9 @@ async def test_status( assert status_response.ec2.is_enabled is True assert status_response.ec2.is_responsive is False + assert status_response.ssm.is_enabled is True + assert status_response.ssm.is_responsive is False + # restart the server mocked_aws_server.start() @@ -83,3 +87,6 @@ async def test_status( assert status_response.ec2.is_enabled is True assert status_response.ec2.is_responsive is True + + assert status_response.ssm.is_enabled is True + assert status_response.ssm.is_responsive is True From 8ec7d587d780d675f404b92c8315bc64a764401a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 16 Sep 2024 18:50:14 +0200 Subject: [PATCH 07/14] init ssm server --- services/clusters-keeper/tests/unit/test_modules_clusters.py | 1 + .../tests/unit/test_modules_clusters_management_core.py | 1 + 2 files changed, 2 insertions(+) diff --git a/services/clusters-keeper/tests/unit/test_modules_clusters.py b/services/clusters-keeper/tests/unit/test_modules_clusters.py index 16cfbde04b2c..497b9e447e74 100644 --- a/services/clusters-keeper/tests/unit/test_modules_clusters.py +++ b/services/clusters-keeper/tests/unit/test_modules_clusters.py @@ -49,6 +49,7 @@ def _base_configuration( mocked_redis_server: None, mocked_ec2_server_envs: EnvVarsDict, mocked_primary_ec2_instances_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, ) -> None: ... diff --git a/services/clusters-keeper/tests/unit/test_modules_clusters_management_core.py b/services/clusters-keeper/tests/unit/test_modules_clusters_management_core.py index 09720632fd4b..438e69ee72eb 100644 --- a/services/clusters-keeper/tests/unit/test_modules_clusters_management_core.py +++ b/services/clusters-keeper/tests/unit/test_modules_clusters_management_core.py @@ -60,6 +60,7 @@ def _base_configuration( mocked_redis_server: None, mocked_ec2_server_envs: EnvVarsDict, mocked_primary_ec2_instances_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, ) -> None: ... From 80ba15928b315b0976a2003ddfdcf72d4db63a98 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 16 Sep 2024 18:52:20 +0200 Subject: [PATCH 08/14] init ssm server --- services/clusters-keeper/tests/unit/test_rpc_clusters.py | 1 + services/clusters-keeper/tests/unit/test_rpc_ec2_instances.py | 1 + services/clusters-keeper/tests/unit/test_utils_clusters.py | 3 +++ services/clusters-keeper/tests/unit/test_utils_ec2.py | 2 ++ 4 files changed, 7 insertions(+) diff --git a/services/clusters-keeper/tests/unit/test_rpc_clusters.py b/services/clusters-keeper/tests/unit/test_rpc_clusters.py index 41146c827bd7..a280cbb5338f 100644 --- a/services/clusters-keeper/tests/unit/test_rpc_clusters.py +++ b/services/clusters-keeper/tests/unit/test_rpc_clusters.py @@ -43,6 +43,7 @@ def _base_configuration( mocked_redis_server: None, mocked_ec2_server_envs: EnvVarsDict, mocked_primary_ec2_instances_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, initialized_app: FastAPI, ensure_run_in_sequence_context_is_empty: None, ) -> None: diff --git a/services/clusters-keeper/tests/unit/test_rpc_ec2_instances.py b/services/clusters-keeper/tests/unit/test_rpc_ec2_instances.py index d03b6b74502d..f4eea132cdf8 100644 --- a/services/clusters-keeper/tests/unit/test_rpc_ec2_instances.py +++ b/services/clusters-keeper/tests/unit/test_rpc_ec2_instances.py @@ -24,6 +24,7 @@ def _base_configuration( enabled_rabbitmq: None, mocked_redis_server: None, mocked_ec2_server_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, initialized_app: FastAPI, ) -> None: ... diff --git a/services/clusters-keeper/tests/unit/test_utils_clusters.py b/services/clusters-keeper/tests/unit/test_utils_clusters.py index a6592ed1fa40..ae402ba8d395 100644 --- a/services/clusters-keeper/tests/unit/test_utils_clusters.py +++ b/services/clusters-keeper/tests/unit/test_utils_clusters.py @@ -69,6 +69,7 @@ def app_environment( def test_create_startup_script( disabled_rabbitmq: None, mocked_ec2_server_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, mocked_redis_server: None, app_settings: ApplicationSettings, cluster_machines_name_prefix: str, @@ -160,6 +161,7 @@ def test_create_startup_script( def test_create_startup_script_script_size_below_16kb( disabled_rabbitmq: None, mocked_ec2_server_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, mocked_redis_server: None, app_settings: ApplicationSettings, cluster_machines_name_prefix: str, @@ -187,6 +189,7 @@ def test_create_startup_script_script_size_below_16kb( def test_startup_script_defines_all_envs_for_docker_compose( disabled_rabbitmq: None, mocked_ec2_server_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, mocked_redis_server: None, app_settings: ApplicationSettings, cluster_machines_name_prefix: str, diff --git a/services/clusters-keeper/tests/unit/test_utils_ec2.py b/services/clusters-keeper/tests/unit/test_utils_ec2.py index cc466d113ac4..c0ab11ba058a 100644 --- a/services/clusters-keeper/tests/unit/test_utils_ec2.py +++ b/services/clusters-keeper/tests/unit/test_utils_ec2.py @@ -49,6 +49,7 @@ def test_get_cluster_name( def test_creation_ec2_tags( mocked_ec2_server_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, disabled_rabbitmq: None, mocked_redis_server: None, app_settings: ApplicationSettings, @@ -78,6 +79,7 @@ def test_creation_ec2_tags( def test_all_created_ec2_instances_filter( mocked_ec2_server_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, disabled_rabbitmq: None, mocked_redis_server: None, app_settings: ApplicationSettings, From 570fd5d9fe5247f5b61bad6ede8860b12d1f0101 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 17 Sep 2024 10:32:35 +0200 Subject: [PATCH 09/14] missing fixtures --- .../clusters-keeper/tests/unit/conftest.py | 5 ++++ .../test_modules_clusters_management_task.py | 1 + .../tests/unit/test_modules_ec2.py | 7 +++++- .../tests/unit/test_modules_ssm.py | 25 +++++++++++++++++++ 4 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 services/clusters-keeper/tests/unit/test_modules_ssm.py diff --git a/services/clusters-keeper/tests/unit/conftest.py b/services/clusters-keeper/tests/unit/conftest.py index 62ecddce3562..43805123c30e 100644 --- a/services/clusters-keeper/tests/unit/conftest.py +++ b/services/clusters-keeper/tests/unit/conftest.py @@ -226,6 +226,11 @@ def disabled_ec2(app_environment: EnvVarsDict, monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("CLUSTERS_KEEPER_EC2_ACCESS", "null") +@pytest.fixture +def disabled_ssm(app_environment: EnvVarsDict, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("CLUSTERS_KEEPER_SSM_ACCESS", "null") + + @pytest.fixture def enabled_rabbitmq( app_environment: EnvVarsDict, rabbit_service: RabbitSettings diff --git a/services/clusters-keeper/tests/unit/test_modules_clusters_management_task.py b/services/clusters-keeper/tests/unit/test_modules_clusters_management_task.py index 0c9c52eab4c3..d22bdce1f765 100644 --- a/services/clusters-keeper/tests/unit/test_modules_clusters_management_task.py +++ b/services/clusters-keeper/tests/unit/test_modules_clusters_management_task.py @@ -37,6 +37,7 @@ def mock_background_task(mocker: MockerFixture) -> mock.Mock: async def test_clusters_management_task_created_and_deleted( disabled_rabbitmq: None, mocked_ec2_server_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, mocked_redis_server: None, mock_background_task: mock.Mock, initialized_app: FastAPI, diff --git a/services/clusters-keeper/tests/unit/test_modules_ec2.py b/services/clusters-keeper/tests/unit/test_modules_ec2.py index 0820ada58183..439e54aaa2d9 100644 --- a/services/clusters-keeper/tests/unit/test_modules_ec2.py +++ b/services/clusters-keeper/tests/unit/test_modules_ec2.py @@ -5,13 +5,16 @@ import pytest from fastapi import FastAPI +from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict from simcore_service_clusters_keeper.core.errors import ConfigurationError from simcore_service_clusters_keeper.modules.ec2 import get_ec2_client +from simcore_service_clusters_keeper.modules.ssm import get_ssm_client -async def test_ec2_does_not_initialize_if_deactivated( +async def test_ec2_does_not_initialize_if_ec2_deactivated( disabled_rabbitmq: None, disabled_ec2: None, + mocked_ssm_server_envs: EnvVarsDict, mocked_redis_server: None, initialized_app: FastAPI, ): @@ -19,3 +22,5 @@ async def test_ec2_does_not_initialize_if_deactivated( assert initialized_app.state.ec2_client is None with pytest.raises(ConfigurationError): get_ec2_client(initialized_app) + + assert get_ssm_client(initialized_app) diff --git a/services/clusters-keeper/tests/unit/test_modules_ssm.py b/services/clusters-keeper/tests/unit/test_modules_ssm.py new file mode 100644 index 000000000000..dd5465e57001 --- /dev/null +++ b/services/clusters-keeper/tests/unit/test_modules_ssm.py @@ -0,0 +1,25 @@ +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable + + +import pytest +from fastapi import FastAPI +from simcore_service_clusters_keeper.core.errors import ConfigurationError +from simcore_service_clusters_keeper.modules.ec2 import get_ec2_client +from simcore_service_clusters_keeper.modules.ssm import get_ssm_client + + +async def test_ssm_does_not_initialize_if_ssm_deactivated( + disabled_rabbitmq: None, + disabled_ec2: None, + disabled_ssm: None, + mocked_redis_server: None, + initialized_app: FastAPI, +): + assert hasattr(initialized_app.state, "ec2_client") + assert initialized_app.state.ec2_client is None + with pytest.raises(ConfigurationError): + get_ec2_client(initialized_app) + + assert get_ssm_client(initialized_app) From fe113e9ea8add6ea79bd82418973c32598382aa6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 17 Sep 2024 10:36:35 +0200 Subject: [PATCH 10/14] missing disable ssm --- .../clusters-keeper/tests/unit/test_modules_rabbitmq.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/services/clusters-keeper/tests/unit/test_modules_rabbitmq.py b/services/clusters-keeper/tests/unit/test_modules_rabbitmq.py index a2c23ac06028..1bbd5683c769 100644 --- a/services/clusters-keeper/tests/unit/test_modules_rabbitmq.py +++ b/services/clusters-keeper/tests/unit/test_modules_rabbitmq.py @@ -43,8 +43,8 @@ def rabbit_log_message(faker: Faker) -> LoggerRabbitMessage: return LoggerRabbitMessage( user_id=faker.pyint(min_value=1), - project_id=faker.uuid4(), - node_id=faker.uuid4(), + project_id=faker.uuid4(cast_to=None), + node_id=faker.uuid4(cast_to=None), messages=faker.pylist(allowed_types=(str,)), ) @@ -62,6 +62,7 @@ def rabbit_message( def test_rabbitmq_does_not_initialize_if_deactivated( disabled_rabbitmq: None, disabled_ec2: None, + disabled_ssm: None, mocked_redis_server: None, initialized_app: FastAPI, ): @@ -78,6 +79,7 @@ def test_rabbitmq_does_not_initialize_if_deactivated( def test_rabbitmq_initializes( enabled_rabbitmq: RabbitSettings, disabled_ec2: None, + disabled_ssm: None, mocked_redis_server: None, initialized_app: FastAPI, ): @@ -95,6 +97,7 @@ def test_rabbitmq_initializes( async def test_post_message( enabled_rabbitmq: RabbitSettings, disabled_ec2: None, + disabled_ssm: None, mocked_redis_server: None, initialized_app: FastAPI, rabbit_message: RabbitMessageBase, @@ -124,6 +127,7 @@ async def test_post_message( async def test_post_message_with_disabled_rabbit_does_not_raise( disabled_rabbitmq: None, disabled_ec2: None, + disabled_ssm: None, mocked_redis_server: None, initialized_app: FastAPI, rabbit_message: RabbitMessageBase, @@ -135,6 +139,7 @@ async def test_post_message_when_rabbit_disconnected_does_not_raise( paused_container: Callable[[str], AbstractAsyncContextManager[None]], enabled_rabbitmq: RabbitSettings, disabled_ec2: None, + disabled_ssm: None, mocked_redis_server: None, initialized_app: FastAPI, rabbit_log_message: LoggerRabbitMessage, From 63a91a7898368b021d67b3135a7c545291eaacc0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 17 Sep 2024 14:27:59 +0200 Subject: [PATCH 11/14] added example --- packages/models-library/src/models_library/clusters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/models-library/src/models_library/clusters.py b/packages/models-library/src/models_library/clusters.py index c51598b06ee1..1856dc5c287d 100644 --- a/packages/models-library/src/models_library/clusters.py +++ b/packages/models-library/src/models_library/clusters.py @@ -96,6 +96,9 @@ class Config(BaseAuthentication.Config): class NoAuthentication(BaseAuthentication): type: Literal["none"] = "none" + class Config(BaseAuthentication.Config): + schema_extra: ClassVar[dict[str, Any]] = {"examples": [{"type": "none"}]} + class TLSAuthentication(BaseAuthentication): type: Literal["tls"] = "tls" From aded14bbdc7ad7cafbd65632a679334d08350c3f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 17 Sep 2024 14:28:30 +0200 Subject: [PATCH 12/14] missing fixture --- .../clusters-keeper/tests/unit/test_modules_redis.py | 1 + .../tests/unit/test_modules_remote_debug.py | 1 + .../clusters-keeper/tests/unit/test_modules_ssm.py | 9 +++------ .../clusters-keeper/tests/unit/test_utils_ec2.py | 12 ++++++++++++ 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/services/clusters-keeper/tests/unit/test_modules_redis.py b/services/clusters-keeper/tests/unit/test_modules_redis.py index f6b760f27fbc..44fb9a9f6ace 100644 --- a/services/clusters-keeper/tests/unit/test_modules_redis.py +++ b/services/clusters-keeper/tests/unit/test_modules_redis.py @@ -10,6 +10,7 @@ async def test_redis_raises_if_missing( disabled_rabbitmq: None, disabled_ec2: None, + disabled_ssm: None, mocked_redis_server: None, initialized_app: FastAPI, ): diff --git a/services/clusters-keeper/tests/unit/test_modules_remote_debug.py b/services/clusters-keeper/tests/unit/test_modules_remote_debug.py index dbb5a91922e1..3fe8b823d136 100644 --- a/services/clusters-keeper/tests/unit/test_modules_remote_debug.py +++ b/services/clusters-keeper/tests/unit/test_modules_remote_debug.py @@ -23,6 +23,7 @@ def app_environment( def test_application_with_debug_enabled( disabled_rabbitmq: None, disabled_ec2: None, + disabled_ssm: None, mocked_redis_server: None, initialized_app: FastAPI, ): diff --git a/services/clusters-keeper/tests/unit/test_modules_ssm.py b/services/clusters-keeper/tests/unit/test_modules_ssm.py index dd5465e57001..3bcffb72661b 100644 --- a/services/clusters-keeper/tests/unit/test_modules_ssm.py +++ b/services/clusters-keeper/tests/unit/test_modules_ssm.py @@ -6,7 +6,6 @@ import pytest from fastapi import FastAPI from simcore_service_clusters_keeper.core.errors import ConfigurationError -from simcore_service_clusters_keeper.modules.ec2 import get_ec2_client from simcore_service_clusters_keeper.modules.ssm import get_ssm_client @@ -17,9 +16,7 @@ async def test_ssm_does_not_initialize_if_ssm_deactivated( mocked_redis_server: None, initialized_app: FastAPI, ): - assert hasattr(initialized_app.state, "ec2_client") - assert initialized_app.state.ec2_client is None + assert hasattr(initialized_app.state, "ssm_client") + assert initialized_app.state.ssm_client is None with pytest.raises(ConfigurationError): - get_ec2_client(initialized_app) - - assert get_ssm_client(initialized_app) + get_ssm_client(initialized_app) diff --git a/services/clusters-keeper/tests/unit/test_utils_ec2.py b/services/clusters-keeper/tests/unit/test_utils_ec2.py index c0ab11ba058a..125670475dba 100644 --- a/services/clusters-keeper/tests/unit/test_utils_ec2.py +++ b/services/clusters-keeper/tests/unit/test_utils_ec2.py @@ -25,6 +25,7 @@ def wallet_id(faker: Faker) -> WalletID: def test_get_cluster_name( disabled_rabbitmq: None, disabled_ec2: None, + disabled_ssm: None, mocked_redis_server: None, app_settings: ApplicationSettings, user_id: UserID, @@ -46,6 +47,17 @@ def test_get_cluster_name( == f"{app_settings.CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX}osparc-computational-cluster-worker-{app_settings.SWARM_STACK_NAME}-user_id:{user_id}-wallet_id:{wallet_id}" ) + assert ( + get_cluster_name(app_settings, user_id=user_id, wallet_id=None, is_manager=True) + == f"{app_settings.CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX}osparc-computational-cluster-manager-{app_settings.SWARM_STACK_NAME}-user_id:{user_id}-wallet_id:None" + ) + assert ( + get_cluster_name( + app_settings, user_id=user_id, wallet_id=None, is_manager=False + ) + == f"{app_settings.CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX}osparc-computational-cluster-worker-{app_settings.SWARM_STACK_NAME}-user_id:{user_id}-wallet_id:None" + ) + def test_creation_ec2_tags( mocked_ec2_server_envs: EnvVarsDict, From fceee974dd8b852f373d69df0907418b0631ebbd Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 17 Sep 2024 14:28:48 +0200 Subject: [PATCH 13/14] adapted test to SSM --- .../tests/unit/test_utils_clusters.py | 86 ++++++++++++++----- 1 file changed, 64 insertions(+), 22 deletions(-) diff --git a/services/clusters-keeper/tests/unit/test_utils_clusters.py b/services/clusters-keeper/tests/unit/test_utils_clusters.py index ae402ba8d395..1c4a7760d5fb 100644 --- a/services/clusters-keeper/tests/unit/test_utils_clusters.py +++ b/services/clusters-keeper/tests/unit/test_utils_clusters.py @@ -29,6 +29,7 @@ from simcore_service_clusters_keeper.utils.clusters import ( _prepare_environment_variables, create_cluster_from_ec2_instance, + create_deploy_cluster_stack_script, create_startup_script, ) from types_aiobotocore_ec2.literals import InstanceStateNameType @@ -51,16 +52,26 @@ def ec2_boot_specs(app_settings: ApplicationSettings) -> EC2InstanceBootSpecific return ec2_boot_specs +@pytest.fixture(params=[TLSAuthentication, NoAuthentication]) +def backend_cluster_auth( + request: pytest.FixtureRequest, +) -> InternalClusterAuthentication: + return request.param + + @pytest.fixture def app_environment( app_environment: EnvVarsDict, monkeypatch: pytest.MonkeyPatch, + backend_cluster_auth: InternalClusterAuthentication, ) -> EnvVarsDict: return app_environment | setenvs_from_dict( monkeypatch, { "CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH": json_dumps( TLSAuthentication.Config.schema_extra["examples"][0] + if isinstance(backend_cluster_auth, TLSAuthentication) + else NoAuthentication.Config.schema_extra["examples"][0] ) }, ) @@ -72,36 +83,49 @@ def test_create_startup_script( mocked_ssm_server_envs: EnvVarsDict, mocked_redis_server: None, app_settings: ApplicationSettings, - cluster_machines_name_prefix: str, - clusters_keeper_docker_compose: dict[str, Any], ec2_boot_specs: EC2InstanceBootSpecific, ): - additional_custom_tags = { - AWSTagKey("pytest-tag-key"): AWSTagValue("pytest-tag-value") - } startup_script = create_startup_script( app_settings, - cluster_machines_name_prefix=cluster_machines_name_prefix, ec2_boot_specific=ec2_boot_specs, - additional_custom_tags=additional_custom_tags, ) assert isinstance(startup_script, str) assert len(ec2_boot_specs.custom_boot_scripts) > 0 for boot_script in ec2_boot_specs.custom_boot_scripts: assert boot_script in startup_script + + +def test_create_deploy_cluster_stack_script( + disabled_rabbitmq: None, + mocked_ec2_server_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, + mocked_redis_server: None, + app_settings: ApplicationSettings, + cluster_machines_name_prefix: str, + clusters_keeper_docker_compose: dict[str, Any], +): + additional_custom_tags = { + AWSTagKey("pytest-tag-key"): AWSTagValue("pytest-tag-value") + } + deploy_script = create_deploy_cluster_stack_script( + app_settings, + cluster_machines_name_prefix=cluster_machines_name_prefix, + additional_custom_tags=additional_custom_tags, + ) + assert isinstance(deploy_script, str) # we have commands to pipe into a docker-compose file - assert " | base64 -d > /docker-compose.yml" in startup_script + assert " | base64 -d > /docker-compose.yml" in deploy_script # we have commands to init a docker-swarm - assert "docker swarm init" in startup_script + assert "docker swarm init --default-addr-pool" in deploy_script # we have commands to deploy a stack assert ( "docker stack deploy --with-registry-auth --compose-file=/docker-compose.yml dask_stack" - in startup_script + in deploy_script ) # before that we have commands that setup ENV variables, let's check we have all of them as defined in the docker-compose # let's get what was set in the startup script and compare with the expected one of the docker-compose startup_script_envs_definition = ( - startup_script.splitlines()[-1].split("docker stack deploy")[0].strip() + deploy_script.splitlines()[-1].split("docker stack deploy")[0].strip() ) assert startup_script_envs_definition # Use regular expression to split the string into key-value pairs (courtesy of chatGPT) @@ -138,7 +162,7 @@ def test_create_startup_script( "WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS", ] assert all( - re.search(rf"{i}=\[(\\\".+\\\")*\]", startup_script) for i in list_settings + re.search(rf"{i}=\[(\\\".+\\\")*\]", deploy_script) for i in list_settings ) # check dicts have \' in front @@ -147,18 +171,18 @@ def test_create_startup_script( "WORKERS_EC2_INSTANCES_CUSTOM_TAGS", ] assert all( - re.search(rf"{i}=\'{{(\".+\":\s\".*\")+}}\'", startup_script) + re.search(rf"{i}=\'{{(\".+\":\s\".*\")+}}\'", deploy_script) for i in dict_settings ) # check the additional tags are in assert all( - f'"{key}": "{value}"' in startup_script + f'"{key}": "{value}"' in deploy_script for key, value in additional_custom_tags.items() ) -def test_create_startup_script_script_size_below_16kb( +def test_create_deploy_cluster_stack_script_below_64kb( disabled_rabbitmq: None, mocked_ec2_server_envs: EnvVarsDict, mocked_ssm_server_envs: EnvVarsDict, @@ -166,17 +190,36 @@ def test_create_startup_script_script_size_below_16kb( app_settings: ApplicationSettings, cluster_machines_name_prefix: str, clusters_keeper_docker_compose: dict[str, Any], - ec2_boot_specs: EC2InstanceBootSpecific, ): additional_custom_tags = { AWSTagKey("pytest-tag-key"): AWSTagValue("pytest-tag-value") } - startup_script = create_startup_script( + deploy_script = create_deploy_cluster_stack_script( app_settings, cluster_machines_name_prefix=cluster_machines_name_prefix, - ec2_boot_specific=ec2_boot_specs, additional_custom_tags=additional_custom_tags, ) + deploy_script_size_in_bytes = len(deploy_script.encode("utf-8")) + assert deploy_script_size_in_bytes < 64000, ( + f"script size is {deploy_script_size_in_bytes} bytes that exceeds the SSM command of 64KB. " + "TIP: split commands or reduce size." + ) + + +def test_create_startup_script_script_size_below_16kb( + disabled_rabbitmq: None, + mocked_ec2_server_envs: EnvVarsDict, + mocked_ssm_server_envs: EnvVarsDict, + mocked_redis_server: None, + app_settings: ApplicationSettings, + cluster_machines_name_prefix: str, + clusters_keeper_docker_compose: dict[str, Any], + ec2_boot_specs: EC2InstanceBootSpecific, +): + startup_script = create_startup_script( + app_settings, + ec2_boot_specific=ec2_boot_specs, + ) script_size_in_bytes = len(startup_script.encode("utf-8")) print( @@ -186,14 +229,13 @@ def test_create_startup_script_script_size_below_16kb( assert script_size_in_bytes < 15 * 1024 -def test_startup_script_defines_all_envs_for_docker_compose( +def test__prepare_environment_variables_defines_all_envs_for_docker_compose( disabled_rabbitmq: None, mocked_ec2_server_envs: EnvVarsDict, mocked_ssm_server_envs: EnvVarsDict, mocked_redis_server: None, app_settings: ApplicationSettings, cluster_machines_name_prefix: str, - ec2_boot_specs: EC2InstanceBootSpecific, clusters_keeper_docker_compose_file: Path, ): additional_custom_tags = { @@ -205,8 +247,8 @@ def test_startup_script_defines_all_envs_for_docker_compose( additional_custom_tags=additional_custom_tags, ) assert environment_variables - process = subprocess.run( - [ # noqa: S603, S607 + process = subprocess.run( # noqa: S603 + [ # noqa: S607 "docker", "compose", "--dry-run", From 8d03db61b29a6d6133c9d87594ed4d009426a0b4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 20 Sep 2024 16:11:25 +0200 Subject: [PATCH 14/14] @pcrespov review: TODO --- .../modules/clusters_management_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py index cf23a01e522a..871ad8bd242b 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py @@ -169,7 +169,7 @@ async def check_clusters(app: FastAPI) -> None: ) await delete_clusters(app, instances=terminateable_instances) - # TODO: transmit command to start docker swarm/stack if needed + # NOTE: transmit command to start docker swarm/stack if needed # once the instance is connected to the SSM server, # use ssm client to send the command to these instances, # we send a command that contain: