diff --git a/packages/models-library/src/models_library/api_schemas_directorv2/health.py b/packages/models-library/src/models_library/api_schemas__common/health.py similarity index 100% rename from packages/models-library/src/models_library/api_schemas_directorv2/health.py rename to packages/models-library/src/models_library/api_schemas__common/health.py diff --git a/packages/models-library/src/models_library/api_schemas_directorv2/services.py b/packages/models-library/src/models_library/api_schemas_directorv2/services.py index 6e429be4d500..2bd0084b7b32 100644 --- a/packages/models-library/src/models_library/api_schemas_directorv2/services.py +++ b/packages/models-library/src/models_library/api_schemas_directorv2/services.py @@ -1,4 +1,4 @@ -from typing import Any, ClassVar +from typing import Any, ClassVar, Final from pydantic import BaseModel, Field, validator from pydantic.types import ByteSize, NonNegativeInt @@ -90,3 +90,6 @@ class Config: for node_example in NodeRequirements.Config.schema_extra["examples"] ] } + + +CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME: Final[NonNegativeInt] = 89 diff --git a/packages/service-library/src/servicelib/fastapi/app_state.py b/packages/service-library/src/servicelib/fastapi/app_state.py index b15cbcb261e7..79e2bea4123a 100644 --- a/packages/service-library/src/servicelib/fastapi/app_state.py +++ b/packages/service-library/src/servicelib/fastapi/app_state.py @@ -1,8 +1,8 @@ -import logging +from typing import TypeVar from fastapi import FastAPI -_logger = logging.getLogger(__name__) +T = TypeVar("T", bound="SingletonInAppStateMixin") class SingletonInAppStateMixin: @@ -14,8 +14,8 @@ class SingletonInAppStateMixin: frozen: bool = True # Will raise if set multiple times @classmethod - def get_from_app_state(cls, app: FastAPI): - return getattr(app.state, cls.app_state_name) + def get_from_app_state(cls: type[T], app: FastAPI) -> T: + return getattr(app.state, cls.app_state_name) # type:ignore[no-any-return] def set_to_app_state(self, app: FastAPI): if (exists := getattr(app.state, self.app_state_name, None)) and self.frozen: @@ -26,11 +26,11 @@ def set_to_app_state(self, app: FastAPI): return self.get_from_app_state(app) @classmethod - def pop_from_app_state(cls, app: FastAPI): + def pop_from_app_state(cls: type[T], app: FastAPI) -> T: """ Raises: AttributeError: if instance is not in app.state """ - old = getattr(app.state, cls.app_state_name) + old = cls.get_from_app_state(app) delattr(app.state, cls.app_state_name) return old diff --git a/services/agent/src/simcore_service_agent/modules/__init__.py b/packages/service-library/src/servicelib/rabbitmq/rpc_interfaces/agent/__init__.py similarity index 100% rename from services/agent/src/simcore_service_agent/modules/__init__.py rename to packages/service-library/src/servicelib/rabbitmq/rpc_interfaces/agent/__init__.py diff --git a/packages/service-library/src/servicelib/rabbitmq/rpc_interfaces/agent/errors.py b/packages/service-library/src/servicelib/rabbitmq/rpc_interfaces/agent/errors.py new file mode 100644 index 000000000000..ae21b8f09a70 --- /dev/null +++ b/packages/service-library/src/servicelib/rabbitmq/rpc_interfaces/agent/errors.py @@ -0,0 +1,12 @@ +from pydantic.errors import PydanticErrorMixin + + +class BaseAgentRPCError(PydanticErrorMixin, Exception): + ... + + +class NoServiceVolumesFoundRPCError(BaseAgentRPCError): + msg_template: str = ( + "Could not detect any unused volumes after waiting '{period}' seconds for " + "volumes to be released after closing all container for service='{node_id}'" + ) diff --git a/packages/service-library/src/servicelib/rabbitmq/rpc_interfaces/agent/volumes.py b/packages/service-library/src/servicelib/rabbitmq/rpc_interfaces/agent/volumes.py new file mode 100644 index 000000000000..d414cd6b979c --- /dev/null +++ b/packages/service-library/src/servicelib/rabbitmq/rpc_interfaces/agent/volumes.py @@ -0,0 +1,57 @@ +import logging +from datetime import timedelta +from typing import Final + +from models_library.projects_nodes_io import NodeID +from models_library.rabbitmq_basic_types import RPCMethodName, RPCNamespace +from pydantic import NonNegativeInt, parse_obj_as +from servicelib.logging_utils import log_decorator +from servicelib.rabbitmq import RabbitMQRPCClient + +_logger = logging.getLogger(__name__) + +_REQUEST_TIMEOUT: Final[NonNegativeInt] = int(timedelta(minutes=60).total_seconds()) + + +@log_decorator(_logger, level=logging.DEBUG) +async def remove_volumes_without_backup_for_service( + rabbitmq_rpc_client: RabbitMQRPCClient, + *, + docker_node_id: str, + swarm_stack_name: str, + node_id: NodeID, +) -> None: + result = await rabbitmq_rpc_client.request( + RPCNamespace.from_entries( + { + "service": "agent", + "docker_node_id": docker_node_id, + "swarm_stack_name": swarm_stack_name, + } + ), + parse_obj_as(RPCMethodName, "remove_volumes_without_backup_for_service"), + node_id=node_id, + timeout_s=_REQUEST_TIMEOUT, + ) + assert result is None # nosec + + +@log_decorator(_logger, level=logging.DEBUG) +async def backup_and_remove_volumes_for_all_services( + rabbitmq_rpc_client: RabbitMQRPCClient, + *, + docker_node_id: str, + swarm_stack_name: str, +) -> None: + result = await rabbitmq_rpc_client.request( + RPCNamespace.from_entries( + { + "service": "agent", + "docker_node_id": docker_node_id, + "swarm_stack_name": swarm_stack_name, + } + ), + parse_obj_as(RPCMethodName, "backup_and_remove_volumes_for_all_services"), + timeout_s=_REQUEST_TIMEOUT, + ) + assert result is None # nosec diff --git a/services/agent/VERSION b/services/agent/VERSION index 8acdd82b765e..3eefcb9dd5b3 100644 --- a/services/agent/VERSION +++ b/services/agent/VERSION @@ -1 +1 @@ -0.0.1 +1.0.0 diff --git a/services/agent/requirements/_base.in b/services/agent/requirements/_base.in index 7064a5518981..e44904bfced2 100644 --- a/services/agent/requirements/_base.in +++ b/services/agent/requirements/_base.in @@ -8,11 +8,12 @@ # intra-repo required dependencies --requirement ../../../packages/models-library/requirements/_base.in --requirement ../../../packages/settings-library/requirements/_base.in +# service-library[fastapi] +--requirement ../../../packages/service-library/requirements/_base.in --requirement ../../../packages/service-library/requirements/_fastapi.in aiodocker fastapi packaging pydantic -python-dotenv uvicorn diff --git a/services/agent/requirements/_base.txt b/services/agent/requirements/_base.txt index e3cb0445b7d0..83a70125e2fc 100644 --- a/services/agent/requirements/_base.txt +++ b/services/agent/requirements/_base.txt @@ -1,6 +1,18 @@ -aiodocker==0.21.0 - # via -r requirements/_base.in -aiohttp==3.8.5 +aio-pika==9.4.3 + # via -r requirements/../../../packages/service-library/requirements/_base.in +aiocache==0.12.3 + # via -r requirements/../../../packages/service-library/requirements/_base.in +aiodebug==2.3.0 + # via -r requirements/../../../packages/service-library/requirements/_base.in +aiodocker==0.23.0 + # via + # -r requirements/../../../packages/service-library/requirements/_base.in + # -r requirements/_base.in +aiofiles==24.1.0 + # via -r requirements/../../../packages/service-library/requirements/_base.in +aiohappyeyeballs==2.4.0 + # via aiohttp +aiohttp==3.10.6 # via # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt @@ -9,23 +21,29 @@ aiohttp==3.8.5 # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../requirements/constraints.txt # aiodocker -aiosignal==1.2.0 +aiormq==6.8.1 + # via aio-pika +aiosignal==1.3.1 # via aiohttp -anyio==3.6.2 +anyio==4.6.0 # via + # fast-depends + # faststream # httpx # starlette -arrow==1.2.3 - # via -r requirements/../../../packages/models-library/requirements/_base.in +arrow==1.3.0 + # via + # -r requirements/../../../packages/models-library/requirements/_base.in + # -r requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/_base.in + # -r requirements/../../../packages/service-library/requirements/_base.in asgiref==3.8.1 # via opentelemetry-instrumentation-asgi -async-timeout==4.0.2 - # via aiohttp -attrs==21.4.0 +attrs==24.2.0 # via # aiohttp # jsonschema -certifi==2023.11.17 + # referencing +certifi==2024.8.30 # via # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt @@ -35,21 +53,26 @@ certifi==2023.11.17 # -c requirements/../../../requirements/constraints.txt # httpcore # httpx -charset-normalizer==2.1.1 - # via aiohttp -click==8.1.3 + # requests +charset-normalizer==3.3.2 + # via requests +click==8.1.7 # via # typer # uvicorn deprecated==1.2.14 # via # opentelemetry-api + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http # opentelemetry-semantic-conventions -dnspython==2.2.1 +dnspython==2.6.1 # via email-validator -email-validator==1.3.0 +email-validator==2.2.0 # via pydantic -fastapi==0.96.0 +fast-depends==2.4.11 + # via faststream +fastapi==0.99.1 # via # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt @@ -60,17 +83,25 @@ fastapi==0.96.0 # -r requirements/../../../packages/service-library/requirements/_fastapi.in # -r requirements/_base.in # prometheus-fastapi-instrumentator -frozenlist==1.3.1 +faststream==0.5.25 + # via -r requirements/../../../packages/service-library/requirements/_base.in +frozenlist==1.4.1 # via # aiohttp # aiosignal +googleapis-common-protos==1.65.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +grpcio==1.66.1 + # via opentelemetry-exporter-otlp-proto-grpc h11==0.14.0 # via # httpcore # uvicorn -httpcore==1.0.2 +httpcore==1.0.5 # via httpx -httpx==0.26.0 +httpx==0.27.2 # via # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt @@ -79,47 +110,83 @@ httpx==0.26.0 # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../requirements/constraints.txt # -r requirements/../../../packages/service-library/requirements/_fastapi.in -idna==3.4 +idna==3.10 # via # anyio # email-validator # httpx + # requests # yarl -importlib-metadata==8.0.0 +importlib-metadata==8.4.0 # via opentelemetry-api -jsonschema==3.2.0 - # via -r requirements/../../../packages/models-library/requirements/_base.in +jsonschema==4.23.0 + # via + # -r requirements/../../../packages/models-library/requirements/_base.in + # -r requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/_base.in +jsonschema-specifications==2023.7.1 + # via jsonschema markdown-it-py==3.0.0 # via rich mdurl==0.1.2 # via markdown-it-py -multidict==6.0.2 +multidict==6.1.0 # via # aiohttp # yarl -opentelemetry-api==1.26.0 +opentelemetry-api==1.27.0 # via + # -r requirements/../../../packages/service-library/requirements/_base.in + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http # opentelemetry-instrumentation # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-requests + # opentelemetry-sdk # opentelemetry-semantic-conventions -opentelemetry-instrumentation==0.47b0 +opentelemetry-exporter-otlp==1.27.0 + # via -r requirements/../../../packages/service-library/requirements/_base.in +opentelemetry-exporter-otlp-proto-common==1.27.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-exporter-otlp-proto-grpc==1.27.0 + # via opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.27.0 + # via opentelemetry-exporter-otlp +opentelemetry-instrumentation==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi -opentelemetry-instrumentation-asgi==0.47b0 + # opentelemetry-instrumentation-requests +opentelemetry-instrumentation-asgi==0.48b0 # via opentelemetry-instrumentation-fastapi -opentelemetry-instrumentation-fastapi==0.47b0 +opentelemetry-instrumentation-fastapi==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in -opentelemetry-semantic-conventions==0.47b0 +opentelemetry-instrumentation-requests==0.48b0 + # via -r requirements/../../../packages/service-library/requirements/_base.in +opentelemetry-proto==1.27.0 + # via + # opentelemetry-exporter-otlp-proto-common + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-sdk==1.27.0 + # via + # -r requirements/../../../packages/service-library/requirements/_base.in + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-semantic-conventions==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi -opentelemetry-util-http==0.47b0 + # opentelemetry-instrumentation-requests + # opentelemetry-sdk +opentelemetry-util-http==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi -orjson==3.10.0 + # opentelemetry-instrumentation-requests +orjson==3.10.7 # via # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt @@ -128,15 +195,24 @@ orjson==3.10.0 # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../requirements/constraints.txt # -r requirements/../../../packages/models-library/requirements/_base.in -packaging==23.1 + # -r requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/_base.in +packaging==24.1 # via -r requirements/_base.in -prometheus-client==0.19.0 +pamqp==3.3.0 + # via aiormq +prometheus-client==0.21.0 # via # -r requirements/../../../packages/service-library/requirements/_fastapi.in # prometheus-fastapi-instrumentator prometheus-fastapi-instrumentator==6.1.0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in -pydantic==1.10.2 +protobuf==4.25.5 + # via + # googleapis-common-protos + # opentelemetry-proto +psutil==6.0.0 + # via -r requirements/../../../packages/service-library/requirements/_base.in +pydantic==1.10.18 # via # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt @@ -146,32 +222,60 @@ pydantic==1.10.2 # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../requirements/constraints.txt # -r requirements/../../../packages/models-library/requirements/_base.in + # -r requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/_base.in + # -r requirements/../../../packages/service-library/requirements/../../../packages/settings-library/requirements/_base.in + # -r requirements/../../../packages/service-library/requirements/_base.in # -r requirements/../../../packages/settings-library/requirements/_base.in # -r requirements/_base.in + # fast-depends # fastapi -pygments==2.15.1 +pygments==2.18.0 # via rich -pyrsistent==0.19.2 - # via jsonschema -python-dateutil==2.8.2 +pyinstrument==4.7.3 + # via -r requirements/../../../packages/service-library/requirements/_base.in +python-dateutil==2.9.0.post0 # via arrow -python-dotenv==1.0.0 - # via -r requirements/_base.in -rich==13.4.2 +pyyaml==6.0.2 + # via + # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/service-library/requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/service-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../requirements/constraints.txt + # -r requirements/../../../packages/service-library/requirements/_base.in +redis==5.0.8 + # via + # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/service-library/requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/service-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../requirements/constraints.txt + # -r requirements/../../../packages/service-library/requirements/_base.in +referencing==0.29.3 # via + # -c requirements/../../../packages/service-library/requirements/./constraints.txt + # jsonschema + # jsonschema-specifications +requests==2.32.3 + # via opentelemetry-exporter-otlp-proto-http +rich==13.8.1 + # via + # -r requirements/../../../packages/service-library/requirements/../../../packages/settings-library/requirements/_base.in # -r requirements/../../../packages/settings-library/requirements/_base.in # typer -setuptools==69.2.0 +rpds-py==0.20.0 # via # jsonschema - # opentelemetry-instrumentation + # referencing +setuptools==75.1.0 + # via opentelemetry-instrumentation shellingham==1.5.4 # via typer six==1.16.0 - # via - # jsonschema - # python-dateutil -sniffio==1.3.0 + # via python-dateutil +sniffio==1.3.1 # via # anyio # httpx @@ -184,14 +288,36 @@ starlette==0.27.0 # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../requirements/constraints.txt # fastapi -typer==0.12.3 - # via -r requirements/../../../packages/settings-library/requirements/_base.in -typing-extensions==4.4.0 +tenacity==9.0.0 + # via -r requirements/../../../packages/service-library/requirements/_base.in +toolz==0.12.1 + # via -r requirements/../../../packages/service-library/requirements/_base.in +tqdm==4.66.5 + # via -r requirements/../../../packages/service-library/requirements/_base.in +typer==0.12.5 # via - # aiodocker + # -r requirements/../../../packages/service-library/requirements/../../../packages/settings-library/requirements/_base.in + # -r requirements/../../../packages/settings-library/requirements/_base.in +types-python-dateutil==2.9.0.20240906 + # via arrow +typing-extensions==4.12.2 + # via + # aiodebug + # fastapi + # faststream + # opentelemetry-sdk # pydantic # typer -uvicorn==0.19.0 +urllib3==2.2.3 + # via + # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/service-library/requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/service-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt + # -c requirements/../../../requirements/constraints.txt + # requests +uvicorn==0.30.6 # via # -r requirements/../../../packages/service-library/requirements/_fastapi.in # -r requirements/_base.in @@ -199,7 +325,10 @@ wrapt==1.16.0 # via # deprecated # opentelemetry-instrumentation -yarl==1.9.2 - # via aiohttp -zipp==3.20.1 +yarl==1.12.1 + # via + # aio-pika + # aiohttp + # aiormq +zipp==3.20.2 # via importlib-metadata diff --git a/services/agent/requirements/_test.in b/services/agent/requirements/_test.in index 04f619082e08..ff76fcd01f53 100644 --- a/services/agent/requirements/_test.in +++ b/services/agent/requirements/_test.in @@ -11,6 +11,7 @@ --constraint _base.txt aioboto3 +asgi-lifespan coverage faker httpx @@ -20,3 +21,4 @@ pytest-asyncio pytest-cov pytest-mock pytest-runner +python-dotenv diff --git a/services/agent/requirements/_test.txt b/services/agent/requirements/_test.txt index fbcf8d854094..912fae4819f4 100644 --- a/services/agent/requirements/_test.txt +++ b/services/agent/requirements/_test.txt @@ -1,50 +1,59 @@ -aioboto3==12.4.0 +aioboto3==13.1.1 # via -r requirements/_test.in -aiobotocore==2.12.3 +aiobotocore==2.13.1 # via aioboto3 -aiohttp==3.8.5 +aiofiles==24.1.0 + # via + # -c requirements/_base.txt + # aioboto3 +aiohappyeyeballs==2.4.0 + # via + # -c requirements/_base.txt + # aiohttp +aiohttp==3.10.6 # via # -c requirements/../../../requirements/constraints.txt # -c requirements/_base.txt # aiobotocore aioitertools==0.12.0 # via aiobotocore -aiosignal==1.2.0 +aiosignal==1.3.1 # via # -c requirements/_base.txt # aiohttp -anyio==3.6.2 +antlr4-python3-runtime==4.13.2 + # via moto +anyio==4.6.0 # via # -c requirements/_base.txt # httpx -async-timeout==4.0.2 - # via - # -c requirements/_base.txt - # aiohttp -attrs==21.4.0 +asgi-lifespan==2.1.0 + # via -r requirements/_test.in +attrs==24.2.0 # via # -c requirements/_base.txt # aiohttp # jsonschema + # referencing aws-sam-translator==1.91.0 # via cfn-lint aws-xray-sdk==2.14.0 # via moto blinker==1.8.2 # via flask -boto3==1.34.69 +boto3==1.34.131 # via # aiobotocore # aws-sam-translator # moto -botocore==1.34.69 +botocore==1.34.131 # via # aiobotocore # aws-xray-sdk # boto3 # moto # s3transfer -certifi==2023.11.17 +certifi==2024.8.30 # via # -c requirements/../../../requirements/constraints.txt # -c requirements/_base.txt @@ -53,14 +62,13 @@ certifi==2023.11.17 # requests cffi==1.17.1 # via cryptography -cfn-lint==1.15.0 +cfn-lint==1.15.1 # via moto -charset-normalizer==2.1.1 +charset-normalizer==3.3.2 # via # -c requirements/_base.txt - # aiohttp # requests -click==8.1.3 +click==8.1.7 # via # -c requirements/_base.txt # flask @@ -71,17 +79,11 @@ coverage==7.6.1 cryptography==43.0.1 # via # -c requirements/../../../requirements/constraints.txt + # joserfc # moto - # python-jose - # sshpubkeys docker==7.1.0 # via moto -ecdsa==0.19.0 - # via - # moto - # python-jose - # sshpubkeys -faker==29.0.0 +faker==30.0.0 # via -r requirements/_test.in flask==3.0.3 # via @@ -89,7 +91,7 @@ flask==3.0.3 # moto flask-cors==5.0.0 # via moto -frozenlist==1.3.1 +frozenlist==1.4.1 # via # -c requirements/_base.txt # aiohttp @@ -100,16 +102,16 @@ h11==0.14.0 # via # -c requirements/_base.txt # httpcore -httpcore==1.0.2 +httpcore==1.0.5 # via # -c requirements/_base.txt # httpx -httpx==0.26.0 +httpx==0.27.2 # via # -c requirements/../../../requirements/constraints.txt # -c requirements/_base.txt # -r requirements/_test.in -idna==3.4 +idna==3.10 # via # -c requirements/_base.txt # anyio @@ -129,64 +131,71 @@ jmespath==1.0.1 # via # boto3 # botocore +joserfc==1.0.0 + # via moto jsondiff==2.2.1 # via moto jsonpatch==1.33 # via cfn-lint +jsonpath-ng==1.6.1 + # via moto jsonpointer==3.0.0 # via jsonpatch -jsonschema==3.2.0 +jsonschema==4.23.0 # via # -c requirements/_base.txt # aws-sam-translator # openapi-schema-validator # openapi-spec-validator +jsonschema-path==0.3.3 + # via openapi-spec-validator +jsonschema-specifications==2023.7.1 + # via + # -c requirements/_base.txt + # jsonschema + # openapi-schema-validator +lazy-object-proxy==1.10.0 + # via openapi-spec-validator markupsafe==2.1.5 # via # jinja2 # werkzeug -moto==4.2.6 +moto==5.0.15 # via -r requirements/_test.in mpmath==1.3.0 # via sympy -multidict==6.0.2 +multidict==6.1.0 # via # -c requirements/_base.txt # aiohttp # yarl networkx==3.3 # via cfn-lint -openapi-schema-validator==0.2.3 +openapi-schema-validator==0.6.2 # via openapi-spec-validator -openapi-spec-validator==0.4.0 - # via - # -c requirements/./constraints.txt - # moto -packaging==23.1 +openapi-spec-validator==0.7.1 + # via moto +packaging==24.1 # via # -c requirements/_base.txt # pytest +pathable==0.4.3 + # via jsonschema-path pluggy==1.5.0 # via pytest -py-partiql-parser==0.4.0 +ply==3.11 + # via jsonpath-ng +py-partiql-parser==0.5.6 # via moto -pyasn1==0.6.1 - # via - # python-jose - # rsa pycparser==2.22 # via cffi -pydantic==1.10.2 +pydantic==1.10.18 # via # -c requirements/../../../requirements/constraints.txt # -c requirements/_base.txt # aws-sam-translator pyparsing==3.1.4 # via moto -pyrsistent==0.19.2 - # via - # -c requirements/_base.txt - # jsonschema pytest==8.3.3 # via # -r requirements/_test.in @@ -203,59 +212,67 @@ pytest-mock==3.14.0 # via -r requirements/_test.in pytest-runner==6.0.1 # via -r requirements/_test.in -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # -c requirements/_base.txt # botocore # faker # moto -python-jose==3.3.0 - # via moto +python-dotenv==1.0.1 + # via -r requirements/_test.in pyyaml==6.0.2 # via # -c requirements/../../../requirements/constraints.txt + # -c requirements/_base.txt # cfn-lint # jsondiff + # jsonschema-path # moto - # openapi-spec-validator # responses +referencing==0.29.3 + # via + # -c requirements/_base.txt + # jsonschema + # jsonschema-path + # jsonschema-specifications regex==2024.9.11 # via cfn-lint requests==2.32.3 # via + # -c requirements/_base.txt # docker + # jsonschema-path # moto # responses responses==0.25.3 # via moto -rsa==4.9 +rfc3339-validator==0.1.4 + # via openapi-schema-validator +rpds-py==0.20.0 # via - # -c requirements/../../../requirements/constraints.txt - # python-jose + # -c requirements/_base.txt + # jsonschema + # referencing s3transfer==0.10.2 # via boto3 -setuptools==69.2.0 +setuptools==75.1.0 # via # -c requirements/_base.txt - # jsonschema # moto - # openapi-spec-validator six==1.16.0 # via # -c requirements/_base.txt - # ecdsa - # jsonschema # python-dateutil -sniffio==1.3.0 + # rfc3339-validator +sniffio==1.3.1 # via # -c requirements/_base.txt # anyio + # asgi-lifespan # httpx -sshpubkeys==3.3.1 - # via moto sympy==1.13.3 # via cfn-lint -typing-extensions==4.4.0 +typing-extensions==4.12.2 # via # -c requirements/_base.txt # aws-sam-translator @@ -264,6 +281,7 @@ typing-extensions==4.4.0 urllib3==2.2.3 # via # -c requirements/../../../requirements/constraints.txt + # -c requirements/_base.txt # botocore # docker # requests @@ -279,7 +297,7 @@ wrapt==1.16.0 # aws-xray-sdk xmltodict==0.13.0 # via moto -yarl==1.9.2 +yarl==1.12.1 # via # -c requirements/_base.txt # aiohttp diff --git a/services/agent/requirements/_tools.in b/services/agent/requirements/_tools.in index 8e7d4eb265ee..1def82c12a30 100644 --- a/services/agent/requirements/_tools.in +++ b/services/agent/requirements/_tools.in @@ -3,7 +3,3 @@ --constraint _test.txt --requirement ../../../requirements/devenv.txt - -black -isort -watchdog[watchmedo] diff --git a/services/agent/requirements/_tools.txt b/services/agent/requirements/_tools.txt index 6937ce6b8b5c..360f0628998a 100644 --- a/services/agent/requirements/_tools.txt +++ b/services/agent/requirements/_tools.txt @@ -1,16 +1,14 @@ astroid==3.3.4 # via pylint black==24.8.0 - # via - # -r requirements/../../../requirements/devenv.txt - # -r requirements/_tools.in + # via -r requirements/../../../requirements/devenv.txt build==1.2.2 # via pip-tools bump2version==1.0.1 # via -r requirements/../../../requirements/devenv.txt cfgv==3.4.0 # via pre-commit -click==8.1.3 +click==8.1.7 # via # -c requirements/_base.txt # -c requirements/_test.txt @@ -27,11 +25,10 @@ identify==2.6.1 isort==5.13.2 # via # -r requirements/../../../requirements/devenv.txt - # -r requirements/_tools.in # pylint mccabe==0.7.0 # via pylint -mypy==1.10.1 +mypy==1.11.2 # via -r requirements/../../../requirements/devenv.txt mypy-extensions==1.0.0 # via @@ -39,7 +36,7 @@ mypy-extensions==1.0.0 # mypy nodeenv==1.9.1 # via pre-commit -packaging==23.1 +packaging==24.1 # via # -c requirements/_base.txt # -c requirements/_test.txt @@ -58,7 +55,7 @@ platformdirs==4.3.6 # virtualenv pre-commit==3.8.0 # via -r requirements/../../../requirements/devenv.txt -pylint==3.3.0 +pylint==3.3.1 # via -r requirements/../../../requirements/devenv.txt pyproject-hooks==1.1.0 # via @@ -67,26 +64,24 @@ pyproject-hooks==1.1.0 pyyaml==6.0.2 # via # -c requirements/../../../requirements/constraints.txt + # -c requirements/_base.txt # -c requirements/_test.txt # pre-commit - # watchdog ruff==0.6.7 # via -r requirements/../../../requirements/devenv.txt -setuptools==69.2.0 +setuptools==75.1.0 # via # -c requirements/_base.txt # -c requirements/_test.txt # pip-tools tomlkit==0.13.2 # via pylint -typing-extensions==4.4.0 +typing-extensions==4.12.2 # via # -c requirements/_base.txt # -c requirements/_test.txt # mypy virtualenv==20.26.5 # via pre-commit -watchdog==5.0.2 - # via -r requirements/_tools.in wheel==0.44.0 # via pip-tools diff --git a/services/agent/requirements/ci.txt b/services/agent/requirements/ci.txt index 394361bc0ae6..8c52cca7e954 100644 --- a/services/agent/requirements/ci.txt +++ b/services/agent/requirements/ci.txt @@ -13,7 +13,7 @@ # installs this repo's packages simcore-models-library @ ../../packages/models-library pytest-simcore @ ../../packages/pytest-simcore -simcore-service-library @ ../../packages/service-library +simcore-service-library[fastapi] @ ../../packages/service-library simcore-settings-library @ ../../packages/settings-library # installs current package diff --git a/services/agent/requirements/constraints.txt b/services/agent/requirements/constraints.txt index b52ac1ee492d..e69de29bb2d1 100644 --- a/services/agent/requirements/constraints.txt +++ b/services/agent/requirements/constraints.txt @@ -1,11 +0,0 @@ - -# -# CONSTRAINTS DUE TO TEST LIBRARIES -# - -# There are incompatible versions in the resolved dependencies: -# jsonschema==3.2.0 (from -c requirements/_base.txt (line 159)) -# jsonschema~=3.2 (from -c requirements/./constraints.txt (line 12)) -# jsonschema<5,>=3.0 (from cfn-lint==0.64.1->moto[server]==4.0.1->-r requirements/_test.in (line 21)) -# jsonschema<5.0.0,>=4.0.0 (from openapi-spec-validator==0.5.1->moto[server]==4.0.1->-r requirements/_test.in (line 21)) -openapi-spec-validator<0.5.0 diff --git a/services/agent/requirements/dev.txt b/services/agent/requirements/dev.txt index a20e1ab941de..3793504486c6 100644 --- a/services/agent/requirements/dev.txt +++ b/services/agent/requirements/dev.txt @@ -14,7 +14,7 @@ # installs this repo's packages --editable ../../packages/models-library --editable ../../packages/pytest-simcore ---editable ../../packages/service-library +--editable ../../packages/service-library[fastapi] --editable ../../packages/settings-library # installs current package diff --git a/services/agent/requirements/prod.txt b/services/agent/requirements/prod.txt index 26afb420d402..aad1cc7a2bb2 100644 --- a/services/agent/requirements/prod.txt +++ b/services/agent/requirements/prod.txt @@ -11,7 +11,8 @@ # installs this repo's packages simcore-models-library @ ../../packages/models-library -simcore-service-library @ ../../packages/service-library +simcore-service-library[fastapi] @ ../../packages/service-library simcore-settings-library @ ../../packages/settings-library + # installs current package simcore-service-agent @ . diff --git a/services/agent/src/simcore_service_agent/api/__init__.py b/services/agent/src/simcore_service_agent/api/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/services/agent/src/simcore_service_agent/api/rest/__init__.py b/services/agent/src/simcore_service_agent/api/rest/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/services/agent/src/simcore_service_agent/api/rest/_dependencies.py b/services/agent/src/simcore_service_agent/api/rest/_dependencies.py new file mode 100644 index 000000000000..a02971d996a1 --- /dev/null +++ b/services/agent/src/simcore_service_agent/api/rest/_dependencies.py @@ -0,0 +1,27 @@ +""" Free functions to inject dependencies in routes handlers +""" + +from typing import Annotated, cast + +from fastapi import Depends, FastAPI, Request +from servicelib.rabbitmq._client_rpc import RabbitMQRPCClient + +from ...core.settings import ApplicationSettings + + +def get_application(request: Request) -> FastAPI: + return cast(FastAPI, request.app) + + +def get_settings( + app: Annotated[FastAPI, Depends(get_application)] +) -> ApplicationSettings: + assert isinstance(app.state.settings, ApplicationSettings) # nosec + return app.state.settings + + +def get_rabbitmq_client( + app: Annotated[FastAPI, Depends(get_application)] +) -> RabbitMQRPCClient: + assert isinstance(app.state.rabbitmq_rpc_server, RabbitMQRPCClient) # nosec + return app.state.rabbitmq_rpc_server diff --git a/services/agent/src/simcore_service_agent/api/rest/_health.py b/services/agent/src/simcore_service_agent/api/rest/_health.py new file mode 100644 index 000000000000..600de2467221 --- /dev/null +++ b/services/agent/src/simcore_service_agent/api/rest/_health.py @@ -0,0 +1,25 @@ +from typing import Annotated + +import arrow +from fastapi import APIRouter, Depends +from models_library.api_schemas__common.health import HealthCheckGet +from models_library.errors import RABBITMQ_CLIENT_UNHEALTHY_MSG +from servicelib.rabbitmq import RabbitMQClient + +from ._dependencies import get_rabbitmq_client + +router = APIRouter() + + +class HealthCheckError(RuntimeError): + """Failed a health check""" + + +@router.get("/health", response_model=HealthCheckGet) +async def check_service_health( + rabbitmq_client: Annotated[RabbitMQClient, Depends(get_rabbitmq_client)] +): + if not rabbitmq_client.healthy: + raise HealthCheckError(RABBITMQ_CLIENT_UNHEALTHY_MSG) + + return HealthCheckGet(timestamp=f"{__name__}@{arrow.utcnow().datetime.isoformat()}") diff --git a/services/agent/src/simcore_service_agent/api/rest/routes.py b/services/agent/src/simcore_service_agent/api/rest/routes.py new file mode 100644 index 000000000000..18688cf2f4d7 --- /dev/null +++ b/services/agent/src/simcore_service_agent/api/rest/routes.py @@ -0,0 +1,14 @@ +from fastapi import FastAPI, HTTPException +from servicelib.fastapi.exceptions_utils import ( + handle_errors_as_500, + http_exception_as_json_response, +) + +from . import _health + + +def setup_rest_api(app: FastAPI): + app.include_router(_health.router) + + app.add_exception_handler(Exception, handle_errors_as_500) + app.add_exception_handler(HTTPException, http_exception_as_json_response) diff --git a/services/agent/src/simcore_service_agent/api/rpc/__init__.py b/services/agent/src/simcore_service_agent/api/rpc/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/services/agent/src/simcore_service_agent/api/rpc/_volumes.py b/services/agent/src/simcore_service_agent/api/rpc/_volumes.py new file mode 100644 index 000000000000..96edb817e622 --- /dev/null +++ b/services/agent/src/simcore_service_agent/api/rpc/_volumes.py @@ -0,0 +1,28 @@ +import logging + +from fastapi import FastAPI +from models_library.projects_nodes_io import NodeID +from servicelib.logging_utils import log_context +from servicelib.rabbitmq import RPCRouter +from servicelib.rabbitmq.rpc_interfaces.agent.errors import ( + NoServiceVolumesFoundRPCError, +) +from simcore_service_agent.services.volumes_manager import VolumesManager + +_logger = logging.getLogger(__name__) + +router = RPCRouter() + + +@router.expose(reraise_if_error_type=(NoServiceVolumesFoundRPCError,)) +async def remove_volumes_without_backup_for_service( + app: FastAPI, *, node_id: NodeID +) -> None: + with log_context(_logger, logging.INFO, f"removing volumes for service: {node_id}"): + await VolumesManager.get_from_app_state(app).remove_service_volumes(node_id) + + +@router.expose() +async def backup_and_remove_volumes_for_all_services(app: FastAPI) -> None: + with log_context(_logger, logging.INFO, "removing all service volumes from node"): + await VolumesManager.get_from_app_state(app).remove_all_volumes() diff --git a/services/agent/src/simcore_service_agent/api/rpc/routes.py b/services/agent/src/simcore_service_agent/api/rpc/routes.py new file mode 100644 index 000000000000..7a658ae52806 --- /dev/null +++ b/services/agent/src/simcore_service_agent/api/rpc/routes.py @@ -0,0 +1,28 @@ +from fastapi import FastAPI +from models_library.rabbitmq_basic_types import RPCNamespace +from servicelib.rabbitmq import RPCRouter +from simcore_service_agent.core.settings import ApplicationSettings + +from ...services.rabbitmq import get_rabbitmq_rpc_server +from . import _volumes + +ROUTERS: list[RPCRouter] = [ + _volumes.router, +] + + +def setup_rpc_api_routes(app: FastAPI) -> None: + async def startup() -> None: + rpc_server = get_rabbitmq_rpc_server(app) + settings: ApplicationSettings = app.state.settings + rpc_namespace = RPCNamespace.from_entries( + { + "service": "agent", + "docker_node_id": settings.AGENT_DOCKER_NODE_ID, + "swarm_stack_name": settings.AGENT_VOLUMES_CLEANUP_TARGET_SWARM_STACK_NAME, + } + ) + for router in ROUTERS: + await rpc_server.register_router(router, rpc_namespace, app) + + app.add_event_handler("startup", startup) diff --git a/services/agent/src/simcore_service_agent/core/_dependencies.py b/services/agent/src/simcore_service_agent/core/_dependencies.py deleted file mode 100644 index d48e806851af..000000000000 --- a/services/agent/src/simcore_service_agent/core/_dependencies.py +++ /dev/null @@ -1,23 +0,0 @@ -""" Free functions to inject dependencies in routes handlers -""" - -from typing import cast - -from fastapi import Depends, FastAPI, Request - -from ..modules.task_monitor import TaskMonitor -from .settings import ApplicationSettings - - -def get_application(request: Request) -> FastAPI: - return cast(FastAPI, request.app) - - -def get_settings(app: FastAPI = Depends(get_application)) -> ApplicationSettings: - assert isinstance(app.state.settings, ApplicationSettings) # nosec - return app.state.settings - - -def get_task_monitor(app: FastAPI = Depends(get_application)) -> TaskMonitor: - assert isinstance(app.state.task_monitor, TaskMonitor) # nosec - return app.state.task_monitor diff --git a/services/agent/src/simcore_service_agent/core/_routes.py b/services/agent/src/simcore_service_agent/core/_routes.py deleted file mode 100644 index 6f3486d06627..000000000000 --- a/services/agent/src/simcore_service_agent/core/_routes.py +++ /dev/null @@ -1,12 +0,0 @@ -from fastapi import APIRouter, Depends, HTTPException, status - -from ..modules.task_monitor import TaskMonitor -from ._dependencies import get_task_monitor - -router = APIRouter() - - -@router.get("/health") -def health(task_monitor: TaskMonitor = Depends(get_task_monitor)) -> None: - if not task_monitor.was_started or task_monitor.are_tasks_hanging: - raise HTTPException(status.HTTP_503_SERVICE_UNAVAILABLE, detail="unhealthy") diff --git a/services/agent/src/simcore_service_agent/core/application.py b/services/agent/src/simcore_service_agent/core/application.py index 1c2211b16f3a..777c22a422cc 100644 --- a/services/agent/src/simcore_service_agent/core/application.py +++ b/services/agent/src/simcore_service_agent/core/application.py @@ -5,9 +5,6 @@ get_common_oas_options, override_fastapi_openapi_method, ) -from servicelib.fastapi.prometheus_instrumentation import ( - setup_prometheus_instrumentation, -) from servicelib.logging_utils import config_all_loggers from .._meta import ( @@ -18,8 +15,11 @@ SUMMARY, VERSION, ) -from ..modules import task_monitor -from ._routes import router +from ..api.rest.routes import setup_rest_api +from ..api.rpc.routes import setup_rpc_api_routes +from ..services.instrumentation import setup_instrumentation +from ..services.rabbitmq import setup_rabbitmq +from ..services.volumes_manager import setup_volume_manager from .settings import ApplicationSettings logger = logging.getLogger(__name__) @@ -35,7 +35,6 @@ def _setup_logger(settings: ApplicationSettings): def create_app() -> FastAPI: - # SETTINGS settings = ApplicationSettings.create_from_envs() _setup_logger(settings) logger.debug(settings.json(indent=2)) @@ -52,20 +51,18 @@ def create_app() -> FastAPI: override_fastapi_openapi_method(app) app.state.settings = settings - if app.state.settings.AGENT_PROMETHEUS_INSTRUMENTATION_ENABLED: - setup_prometheus_instrumentation(app) - - # ROUTERS - app.include_router(router) + setup_instrumentation(app) - # EVENTS - task_monitor.setup(app) + setup_rabbitmq(app) + setup_volume_manager(app) + setup_rest_api(app) + setup_rpc_api_routes(app) async def _on_startup() -> None: - print(APP_STARTED_BANNER_MSG, flush=True) + print(APP_STARTED_BANNER_MSG, flush=True) # noqa: T201 async def _on_shutdown() -> None: - print(APP_FINISHED_BANNER_MSG, flush=True) + print(APP_FINISHED_BANNER_MSG, flush=True) # noqa: T201 app.add_event_handler("startup", _on_startup) app.add_event_handler("shutdown", _on_shutdown) diff --git a/services/agent/src/simcore_service_agent/core/settings.py b/services/agent/src/simcore_service_agent/core/settings.py index 882217f9a5ff..96545d0355db 100644 --- a/services/agent/src/simcore_service_agent/core/settings.py +++ b/services/agent/src/simcore_service_agent/core/settings.py @@ -1,13 +1,12 @@ -from typing import Final +from datetime import timedelta from models_library.basic_types import BootModeEnum, LogLevel -from pydantic import AnyHttpUrl, Field, NonNegativeInt, validator +from pydantic import AnyHttpUrl, Field, validator from settings_library.base import BaseCustomSettings from settings_library.r_clone import S3Provider +from settings_library.rabbit import RabbitSettings from settings_library.utils_logging import MixinLoggingSettings -_MINUTE: Final[NonNegativeInt] = 60 - class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): LOGLEVEL: LogLevel = Field( @@ -21,12 +20,15 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): "AGENT_VOLUMES_LOG_FORMAT_LOCAL_DEV_ENABLED", "LOG_FORMAT_LOCAL_DEV_ENABLED", ], - description="Enables local development log format. WARNING: make sure it is disabled if you want to have structured logs!", + description=( + "Enables local development log format. WARNING: make sure it is " + "disabled if you want to have structured logs!" + ), ) AGENT_VOLUMES_CLEANUP_TARGET_SWARM_STACK_NAME: str = Field( ..., description="Exactly the same as director-v2's `SWARM_STACK_NAME` env var" ) - AGENT_VOLUMES_CLEANUP_S3_ENDPOINT: AnyHttpUrl | None + AGENT_VOLUMES_CLEANUP_S3_ENDPOINT: AnyHttpUrl AGENT_VOLUMES_CLEANUP_S3_ACCESS_KEY: str AGENT_VOLUMES_CLEANUP_S3_SECRET_KEY: str AGENT_VOLUMES_CLEANUP_S3_BUCKET: str @@ -42,11 +44,33 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): [".hidden_do_not_remove", "key_values.json"], description="Files to ignore when syncing to s3", ) - AGENT_VOLUMES_CLEANUP_INTERVAL_S: NonNegativeInt = Field( - 60 * _MINUTE, description="interval at which to repeat volumes cleanup" + AGENT_VOLUMES_CLEANUP_INTERVAL: timedelta = Field( + timedelta(minutes=1), description="interval for running volumes removal" + ) + AGENT_VOLUMES_CLENUP_BOOK_KEEPING_INTERVAL: timedelta = Field( + timedelta(minutes=1), + description=( + "interval at which to scan for unsued volumes and keep track since " + "they were detected as being unused" + ), ) + AGENT_VOLUMES_CLENUP_REMOVE_VOLUMES_INACTIVE_FOR: timedelta = Field( + timedelta(minutes=65), + description=( + "if a volume is unused for more than this interval it can be removed. " + "The default is set to a health 60+ miunutes since it might take upto " + "60 minutes for the dy-sidecar to properly save data form the volumes" + ), + ) + AGENT_PROMETHEUS_INSTRUMENTATION_ENABLED: bool = True + AGENT_DOCKER_NODE_ID: str = Field(..., description="used by the rabbitmq module") + + AGENT_RABBITMQ: RabbitSettings = Field( + auto_default_from_env=True, description="settings for service/rabbitmq" + ) + @validator("LOGLEVEL") @classmethod def valid_log_level(cls, value) -> LogLevel: diff --git a/services/agent/src/simcore_service_agent/models/__init__.py b/services/agent/src/simcore_service_agent/models/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/services/agent/src/simcore_service_agent/models/volumes.py b/services/agent/src/simcore_service_agent/models/volumes.py new file mode 100644 index 000000000000..ceb310486502 --- /dev/null +++ b/services/agent/src/simcore_service_agent/models/volumes.py @@ -0,0 +1,28 @@ +from pathlib import Path + +from models_library.api_schemas_directorv2.services import ( + CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME, +) +from models_library.projects import ProjectID +from models_library.projects_nodes_io import NodeID +from models_library.services_types import RunID +from models_library.users import UserID +from pydantic import BaseModel, Field + + +class DynamicServiceVolumeLabels(BaseModel): + node_uuid: NodeID + run_id: RunID + source: str + study_id: ProjectID + swarm_stack_name: str + user_id: UserID + + @property + def directory_name(self) -> str: + return self.source[CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME:][::-1].strip("_") + + +class VolumeDetails(BaseModel): + mountpoint: Path = Field(alias="Mountpoint") + labels: DynamicServiceVolumeLabels = Field(alias="Labels") diff --git a/services/agent/src/simcore_service_agent/modules/task_monitor.py b/services/agent/src/simcore_service_agent/modules/task_monitor.py deleted file mode 100644 index 8f4e3cd2ab3e..000000000000 --- a/services/agent/src/simcore_service_agent/modules/task_monitor.py +++ /dev/null @@ -1,169 +0,0 @@ -import asyncio -import logging -from collections import deque -from contextlib import suppress -from dataclasses import dataclass, field -from time import time -from typing import Any, Awaitable, Callable, Final, Optional - -from fastapi import FastAPI -from pydantic import PositiveFloat, PositiveInt -from servicelib.logging_utils import log_context - -from ..core.settings import ApplicationSettings -from .volumes_cleanup import backup_and_remove_volumes - -logger = logging.getLogger(__name__) - -DEFAULT_TASK_WAIT_ON_ERROR: Final[PositiveInt] = 10 - - -@dataclass -class _TaskData: - target: Callable - args: Any - repeat_interval_s: Optional[PositiveFloat] - _start_time: Optional[PositiveFloat] = None - - @property - def name(self) -> str: - return self.target.__name__ - - async def run(self) -> None: - coroutine = self.target(*self.args) - - self._start_time = time() - - try: - await coroutine - finally: - self._start_time = None - - def is_hanging(self) -> bool: - # NOTE: tasks with no repeat_interval_s are design to run forever - if self.repeat_interval_s is None: - return False - - if self._start_time is None: - return False - - return (time() - self._start_time) > self.repeat_interval_s - - -async def _task_runner(task_data: _TaskData) -> None: - with log_context(logger, logging.INFO, msg=f"'{task_data.name}'"): - while True: - try: - await task_data.run() - except Exception: # pylint: disable=broad-except - logger.exception("Had an error while running '%s'", task_data.name) - - if task_data.repeat_interval_s is None: - logger.warning( - "Unexpected termination of '%s'; it will be restarted", - task_data.name, - ) - - logger.info( - "Will run '%s' again in %s seconds", - task_data.name, - task_data.repeat_interval_s, - ) - await asyncio.sleep( - DEFAULT_TASK_WAIT_ON_ERROR - if task_data.repeat_interval_s is None - else task_data.repeat_interval_s - ) - - -@dataclass -class TaskMonitor: - _was_started: bool = False - _tasks: set[asyncio.Task] = field(default_factory=set) - _to_start: dict[str, _TaskData] = field(default_factory=dict) - - @property - def was_started(self) -> bool: - return self._was_started - - @property - def are_tasks_hanging(self) -> bool: - hanging_tasks_detected = False - for name, task_data in self._to_start.items(): - if task_data.is_hanging(): - logger.warning("Task '%s' is hanging", name) - hanging_tasks_detected = True - return hanging_tasks_detected - - def register_job( - self, - target: Callable, - *args: Any, - repeat_interval_s: Optional[PositiveFloat] = None, - ) -> None: - if self._was_started: - raise RuntimeError( - "Cannot add more tasks, monitor already running with: " - f"{[x.get_name() for x in self._tasks]}" - ) - - task_data = _TaskData(target, args, repeat_interval_s) - if task_data.name in self._to_start: - raise RuntimeError(f"{target.__name__} is already registered") - - self._to_start[target.__name__] = task_data - - async def start(self) -> None: - self._was_started = True - for name, task_data in self._to_start.items(): - logger.info("Starting task '%s'", name) - self._tasks.add( - asyncio.create_task(_task_runner(task_data), name=f"task_{name}") - ) - - async def shutdown(self): - async def _wait_for_task(task: asyncio.Task) -> None: - with suppress(asyncio.CancelledError): - await task - - tasks_to_wait: deque[Awaitable] = deque() - for task in set(self._tasks): - logger.info("Cancel and stop task '%s'", task.get_name()) - - task.cancel() - tasks_to_wait.append(_wait_for_task(task)) - self._tasks.remove(task) - - await asyncio.gather(*tasks_to_wait, return_exceptions=True) - self._was_started = False - self._to_start = {} - - -def setup(app: FastAPI) -> None: - async def _on_startup() -> None: - task_monitor = app.state.task_monitor = TaskMonitor() - settings: ApplicationSettings = app.state.settings - - # setup all relative jobs - task_monitor.register_job( - backup_and_remove_volumes, - settings, - repeat_interval_s=settings.AGENT_VOLUMES_CLEANUP_INTERVAL_S, - ) - - await task_monitor.start() - logger.info("Started 🔍 task_monitor") - - async def _on_shutdown() -> None: - task_monitor: TaskMonitor = app.state.task_monitor - await task_monitor.shutdown() - logger.info("Stopped 🔍 task_monitor") - - app.add_event_handler("startup", _on_startup) - app.add_event_handler("shutdown", _on_shutdown) - - -__all__: tuple[str, ...] = ( - "setup", - "TaskMonitor", -) diff --git a/services/agent/src/simcore_service_agent/modules/volumes_cleanup/__init__.py b/services/agent/src/simcore_service_agent/modules/volumes_cleanup/__init__.py deleted file mode 100644 index 00002f118b7d..000000000000 --- a/services/agent/src/simcore_service_agent/modules/volumes_cleanup/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from ._core import backup_and_remove_volumes - -__all__: tuple[str, ...] = ("backup_and_remove_volumes",) diff --git a/services/agent/src/simcore_service_agent/modules/volumes_cleanup/_core.py b/services/agent/src/simcore_service_agent/modules/volumes_cleanup/_core.py deleted file mode 100644 index 6a4e63e3ce93..000000000000 --- a/services/agent/src/simcore_service_agent/modules/volumes_cleanup/_core.py +++ /dev/null @@ -1,61 +0,0 @@ -import logging - -from ...core.settings import ApplicationSettings -from ._docker import delete_volume, docker_client, get_dyv_volumes, is_volume_used -from ._s3 import store_to_s3 - -logger = logging.getLogger(__name__) - - -async def backup_and_remove_volumes(settings: ApplicationSettings) -> None: - async with docker_client() as client: - dyv_volumes: list[dict] = await get_dyv_volumes( - client, settings.AGENT_VOLUMES_CLEANUP_TARGET_SWARM_STACK_NAME - ) - - if len(dyv_volumes) == 0: - return - - cleaned_up_volumes_count = 0 - logger.info("Beginning cleanup.") - for dyv_volume in dyv_volumes: - volume_name = dyv_volume["Name"] - - if await is_volume_used(client, volume_name): - logger.debug("Skipped in use docker volume: '%s'", volume_name) - continue - - try: - await store_to_s3( - volume_name=volume_name, - dyv_volume=dyv_volume, - s3_endpoint=settings.AGENT_VOLUMES_CLEANUP_S3_ENDPOINT, - s3_access_key=settings.AGENT_VOLUMES_CLEANUP_S3_ACCESS_KEY, - s3_secret_key=settings.AGENT_VOLUMES_CLEANUP_S3_SECRET_KEY, - s3_bucket=settings.AGENT_VOLUMES_CLEANUP_S3_BUCKET, - s3_region=settings.AGENT_VOLUMES_CLEANUP_S3_REGION, - s3_provider=settings.AGENT_VOLUMES_CLEANUP_S3_PROVIDER, - s3_retries=settings.AGENT_VOLUMES_CLEANUP_RETRIES, - s3_parallelism=settings.AGENT_VOLUMES_CLEANUP_PARALLELISM, - exclude_files=settings.AGENT_VOLUMES_CLEANUP_EXCLUDE_FILES, - ) - except Exception as e: # pylint:disable=broad-except - logger.error("%s", e) - continue - - logger.info("Successfully cleaned up docker volume: '%s'", volume_name) - - await delete_volume(client, volume_name) - logger.info("Removed docker volume: '%s'", volume_name) - cleaned_up_volumes_count += 1 - - if cleaned_up_volumes_count > 0: - logger.info( - ( - "The dy-sidecar volume cleanup detected %s " - "zombie volumes on the current machine." - ), - cleaned_up_volumes_count, - ) - else: - logger.info("Found no zombie dy-sidecar volumes to cleanup.") diff --git a/services/agent/src/simcore_service_agent/modules/volumes_cleanup/_docker.py b/services/agent/src/simcore_service_agent/modules/volumes_cleanup/_docker.py deleted file mode 100644 index 26d1475fdc19..000000000000 --- a/services/agent/src/simcore_service_agent/modules/volumes_cleanup/_docker.py +++ /dev/null @@ -1,37 +0,0 @@ -from collections import deque -from contextlib import asynccontextmanager -from typing import Any, AsyncIterator - -from aiodocker import Docker -from aiodocker.utils import clean_filters -from aiodocker.volumes import DockerVolume -from servicelib.docker_constants import PREFIX_DYNAMIC_SIDECAR_VOLUMES - - -@asynccontextmanager -async def docker_client() -> AsyncIterator[Docker]: - async with Docker() as docker: - yield docker - - -async def get_dyv_volumes(docker: Docker, target_swarm_stack_name: str) -> list[dict]: - dyv_volumes: deque[dict] = deque() - volumes = await docker.volumes.list() - for volume in volumes["Volumes"]: - volume_labels: dict[str, Any] = volume.get("Labels") or {} - if ( - volume["Name"].startswith(f"{PREFIX_DYNAMIC_SIDECAR_VOLUMES}_") - and volume_labels.get("swarm_stack_name") == target_swarm_stack_name - ): - dyv_volumes.append(volume) - return list(dyv_volumes) - - -async def delete_volume(docker: Docker, volume_name: str) -> None: - await DockerVolume(docker, volume_name).delete() - - -async def is_volume_used(docker: Docker, volume_name: str) -> bool: - filters = clean_filters({"volume": volume_name}) - containers = await docker.containers.list(all=True, filters=filters) - return len(containers) > 0 diff --git a/services/agent/src/simcore_service_agent/services/__init__.py b/services/agent/src/simcore_service_agent/services/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/services/agent/src/simcore_service_agent/modules/volumes_cleanup/_s3.py b/services/agent/src/simcore_service_agent/services/backup.py similarity index 56% rename from services/agent/src/simcore_service_agent/modules/volumes_cleanup/_s3.py rename to services/agent/src/simcore_service_agent/services/backup.py index 0494f614a1d7..fb6de148eef3 100644 --- a/services/agent/src/simcore_service_agent/modules/volumes_cleanup/_s3.py +++ b/services/agent/src/simcore_service_agent/services/backup.py @@ -1,17 +1,24 @@ import asyncio import logging +import tempfile from asyncio.streams import StreamReader from pathlib import Path from textwrap import dedent from typing import Final +from uuid import uuid4 -from pydantic import AnyHttpUrl -from settings_library.r_clone import S3Provider +from fastapi import FastAPI from settings_library.utils_r_clone import resolve_provider -logger = logging.getLogger(__name__) +from ..core.settings import ApplicationSettings +from ..models.volumes import DynamicServiceVolumeLabels, VolumeDetails -R_CLONE_CONFIG = """ +_logger = logging.getLogger(__name__) + + +_R_CLONE_CONFIG: Final[ + str +] = """ [dst] type = s3 provider = {destination_provider} @@ -21,46 +28,32 @@ region = {destination_region} acl = private """ -VOLUME_NAME_FIXED_PORTION: Final[int] = 78 - - -def get_config_file_path( - s3_endpoint: AnyHttpUrl | None, - s3_access_key: str, - s3_secret_key: str, - s3_region: str, - s3_provider: S3Provider, -) -> Path: - config_content = R_CLONE_CONFIG.format( - destination_provider=resolve_provider(s3_provider), - destination_access_key=s3_access_key, - destination_secret_key=s3_secret_key, - destination_endpoint=s3_endpoint, - destination_region=s3_region, - ) - conf_path = Path("/tmp/rclone_config.ini") # NOSONAR - conf_path.write_text(config_content) # pylint:disable=unspecified-encoding - return conf_path -def _get_dir_name(volume_name: str) -> str: - # from: "dyv_a0430d06-40d2-4c92-9490-6aca30e00fc7_898fff63-d402-5566-a99b-091522dd2ae9_stuptuo_krow_nayvoj_emoh_" - # gets: "home_jovyan_work_outputs" - return volume_name[VOLUME_NAME_FIXED_PORTION:][::-1].strip("_") +def _get_config_file_path(settings: ApplicationSettings) -> Path: + config_content = _R_CLONE_CONFIG.format( + destination_provider=resolve_provider( + settings.AGENT_VOLUMES_CLEANUP_S3_PROVIDER + ), + destination_access_key=settings.AGENT_VOLUMES_CLEANUP_S3_ACCESS_KEY, + destination_secret_key=settings.AGENT_VOLUMES_CLEANUP_S3_SECRET_KEY, + destination_endpoint=settings.AGENT_VOLUMES_CLEANUP_S3_ENDPOINT, + destination_region=settings.AGENT_VOLUMES_CLEANUP_S3_REGION, + ) + conf_path = Path(tempfile.gettempdir()) / f"rclone_config_{uuid4()}.ini" + conf_path.write_text(config_content) + return conf_path -def _get_s3_path(s3_bucket: str, labels: dict[str, str], volume_name: str) -> Path: - joint_key = "/".join( - ( - s3_bucket, - labels["swarm_stack_name"], - labels["study_id"], - labels["node_uuid"], - labels["run_id"], - _get_dir_name(volume_name), - ) +def _get_s3_path(s3_bucket: str, labels: DynamicServiceVolumeLabels) -> Path: + return ( + Path(s3_bucket) + / labels.swarm_stack_name + / f"{labels.study_id}" + / f"{labels.node_uuid}" + / labels.run_id + / labels.directory_name ) - return Path(f"/{joint_key}") async def _read_stream(stream: StreamReader) -> str: @@ -68,7 +61,7 @@ async def _read_stream(stream: StreamReader) -> str: while line := await stream.readline(): message = line.decode() output += message - logger.debug(message.strip("\n")) + _logger.debug(message.strip("\n")) return output @@ -79,12 +72,12 @@ def _get_r_clone_str_command(command: list[str], exclude_files: list[str]) -> st command.append(to_exclude) str_command = " ".join(command) - logger.info(str_command) + _logger.info(str_command) return str_command def _log_expected_operation( - dyv_volume_labels: dict[str, str], + labels: DynamicServiceVolumeLabels, s3_path: Path, r_clone_ls_output: str, volume_name: str, @@ -101,50 +94,38 @@ def _log_expected_operation( --- volume_name {volume_name} destination_path {s3_path} - study_id: {dyv_volume_labels['study_id']} - node_id: {dyv_volume_labels['node_uuid']} - user_id: {dyv_volume_labels['user_id']} - run_id: {dyv_volume_labels['run_id']} + study_id: {labels.study_id} + node_id: {labels.node_uuid} + user_id: {labels.user_id} + run_id: {labels.run_id} --- Files to sync by rclone ---\n{r_clone_ls_output.rstrip()} --- """ ) - logger.log(log_level, formatted_message) + _logger.log(log_level, formatted_message) -async def store_to_s3( # pylint:disable=too-many-locals,too-many-arguments - volume_name: str, - dyv_volume: dict, - s3_endpoint: AnyHttpUrl | None, - s3_access_key: str, - s3_secret_key: str, - s3_bucket: str, - s3_region: str, - s3_provider: S3Provider, - s3_retries: int, - s3_parallelism: int, - exclude_files: list[str], +async def _store_in_s3( + settings: ApplicationSettings, volume_name: str, volume_details: VolumeDetails ) -> None: - config_file_path = get_config_file_path( - s3_endpoint=s3_endpoint, - s3_access_key=s3_access_key, - s3_secret_key=s3_secret_key, - s3_region=s3_region, - s3_provider=s3_provider, - ) + exclude_files = settings.AGENT_VOLUMES_CLEANUP_EXCLUDE_FILES - source_dir = dyv_volume["Mountpoint"] + config_file_path = _get_config_file_path(settings) + + source_dir = volume_details.mountpoint if not Path(source_dir).exists(): - logger.info( + _logger.info( "Volume mountpoint %s does not exist. Skipping backup, volume %s will be removed.", source_dir, volume_name, ) return - s3_path = _get_s3_path(s3_bucket, dyv_volume["Labels"], volume_name) + s3_path = _get_s3_path( + settings.AGENT_VOLUMES_CLEANUP_S3_BUCKET, volume_details.labels + ) # listing files rclone will sync r_clone_ls = [ @@ -159,11 +140,12 @@ async def store_to_s3( # pylint:disable=too-many-locals,too-many-arguments stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, ) + assert process.stdout # nosec r_clone_ls_output = await _read_stream(process.stdout) await process.wait() _log_expected_operation( - dyv_volume["Labels"], s3_path, r_clone_ls_output, volume_name + volume_details.labels, s3_path, r_clone_ls_output, volume_name ) # sync files via rclone @@ -174,9 +156,9 @@ async def store_to_s3( # pylint:disable=too-many-locals,too-many-arguments "--low-level-retries", "3", "--retries", - f"{s3_retries}", + f"{settings.AGENT_VOLUMES_CLEANUP_RETRIES}", "--transfers", - f"{s3_parallelism}", + f"{settings.AGENT_VOLUMES_CLEANUP_PARALLELISM}", # below two options reduce to a minimum the memory footprint # https://forum.rclone.org/t/how-to-set-a-memory-limit/10230/4 "--use-mmap", # docs https://rclone.org/docs/#use-mmap @@ -197,13 +179,24 @@ async def store_to_s3( # pylint:disable=too-many-locals,too-many-arguments stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, ) + assert process.stdout # nosec r_clone_sync_output = await _read_stream(process.stdout) await process.wait() - logger.info("Sync result:\n%s", r_clone_sync_output) + _logger.info("Sync result:\n%s", r_clone_sync_output) if process.returncode != 0: - raise RuntimeError( + msg = ( f"Shell subprocesses yielded nonzero error code {process.returncode} " f"for command {str_r_clone_sync}\n{r_clone_sync_output}" ) + raise RuntimeError(msg) + + +async def backup_volume( + app: FastAPI, volume_details: VolumeDetails, volume_name: str +) -> None: + settings: ApplicationSettings = app.state.settings + await _store_in_s3( + settings=settings, volume_name=volume_name, volume_details=volume_details + ) diff --git a/services/agent/src/simcore_service_agent/services/docker_utils.py b/services/agent/src/simcore_service_agent/services/docker_utils.py new file mode 100644 index 000000000000..181fe13a2759 --- /dev/null +++ b/services/agent/src/simcore_service_agent/services/docker_utils.py @@ -0,0 +1,108 @@ +import logging +from collections.abc import Iterator +from contextlib import contextmanager +from typing import Final + +from aiodocker import DockerError +from aiodocker.docker import Docker +from aiodocker.volumes import DockerVolume +from fastapi import FastAPI +from models_library.api_schemas_directorv2.services import ( + CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME, +) +from servicelib.docker_constants import PREFIX_DYNAMIC_SIDECAR_VOLUMES +from servicelib.logging_utils import log_catch, log_context +from simcore_service_agent.core.settings import ApplicationSettings +from starlette import status + +from ..models.volumes import VolumeDetails +from .backup import backup_volume +from .instrumentation import get_instrumentation + +_logger = logging.getLogger(__name__) + + +def _reverse_string(to_reverse: str) -> str: + return to_reverse[::-1] + + +_VOLUMES_NOT_TO_BACKUP: Final[tuple[str, ...]] = ( + _reverse_string("inputs"), + _reverse_string("shared-store"), +) + + +def _does_volume_require_backup(volume_name: str) -> bool: + # from `dyv_1726228407_891aa1a7-eb31-459f-8aed-8c902f5f5fb0_dd84f39e-7154-4a13-ba1d-50068d723104_stupni_www_` + # retruns `stupni_www_` + inverse_name_part = volume_name[CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME:] + return not inverse_name_part.startswith(_VOLUMES_NOT_TO_BACKUP) + + +async def get_unused_dynamc_sidecar_volumes(docker: Docker) -> set[str]: + """Returns all volumes unused by sidecars""" + volumes = await docker.volumes.list() + all_volumes: set[str] = {volume["Name"] for volume in volumes["Volumes"]} + + containers = await docker.containers.list(all=True) + + used_volumes: set[str] = set() + for container in containers: + container_info = await container.show() + mounts = container_info.get("Mounts", []) + for mount in mounts: + if mount["Type"] == "volume": + used_volumes.add(mount["Name"]) + + unused_volumes = all_volumes - used_volumes + return {v for v in unused_volumes if v.startswith(PREFIX_DYNAMIC_SIDECAR_VOLUMES)} + + +async def get_volume_details(docker: Docker, *, volume_name: str) -> VolumeDetails: + volume_details = await DockerVolume(docker, volume_name).show() + return VolumeDetails.parse_obj(volume_details) + + +@contextmanager +def _log_volume_not_found(volume_name: str) -> Iterator[None]: + try: + yield + except DockerError as e: + if e.status == status.HTTP_404_NOT_FOUND: + _logger.info("Volume not found '%s'", volume_name) + else: + raise + + +async def _backup_volume(app: FastAPI, docker: Docker, *, volume_name: str) -> None: + """Backs up only volumes which require a backup""" + if _does_volume_require_backup(volume_name): + with log_context( + _logger, logging.INFO, f"backup '{volume_name}'", log_duration=True + ): + volume_details = await get_volume_details(docker, volume_name=volume_name) + settings: ApplicationSettings = app.state.settings + get_instrumentation(app).agent_metrics.backedup_volumes( + settings.AGENT_DOCKER_NODE_ID + ) + await backup_volume(app, volume_details, volume_name) + else: + _logger.debug("No backup is required for '%s'", volume_name) + + +async def remove_volume( + app: FastAPI, docker: Docker, *, volume_name: str, requires_backup: bool +) -> None: + """Removes a volume and backs data up if required""" + with log_context( + _logger, logging.DEBUG, f"removing '{volume_name}'", log_duration=True + ), log_catch(_logger, reraise=False), _log_volume_not_found(volume_name): + if requires_backup: + await _backup_volume(app, docker, volume_name=volume_name) + + await DockerVolume(docker, volume_name).delete() + + settings: ApplicationSettings = app.state.settings + get_instrumentation(app).agent_metrics.remove_volumes( + settings.AGENT_DOCKER_NODE_ID + ) diff --git a/services/agent/src/simcore_service_agent/services/instrumentation/__init__.py b/services/agent/src/simcore_service_agent/services/instrumentation/__init__.py new file mode 100644 index 000000000000..49d7b66b0793 --- /dev/null +++ b/services/agent/src/simcore_service_agent/services/instrumentation/__init__.py @@ -0,0 +1,6 @@ +from ._setup import get_instrumentation, setup_instrumentation + +__all__: tuple[str, ...] = ( + "get_instrumentation", + "setup_instrumentation", +) diff --git a/services/agent/src/simcore_service_agent/services/instrumentation/_models.py b/services/agent/src/simcore_service_agent/services/instrumentation/_models.py new file mode 100644 index 000000000000..bf5543745954 --- /dev/null +++ b/services/agent/src/simcore_service_agent/services/instrumentation/_models.py @@ -0,0 +1,52 @@ +from dataclasses import dataclass, field +from typing import Final + +from prometheus_client import CollectorRegistry, Counter +from servicelib.instrumentation import MetricsBase, get_metrics_namespace + +from ..._meta import APP_NAME + +_METRICS_NAMESPACE: Final[str] = get_metrics_namespace(APP_NAME) +_LABELS_COUNTERS: Final[tuple[str, ...]] = ("docker_node_id",) + + +@dataclass(slots=True, kw_only=True) +class AgentMetrics(MetricsBase): + volumes_removed: Counter = field(init=False) + volumes_backedup: Counter = field(init=False) + + def __post_init__(self) -> None: + self.volumes_removed = Counter( + "volumes_removed_total", + "Number of removed volumes by the agent", + labelnames=_LABELS_COUNTERS, + namespace=_METRICS_NAMESPACE, + subsystem=self.subsystem, + registry=self.registry, + ) + + self.volumes_backedup = Counter( + "volumes_backedup_total", + "Number of removed volumes who's content was uplaoded by the agent", + labelnames=_LABELS_COUNTERS, + namespace=_METRICS_NAMESPACE, + subsystem=self.subsystem, + registry=self.registry, + ) + + def remove_volumes(self, docker_node_id: str) -> None: + self.volumes_removed.labels(docker_node_id=docker_node_id).inc() + + def backedup_volumes(self, docker_node_id: str) -> None: + self.volumes_backedup.labels(docker_node_id=docker_node_id).inc() + + +@dataclass(slots=True, kw_only=True) +class AgentInstrumentation: + registry: CollectorRegistry + agent_metrics: AgentMetrics = field(init=False) + + def __post_init__(self) -> None: + self.agent_metrics = AgentMetrics( # pylint: disable=unexpected-keyword-arg + subsystem="agent", registry=self.registry + ) diff --git a/services/agent/src/simcore_service_agent/services/instrumentation/_setup.py b/services/agent/src/simcore_service_agent/services/instrumentation/_setup.py new file mode 100644 index 000000000000..ad4e2f3cf2eb --- /dev/null +++ b/services/agent/src/simcore_service_agent/services/instrumentation/_setup.py @@ -0,0 +1,30 @@ +from fastapi import FastAPI +from servicelib.fastapi.prometheus_instrumentation import ( + setup_prometheus_instrumentation, +) +from simcore_service_agent.core.settings import ApplicationSettings + +from ._models import AgentInstrumentation + + +def setup_instrumentation(app: FastAPI) -> None: + settings: ApplicationSettings = app.state.settings + if not settings.AGENT_PROMETHEUS_INSTRUMENTATION_ENABLED: + return + + instrumentator = setup_prometheus_instrumentation(app) + + async def on_startup() -> None: + app.state.instrumentation = AgentInstrumentation( + registry=instrumentator.registry + ) + + app.add_event_handler("startup", on_startup) + + +def get_instrumentation(app: FastAPI) -> AgentInstrumentation: + assert ( + app.state.instrumentation + ), "Instrumentation not setup. Please check the configuration" # nosec + instrumentation: AgentInstrumentation = app.state.instrumentation + return instrumentation diff --git a/services/agent/src/simcore_service_agent/services/rabbitmq.py b/services/agent/src/simcore_service_agent/services/rabbitmq.py new file mode 100644 index 000000000000..3c548fb0b241 --- /dev/null +++ b/services/agent/src/simcore_service_agent/services/rabbitmq.py @@ -0,0 +1,29 @@ +from typing import cast + +from fastapi import FastAPI +from servicelib.rabbitmq import RabbitMQRPCClient, wait_till_rabbitmq_responsive +from settings_library.rabbit import RabbitSettings + + +def setup_rabbitmq(app: FastAPI) -> None: + settings: RabbitSettings = app.state.settings.AGENT_RABBITMQ + app.state.rabbitmq_rpc_server = None + + async def _on_startup() -> None: + await wait_till_rabbitmq_responsive(settings.dsn) + + app.state.rabbitmq_rpc_server = await RabbitMQRPCClient.create( + client_name="dynamic_scheduler_rpc_server", settings=settings + ) + + async def _on_shutdown() -> None: + if app.state.rabbitmq_rpc_server: + await app.state.rabbitmq_rpc_server.close() + + app.add_event_handler("startup", _on_startup) + app.add_event_handler("shutdown", _on_shutdown) + + +def get_rabbitmq_rpc_server(app: FastAPI) -> RabbitMQRPCClient: + assert app.state.rabbitmq_rpc_server # nosec + return cast(RabbitMQRPCClient, app.state.rabbitmq_rpc_server) diff --git a/services/agent/src/simcore_service_agent/services/volumes_manager.py b/services/agent/src/simcore_service_agent/services/volumes_manager.py new file mode 100644 index 000000000000..526589a2c9cd --- /dev/null +++ b/services/agent/src/simcore_service_agent/services/volumes_manager.py @@ -0,0 +1,188 @@ +import logging +from asyncio import Lock, Task +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from typing import Final + +import arrow +from aiodocker.docker import Docker +from fastapi import FastAPI +from models_library.projects_nodes_io import NodeID +from pydantic import NonNegativeFloat +from servicelib.background_task import start_periodic_task, stop_periodic_task +from servicelib.fastapi.app_state import SingletonInAppStateMixin +from servicelib.logging_utils import log_context +from servicelib.rabbitmq.rpc_interfaces.agent.errors import ( + NoServiceVolumesFoundRPCError, +) +from tenacity import AsyncRetrying, before_sleep_log, stop_after_delay, wait_fixed + +from ..core.settings import ApplicationSettings +from .docker_utils import get_unused_dynamc_sidecar_volumes, remove_volume + +_logger = logging.getLogger(__name__) + +_WAIT_FOR_UNUSED_SERVICE_VOLUMES: Final[timedelta] = timedelta(minutes=1) + + +@dataclass +class VolumesManager( # pylint:disable=too-many-instance-attributes + SingletonInAppStateMixin +): + app: FastAPI + book_keeping_interval: timedelta + volume_cleanup_interval: timedelta + remove_volumes_inactive_for: NonNegativeFloat + + docker: Docker = field(default_factory=Docker) + removal_lock: Lock = field(default_factory=Lock) + + _task_bookkeeping: Task | None = None + _unused_volumes: dict[str, datetime] = field(default_factory=dict) + + _task_periodic_volume_cleanup: Task | None = None + + app_state_name: str = "volumes_manager" + + async def setup(self) -> None: + self._task_bookkeeping = start_periodic_task( + self._bookkeeping_task, + interval=self.book_keeping_interval, + task_name="volumes bookkeeping", + ) + self._task_periodic_volume_cleanup = start_periodic_task( + self._bookkeeping_task, + interval=self.volume_cleanup_interval, + task_name="volume cleanup", + ) + + async def shutdown(self) -> None: + await self.docker.close() + + if self._task_bookkeeping: + await stop_periodic_task(self._task_bookkeeping) + + if self._task_periodic_volume_cleanup: + await stop_periodic_task(self._task_periodic_volume_cleanup) + + async def _bookkeeping_task(self) -> None: + with log_context(_logger, logging.DEBUG, "volume bookkeeping"): + current_unused_volumes = await get_unused_dynamc_sidecar_volumes( + self.docker + ) + old_unused_volumes = set(self._unused_volumes.keys()) + + # remove + to_remove = old_unused_volumes - current_unused_volumes + for volume in to_remove: + self._unused_volumes.pop(volume, None) + + # volumes which have just been detected as inactive + to_add = current_unused_volumes - old_unused_volumes + for volume in to_add: + self._unused_volumes[volume] = arrow.utcnow().datetime + + async def _remove_volume_safe( + self, *, volume_name: str, requires_backup: bool + ) -> None: + # NOTE: to avoid race conditions only one volume can be removed + # also avoids issues with accessing the docker API in parallel + async with self.removal_lock: + await remove_volume( + self.app, + self.docker, + volume_name=volume_name, + requires_backup=requires_backup, + ) + + async def _periodic_volmue_cleanup_task(self) -> None: + with log_context(_logger, logging.DEBUG, "volume cleanup"): + volumes_to_remove: set[str] = set() + for volume_name, inactive_since in self._unused_volumes.items(): + volume_inactive_sicne = ( + arrow.utcnow().datetime - inactive_since + ).total_seconds() + if volume_inactive_sicne > self.remove_volumes_inactive_for: + volumes_to_remove.add(volume_name) + + for volume in volumes_to_remove: + await self._remove_volume_safe(volume_name=volume, requires_backup=True) + + async def _wait_for_service_volumes_to_become_unused( + self, node_id: NodeID + ) -> set[str]: + # NOTE: it usually takes a few seconds for volumes to become unused, + # if agent does not wait for this operation to finish, + # volumes will be removed and backed up by the background task + # causing unncecessary data transfer to S3 + async for attempt in AsyncRetrying( + reraise=True, + stop=stop_after_delay(_WAIT_FOR_UNUSED_SERVICE_VOLUMES.total_seconds()), + wait=wait_fixed(1), + before_sleep=before_sleep_log(_logger, logging.DEBUG), + ): + with attempt: + current_unused_volumes = await get_unused_dynamc_sidecar_volumes( + self.docker + ) + + service_volumes = { + v for v in current_unused_volumes if f"{node_id}" in v + } + _logger.debug( + "service %s found volumes to remove: %s", node_id, service_volumes + ) + if len(service_volumes) == 0: + raise NoServiceVolumesFoundRPCError( + period=_WAIT_FOR_UNUSED_SERVICE_VOLUMES.total_seconds(), + node_id=node_id, + ) + + return service_volumes + + async def remove_service_volumes(self, node_id: NodeID) -> None: + # bookkept volumes might not be up to date + service_volumes = await self._wait_for_service_volumes_to_become_unused(node_id) + _logger.debug( + "will remove volumes for %s from service_volumes=%s", + node_id, + service_volumes, + ) + + for volume_name in service_volumes: + # volumes already saved to S3 by the sidecar and no longer require backup + await self._remove_volume_safe( + volume_name=volume_name, requires_backup=False + ) + + async def remove_all_volumes(self) -> None: + # bookkept volumes might not be up to date + current_unused_volumes = await get_unused_dynamc_sidecar_volumes(self.docker) + + with log_context(_logger, logging.INFO, "remove all volumes"): + for volume in current_unused_volumes: + await self._remove_volume_safe(volume_name=volume, requires_backup=True) + + +def get_volumes_manager(app: FastAPI) -> VolumesManager: + return VolumesManager.get_from_app_state(app) + + +def setup_volume_manager(app: FastAPI) -> None: + async def _on_startup() -> None: + settings: ApplicationSettings = app.state.settings + + volumes_manager = VolumesManager( + app=app, + book_keeping_interval=settings.AGENT_VOLUMES_CLENUP_BOOK_KEEPING_INTERVAL, + volume_cleanup_interval=settings.AGENT_VOLUMES_CLEANUP_INTERVAL, + remove_volumes_inactive_for=settings.AGENT_VOLUMES_CLENUP_REMOVE_VOLUMES_INACTIVE_FOR.total_seconds(), + ) + volumes_manager.set_to_app_state(app) + await volumes_manager.setup() + + async def _on_shutdown() -> None: + await VolumesManager.get_from_app_state(app).shutdown() + + app.add_event_handler("startup", _on_startup) + app.add_event_handler("shutdown", _on_shutdown) diff --git a/services/agent/tests/conftest.py b/services/agent/tests/conftest.py index bd0d1a8964f1..4632ca841023 100644 --- a/services/agent/tests/conftest.py +++ b/services/agent/tests/conftest.py @@ -1,189 +1,65 @@ # pylint: disable=redefined-outer-name # pylint: disable=unused-argument -import contextlib -import logging -from pathlib import Path -from typing import AsyncIterator, Iterable -from uuid import uuid4 -import aiodocker import pytest -import simcore_service_agent -from aiodocker.volumes import DockerVolume +from faker import Faker from models_library.basic_types import BootModeEnum -from models_library.services import RunID from moto.server import ThreadedMotoServer from pydantic import HttpUrl, parse_obj_as +from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict from settings_library.r_clone import S3Provider -from simcore_service_agent.core.settings import ApplicationSettings pytest_plugins = [ "pytest_simcore.aws_server", + "pytest_simcore.docker_compose", + "pytest_simcore.docker_swarm", + "pytest_simcore.rabbit_service", "pytest_simcore.repository_paths", ] -@pytest.fixture(scope="session") -def project_slug_dir(osparc_simcore_root_dir: Path) -> Path: - # fixtures in pytest_simcore.environs - service_folder = osparc_simcore_root_dir / "services" / "agent" - assert service_folder.exists() - assert any(service_folder.glob("src/simcore_service_agent")) - return service_folder - - -@pytest.fixture(scope="session") -def installed_package_dir() -> Path: - dirpath = Path(simcore_service_agent.__file__).resolve().parent - assert dirpath.exists() - return dirpath - - @pytest.fixture def swarm_stack_name() -> str: return "test-simcore" @pytest.fixture -def study_id() -> str: - return f"{uuid4()}" - - -@pytest.fixture -def node_uuid() -> str: - return f"{uuid4()}" - - -@pytest.fixture -def run_id() -> RunID: - return RunID.create() - - -@pytest.fixture -def bucket() -> str: - return f"test-bucket-{uuid4()}" - - -@pytest.fixture -def used_volume_path(tmp_path: Path) -> Path: - return tmp_path / "used_volume" - - -@pytest.fixture -def unused_volume_path(tmp_path: Path) -> Path: - return tmp_path / "unused_volume" - - -def _get_source(run_id: str, node_uuid: str, volume_path: Path) -> str: - reversed_path = f"{volume_path}"[::-1].replace("/", "_") - return f"dyv_{run_id}_{node_uuid}_{reversed_path}" +def docker_node_id() -> str: + return "test-node-id" @pytest.fixture -async def unused_volume( - swarm_stack_name: str, - study_id: str, - node_uuid: str, - run_id: RunID, - unused_volume_path: Path, -) -> AsyncIterator[DockerVolume]: - async with aiodocker.Docker() as docker_client: - source = _get_source(run_id, node_uuid, unused_volume_path) - volume = await docker_client.volumes.create( - { - "Name": source, - "Labels": { - "node_uuid": node_uuid, - "run_id": run_id, - "source": source, - "study_id": study_id, - "swarm_stack_name": swarm_stack_name, - "user_id": "1", - }, - } - ) - - # attach to volume and create some files!!! - - yield volume - - with contextlib.suppress(aiodocker.DockerError): - await volume.delete() +def bucket(faker: Faker) -> str: + return f"test-bucket-{faker.uuid4()}" @pytest.fixture -async def used_volume( - swarm_stack_name: str, - study_id: str, - node_uuid: str, - run_id: RunID, - used_volume_path: Path, -) -> AsyncIterator[DockerVolume]: - async with aiodocker.Docker() as docker_client: - source = _get_source(run_id, node_uuid, used_volume_path) - volume = await docker_client.volumes.create( - { - "Name": source, - "Labels": { - "node_uuid": node_uuid, - "run_id": run_id, - "source": source, - "study_id": study_id, - "swarm_stack_name": swarm_stack_name, - "user_id": "1", - }, - } - ) - - container = await docker_client.containers.run( - config={ - "Cmd": ["/bin/ash", "-c", "sleep 10000"], - "Image": "alpine:latest", - "HostConfig": {"Binds": [f"{volume.name}:{used_volume_path}"]}, - }, - name=f"using_volume_{volume.name}", - ) - await container.start() - - yield volume - - await container.delete(force=True) - await volume.delete() - - -@pytest.fixture -def env( # noqa: PT004 +def mock_environment( monkeypatch: pytest.MonkeyPatch, mocked_s3_server_url: HttpUrl, bucket: str, swarm_stack_name: str, -) -> None: - mock_dict = { - "LOGLEVEL": "DEBUG", - "SC_BOOT_MODE": BootModeEnum.DEBUG, - "AGENT_VOLUMES_CLEANUP_TARGET_SWARM_STACK_NAME": swarm_stack_name, - "AGENT_VOLUMES_CLEANUP_S3_ENDPOINT": mocked_s3_server_url, - "AGENT_VOLUMES_CLEANUP_S3_ACCESS_KEY": "xxx", - "AGENT_VOLUMES_CLEANUP_S3_SECRET_KEY": "xxx", - "AGENT_VOLUMES_CLEANUP_S3_BUCKET": bucket, - "AGENT_VOLUMES_CLEANUP_S3_PROVIDER": S3Provider.MINIO, - } - for key, value in mock_dict.items(): - monkeypatch.setenv(key, value) - - -@pytest.fixture -def settings(env: None) -> ApplicationSettings: - return ApplicationSettings.create_from_envs() - - -@pytest.fixture() -def caplog_info_debug( - caplog: pytest.LogCaptureFixture, -) -> Iterable[pytest.LogCaptureFixture]: - with caplog.at_level(logging.DEBUG): - yield caplog + docker_node_id: str, +) -> EnvVarsDict: + return setenvs_from_dict( + monkeypatch, + { + "LOGLEVEL": "DEBUG", + "SC_BOOT_MODE": BootModeEnum.DEBUG, + "AGENT_VOLUMES_CLEANUP_TARGET_SWARM_STACK_NAME": swarm_stack_name, + "AGENT_VOLUMES_CLEANUP_S3_ENDPOINT": mocked_s3_server_url, + "AGENT_VOLUMES_CLEANUP_S3_ACCESS_KEY": "xxx", + "AGENT_VOLUMES_CLEANUP_S3_SECRET_KEY": "xxx", + "AGENT_VOLUMES_CLEANUP_S3_BUCKET": bucket, + "AGENT_VOLUMES_CLEANUP_S3_PROVIDER": S3Provider.MINIO, + "RABBIT_HOST": "test", + "RABBIT_PASSWORD": "test", + "RABBIT_SECURE": "false", + "RABBIT_USER": "test", + "AGENT_DOCKER_NODE_ID": docker_node_id, + }, + ) @pytest.fixture(scope="module") diff --git a/services/agent/tests/unit/conftest.py b/services/agent/tests/unit/conftest.py new file mode 100644 index 000000000000..1a49ce6ba576 --- /dev/null +++ b/services/agent/tests/unit/conftest.py @@ -0,0 +1,144 @@ +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument + +from collections.abc import AsyncIterable, AsyncIterator, Awaitable, Callable +from contextlib import suppress +from pathlib import Path +from uuid import uuid4 + +import aiodocker +import pytest +from aiodocker.containers import DockerContainer +from aiodocker.volumes import DockerVolume +from asgi_lifespan import LifespanManager +from fastapi import FastAPI +from fastapi.testclient import TestClient +from models_library.projects import ProjectID +from models_library.projects_nodes_io import NodeID +from models_library.services_types import RunID +from models_library.users import UserID +from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict +from settings_library.rabbit import RabbitSettings +from simcore_service_agent.core.application import create_app +from utils import VOLUMES_TO_CREATE, get_source + + +@pytest.fixture +def service_env( + monkeypatch: pytest.MonkeyPatch, + mock_environment: EnvVarsDict, + rabbit_service: RabbitSettings, +) -> EnvVarsDict: + return setenvs_from_dict( + monkeypatch, + { + **mock_environment, + "RABBIT_HOST": rabbit_service.RABBIT_HOST, + "RABBIT_PASSWORD": rabbit_service.RABBIT_PASSWORD.get_secret_value(), + "RABBIT_PORT": f"{rabbit_service.RABBIT_PORT}", + "RABBIT_SECURE": f"{rabbit_service.RABBIT_SECURE}", + "RABBIT_USER": rabbit_service.RABBIT_USER, + }, + ) + + +@pytest.fixture +async def initialized_app(service_env: EnvVarsDict) -> AsyncIterator[FastAPI]: + app: FastAPI = create_app() + + async with LifespanManager(app): + yield app + + +@pytest.fixture +def test_client(initialized_app: FastAPI) -> TestClient: + return TestClient(initialized_app) + + +@pytest.fixture +def run_id() -> RunID: + return RunID.create() + + +@pytest.fixture +def project_id() -> ProjectID: + return uuid4() + + +@pytest.fixture +def user_id() -> UserID: + return 1 + + +@pytest.fixture +def volumes_path(tmp_path: Path) -> Path: + return tmp_path / "volumes" + + +@pytest.fixture +async def create_dynamic_sidecar_volume( + run_id: RunID, + project_id: ProjectID, + swarm_stack_name: str, + user_id: UserID, + volumes_path: Path, +) -> AsyncIterable[Callable[[NodeID, bool, str], Awaitable[str]]]: + volumes_to_cleanup: list[DockerVolume] = [] + containers_to_cleanup: list[DockerContainer] = [] + + async with aiodocker.Docker() as docker_client: + + async def _(node_id: NodeID, in_use: bool, volume_name: str) -> str: + source = get_source(run_id, node_id, volumes_path / volume_name) + volume = await docker_client.volumes.create( + { + "Name": source, + "Labels": { + "node_uuid": f"{node_id}", + "run_id": run_id, + "source": source, + "study_id": f"{project_id}", + "swarm_stack_name": swarm_stack_name, + "user_id": f"{user_id}", + }, + } + ) + volumes_to_cleanup.append(volume) + + if in_use: + container = await docker_client.containers.run( + config={ + "Cmd": ["/bin/ash", "-c", "sleep 10000"], + "Image": "alpine:latest", + "HostConfig": {"Binds": [f"{volume.name}:{volumes_path}"]}, + }, + name=f"using_volume_{volume.name}", + ) + await container.start() + containers_to_cleanup.append(container) + + return source + + yield _ + + for container in containers_to_cleanup: + with suppress(aiodocker.DockerError): + await container.delete(force=True) + for volume in volumes_to_cleanup: + with suppress(aiodocker.DockerError): + await volume.delete() + + +@pytest.fixture +def create_dynamic_sidecar_volumes( + create_dynamic_sidecar_volume: Callable[[NodeID, bool, str], Awaitable[str]] +) -> Callable[[NodeID, bool], Awaitable[set[str]]]: + async def _(node_id: NodeID, in_use: bool) -> set[str]: + volume_names: set[str] = set() + for volume_name in VOLUMES_TO_CREATE: + name = await create_dynamic_sidecar_volume(node_id, in_use, volume_name) + volume_names.add(name) + + return volume_names + + return _ diff --git a/services/agent/tests/unit/test_api_rest__health.py b/services/agent/tests/unit/test_api_rest__health.py new file mode 100644 index 000000000000..6e690daa7884 --- /dev/null +++ b/services/agent/tests/unit/test_api_rest__health.py @@ -0,0 +1,17 @@ +# pylint: disable=protected-access +# pylint: disable=redefined-outer-name + + +from fastapi import status +from fastapi.testclient import TestClient +from models_library.api_schemas__common.health import HealthCheckGet + +pytest_simcore_core_services_selection = [ + "rabbit", +] + + +def test_health_ok(test_client: TestClient): + response = test_client.get("/health") + assert response.status_code == status.HTTP_200_OK + assert HealthCheckGet.parse_obj(response.json()) diff --git a/services/agent/tests/unit/test_api_rpc__volumes.py b/services/agent/tests/unit/test_api_rpc__volumes.py new file mode 100644 index 000000000000..df7121d1418d --- /dev/null +++ b/services/agent/tests/unit/test_api_rpc__volumes.py @@ -0,0 +1,67 @@ +# pylint:disable=redefined-outer-name +# pylint:disable=unused-argument + +from collections.abc import Awaitable, Callable +from unittest.mock import AsyncMock +from uuid import uuid4 + +import pytest +import pytest_mock +from fastapi import FastAPI +from servicelib.rabbitmq import RabbitMQRPCClient +from servicelib.rabbitmq.rpc_interfaces.agent import volumes + +pytest_simcore_core_services_selection = [ + "rabbit", +] + + +@pytest.fixture +async def rpc_client( + initialized_app: FastAPI, + rabbitmq_rpc_client: Callable[[str], Awaitable[RabbitMQRPCClient]], +) -> RabbitMQRPCClient: + return await rabbitmq_rpc_client("client") + + +@pytest.fixture +def mocked_remove_service_volumes(mocker: pytest_mock.MockerFixture) -> AsyncMock: + return mocker.patch( + "simcore_service_agent.services.volumes_manager.VolumesManager.remove_service_volumes" + ) + + +@pytest.fixture +def mocked_remove_all_volumes(mocker: pytest_mock.MockerFixture) -> AsyncMock: + return mocker.patch( + "simcore_service_agent.services.volumes_manager.VolumesManager.remove_all_volumes" + ) + + +async def test_backup_and_remove_volumes_for_all_services( + rpc_client: RabbitMQRPCClient, + swarm_stack_name: str, + docker_node_id: str, + mocked_remove_all_volumes: AsyncMock, +): + assert mocked_remove_all_volumes.call_count == 0 + await volumes.backup_and_remove_volumes_for_all_services( + rpc_client, docker_node_id=docker_node_id, swarm_stack_name=swarm_stack_name + ) + assert mocked_remove_all_volumes.call_count == 1 + + +async def test_remove_volumes_without_backup_for_service( + rpc_client: RabbitMQRPCClient, + swarm_stack_name: str, + docker_node_id: str, + mocked_remove_service_volumes: AsyncMock, +): + assert mocked_remove_service_volumes.call_count == 0 + await volumes.remove_volumes_without_backup_for_service( + rpc_client, + docker_node_id=docker_node_id, + swarm_stack_name=swarm_stack_name, + node_id=uuid4(), + ) + assert mocked_remove_service_volumes.call_count == 1 diff --git a/services/agent/tests/unit/test_cli.py b/services/agent/tests/unit/test_cli.py index 97de609dd92b..a205dadb47b4 100644 --- a/services/agent/tests/unit/test_cli.py +++ b/services/agent/tests/unit/test_cli.py @@ -5,12 +5,13 @@ import pytest from click.testing import Result +from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict from simcore_service_agent.cli import main from typer.testing import CliRunner @pytest.fixture -def cli_runner() -> CliRunner: +def cli_runner(mock_environment: EnvVarsDict) -> CliRunner: return CliRunner() @@ -20,7 +21,7 @@ def _format_cli_error(result: Result) -> str: return f"Below exception was raised by the cli:\n{tb_message}" -def test_process_cli_options(env: None, cli_runner: CliRunner): +def test_process_cli_options(cli_runner: CliRunner): result = cli_runner.invoke(main, ["--help"]) print(result.stdout) assert result.exit_code == 0, _format_cli_error(result) diff --git a/services/agent/tests/unit/test_core_routes.py b/services/agent/tests/unit/test_core_routes.py deleted file mode 100644 index c20b87147578..000000000000 --- a/services/agent/tests/unit/test_core_routes.py +++ /dev/null @@ -1,58 +0,0 @@ -# pylint: disable=redefined-outer-name -# pylint: disable=unused-argument -# pylint: disable=protected-access - -from time import time -from typing import AsyncIterator - -import pytest -from fastapi import FastAPI, status -from fastapi.testclient import TestClient -from simcore_service_agent.core.application import create_app -from simcore_service_agent.modules.task_monitor import TaskMonitor - - -@pytest.fixture -async def initialized_app() -> AsyncIterator[FastAPI]: - app: FastAPI = create_app() - - await app.router.startup() - yield app - await app.router.shutdown() - - -@pytest.fixture -def test_client(initialized_app: FastAPI) -> TestClient: - return TestClient(initialized_app) - - -def test_health_ok(env: None, test_client: TestClient): - response = test_client.get("/health") - assert response.status_code == status.HTTP_200_OK - assert response.json() is None - - -def test_health_fails_not_started( - env: None, initialized_app: FastAPI, test_client: TestClient -): - task_monitor: TaskMonitor = initialized_app.state.task_monitor - # emulate monitor not being started - task_monitor._was_started = False # noqa: SLF001 - - response = test_client.get("/health") - assert response.status_code == status.HTTP_503_SERVICE_UNAVAILABLE - assert response.json() == {"detail": "unhealthy"} - - -def test_health_fails_hanging_tasks( - env: None, initialized_app: FastAPI, test_client: TestClient -): - task_monitor: TaskMonitor = initialized_app.state.task_monitor - - # emulate tasks hanging - for task_data in task_monitor._to_start.values(): # noqa: SLF001 - task_data._start_time = time() - 1e6 # noqa: SLF001 - - response = test_client.get("/health") - assert response.status_code == status.HTTP_503_SERVICE_UNAVAILABLE - assert response.json() == {"detail": "unhealthy"} diff --git a/services/agent/tests/unit/test_modules_task_monitor.py b/services/agent/tests/unit/test_modules_task_monitor.py deleted file mode 100644 index e13b73c5d007..000000000000 --- a/services/agent/tests/unit/test_modules_task_monitor.py +++ /dev/null @@ -1,91 +0,0 @@ -# pylint:disable=protected-access - -import asyncio -from typing import Final - -import pytest -from pydantic import PositiveFloat -from simcore_service_agent.modules.task_monitor import TaskMonitor - -REPEAT_TASK_INTERVAL_S: Final[PositiveFloat] = 0.05 - - -async def _job_which_raises_error() -> None: - raise RuntimeError("raised expected error") - - -async def _job_which_hangs() -> None: - print("I will be hanging....") - await asyncio.sleep(REPEAT_TASK_INTERVAL_S * 10000) - - -@pytest.mark.parametrize("repeat_interval_s", [REPEAT_TASK_INTERVAL_S, None]) -async def test_task_monitor_recovers_from_error( - caplog_info_debug: pytest.LogCaptureFixture, - repeat_interval_s: PositiveFloat | None, -): - - task_monitor = TaskMonitor() - task_monitor.register_job( - _job_which_raises_error, repeat_interval_s=repeat_interval_s - ) - - await task_monitor.start() - - await asyncio.sleep(REPEAT_TASK_INTERVAL_S * 2) - - await task_monitor.shutdown() - assert len(task_monitor._tasks) == 0 - assert len(task_monitor._to_start) == 0 - - log_messages = caplog_info_debug.text - print(log_messages) - - assert f"Starting '{_job_which_raises_error.__name__}' ..." in log_messages - assert 'RuntimeError("raised expected error")' in log_messages - assert ( - f"Will run '{_job_which_raises_error.__name__}' again in {repeat_interval_s} seconds" - in log_messages - ) - if repeat_interval_s is None: - assert ( - f"Unexpected termination of '{_job_which_raises_error.__name__}'; it will be restarted" - in log_messages - ) - - -async def test_add_same_task_fails(): - task_monitor = TaskMonitor() - task_monitor.register_job(_job_which_raises_error, repeat_interval_s=1) - with pytest.raises(RuntimeError) as exe_info: - task_monitor.register_job(_job_which_raises_error, repeat_interval_s=1) - assert ( - f"{exe_info.value}" - == f"{_job_which_raises_error.__name__} is already registered" - ) - - -async def test_add_task_after_start_fails(): - task_monitor = TaskMonitor() - await task_monitor.start() - - with pytest.raises(RuntimeError) as exe_info: - task_monitor.register_job(_job_which_raises_error, repeat_interval_s=1) - assert ( - f"{exe_info.value}" == "Cannot add more tasks, monitor already running with: []" - ) - await task_monitor.shutdown() - - -async def test_hanging_jobs_are_detected(): - task_monitor = TaskMonitor() - task_monitor.register_job( - _job_which_hangs, repeat_interval_s=REPEAT_TASK_INTERVAL_S - ) - await task_monitor.start() - - assert task_monitor.are_tasks_hanging is False - - await asyncio.sleep(REPEAT_TASK_INTERVAL_S * 2) - - assert task_monitor.are_tasks_hanging is True diff --git a/services/agent/tests/unit/test_modules_volumes_cleanup.py b/services/agent/tests/unit/test_modules_volumes_cleanup.py deleted file mode 100644 index f6f25945d5b8..000000000000 --- a/services/agent/tests/unit/test_modules_volumes_cleanup.py +++ /dev/null @@ -1,92 +0,0 @@ -# pylint: disable=redefined-outer-name -# pylint: disable=unused-argument - - -from pathlib import Path - -import pytest -from aiodocker.volumes import DockerVolume -from pytest import LogCaptureFixture -from pytest_mock.plugin import MockerFixture -from simcore_service_agent.core.settings import ApplicationSettings -from simcore_service_agent.modules.volumes_cleanup import backup_and_remove_volumes - - -@pytest.fixture -async def mock_volumes_folders( - mocker: MockerFixture, - unused_volume: DockerVolume, - used_volume: DockerVolume, - unused_volume_path: Path, - used_volume_path: Path, -) -> None: - - unused_volume_path.mkdir(parents=True, exist_ok=True) - used_volume_path.mkdir(parents=True, exist_ok=True) - - # root permissions are required to access the /var/docker data - # overwriting with a mocked path for this test - unused_volume_data = await unused_volume.show() - unused_volume_data["Mountpoint"] = f"{unused_volume_path}" - used_volume_data = await used_volume.show() - used_volume_data["Mountpoint"] = f"{used_volume_path}" - - volumes_inspect = [unused_volume_data, used_volume_data] - - # patch the function here - mocker.patch( - "aiodocker.volumes.DockerVolumes.list", - return_value={"Volumes": volumes_inspect}, - ) - - -@pytest.fixture -async def used_volume_name(used_volume: DockerVolume) -> str: - return (await used_volume.show())["Name"] - - -@pytest.fixture -async def unused_volume_name(unused_volume: DockerVolume) -> str: - return (await unused_volume.show())["Name"] - - -async def test_workflow( - mock_volumes_folders: None, - caplog_info_debug: pytest.LogCaptureFixture, - settings: ApplicationSettings, - used_volume_name: str, - unused_volume_name: str, -): - await backup_and_remove_volumes(settings) - - log_messages = caplog_info_debug.messages - assert f"Removed docker volume: '{unused_volume_name}'" in log_messages - assert f"Skipped in use docker volume: '{used_volume_name}'" in log_messages - - -@pytest.mark.parametrize( - "error_class, error_message", - [ - (RuntimeError, "this was already handled"), - (Exception, "also capture all other generic errors"), - ], -) -async def test_regression_error_handling( - mock_volumes_folders: None, - caplog_info_debug: LogCaptureFixture, - settings: ApplicationSettings, - used_volume_name: str, - unused_volume_name: str, - mocker: MockerFixture, - error_class: type[BaseException], - error_message: str, -): - mocker.patch( - "simcore_service_agent.modules.volumes_cleanup._core.store_to_s3", - side_effect=error_class(error_message), - ) - - await backup_and_remove_volumes(settings) - - log_messages = caplog_info_debug.messages - assert error_message in log_messages diff --git a/services/agent/tests/unit/test_modules_volumes_cleanup_docker.py b/services/agent/tests/unit/test_modules_volumes_cleanup_docker.py deleted file mode 100644 index e2e74088bd67..000000000000 --- a/services/agent/tests/unit/test_modules_volumes_cleanup_docker.py +++ /dev/null @@ -1,133 +0,0 @@ -# pylint: disable=redefined-outer-name) - -from typing import Any, AsyncIterator - -import aiodocker -import pytest -from aiodocker.volumes import DockerVolume -from pytest_mock import MockerFixture -from servicelib.docker_constants import PREFIX_DYNAMIC_SIDECAR_VOLUMES -from simcore_service_agent.modules.volumes_cleanup._docker import ( - docker_client, - get_dyv_volumes, - is_volume_used, -) - -# UTILS - - -async def _create_volume( - docker_client: aiodocker.Docker, - swarm_stack_name: str, - study_id: str, - node_uuid: str, - run_id: str, -) -> DockerVolume: - mocked_source = f"{PREFIX_DYNAMIC_SIDECAR_VOLUMES}_a_test_ok" - volume = await docker_client.volumes.create( - { - "Name": mocked_source, - "Labels": { - "node_uuid": node_uuid, - "run_id": run_id, - "source": mocked_source, - "study_id": study_id, - "swarm_stack_name": swarm_stack_name, - "user_id": "1", - }, - } - ) - return volume - - -# FIXTURES - - -@pytest.fixture -async def volume_with_correct_target( - swarm_stack_name: str, - study_id: str, - node_uuid: str, - run_id: str, -) -> AsyncIterator[dict[str, Any]]: - async with aiodocker.Docker() as docker_client: - volume = await _create_volume( - docker_client, swarm_stack_name, study_id, node_uuid, run_id - ) - - yield await volume.show() - - try: - await volume.delete() - except aiodocker.DockerError: - pass - - -@pytest.fixture -def wrong_swarm_stack_name() -> str: - return "a_different_swarm_stack_name" - - -@pytest.fixture -async def volume_with_wrong_target( - study_id: str, node_uuid: str, run_id: str, wrong_swarm_stack_name: str -) -> None: - async with aiodocker.Docker() as docker_client: - volume = await _create_volume( - docker_client, wrong_swarm_stack_name, study_id, node_uuid, run_id - ) - - yield await volume.show() - - try: - await volume.delete() - except aiodocker.DockerError: - pass - - -# TESTS - - -async def test_get_dyv_volumes_expect_a_volume( - volume_with_correct_target: dict[str, Any], swarm_stack_name: str -): - async with aiodocker.Docker() as docker_client: - volumes = await get_dyv_volumes(docker_client, swarm_stack_name) - assert len(volumes) == 1 - assert volumes[0] == volume_with_correct_target - - -async def test_get_dyv_volumes_expect_no_volume( - volume_with_wrong_target: dict[str, Any], - swarm_stack_name: str, - wrong_swarm_stack_name: str, -): - async with aiodocker.Docker() as docker_client: - volumes = await get_dyv_volumes(docker_client, swarm_stack_name) - assert len(volumes) == 0 - - async with aiodocker.Docker() as docker_client: - volumes = await get_dyv_volumes(docker_client, wrong_swarm_stack_name) - assert len(volumes) == 1 - assert volumes[0] == volume_with_wrong_target - - -async def test_is_volume_mounted_true_(used_volume: DockerVolume): - async with docker_client() as client: - assert await is_volume_used(client, used_volume.name) is True - - -async def test_is_volume_mounted_false(unused_volume: DockerVolume): - async with docker_client() as client: - assert await is_volume_used(client, unused_volume.name) is False - - -async def test_regression_volume_labels_are_none(mocker: MockerFixture): - mocked_volumes = { - "Volumes": [{"Name": f"{PREFIX_DYNAMIC_SIDECAR_VOLUMES}_test", "Labels": None}] - } - - async with docker_client() as client: - mocker.patch.object(client.volumes, "list", return_value=mocked_volumes) - - await get_dyv_volumes(client, "test") diff --git a/services/agent/tests/unit/test_modules_volumes_cleanup_s3.py b/services/agent/tests/unit/test_modules_volumes_cleanup_s3.py deleted file mode 100644 index 1728d0bb0292..000000000000 --- a/services/agent/tests/unit/test_modules_volumes_cleanup_s3.py +++ /dev/null @@ -1,236 +0,0 @@ -# pylint: disable=redefined-outer-name -# pylint: disable=unused-argument -# pylint: disable=too-many-locals - -import hashlib -from pathlib import Path - -import aioboto3 -import pytest -from aiodocker.volumes import DockerVolume -from pydantic import HttpUrl -from pytest import LogCaptureFixture -from simcore_service_agent.core.settings import ApplicationSettings -from simcore_service_agent.modules.volumes_cleanup._s3 import ( - S3Provider, - _get_dir_name, - _get_s3_path, - store_to_s3, -) - -# UTILS - - -def _get_file_hashes_in_path( - path_to_hash: Path, exclude_files: set[Path] | None = None -) -> set[tuple[Path, str]]: - def _hash_path(path: Path): - sha256_hash = hashlib.sha256() - with path.open("rb") as file: - # Read and update hash string value in blocks of 4K - for byte_block in iter(lambda: file.read(4096), b""): - sha256_hash.update(byte_block) - return sha256_hash.hexdigest() - - if path_to_hash.is_file(): - return {(path_to_hash.relative_to(path_to_hash), _hash_path(path_to_hash))} - - if exclude_files is None: - exclude_files = set() - - return { - (path.relative_to(path_to_hash), _hash_path(path)) - for path in path_to_hash.rglob("*") - if path.is_file() and path.relative_to(path_to_hash) not in exclude_files - } - - -async def _download_files_from_bucket( - endpoint: str, - access_key: str, - secret_key: str, - bucket_name: str, - save_to: Path, - swarm_stack_name: str, - study_id: str, - node_uuid: str, - run_id: str, -) -> None: - session = aioboto3.Session( - aws_access_key_id=access_key, aws_secret_access_key=secret_key - ) - async with session.resource("s3", endpoint_url=endpoint, use_ssl=False) as s_3: - bucket = await s_3.Bucket(bucket_name) - async for s3_object in bucket.objects.all(): - key_path = f"{swarm_stack_name}/{study_id}/{node_uuid}/{run_id}/" - if s3_object.key.startswith(key_path): - file_object = await s3_object.get() - file_path: Path = save_to / s3_object.key.replace(key_path, "") - file_path.parent.mkdir(parents=True, exist_ok=True) - print(f"Saving file to {file_path}") - file_content = await file_object["Body"].read() - file_path.write_bytes(file_content) - - -def _create_data(folder: Path) -> None: - for file in { # pylint:disable=use-sequence-for-iteration - ".hidden_do_not_remove", - "key_values.json", - "f1.txt", - "f2.txt", - "f3.txt", - "d1/f1.txt", - "d1/f2.txt", - "d1/f3.txt", - "d1/sd1/f1.txt", - "d1/sd1/f2.txt", - "d1/sd1/f3.txt", - }: - file_path = folder / file - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("test") - - -# FIXTURES - - -@pytest.fixture -def save_to(tmp_path: Path) -> Path: - return tmp_path / "save_to" - - -# TESTS - - -async def test_get_s3_path( - unused_volume: DockerVolume, - swarm_stack_name: str, - study_id: str, - node_uuid: str, - run_id: str, - bucket: str, -): - volume_data = await unused_volume.show() - assert _get_s3_path(bucket, volume_data["Labels"], unused_volume.name) == Path( - f"/{bucket}/{swarm_stack_name}/{study_id}/{node_uuid}/{run_id}/{_get_dir_name(unused_volume.name)}" - ) - - -async def test_store_to_s3( - unused_volume: DockerVolume, - mocked_s3_server_url: HttpUrl, - unused_volume_path: Path, - save_to: Path, - study_id: str, - node_uuid: str, - run_id: str, - bucket: str, - settings: ApplicationSettings, -): - _create_data(unused_volume_path) - dyv_volume = await unused_volume.show() - - # overwrite to test locally not against volume - # root permissions are required to access this - dyv_volume["Mountpoint"] = unused_volume_path - - await store_to_s3( - volume_name=unused_volume.name, - dyv_volume=dyv_volume, - s3_access_key="xxx", - s3_secret_key="xxx", - s3_bucket=bucket, - s3_endpoint=mocked_s3_server_url, - s3_region="us-east-1", - s3_provider=S3Provider.MINIO, - s3_parallelism=3, - s3_retries=1, - exclude_files=settings.AGENT_VOLUMES_CLEANUP_EXCLUDE_FILES, - ) - - await _download_files_from_bucket( - endpoint=mocked_s3_server_url, - access_key="xxx", - secret_key="xxx", - bucket_name=bucket, - save_to=save_to, - swarm_stack_name=dyv_volume["Labels"]["swarm_stack_name"], - study_id=study_id, - node_uuid=node_uuid, - run_id=run_id, - ) - - hashes_on_disk = _get_file_hashes_in_path( - unused_volume_path, set(map(Path, settings.AGENT_VOLUMES_CLEANUP_EXCLUDE_FILES)) - ) - volume_path_without_source_dir = save_to / _get_dir_name(unused_volume.name) - hashes_in_s3 = _get_file_hashes_in_path(volume_path_without_source_dir) - assert len(hashes_on_disk) > 0 - assert len(hashes_in_s3) > 0 - assert hashes_on_disk == hashes_in_s3 - - -@pytest.mark.parametrize("provider", [S3Provider.CEPH, S3Provider.MINIO]) -async def test_regression_non_aws_providers( - unused_volume: DockerVolume, - mocked_s3_server_url: HttpUrl, - unused_volume_path: Path, - bucket: str, - settings: ApplicationSettings, - caplog_info_debug: pytest.LogCaptureFixture, - provider: S3Provider, -): - _create_data(unused_volume_path) - dyv_volume = await unused_volume.show() - - # overwrite to test locally not against volume - # root permissions are required to access this - dyv_volume["Mountpoint"] = unused_volume_path - - await store_to_s3( - volume_name=unused_volume.name, - dyv_volume=dyv_volume, - s3_access_key="xxx", - s3_secret_key="xxx", - s3_bucket=bucket, - s3_endpoint=mocked_s3_server_url, - s3_region="us-east-1", - s3_provider=provider, - s3_parallelism=3, - s3_retries=1, - exclude_files=settings.AGENT_VOLUMES_CLEANUP_EXCLUDE_FILES, - ) - - assert f'provider "{provider}" not known' not in caplog_info_debug.text - - -async def test_regression_store_to_s3_volume_mountpoint_not_found( - unused_volume: DockerVolume, - mocked_s3_server_url: HttpUrl, - unused_volume_path: Path, - bucket: str, - settings: ApplicationSettings, - caplog_info_debug: LogCaptureFixture, -): - dyv_volume = await unused_volume.show() - assert unused_volume_path.exists() is False - - # overwrite to test locally not against volume - # root permissions are required to access this - dyv_volume["Mountpoint"] = unused_volume_path - - await store_to_s3( - volume_name=unused_volume.name, - dyv_volume=dyv_volume, - s3_access_key="xxx", - s3_secret_key="xxx", - s3_bucket=bucket, - s3_endpoint=mocked_s3_server_url, - s3_region="us-east-1", - s3_provider=S3Provider.MINIO, - s3_parallelism=3, - s3_retries=1, - exclude_files=settings.AGENT_VOLUMES_CLEANUP_EXCLUDE_FILES, - ) - assert f"mountpoint {unused_volume_path} does not exist" in caplog_info_debug.text - assert f"{unused_volume.name}" in caplog_info_debug.text diff --git a/services/agent/tests/unit/test_services_backup.py b/services/agent/tests/unit/test_services_backup.py new file mode 100644 index 000000000000..67a1203ea796 --- /dev/null +++ b/services/agent/tests/unit/test_services_backup.py @@ -0,0 +1,105 @@ +# pylint: disable=redefined-outer-name + +import asyncio +from collections.abc import Awaitable, Callable +from pathlib import Path +from typing import Final +from uuid import uuid4 + +import aioboto3 +import pytest +from fastapi import FastAPI +from models_library.projects import ProjectID +from models_library.projects_nodes_io import NodeID +from models_library.services_types import RunID +from pydantic import NonNegativeInt +from simcore_service_agent.core.settings import ApplicationSettings +from simcore_service_agent.services.backup import backup_volume +from simcore_service_agent.services.docker_utils import get_volume_details +from simcore_service_agent.services.volumes_manager import VolumesManager +from utils import VOLUMES_TO_CREATE + +pytest_simcore_core_services_selection = [ + "rabbit", +] + +_FILES_TO_CREATE_IN_VOLUME: Final[NonNegativeInt] = 10 + + +@pytest.fixture +def volume_content(tmpdir: Path) -> Path: + path = Path(tmpdir) / "to_copy" + path.mkdir(parents=True, exist_ok=True) + + for i in range(_FILES_TO_CREATE_IN_VOLUME): + (path / f"f{i}").write_text(f"some text for file {i}\n" * (i + 1)) + + return path + + +@pytest.fixture +def downlaoded_from_s3(tmpdir: Path) -> Path: + path = Path(tmpdir) / "downloaded_from_s3" + path.mkdir(parents=True, exist_ok=True) + return path + + +async def test_backup_volume( + volume_content: Path, + project_id: ProjectID, + swarm_stack_name: str, + run_id: RunID, + downlaoded_from_s3: Path, + create_dynamic_sidecar_volumes: Callable[[NodeID, bool], Awaitable[set[str]]], + initialized_app: FastAPI, +): + node_id = uuid4() + volumes: set[str] = await create_dynamic_sidecar_volumes( + node_id, True # noqa: FBT003 + ) + + for volume in volumes: + volume_details = await get_volume_details( + VolumesManager.get_from_app_state(initialized_app).docker, + volume_name=volume, + ) + # root permissions are required to access the /var/docker data + # overwriting with a mocked path for this test + volume_details.mountpoint = volume_content + await backup_volume(initialized_app, volume_details, volume) + + settings: ApplicationSettings = initialized_app.state.settings + + session = aioboto3.Session( + aws_access_key_id=settings.AGENT_VOLUMES_CLEANUP_S3_ACCESS_KEY, + aws_secret_access_key=settings.AGENT_VOLUMES_CLEANUP_S3_SECRET_KEY, + ) + + expected_files = _FILES_TO_CREATE_IN_VOLUME * len(VOLUMES_TO_CREATE) + + async with session.client("s3", endpoint_url=settings.AGENT_VOLUMES_CLEANUP_S3_ENDPOINT) as s3_client: # type: ignore + list_response = await s3_client.list_objects_v2( + Bucket=settings.AGENT_VOLUMES_CLEANUP_S3_BUCKET, + Prefix=f"{swarm_stack_name}/{project_id}/{node_id}/{run_id}", + ) + synced_keys: list[str] = [o["Key"] for o in list_response["Contents"]] + + assert len(synced_keys) == expected_files + + async def _download_file(key: str) -> None: + key_path = Path(key) + (downlaoded_from_s3 / key_path.parent.name).mkdir( + parents=True, exist_ok=True + ) + await s3_client.download_file( + settings.AGENT_VOLUMES_CLEANUP_S3_BUCKET, + key, + downlaoded_from_s3 / key_path.parent.name / key_path.name, + ) + + await asyncio.gather(*[_download_file(key) for key in synced_keys]) + + assert ( + len([x for x in downlaoded_from_s3.rglob("*") if x.is_file()]) + == expected_files + ) diff --git a/services/agent/tests/unit/test_services_docker_utils.py b/services/agent/tests/unit/test_services_docker_utils.py new file mode 100644 index 000000000000..40f86529edb6 --- /dev/null +++ b/services/agent/tests/unit/test_services_docker_utils.py @@ -0,0 +1,148 @@ +# pylint: disable=protected-access +# pylint: disable=redefined-outer-name + +from collections.abc import Awaitable, Callable +from pathlib import Path +from unittest.mock import AsyncMock +from uuid import uuid4 + +import pytest +from aiodocker.docker import Docker +from fastapi import FastAPI +from models_library.projects_nodes_io import NodeID +from models_library.services_types import RunID +from pytest_mock import MockerFixture +from servicelib.docker_constants import PREFIX_DYNAMIC_SIDECAR_VOLUMES +from simcore_service_agent.services.docker_utils import ( + _VOLUMES_NOT_TO_BACKUP, + _does_volume_require_backup, + _reverse_string, + get_unused_dynamc_sidecar_volumes, + get_volume_details, + remove_volume, +) +from simcore_service_agent.services.volumes_manager import VolumesManager +from utils import VOLUMES_TO_CREATE, get_source + +pytest_simcore_core_services_selection = [ + "rabbit", +] + + +def test__reverse_string(): + assert _reverse_string("abcd") == "dcba" + + +@pytest.mark.parametrize( + "volume_path_part, expected", + [ + ("inputs", False), + ("shared-store", False), + ("outputs", True), + ("workdir", True), + ], +) +def test__does_volume_require_backup( + run_id: RunID, volume_path_part: str, expected: bool +) -> None: + volume_name = get_source(run_id, uuid4(), Path("/apath") / volume_path_part) + print(volume_name) + assert _does_volume_require_backup(volume_name) is expected + + +@pytest.fixture +def volumes_manager_docker_client(initialized_app: FastAPI) -> Docker: + volumes_manager = VolumesManager.get_from_app_state(initialized_app) + return volumes_manager.docker + + +@pytest.fixture +def mock_backup_volume(mocker: MockerFixture) -> AsyncMock: + return mocker.patch("simcore_service_agent.services.docker_utils.backup_volume") + + +@pytest.mark.parametrize("volume_count", [2]) +@pytest.mark.parametrize("requires_backup", [True, False]) +async def test_doclker_utils_workflow( + volume_count: int, + requires_backup: bool, + initialized_app: FastAPI, + volumes_manager_docker_client: Docker, + create_dynamic_sidecar_volumes: Callable[[NodeID, bool], Awaitable[set[str]]], + mock_backup_volume: AsyncMock, +): + created_volumes: set[str] = set() + for _ in range(volume_count): + created_volume = await create_dynamic_sidecar_volumes( + uuid4(), False # noqa: FBT003 + ) + created_volumes.update(created_volume) + + volumes = await get_unused_dynamc_sidecar_volumes(volumes_manager_docker_client) + assert volumes == created_volumes, ( + "Most likely you have a dirty working state, please check " + "that there are no previous docker volumes named `dyv_...` " + "currently present on the machine" + ) + + assert len(volumes) == len(VOLUMES_TO_CREATE) * volume_count + + count_vloumes_to_backup = 0 + count_volumes_to_skip = 0 + + for volume in volumes: + if _does_volume_require_backup(volume): + count_vloumes_to_backup += 1 + else: + count_volumes_to_skip += 1 + + assert volume.startswith(PREFIX_DYNAMIC_SIDECAR_VOLUMES) + await remove_volume( + initialized_app, + volumes_manager_docker_client, + volume_name=volume, + requires_backup=requires_backup, + ) + + assert ( + count_vloumes_to_backup + == (len(VOLUMES_TO_CREATE) - len(_VOLUMES_NOT_TO_BACKUP)) * volume_count + ) + assert count_volumes_to_skip == len(_VOLUMES_NOT_TO_BACKUP) * volume_count + + assert mock_backup_volume.call_count == ( + count_vloumes_to_backup if requires_backup else 0 + ) + + volumes = await get_unused_dynamc_sidecar_volumes(volumes_manager_docker_client) + assert len(volumes) == 0 + + +@pytest.mark.parametrize("requires_backup", [True, False]) +async def test_remove_misisng_volume_does_not_raise_error( + requires_backup: bool, + initialized_app: FastAPI, + volumes_manager_docker_client: Docker, +): + await remove_volume( + initialized_app, + volumes_manager_docker_client, + volume_name="this-volume-does-not-exist", + requires_backup=requires_backup, + ) + + +async def test_get_volume_details( + volumes_path: Path, + volumes_manager_docker_client: Docker, + create_dynamic_sidecar_volumes: Callable[[NodeID, bool], Awaitable[set[str]]], +): + + volume_names = await create_dynamic_sidecar_volumes(uuid4(), False) # noqa: FBT003 + for volume_name in volume_names: + volume_details = await get_volume_details( + volumes_manager_docker_client, volume_name=volume_name + ) + print(volume_details) + volume_prefix = f"{volumes_path}".replace("/", "_").strip("_") + assert volume_details.labels.directory_name.startswith(volume_prefix) diff --git a/services/agent/tests/unit/test_services_volumes_manager.py b/services/agent/tests/unit/test_services_volumes_manager.py new file mode 100644 index 000000000000..0dfc29ceb83a --- /dev/null +++ b/services/agent/tests/unit/test_services_volumes_manager.py @@ -0,0 +1,185 @@ +# pylint: disable=protected-access +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument + +from copy import deepcopy +from dataclasses import dataclass, field +from datetime import timedelta +from pathlib import Path +from unittest.mock import AsyncMock +from uuid import uuid4 + +import pytest +import pytest_mock +from aiodocker.docker import Docker +from fastapi import FastAPI +from models_library.projects_nodes_io import NodeID +from models_library.services_types import RunID +from servicelib.rabbitmq.rpc_interfaces.agent.errors import ( + NoServiceVolumesFoundRPCError, +) +from simcore_service_agent.services.volumes_manager import VolumesManager +from tenacity import ( + AsyncRetrying, + retry_if_exception_type, + stop_after_delay, + wait_fixed, +) +from utils import VOLUMES_TO_CREATE, get_source + + +@dataclass +class MockedVolumesProxy: + run_id: RunID + volumes: set[str] = field(default_factory=set) + + def add_unused_volumes_for_service(self, node_id: NodeID) -> None: + for folder_name in VOLUMES_TO_CREATE: + volume_name = get_source(self.run_id, node_id, Path("/apath") / folder_name) + self.volumes.add(volume_name) + + def remove_volume(self, volume_name: str) -> None: + self.volumes.remove(volume_name) + + def get_unused_dynamc_sidecar_volumes(self) -> set[str]: + return deepcopy(self.volumes) + + +@pytest.fixture +async def mock_docker_utils( + mocker: pytest_mock.MockerFixture, run_id: RunID +) -> MockedVolumesProxy: + proxy = MockedVolumesProxy(run_id) + + async def _remove_volume( + app: FastAPI, docker: Docker, *, volume_name: str, requires_backup: bool + ) -> None: + proxy.remove_volume(volume_name) + + async def _get_unused_dynamc_sidecar_volumes(app: FastAPI) -> set[str]: + return proxy.get_unused_dynamc_sidecar_volumes() + + mocker.patch( + "simcore_service_agent.services.volumes_manager.remove_volume", + side_effect=_remove_volume, + ) + + mocker.patch( + "simcore_service_agent.services.volumes_manager.get_unused_dynamc_sidecar_volumes", + side_effect=_get_unused_dynamc_sidecar_volumes, + ) + + return proxy + + +@pytest.fixture +def spy_remove_volume( + mocker: pytest_mock.MockerFixture, mock_docker_utils: MockedVolumesProxy +) -> AsyncMock: + return mocker.spy(mock_docker_utils, "remove_volume") + + +@pytest.fixture +async def volumes_manager() -> VolumesManager: + # NOTE: background tasks are disabled on purpose + return VolumesManager( + app=FastAPI(), + book_keeping_interval=timedelta(seconds=1), + volume_cleanup_interval=timedelta(seconds=1), + remove_volumes_inactive_for=timedelta(seconds=0.1).total_seconds(), + ) + + +@pytest.mark.parametrize("service_count", [1, 3]) +async def test_volumes_manager_remove_all_volumes( + service_count: int, + mock_docker_utils: MockedVolumesProxy, + spy_remove_volume: AsyncMock, + volumes_manager: VolumesManager, +): + assert spy_remove_volume.call_count == 0 + + for _ in range(service_count): + mock_docker_utils.add_unused_volumes_for_service(uuid4()) + assert spy_remove_volume.call_count == 0 + assert ( + len(mock_docker_utils.get_unused_dynamc_sidecar_volumes()) + == len(VOLUMES_TO_CREATE) * service_count + ) + + await volumes_manager.remove_all_volumes() + assert spy_remove_volume.call_count == len(VOLUMES_TO_CREATE) * service_count + assert len(mock_docker_utils.get_unused_dynamc_sidecar_volumes()) == 0 + + +async def test_volumes_manager_remove_service_volumes( + mock_docker_utils: MockedVolumesProxy, + spy_remove_volume: AsyncMock, + volumes_manager: VolumesManager, +): + assert spy_remove_volume.call_count == 0 + mock_docker_utils.add_unused_volumes_for_service(uuid4()) + node_id_to_remvoe = uuid4() + mock_docker_utils.add_unused_volumes_for_service(node_id_to_remvoe) + + assert spy_remove_volume.call_count == 0 + assert ( + len(mock_docker_utils.get_unused_dynamc_sidecar_volumes()) + == len(VOLUMES_TO_CREATE) * 2 + ) + + await volumes_manager.remove_service_volumes(node_id_to_remvoe) + + assert spy_remove_volume.call_count == len(VOLUMES_TO_CREATE) + unused_volumes = mock_docker_utils.get_unused_dynamc_sidecar_volumes() + assert len(unused_volumes) == len(VOLUMES_TO_CREATE) + for volume_name in unused_volumes: + assert f"{node_id_to_remvoe}" not in volume_name + + +@pytest.fixture +async def mock_wait_for_unused_service_volumes( + mocker: pytest_mock.MockerFixture, +) -> None: + mocker.patch( + "simcore_service_agent.services.volumes_manager._WAIT_FOR_UNUSED_SERVICE_VOLUMES", + timedelta(seconds=2), + ) + + +async def test_volumes_manager_remove_service_volumes_when_volume_does_not_exist( + mock_wait_for_unused_service_volumes: None, + volumes_manager: VolumesManager, +): + not_existing_service = uuid4() + with pytest.raises(NoServiceVolumesFoundRPCError): + await volumes_manager.remove_service_volumes(not_existing_service) + + +async def test_volumes_manager_periodic_task_cleanup( + mock_docker_utils: MockedVolumesProxy, + spy_remove_volume: AsyncMock, + volumes_manager: VolumesManager, +): + async def _run_volumes_clennup() -> None: + await volumes_manager._bookkeeping_task() # noqa: SLF001 + await volumes_manager._periodic_volmue_cleanup_task() # noqa: SLF001 + + await _run_volumes_clennup() + assert spy_remove_volume.call_count == 0 + + mock_docker_utils.add_unused_volumes_for_service(uuid4()) + await _run_volumes_clennup() + assert spy_remove_volume.call_count == 0 + + # wait for the amount of time to pass + async for attempt in AsyncRetrying( + wait=wait_fixed(0.1), + stop=stop_after_delay(1), + reraise=True, + retry=retry_if_exception_type(AssertionError), + ): + with attempt: + await _run_volumes_clennup() + assert spy_remove_volume.call_count == len(VOLUMES_TO_CREATE) + assert len(mock_docker_utils.get_unused_dynamc_sidecar_volumes()) == 0 diff --git a/services/agent/tests/unit/utils.py b/services/agent/tests/unit/utils.py new file mode 100644 index 000000000000..8eeb23138d47 --- /dev/null +++ b/services/agent/tests/unit/utils.py @@ -0,0 +1,19 @@ +from pathlib import Path +from typing import Final + +from models_library.projects_nodes_io import NodeID + + +def get_source(run_id: str, node_id: NodeID, full_volume_path: Path) -> str: + # NOTE: volume name is not trimmed here, but it's ok for the tests + reversed_path = f"{full_volume_path}"[::-1].replace("/", "_") + return f"dyv_{run_id}_{node_id}_{reversed_path}" + + +VOLUMES_TO_CREATE: Final[list[str]] = [ + "inputs", + "outputs", + "workspace", + "work", + "shared-store", +] diff --git a/services/director-v2/openapi.json b/services/director-v2/openapi.json index b2e27ac6a703..cdd6d4eca051 100644 --- a/services/director-v2/openapi.json +++ b/services/director-v2/openapi.json @@ -2608,6 +2608,15 @@ "description": "set True if the dy-sidecar saves the state and uploads the outputs", "default": false }, + "instrumentation": { + "allOf": [ + { + "$ref": "#/components/schemas/ServicesInstrumentation" + } + ], + "title": "Instrumentation", + "description": "keeps track times for various operations" + }, "dynamic_sidecar_id": { "type": "string", "maxLength": 25, @@ -3613,17 +3622,36 @@ }, "ServiceState": { "enum": [ + "failed", "pending", "pulling", "starting", "running", + "stopping", "complete", - "failed", - "stopping" + "idle" ], "title": "ServiceState", "description": "An enumeration." }, + "ServicesInstrumentation": { + "properties": { + "start_requested_at": { + "type": "string", + "format": "date-time", + "title": "Start Requested At", + "description": "moment in which the process of starting the service was requested" + }, + "close_requested_at": { + "type": "string", + "format": "date-time", + "title": "Close Requested At", + "description": "moment in which the process of stopping the service was requested" + } + }, + "type": "object", + "title": "ServicesInstrumentation" + }, "SimpleAuthentication": { "properties": { "type": { diff --git a/services/director-v2/src/simcore_service_director_v2/api/routes/health.py b/services/director-v2/src/simcore_service_director_v2/api/routes/health.py index 79c954c44edd..9ce8dc97ef6c 100644 --- a/services/director-v2/src/simcore_service_director_v2/api/routes/health.py +++ b/services/director-v2/src/simcore_service_director_v2/api/routes/health.py @@ -2,7 +2,7 @@ from typing import Annotated from fastapi import APIRouter, Depends -from models_library.api_schemas_directorv2.health import HealthCheckGet +from models_library.api_schemas__common.health import HealthCheckGet from models_library.errors import RABBITMQ_CLIENT_UNHEALTHY_MSG from servicelib.rabbitmq import RabbitMQClient diff --git a/services/director-v2/src/simcore_service_director_v2/constants.py b/services/director-v2/src/simcore_service_director_v2/constants.py index 424ac151acb6..fc700254ed0b 100644 --- a/services/director-v2/src/simcore_service_director_v2/constants.py +++ b/services/director-v2/src/simcore_service_director_v2/constants.py @@ -4,7 +4,6 @@ DYNAMIC_SIDECAR_SERVICE_PREFIX: Final[str] = "dy-sidecar" DYNAMIC_PROXY_SERVICE_PREFIX: Final[str] = "dy-proxy" -DYNAMIC_VOLUME_REMOVER_PREFIX: Final[str] = "dy-volrm" # label storing scheduler_data to allow service # monitoring recovery after director-v2 reboots diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/__init__.py index 99cab1542c53..5fb63db124bb 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/__init__.py @@ -16,7 +16,6 @@ try_to_remove_network, update_scheduler_data_label, ) -from ._volume import remove_pending_volume_removal_services, remove_volumes_from_node __all__: tuple[str, ...] = ( "are_sidecar_and_proxy_services_present", @@ -29,12 +28,10 @@ "get_or_create_networks_ids", "get_projects_networks_containers", "get_swarm_network", - "is_sidecar_running", "is_dynamic_sidecar_stack_missing", + "is_sidecar_running", "remove_dynamic_sidecar_network", "remove_dynamic_sidecar_stack", - "remove_pending_volume_removal_services", - "remove_volumes_from_node", "try_to_remove_network", "update_scheduler_data_label", ) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_volume.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_volume.py deleted file mode 100644 index e5891bd9f6f3..000000000000 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_volume.py +++ /dev/null @@ -1,159 +0,0 @@ -import logging -from datetime import datetime, timezone - -from fastapi.encoders import jsonable_encoder -from models_library.projects import ProjectID -from models_library.projects_nodes_io import NodeID -from models_library.users import UserID -from servicelib.docker_utils import to_datetime -from servicelib.logging_utils import log_context -from tenacity import TryAgain -from tenacity.asyncio import AsyncRetrying -from tenacity.retry import retry_if_exception_type -from tenacity.stop import stop_after_delay -from tenacity.wait import wait_fixed - -from ....constants import DYNAMIC_VOLUME_REMOVER_PREFIX -from ..docker_service_specs.volume_remover import spec_volume_removal_service -from ._utils import docker_client - -_logger = logging.getLogger(__name__) - - -# FROM https://docs.docker.com/engine/swarm/how-swarm-mode-works/swarm-task-states/ -SERVICE_FINISHED_STATES: set[str] = { - "complete", - "failed", - "shutdown", - "rejected", - "orphaned", - "remove", -} - - -async def remove_volumes_from_node( - swarm_stack_name: str, - volume_names: list[str], - docker_node_id: str, - user_id: UserID, - project_id: ProjectID, - node_uuid: NodeID, - *, - volume_removal_attempts: int = 15, - sleep_between_attempts_s: int = 2, -) -> bool: - """ - Starts a service at target docker node which will remove - all entries in the `volumes_names` list. - """ - - async with docker_client() as client: - # Timeout for the runtime of the service is calculated based on the amount - # of attempts required to remove each individual volume, - # in the worst case scenario when all volumes are do not exit. - volume_removal_timeout_s = volume_removal_attempts * sleep_between_attempts_s - service_timeout_s = volume_removal_timeout_s * len(volume_names) - - service_spec = spec_volume_removal_service( - swarm_stack_name=swarm_stack_name, - docker_node_id=docker_node_id, - user_id=user_id, - project_id=project_id, - node_uuid=node_uuid, - volume_names=volume_names, - volume_removal_attempts=volume_removal_attempts, - sleep_between_attempts_s=sleep_between_attempts_s, - service_timeout_s=service_timeout_s, - ) - - volume_removal_service = await client.services.create( - **jsonable_encoder(service_spec, by_alias=True, exclude_unset=True) - ) - - service_id = volume_removal_service["ID"] - try: - async for attempt in AsyncRetrying( - stop=stop_after_delay(service_timeout_s), - wait=wait_fixed(0.5), - retry=retry_if_exception_type(TryAgain), - reraise=True, - ): - with attempt: - _logger.debug( - "Waiting for removal of %s, with service id %s", - node_uuid, - service_id, - ) - tasks = await client.tasks.list(filters={"service": service_id}) - # NOTE: the service will have at most 1 task, since there is no restart - # policy present - if len(tasks) != 1: - # Docker swarm needs a bit of time to startup the tasks - raise TryAgain( - f"Expected 1 task for service {service_id} on node {node_uuid}, found {tasks=}" - ) - - task = tasks[0] - task_status = task["Status"] - _logger.debug("Service %s, %s", service_id, f"{task_status=}") - task_state = task_status["State"] - if task_state not in SERVICE_FINISHED_STATES: - raise TryAgain( - f"Waiting for task to finish for service {service_id} on node {node_uuid}: {task_status=}" - ) - - if not ( - task_state == "complete" - and task_status["ContainerStatus"]["ExitCode"] == 0 - ): - _logger.error( - "Service %s on node %s status: %s", - service_id, - node_uuid, - f"{task_status=}", - ) - # NOTE: above implies the volumes will remain in the system and - # have to be manually removed. - return False - finally: - # NOTE: services created in swarm need to be removed, there is no way - # to instruct swarm to remove a service after it's created - # container/task finished - with log_context( - _logger, - logging.DEBUG, - f"deleting service {service_id} on node {node_uuid}", - ): - await client.services.delete(service_id) - - _logger.debug("Finished removing volumes for service %s", node_uuid) - return True - - -async def remove_pending_volume_removal_services(swarm_stack_name: str) -> None: - """ - Removes all pending volume removal services. Such a service - will be considered pending if it is running for longer than its - intended duration (defined in the `service_timeout_s` label). - """ - service_filters = { - "label": [f"swarm_stack_name={swarm_stack_name}"], - "name": [f"{DYNAMIC_VOLUME_REMOVER_PREFIX}"], - } - async with docker_client() as client: - volume_removal_services = await client.services.list(filters=service_filters) - - for volume_removal_service in volume_removal_services: - service_timeout_s = int( - volume_removal_service["Spec"]["Labels"]["service_timeout_s"] - ) - created_at = to_datetime(volume_removal_services[0]["CreatedAt"]) - time_diff = datetime.now(tz=timezone.utc) - created_at - service_timed_out = time_diff.seconds > (service_timeout_s * 10) - if service_timed_out: - service_id = volume_removal_service["ID"] - service_name = volume_removal_service["Spec"]["Name"] - _logger.debug( - "Removing pending volume removal service %s", service_name - ) - await client.services.delete(service_id) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/volume_remover.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/volume_remover.py deleted file mode 100644 index cefbe0156ec1..000000000000 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/volume_remover.py +++ /dev/null @@ -1,185 +0,0 @@ -import json -import re -from asyncio.log import logger -from typing import Final -from uuid import uuid4 - -from models_library.aiodocker_api import AioDockerServiceSpec -from models_library.projects import ProjectID -from models_library.projects_nodes_io import NodeID -from models_library.services_resources import ( - CPU_10_PERCENT, - CPU_100_PERCENT, - MEMORY_50MB, - MEMORY_250MB, -) -from models_library.users import UserID -from pydantic import parse_obj_as - -from ....constants import DYNAMIC_VOLUME_REMOVER_PREFIX - - -class DockerVersion(str): - """ - Extracts `XX.XX.XX` where X is a range [0-9] from - a given docker version - """ - - @classmethod - def __get_validators__(cls): - yield cls.validate_docker_version - - @classmethod - def validate_docker_version(cls, docker_version: str) -> str: - try: - search_result = re.search(r"^\d\d.(\d\d|\d).(\d\d|\d)", docker_version) - assert search_result # nosec - return search_result.group() - except AttributeError: - raise ValueError( # pylint: disable=raise-missing-from - f"{docker_version} appears not to be a valid docker version" - ) - - -DIND_VERSION: Final[DockerVersion] = parse_obj_as(DockerVersion, "20.10.14") - -# NOTE: below `retry` function is inspired by -# https://gist.github.com/sj26/88e1c6584397bb7c13bd11108a579746 -SH_SCRIPT_REMOVE_VOLUMES = """ -set -e; - -error_counter=0 - -function retry {{ - local retries=$1 - shift - - local count=0 - while true; - do - - local command_result - set +e - $($@ > /tmp/command_result 2>&1) - exit_code=$? - set -e - - command_result=$(cat /tmp/command_result) - echo "$command_result" - volume_name=$4 - - case "$command_result" in - *"Error: No such volume: $volume_name"*) - return 0 - ;; - esac - - if [ $exit_code -eq 0 ]; then - return 0 - fi - - count=$(($count + 1)) - if [ $count -lt $retries ]; then - echo "Retry $count/$retries exited $exit_code, retrying in {sleep} seconds..." - sleep {sleep} - else - echo "Retry $count/$retries exited $exit_code, no more retries left." - let error_counter=error_counter+1 - return 0 - fi - done - return 0 -}} - -for volume_name in {volume_names_seq} -do - retry {retries} docker volume rm "$volume_name" -done - -if [ "$error_counter" -ne "0" ]; then - echo "ERROR: Please check above logs, there was/were $error_counter error/s." - exit 1 -fi -""" - - -def spec_volume_removal_service( - swarm_stack_name: str, - docker_node_id: str, - user_id: UserID, - project_id: ProjectID, - node_uuid: NodeID, - volume_names: list[str], - docker_version: DockerVersion = DIND_VERSION, - *, - volume_removal_attempts: int, - sleep_between_attempts_s: int, - service_timeout_s: int, -) -> AioDockerServiceSpec: - """ - Generates a service spec for with base image - `docker:{docker_version}-dind` running the above bash script. - - The bash script will attempt to remove each individual volume - a few times before giving up. - The script will exit with error if it is not capable of - removing the volume. - - NOTE: expect the container of the service to exit with code 0, - otherwise there was an error. - NOTE: the bash script will exit 1 if it cannot find a - volume to remove. - NOTE: service must be removed once it finishes or it will - remain in the system. - NOTE: when running docker-in-docker https://hub.docker.com/_/docker - selecting the same version as the actual docker engine running - on the current node allows to avoid possible incompatible - versions. It is assumed that the same version of docker - will be running in the entire swarm. - """ - - volume_names_seq = " ".join(volume_names) - formatted_command = SH_SCRIPT_REMOVE_VOLUMES.format( - volume_names_seq=volume_names_seq, - retries=volume_removal_attempts, - sleep=sleep_between_attempts_s, - ) - logger.debug("Service will run:\n%s", formatted_command) - command = ["sh", "-c", formatted_command] - - create_service_params = { - "labels": { - "volume_names": json.dumps(volume_names), - "volume_removal_attempts": f"{volume_removal_attempts}", - "sleep_between_attempts_s": f"{sleep_between_attempts_s}", - "service_timeout_s": f"{service_timeout_s}", - "swarm_stack_name": swarm_stack_name, - "user_id": f"{user_id}", - "study_id": f"{project_id}", - "node_id": f"{node_uuid}", - }, - "name": f"{DYNAMIC_VOLUME_REMOVER_PREFIX}_{uuid4()}", - "task_template": { - "ContainerSpec": { - "Command": command, - "Image": f"docker:{docker_version}-dind", - "Mounts": [ - { - "Source": "/var/run/docker.sock", - "Target": "/var/run/docker.sock", - "Type": "bind", - } - ], - }, - "Placement": {"Constraints": [f"node.id == {docker_node_id}"]}, - "RestartPolicy": {"Condition": "none"}, - "Resources": { - "Reservations": { - "MemoryBytes": MEMORY_50MB, - "NanoCPUs": CPU_10_PERCENT, - }, - "Limits": {"MemoryBytes": MEMORY_250MB, "NanoCPUs": CPU_100_PERCENT}, - }, - }, - } - return AioDockerServiceSpec.parse_obj(create_service_params) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py index febf47040c0e..9dbe2763bc93 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py @@ -27,6 +27,13 @@ from servicelib.fastapi.long_running_tasks.server import TaskProgress from servicelib.logging_utils import log_context from servicelib.rabbitmq import RabbitMQClient +from servicelib.rabbitmq._client_rpc import RabbitMQRPCClient +from servicelib.rabbitmq.rpc_interfaces.agent.errors import ( + NoServiceVolumesFoundRPCError, +) +from servicelib.rabbitmq.rpc_interfaces.agent.volumes import ( + remove_volumes_without_backup_for_service, +) from servicelib.utils import limited_gather, logged_gather from simcore_postgres_database.models.comp_tasks import NodeClass from tenacity import RetryError, TryAgain @@ -67,11 +74,9 @@ get_projects_networks_containers, remove_dynamic_sidecar_network, remove_dynamic_sidecar_stack, - remove_volumes_from_node, try_to_remove_network, ) from ...errors import EntrypointContainerNotFoundError -from ...volumes import DY_SIDECAR_SHARED_STORE_PATH, DynamicSidecarVolumesPathsResolver if TYPE_CHECKING: # NOTE: TYPE_CHECKING is True when static type checkers are running, @@ -230,30 +235,17 @@ async def service_remove_sidecar_proxy_docker_networks_and_volumes( task_progress.update( message="removing volumes", percent=ProgressPercent(0.3) ) - unique_volume_names = [ - DynamicSidecarVolumesPathsResolver.source( - path=volume_path, - node_uuid=scheduler_data.node_uuid, - run_id=scheduler_data.run_id, - ) - for volume_path in [ - DY_SIDECAR_SHARED_STORE_PATH, - scheduler_data.paths_mapping.inputs_path, - scheduler_data.paths_mapping.outputs_path, - *scheduler_data.paths_mapping.state_paths, - ] - ] - with log_context( - _logger, logging.DEBUG, f"removing volumes via service for {node_uuid}" - ): - await remove_volumes_from_node( - swarm_stack_name=swarm_stack_name, - volume_names=unique_volume_names, - docker_node_id=scheduler_data.dynamic_sidecar.docker_node_id, - user_id=scheduler_data.user_id, - project_id=scheduler_data.project_id, - node_uuid=scheduler_data.node_uuid, - ) + with log_context(_logger, logging.DEBUG, f"removing volumes '{node_uuid}'"): + rabbit_rpc_client: RabbitMQRPCClient = app.state.rabbitmq_rpc_client + try: + await remove_volumes_without_backup_for_service( + rabbit_rpc_client, + docker_node_id=scheduler_data.dynamic_sidecar.docker_node_id, + swarm_stack_name=swarm_stack_name, + node_id=scheduler_data.node_uuid, + ) + except NoServiceVolumesFoundRPCError as e: + _logger.info("Could not remove volumes, reason: %s", e) _logger.debug( "Removed dynamic-sidecar services and crated container for '%s'", diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler.py index 41031a60318d..1e66fd82c527 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler.py @@ -93,7 +93,6 @@ class Scheduler( # pylint: disable=too-many-instance-attributes, too-many-publi ) _inverse_search_mapping: dict[NodeID, ServiceName] = field(default_factory=dict) _scheduler_task: Task | None = None - _cleanup_volume_removal_services_task: Task | None = None _trigger_observation_queue_task: Task | None = None _trigger_observation_queue: Queue = field(default_factory=Queue) _observation_counter: int = 0 @@ -123,10 +122,6 @@ async def start(self) -> None: name="dynamic-scheduler-trigger-obs-queue", ) - self._cleanup_volume_removal_services_task = asyncio.create_task( - _scheduler_utils.cleanup_volume_removal_services(self.app), - name="dynamic-scheduler-cleanup-volume-removal-services", - ) await _scheduler_utils.discover_running_services(self) async def shutdown(self) -> None: @@ -134,12 +129,6 @@ async def shutdown(self) -> None: self._inverse_search_mapping = {} self._to_observe = {} - if self._cleanup_volume_removal_services_task is not None: - self._cleanup_volume_removal_services_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await self._cleanup_volume_removal_services_task - self._cleanup_volume_removal_services_task = None - if self._scheduler_task is not None: await stop_periodic_task(self._scheduler_task, timeout=5) self._scheduler_task = None diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler_utils.py index 7438e9d996f7..b03356770845 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler_utils.py @@ -1,4 +1,3 @@ -import asyncio import logging from typing import Final @@ -15,11 +14,7 @@ ) from .....models.dynamic_services_scheduler import DynamicSidecarStatus, SchedulerData from ...api_client import SidecarsClient, get_sidecars_client -from ...docker_api import ( - get_dynamic_sidecar_state, - get_dynamic_sidecars_to_observe, - remove_pending_volume_removal_services, -) +from ...docker_api import get_dynamic_sidecar_state, get_dynamic_sidecars_to_observe from ...docker_states import extract_containers_minimum_statuses from ...errors import DockerServiceNotFoundError from ._events_utils import service_push_outputs @@ -60,32 +55,6 @@ async def service_awaits_manual_interventions(scheduler_data: SchedulerData) -> return service_awaits_intervention -async def cleanup_volume_removal_services(app: FastAPI) -> None: - settings: DynamicServicesSchedulerSettings = ( - app.state.settings.DYNAMIC_SERVICES.DYNAMIC_SCHEDULER - ) - - _logger.debug( - "dynamic-sidecars cleanup pending volume removal services every %s seconds", - settings.DIRECTOR_V2_DYNAMIC_SCHEDULER_PENDING_VOLUME_REMOVAL_INTERVAL_S, - ) - while await asyncio.sleep( - settings.DIRECTOR_V2_DYNAMIC_SCHEDULER_PENDING_VOLUME_REMOVAL_INTERVAL_S, - result=True, - ): - _logger.debug("Removing pending volume removal services...") - - try: - await remove_pending_volume_removal_services(settings.SWARM_STACK_NAME) - except asyncio.CancelledError: - _logger.info("Stopped pending volume removal services task") - raise - except Exception: # pylint: disable=broad-except - _logger.exception( - "Unexpected error while cleaning up pending volume removal services" - ) - - async def discover_running_services(scheduler: "Scheduler") -> None: # type: ignore # noqa: F821 """discover all services which were started before and add them to the scheduler""" settings: DynamicServicesSchedulerSettings = ( diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py index 8a6d85c906be..d003eec60e60 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py @@ -2,6 +2,9 @@ from pathlib import Path from typing import Any +from models_library.api_schemas_directorv2.services import ( + CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME, +) from models_library.projects import ProjectID from models_library.projects_nodes_io import NodeID from models_library.services import RunID @@ -133,7 +136,12 @@ def source(cls, path: Path, node_uuid: NodeID, run_id: RunID) -> str: # and state folders are very long and share the same subdirectory path. # Reversing volume name to prevent these issues from happening. reversed_volume_name = cls.volume_name(path)[::-1] - unique_name = f"{PREFIX_DYNAMIC_SIDECAR_VOLUMES}_{run_id}_{node_uuid}_{reversed_volume_name}" + + # ensure prefix size does not change + prefix = f"{PREFIX_DYNAMIC_SIDECAR_VOLUMES}_{run_id}_{node_uuid}" + assert len(prefix) == CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME - 1 # nosec + + unique_name = f"{prefix}_{reversed_volume_name}" return unique_name[:255] @classmethod diff --git a/services/director-v2/tests/conftest.py b/services/director-v2/tests/conftest.py index 937ba4a3f307..db64158d6d57 100644 --- a/services/director-v2/tests/conftest.py +++ b/services/director-v2/tests/conftest.py @@ -188,7 +188,7 @@ def mock_env( "REGISTRY_USER": "test", "SC_BOOT_MODE": "production", "SIMCORE_SERVICES_NETWORK_NAME": "test_network_name", - "SWARM_STACK_NAME": "test_swarm_name", + "SWARM_STACK_NAME": "pytest-simcore", "TRAEFIK_SIMCORE_ZONE": "test_traefik_zone", }, ) diff --git a/services/director-v2/tests/integration/02/test_dynamic_services_routes.py b/services/director-v2/tests/integration/02/test_dynamic_services_routes.py index dda042565376..4a4340338649 100644 --- a/services/director-v2/tests/integration/02/test_dynamic_services_routes.py +++ b/services/director-v2/tests/integration/02/test_dynamic_services_routes.py @@ -52,14 +52,15 @@ logger = logging.getLogger(__name__) pytest_simcore_core_services_selection = [ + "agent", "catalog", "director", "migration", "postgres", "rabbit", "redis", - "storage", "redis", + "storage", ] pytest_simcore_ops_services_selection = [ "adminer", diff --git a/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py b/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py index cd6f8bc22235..17d3fe4bcca8 100644 --- a/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py +++ b/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py @@ -100,6 +100,7 @@ from yarl import URL pytest_simcore_core_services_selection = [ + "agent", "catalog", "dask-scheduler", "dask-sidecar", @@ -381,7 +382,7 @@ def mock_env( "DYNAMIC_SIDECAR_IMAGE": image_name, "DYNAMIC_SIDECAR_PROMETHEUS_SERVICE_LABELS": "{}", "TRAEFIK_SIMCORE_ZONE": "test_traefik_zone", - "SWARM_STACK_NAME": "test_swarm_name", + "SWARM_STACK_NAME": "pytest-simcore", "SC_BOOT_MODE": "production", "DYNAMIC_SIDECAR_EXPOSE_PORT": "true", "DYNAMIC_SIDECAR_LOG_LEVEL": "DEBUG", diff --git a/services/director-v2/tests/integration/02/test_mixed_dynamic_sidecar_and_legacy_project.py b/services/director-v2/tests/integration/02/test_mixed_dynamic_sidecar_and_legacy_project.py index 4780c2f7a6f4..4d7c348a336a 100644 --- a/services/director-v2/tests/integration/02/test_mixed_dynamic_sidecar_and_legacy_project.py +++ b/services/director-v2/tests/integration/02/test_mixed_dynamic_sidecar_and_legacy_project.py @@ -40,6 +40,7 @@ pytest_simcore_core_services_selection = [ + "agent", "catalog", "director", "migration", @@ -81,7 +82,7 @@ def mock_env( env_vars: EnvVarsDict = { "DYNAMIC_SIDECAR_PROMETHEUS_SERVICE_LABELS": "{}", "TRAEFIK_SIMCORE_ZONE": "test_traefik_zone", - "SWARM_STACK_NAME": "test_swarm_name", + "SWARM_STACK_NAME": "pytest-simcore", "DYNAMIC_SIDECAR_LOG_LEVEL": "DEBUG", "SC_BOOT_MODE": "production", "DYNAMIC_SIDECAR_EXPOSE_PORT": "true", diff --git a/services/director-v2/tests/unit/conftest.py b/services/director-v2/tests/unit/conftest.py index f08ffd47337a..ecd7da595445 100644 --- a/services/director-v2/tests/unit/conftest.py +++ b/services/director-v2/tests/unit/conftest.py @@ -35,10 +35,6 @@ from simcore_service_director_v2.constants import DYNAMIC_SIDECAR_SCHEDULER_DATA_LABEL from simcore_service_director_v2.core.settings import AppSettings from simcore_service_director_v2.models.dynamic_services_scheduler import SchedulerData -from simcore_service_director_v2.modules.dynamic_sidecar.docker_service_specs.volume_remover import ( - DIND_VERSION, - DockerVersion, -) @pytest.fixture @@ -341,8 +337,3 @@ def mock_docker_api(mocker: MockerFixture) -> None: async def async_docker_client() -> AsyncIterable[aiodocker.Docker]: async with aiodocker.Docker() as docker_client: yield docker_client - - -@pytest.fixture -async def docker_version() -> DockerVersion: - return parse_obj_as(DockerVersion, DIND_VERSION) diff --git a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_docker_service_specs_volume_remover.py b/services/director-v2/tests/unit/test_modules_dynamic_sidecar_docker_service_specs_volume_remover.py deleted file mode 100644 index 4f5672b4a533..000000000000 --- a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_docker_service_specs_volume_remover.py +++ /dev/null @@ -1,230 +0,0 @@ -# pylint: disable=redefined-outer-name - -import contextlib -from pathlib import Path -from typing import AsyncIterator - -import pytest -from aiodocker import Docker, DockerError -from aiodocker.volumes import DockerVolume -from faker import Faker -from models_library.services import RunID -from pydantic import parse_obj_as -from simcore_service_director_v2.modules.dynamic_sidecar.docker_service_specs.volume_remover import ( - SH_SCRIPT_REMOVE_VOLUMES, - DockerVersion, -) - -# UTILS - - -def _get_source(run_id: RunID, node_uuid: str, volume_path: Path) -> str: - reversed_path = f"{volume_path}"[::-1].replace("/", "_") - return f"dyv_{run_id}_{node_uuid}_{reversed_path}" - - -async def run_command( - async_docker_client: Docker, docker_version: DockerVersion, volume_names: list[str] -) -> str: - volume_names_seq = " ".join(volume_names) - formatted_command = SH_SCRIPT_REMOVE_VOLUMES.format( - volume_names_seq=volume_names_seq, retries=3, sleep=0.1 - ) - print("Container will run:\n%s", formatted_command) - command = ["sh", "-c", formatted_command] - - container = await async_docker_client.containers.run( - config={ - "Cmd": command, - "Image": f"docker:{docker_version}-dind", - "HostConfig": {"Binds": ["/var/run/docker.sock:/var/run/docker.sock"]}, - }, - ) - await container.start() - await container.wait() - - logs = await container.log(stderr=True, stdout=True) - - await container.delete(force=True) - - return "".join(logs) - - -# FIXTURES - - -@pytest.fixture -def swarm_stack_name() -> str: - return "test_stack" - - -@pytest.fixture -def study_id(faker: Faker) -> str: - return faker.uuid4() - - -@pytest.fixture -def node_uuid(faker: Faker) -> str: - return faker.uuid4() - - -@pytest.fixture -def run_id() -> RunID: - return RunID.create() - - -@pytest.fixture -def used_volume_path(tmp_path: Path) -> Path: - return tmp_path / "used_volume" - - -@pytest.fixture -def unused_volume_path(tmp_path: Path) -> Path: - return tmp_path / "unused_volume" - - -@pytest.fixture -async def unused_volume( - async_docker_client: Docker, - swarm_stack_name: str, - study_id: str, - node_uuid: str, - run_id: RunID, - unused_volume_path: Path, -) -> AsyncIterator[DockerVolume]: - source = _get_source(run_id, node_uuid, unused_volume_path) - volume = await async_docker_client.volumes.create( - { - "Name": source, - "Labels": { - "node_uuid": node_uuid, - "run_id": run_id, - "source": source, - "study_id": study_id, - "swarm_stack_name": swarm_stack_name, - "user_id": "1", - }, - } - ) - - yield volume - - with contextlib.suppress(DockerError): - await volume.delete() - - -@pytest.fixture -async def used_volume( - async_docker_client: Docker, - swarm_stack_name: str, - study_id: str, - node_uuid: str, - run_id: RunID, - used_volume_path: Path, -) -> AsyncIterator[DockerVolume]: - source = _get_source(run_id, node_uuid, used_volume_path) - volume = await async_docker_client.volumes.create( - { - "Name": source, - "Labels": { - "node_uuid": node_uuid, - "run_id": run_id, - "source": source, - "study_id": study_id, - "swarm_stack_name": swarm_stack_name, - "user_id": "1", - }, - } - ) - - container = await async_docker_client.containers.run( - config={ - "Cmd": ["/bin/ash", "-c", "sleep 10000"], - "Image": "alpine:latest", - "HostConfig": {"Binds": [f"{volume.name}:{used_volume_path}"]}, - }, - name=f"using_volume_{volume.name}", - ) - await container.start() - - yield volume - - await container.delete(force=True) - await volume.delete() - - -@pytest.fixture -async def used_volume_name(used_volume: DockerVolume) -> str: - volume = await used_volume.show() - return volume["Name"] - - -@pytest.fixture -async def unused_volume_name(unused_volume: DockerVolume) -> str: - volume = await unused_volume.show() - return volume["Name"] - - -@pytest.fixture -def missing_volume_name(run_id: RunID, node_uuid: str) -> str: - return _get_source(run_id, node_uuid, Path("/MISSING/PATH")) - - -# TESTS - - -async def test_sh_script_error_if_volume_is_used( - async_docker_client: Docker, used_volume_name: str, docker_version: DockerVersion -): - command_stdout = await run_command( - async_docker_client, docker_version, volume_names=[used_volume_name] - ) - print(command_stdout) - assert "ERROR: Please check above logs, there was/were 1 error/s." in command_stdout - - -async def test_sh_script_removes_unused_volume( - async_docker_client: Docker, unused_volume_name: str, docker_version: DockerVersion -): - command_stdout = await run_command( - async_docker_client, docker_version, volume_names=[unused_volume_name] - ) - print(command_stdout) - assert "ERROR: Please check above logs, there was/were" not in command_stdout - assert command_stdout == f"{unused_volume_name}\n" - - -async def test_sh_script_no_error_if_volume_does_not_exist( - async_docker_client: Docker, missing_volume_name: str, docker_version: DockerVersion -): - command_stdout = await run_command( - async_docker_client, docker_version, volume_names=[missing_volume_name] - ) - print(command_stdout) - assert "ERROR: Please check above logs, there was/were" not in command_stdout - - -@pytest.mark.parametrize( - "docker_version", - [ - "20.10.17", - "20.10.17+azure-1-dind", # github workers - "20.10.17.", - "20.10.17asdjasjsaddas", - ], -) -def test_docker_version_strips_unwanted(docker_version: str): - assert parse_obj_as(DockerVersion, docker_version) == "20.10.17" - - -@pytest.mark.parametrize( - "invalid_docker_version", - [ - "nope", - ".20.10.17.", - ".20.10.17", - ], -) -def test_docker_version_invalid(invalid_docker_version: str): - with pytest.raises(ValueError): - parse_obj_as(DockerVersion, invalid_docker_version) diff --git a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_scheduler_task.py b/services/director-v2/tests/unit/test_modules_dynamic_sidecar_scheduler_task.py index 124b156ff0e3..5410c37f2039 100644 --- a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_scheduler_task.py +++ b/services/director-v2/tests/unit/test_modules_dynamic_sidecar_scheduler_task.py @@ -212,7 +212,7 @@ async def action(cls, app: FastAPI, scheduler_data: SchedulerData) -> None: @pytest.fixture def mock_remove_calls(mocker: MockerFixture) -> None: - mocker.patch.object(_events_utils, "remove_volumes_from_node") + mocker.patch.object(_events_utils, "remove_volumes_without_backup_for_service") @pytest.fixture(params=[True, False]) diff --git a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_volumes_resolver.py b/services/director-v2/tests/unit/test_modules_dynamic_sidecar_volumes_resolver.py index cc64e2fd541a..b617c3da6375 100644 --- a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_volumes_resolver.py +++ b/services/director-v2/tests/unit/test_modules_dynamic_sidecar_volumes_resolver.py @@ -9,6 +9,9 @@ import aiodocker import pytest from faker import Faker +from models_library.api_schemas_directorv2.services import ( + CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME, +) from models_library.projects import ProjectID from models_library.services import RunID from models_library.users import UserID @@ -144,6 +147,11 @@ def test_volumes_get_truncated_as_expected(faker: Faker): node_uuid=node_uuid, run_id=run_id, ) + + # if below fails the agent will have issues please check + constant_part = unique_volume_name[: CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME - 1] + assert constant_part == f"dyv_{run_id}_{node_uuid}" + assert len(unique_volume_name) == 255 assert f"{run_id}" in unique_volume_name assert f"{node_uuid}" in unique_volume_name diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py b/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py index b08c5c0c00c9..0536261ed629 100644 --- a/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py +++ b/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py @@ -3,7 +3,6 @@ # pylint: disable=protected-access import asyncio -import contextlib import datetime import logging import sys @@ -14,9 +13,7 @@ import aiodocker import pytest from aiodocker.utils import clean_filters -from aiodocker.volumes import DockerVolume from faker import Faker -from fastapi.encoders import jsonable_encoder from models_library.docker import to_simcore_runtime_docker_label_key from models_library.projects import ProjectID from models_library.projects_nodes_io import NodeID @@ -27,7 +24,6 @@ DYNAMIC_PROXY_SERVICE_PREFIX, DYNAMIC_SIDECAR_SCHEDULER_DATA_LABEL, DYNAMIC_SIDECAR_SERVICE_PREFIX, - DYNAMIC_VOLUME_REMOVER_PREFIX, ) from simcore_service_director_v2.core.dynamic_services_settings.scheduler import ( DynamicServicesSchedulerSettings, @@ -47,10 +43,6 @@ from simcore_service_director_v2.modules.dynamic_sidecar.docker_api._utils import ( docker_client, ) -from simcore_service_director_v2.modules.dynamic_sidecar.docker_service_specs.volume_remover import ( - DockerVersion, - spec_volume_removal_service, -) from simcore_service_director_v2.modules.dynamic_sidecar.errors import ( DynamicSidecarError, GenericDockerError, @@ -798,194 +790,3 @@ async def test_constrain_service_to_node( label, value = node_id_constraint.split("==") assert label.strip() == "node.id" assert value.strip() == target_node_id - - -@pytest.fixture -async def named_volumes( - async_docker_client: aiodocker.Docker, faker: Faker -) -> AsyncIterator[list[str]]: - named_volumes: list[DockerVolume] = [] - volume_names: list[str] = [] - for _ in range(10): - named_volume: DockerVolume = await async_docker_client.volumes.create( - {"Name": f"named-volume-{faker.uuid4()}"} - ) - volume_names.append(named_volume.name) - named_volumes.append(named_volume) - - yield volume_names - - # remove volume if still present - for named_volume in named_volumes: - with contextlib.suppress(aiodocker.DockerError): - await named_volume.delete() - - -async def is_volume_present( - async_docker_client: aiodocker.Docker, volume_name: str -) -> bool: - list_of_volumes = await async_docker_client.volumes.list() - for volume in list_of_volumes.get("Volumes", []): - if volume["Name"] == volume_name: - return True - return False - - -async def test_remove_volume_from_node_ok( - docker_swarm: None, - async_docker_client: aiodocker.Docker, - named_volumes: list[str], - target_node_id: str, - user_id: UserID, - project_id: ProjectID, - node_uuid: NodeID, - dynamic_sidecar_settings: DynamicSidecarSettings, - dynamic_services_scheduler_settings: DynamicServicesSchedulerSettings, -): - for named_volume in named_volumes: - assert await is_volume_present(async_docker_client, named_volume) is True - - volume_removal_result = await docker_api.remove_volumes_from_node( - swarm_stack_name=dynamic_services_scheduler_settings.SWARM_STACK_NAME, - volume_names=named_volumes, - docker_node_id=target_node_id, - user_id=user_id, - project_id=project_id, - node_uuid=node_uuid, - ) - assert volume_removal_result is True - - for named_volume in named_volumes: - assert await is_volume_present(async_docker_client, named_volume) is False - - -async def test_remove_volume_from_node_no_volume_found( - docker_swarm: None, - async_docker_client: aiodocker.Docker, - named_volumes: list[str], - target_node_id: str, - user_id: UserID, - project_id: ProjectID, - node_uuid: NodeID, - dynamic_sidecar_settings: DynamicSidecarSettings, - dynamic_services_scheduler_settings: DynamicServicesSchedulerSettings, -): - missing_volume_name = "nope-i-am-fake-and-do-not-exist" - assert await is_volume_present(async_docker_client, missing_volume_name) is False - - # put the missing one in the middle of the sequence - volumes_to_remove = named_volumes[:1] + [missing_volume_name] + named_volumes[1:] - - volume_removal_result = await docker_api.remove_volumes_from_node( - swarm_stack_name=dynamic_services_scheduler_settings.SWARM_STACK_NAME, - volume_names=volumes_to_remove, - docker_node_id=target_node_id, - user_id=user_id, - project_id=project_id, - node_uuid=node_uuid, - volume_removal_attempts=2, - sleep_between_attempts_s=1, - ) - assert volume_removal_result is True - assert await is_volume_present(async_docker_client, missing_volume_name) is False - for named_volume in named_volumes: - assert await is_volume_present(async_docker_client, named_volume) is False - - -@pytest.fixture -def volume_removal_services_names(faker: Faker) -> set[str]: - return {f"{DYNAMIC_VOLUME_REMOVER_PREFIX}_{faker.uuid4()}" for _ in range(10)} - - -@pytest.fixture(params=[0, 2]) -def service_timeout_s(request: pytest.FixtureRequest) -> int: - return request.param # type: ignore - - -@pytest.fixture -async def ensure_fake_volume_removal_services( - async_docker_client: aiodocker.Docker, - docker_version: DockerVersion, - target_node_id: str, - user_id: UserID, - project_id: ProjectID, - node_uuid: NodeID, - volume_removal_services_names: list[str], - dynamic_services_scheduler_settings: DynamicServicesSchedulerSettings, - service_timeout_s: int, - docker_swarm: None, -) -> AsyncIterator[None]: - started_services_ids: list[str] = [] - - for service_name in volume_removal_services_names: - service_spec = spec_volume_removal_service( - swarm_stack_name=dynamic_services_scheduler_settings.SWARM_STACK_NAME, - docker_node_id=target_node_id, - user_id=user_id, - project_id=project_id, - node_uuid=node_uuid, - volume_names=[], - docker_version=docker_version, - volume_removal_attempts=0, - sleep_between_attempts_s=0, - service_timeout_s=service_timeout_s, - ) - - # replace values - service_spec.Name = service_name - # use very long sleep command - service_spec.TaskTemplate.ContainerSpec.Command = ["sh", "-c", "sleep 3600"] - - started_service = await async_docker_client.services.create( - **jsonable_encoder(service_spec, by_alias=True, exclude_unset=True) - ) - started_services_ids.append(started_service["ID"]) - - yield None - - for service_id in started_services_ids: - try: - await async_docker_client.services.delete(service_id) - except aiodocker.exceptions.DockerError as e: - assert e.message == f"service {service_id} not found" - - -async def _get_pending_services(async_docker_client: aiodocker.Docker) -> list[str]: - service_filters = {"name": [f"{DYNAMIC_VOLUME_REMOVER_PREFIX}"]} - return [ - x["Spec"]["Name"] - for x in await async_docker_client.services.list(filters=service_filters) - ] - - -async def test_get_volume_removal_services( - ensure_fake_volume_removal_services: None, - async_docker_client: aiodocker.Docker, - volume_removal_services_names: set[str], - dynamic_services_scheduler_settings: DynamicServicesSchedulerSettings, - service_timeout_s: int, -): - # services will be detected as timed out after 1 second - sleep_for = 1.01 - await asyncio.sleep(sleep_for) - - pending_service_names = await _get_pending_services(async_docker_client) - assert len(pending_service_names) == len(volume_removal_services_names) - - # check services are present before removing timed out services - for service_name in pending_service_names: - assert service_name in volume_removal_services_names - - await docker_api.remove_pending_volume_removal_services( - dynamic_services_scheduler_settings.SWARM_STACK_NAME - ) - - # check that timed out services have been removed - pending_service_names = await _get_pending_services(async_docker_client) - services_have_timed_out = sleep_for > service_timeout_s - if services_have_timed_out: - assert len(pending_service_names) == 0 - else: - assert len(pending_service_names) == len(volume_removal_services_names) - for service_name in pending_service_names: - assert service_name in volume_removal_services_names diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 8e8f02db8a23..62b1c928456e 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -1005,6 +1005,12 @@ services: AGENT_VOLUMES_CLEANUP_S3_SECRET_KEY: ${AGENT_VOLUMES_CLEANUP_S3_SECRET_KEY} AGENT_VOLUMES_CLEANUP_S3_BUCKET: ${AGENT_VOLUMES_CLEANUP_S3_BUCKET} AGENT_VOLUMES_CLEANUP_S3_PROVIDER: ${AGENT_VOLUMES_CLEANUP_S3_PROVIDER} + AGENT_DOCKER_NODE_ID: "{{.Node.ID}}" + RABBIT_HOST: ${RABBIT_HOST} + RABBIT_PASSWORD: ${RABBIT_PASSWORD} + RABBIT_PORT: ${RABBIT_PORT} + RABBIT_USER: ${RABBIT_USER} + RABBIT_SECURE: ${RABBIT_SECURE} dask-sidecar: image: ${DOCKER_REGISTRY:-itisfoundation}/dask-sidecar:${DOCKER_IMAGE_TAG:-latest} diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rpc/_services.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rpc/_services.py index 0687c58bac12..65fc96dd6601 100644 --- a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rpc/_services.py +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rpc/_services.py @@ -50,16 +50,13 @@ async def run_dynamic_service( ) async def stop_dynamic_service( app: FastAPI, *, dynamic_service_stop: DynamicServiceStop -) -> NodeGet | DynamicServiceGet: +) -> None: director_v2_client = DirectorV2Client.get_from_app_state(app) settings: ApplicationSettings = app.state.settings - response: NodeGet | DynamicServiceGet = ( - await director_v2_client.stop_dynamic_service( - node_id=dynamic_service_stop.node_id, - simcore_user_agent=dynamic_service_stop.simcore_user_agent, - save_state=dynamic_service_stop.save_state, - timeout=settings.DYNAMIC_SCHEDULER_STOP_SERVICE_TIMEOUT, - ) + await director_v2_client.stop_dynamic_service( + node_id=dynamic_service_stop.node_id, + simcore_user_agent=dynamic_service_stop.simcore_user_agent, + save_state=dynamic_service_stop.save_state, + timeout=settings.DYNAMIC_SCHEDULER_STOP_SERVICE_TIMEOUT, ) await set_request_as_stopped(app, dynamic_service_stop) - return response diff --git a/services/payments/src/simcore_service_payments/api/rest/_dependencies.py b/services/payments/src/simcore_service_payments/api/rest/_dependencies.py index 913122a7854b..cc0ead808945 100644 --- a/services/payments/src/simcore_service_payments/api/rest/_dependencies.py +++ b/services/payments/src/simcore_service_payments/api/rest/_dependencies.py @@ -1,7 +1,7 @@ # mypy: disable-error-code=truthy-function import logging from collections.abc import AsyncGenerator, Callable -from typing import Annotated, cast +from typing import Annotated from fastapi import Depends, FastAPI, Request from fastapi.security import OAuth2PasswordBearer @@ -40,9 +40,7 @@ def get_settings(request: Request) -> ApplicationSettings: def get_rut_api(request: Request) -> ResourceUsageTrackerApi: - return cast( - ResourceUsageTrackerApi, ResourceUsageTrackerApi.get_from_app_state(request.app) - ) + return ResourceUsageTrackerApi.get_from_app_state(request.app) def get_from_app_state(