diff --git a/packages/pytest-simcore/src/pytest_simcore/helpers/docker.py b/packages/pytest-simcore/src/pytest_simcore/helpers/docker.py index dcccfa551416..58ef7d0919c3 100644 --- a/packages/pytest-simcore/src/pytest_simcore/helpers/docker.py +++ b/packages/pytest-simcore/src/pytest_simcore/helpers/docker.py @@ -37,7 +37,7 @@ class ContainerStatus(str, Enum): _COLOR_ENCODING_RE = re.compile(r"\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]") _MAX_PATH_CHAR_LEN_ALLOWED = 260 -_kFILENAME_TOO_LONG = 36 +_FILENAME_TOO_LONG = 36 _NORMPATH_COUNT = 0 @@ -94,7 +94,7 @@ def get_service_published_port( ) for target_port in ports_to_look_for: - target_port = int(target_port) + target_port = int(target_port) # noqa: PLW2901 for p in service_ports: if p["TargetPort"] == target_port: published_port = p["PublishedPort"] @@ -158,7 +158,7 @@ def run_docker_compose_config( args = [f"{docker_compose_path}", *bash_options] print(" ".join(args)) - process = subprocess.run( + process = subprocess.run( # noqa: S603 args, cwd=project_dir, capture_output=True, @@ -189,7 +189,7 @@ def shorten_path(filename: str) -> Path: # This helper function tries to normalize the path # Another possibility would be that the path has some # problematic characters but so far we did not find any case ... - global _NORMPATH_COUNT # pylint: disable=global-statement + global _NORMPATH_COUNT # pylint: disable=global-statement # noqa: PLW0603 if len(filename) > _MAX_PATH_CHAR_LEN_ALLOWED: _NORMPATH_COUNT += 1 @@ -215,7 +215,7 @@ def safe_artifact_name(name: str) -> str: return BANNED_CHARS_FOR_ARTIFACTS.sub("_", name) -def save_docker_infos(destination_dir: Path): +def save_docker_infos(destination_dir: Path): # noqa: C901 client = docker.from_env() # Includes stop containers, which might be e.g. failing tasks @@ -228,7 +228,7 @@ def save_docker_infos(destination_dir: Path): destination_dir.mkdir(parents=True, exist_ok=True) except OSError as err: - if err.errno == _kFILENAME_TOO_LONG: + if err.errno == _FILENAME_TOO_LONG: destination_dir = shorten_path(err.filename) destination_dir.mkdir(parents=True, exist_ok=True) @@ -245,7 +245,7 @@ def save_docker_infos(destination_dir: Path): ) except OSError as err: - if err.errno == _kFILENAME_TOO_LONG: + if err.errno == _FILENAME_TOO_LONG: shorten_path(err.filename).write_text( _COLOR_ENCODING_RE.sub("", logs) ) @@ -256,12 +256,12 @@ def save_docker_infos(destination_dir: Path): json.dumps(container.attrs, indent=2) ) except OSError as err: - if err.errno == _kFILENAME_TOO_LONG: + if err.errno == _FILENAME_TOO_LONG: shorten_path(err.filename).write_text( json.dumps(container.attrs, indent=2) ) - except Exception as err: # pylint: disable=broad-except # noqa: PERF203 + except Exception as err: # pylint: disable=broad-except if container.status != ContainerStatus.created: print( f"Error while dumping {container.name=}, {container.status=}.\n\t{err=}" diff --git a/packages/pytest-simcore/src/pytest_simcore/helpers/paused_container.py b/packages/pytest-simcore/src/pytest_simcore/helpers/paused_container.py new file mode 100644 index 000000000000..20caed87f78c --- /dev/null +++ b/packages/pytest-simcore/src/pytest_simcore/helpers/paused_container.py @@ -0,0 +1,64 @@ +from collections.abc import AsyncIterator, Callable +from contextlib import AbstractAsyncContextManager, asynccontextmanager +from typing import TYPE_CHECKING, Protocol + +from tenacity.asyncio import AsyncRetrying +from tenacity.retry import retry_if_exception_type +from tenacity.stop import stop_after_delay +from tenacity.wait import wait_fixed + +if TYPE_CHECKING: + from servicelib.rabbitmq import RabbitMQClient + from servicelib.redis import RedisClientSDK + + +class _ClientWithPingProtocol(Protocol): + async def ping(self) -> bool: ... + + +@asynccontextmanager +async def _paused_container( + paused_container: Callable[[str], AbstractAsyncContextManager[None]], + container_name: str, + client: _ClientWithPingProtocol, +) -> AsyncIterator[None]: + async with paused_container(container_name): + yield + + async for attempt in AsyncRetrying( + wait=wait_fixed(0.1), + stop=stop_after_delay(10), + reraise=True, + retry=retry_if_exception_type(AssertionError), + ): + with attempt: + assert await client.ping() is True + + +@asynccontextmanager +async def pause_rabbit( + paused_container: Callable[[str], AbstractAsyncContextManager[None]], + rabbit_client: "RabbitMQClient", +) -> AsyncIterator[None]: + """ + Pause RabbitMQ container during the context block, + ensuring it's fully down before and back up after. + """ + async with _paused_container(paused_container, "rabbit", rabbit_client): + yield + + +@asynccontextmanager +async def pause_redis( + paused_container: Callable[[str], AbstractAsyncContextManager[None]], + redis_client: "RedisClientSDK", +) -> AsyncIterator[None]: + """ + Pause Redis container during the context block, + saving a DB snapshot first for a clean restore point. + Ensures Redis is down before yielding, and back up after. + """ + await redis_client.redis.save() + + async with _paused_container(paused_container, "redis", redis_client): + yield diff --git a/packages/service-library/tests/deferred_tasks/test_deferred_tasks.py b/packages/service-library/tests/deferred_tasks/test_deferred_tasks.py index 0bb6254542e9..04d73e22b5d1 100644 --- a/packages/service-library/tests/deferred_tasks/test_deferred_tasks.py +++ b/packages/service-library/tests/deferred_tasks/test_deferred_tasks.py @@ -2,15 +2,15 @@ # pylint:disable=unused-argument import asyncio -import contextlib +import datetime import itertools import json import random import sys -from collections.abc import AsyncIterable, AsyncIterator, Awaitable, Callable +from collections.abc import AsyncIterable, Awaitable, Callable from contextlib import AbstractAsyncContextManager, AsyncExitStack, suppress from pathlib import Path -from typing import Any, Protocol +from typing import Any import psutil import pytest @@ -18,6 +18,8 @@ from common_library.json_serialization import json_dumps from common_library.serialization import model_dump_with_secrets from pydantic import NonNegativeFloat, NonNegativeInt +from pytest_mock import MockerFixture +from pytest_simcore.helpers.paused_container import pause_rabbit, pause_redis from servicelib.rabbitmq import RabbitMQClient from servicelib.redis import RedisClientSDK from servicelib.sequences_utils import partition_gen @@ -330,57 +332,12 @@ async def rabbit_client( return create_rabbitmq_client("pinger") -class ClientWithPingProtocol(Protocol): - async def ping(self) -> bool: ... - - -class ServiceManager: - def __init__( - self, - redis_client: RedisClientSDK, - rabbit_client: RabbitMQClient, - paused_container: Callable[[str], AbstractAsyncContextManager[None]], - ) -> None: - self.redis_client = redis_client - self.rabbit_client = rabbit_client - self.paused_container = paused_container - - @contextlib.asynccontextmanager - async def _paused_container( - self, container_name: str, client: ClientWithPingProtocol - ) -> AsyncIterator[None]: - async with self.paused_container(container_name): - async for attempt in AsyncRetrying( - wait=wait_fixed(0.1), - stop=stop_after_delay(10), - reraise=True, - retry=retry_if_exception_type(AssertionError), - ): - with attempt: - assert await client.ping() is False - yield - - async for attempt in AsyncRetrying( - wait=wait_fixed(0.1), - stop=stop_after_delay(10), - reraise=True, - retry=retry_if_exception_type(AssertionError), - ): - with attempt: - assert await client.ping() is True - - @contextlib.asynccontextmanager - async def pause_rabbit(self) -> AsyncIterator[None]: - async with self._paused_container("rabbit", self.rabbit_client): - yield - - @contextlib.asynccontextmanager - async def pause_redis(self) -> AsyncIterator[None]: - # save db for clean restore point - await self.redis_client.redis.save() - - async with self._paused_container("redis", self.redis_client): - yield +@pytest.fixture +def mock_default_socket_timeout(mocker: MockerFixture) -> None: + mocker.patch( + "servicelib.redis._client.DEFAULT_SOCKET_TIMEOUT", + datetime.timedelta(seconds=0.25), + ) @pytest.mark.parametrize("max_workers", [10]) @@ -397,9 +354,6 @@ async def test_workflow_with_third_party_services_outages( deferred_tasks_to_start: int, service: str, ): - service_manager = ServiceManager( - redis_client_sdk_deferred_tasks, rabbit_client, paused_container - ) async with _RemoteProcessLifecycleManager( await get_remote_process(), @@ -423,14 +377,16 @@ async def test_workflow_with_third_party_services_outages( match service: case "rabbit": print("[rabbit]: pausing") - async with service_manager.pause_rabbit(): + async with pause_rabbit(paused_container, rabbit_client): print("[rabbit]: paused") await _sleep_in_interval(0.2, 0.4) print("[rabbit]: resumed") case "redis": print("[redis]: pausing") - async with service_manager.pause_redis(): + async with pause_redis( + paused_container, redis_client_sdk_deferred_tasks + ): print("[redis]: paused") await _sleep_in_interval(0.2, 0.4) print("[redis]: resumed") diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rest/_ops.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rest/_ops.py index 6af7b8f88ca7..80f5cad49de6 100644 --- a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rest/_ops.py +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rest/_ops.py @@ -5,7 +5,7 @@ DynamicServiceGet, ) -from ...services import scheduler_interface +from ...services import common_interface from ._dependencies import ( get_app, ) @@ -19,6 +19,6 @@ async def running_services( ) -> list[DynamicServiceGet]: """returns all running dynamic services. Used by ops internall to determine when it is safe to shutdown the platform""" - return await scheduler_interface.list_tracked_dynamic_services( + return await common_interface.list_tracked_dynamic_services( app, user_id=None, project_id=None ) diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rpc/_services.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rpc/_services.py index b90ed821bfaf..e3a7fc3286b7 100644 --- a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rpc/_services.py +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/api/rpc/_services.py @@ -20,7 +20,7 @@ ServiceWasNotFoundError, ) -from ...services import scheduler_interface +from ...services import common_interface router = RPCRouter() @@ -29,7 +29,7 @@ async def list_tracked_dynamic_services( app: FastAPI, *, user_id: UserID | None = None, project_id: ProjectID | None = None ) -> list[DynamicServiceGet]: - return await scheduler_interface.list_tracked_dynamic_services( + return await common_interface.list_tracked_dynamic_services( app, user_id=user_id, project_id=project_id ) @@ -38,14 +38,14 @@ async def list_tracked_dynamic_services( async def get_service_status( app: FastAPI, *, node_id: NodeID ) -> NodeGet | DynamicServiceGet | NodeGetIdle: - return await scheduler_interface.get_service_status(app, node_id=node_id) + return await common_interface.get_service_status(app, node_id=node_id) @router.expose() async def run_dynamic_service( app: FastAPI, *, dynamic_service_start: DynamicServiceStart ) -> NodeGet | DynamicServiceGet: - return await scheduler_interface.run_dynamic_service( + return await common_interface.run_dynamic_service( app, dynamic_service_start=dynamic_service_start ) @@ -59,7 +59,7 @@ async def run_dynamic_service( async def stop_dynamic_service( app: FastAPI, *, dynamic_service_stop: DynamicServiceStop ) -> None: - return await scheduler_interface.stop_dynamic_service( + return await common_interface.stop_dynamic_service( app, dynamic_service_stop=dynamic_service_stop ) @@ -68,25 +68,25 @@ async def stop_dynamic_service( async def get_project_inactivity( app: FastAPI, *, project_id: ProjectID, max_inactivity_seconds: NonNegativeInt ) -> GetProjectInactivityResponse: - return await scheduler_interface.get_project_inactivity( + return await common_interface.get_project_inactivity( app, project_id=project_id, max_inactivity_seconds=max_inactivity_seconds ) @router.expose() async def restart_user_services(app: FastAPI, *, node_id: NodeID) -> None: - await scheduler_interface.restart_user_services(app, node_id=node_id) + await common_interface.restart_user_services(app, node_id=node_id) @router.expose() async def retrieve_inputs( app: FastAPI, *, node_id: NodeID, port_keys: list[ServicePortKey] ) -> RetrieveDataOutEnveloped: - return await scheduler_interface.retrieve_inputs( + return await common_interface.retrieve_inputs( app, node_id=node_id, port_keys=port_keys ) @router.expose() async def update_projects_networks(app: FastAPI, *, project_id: ProjectID) -> None: - await scheduler_interface.update_projects_networks(app, project_id=project_id) + await common_interface.update_projects_networks(app, project_id=project_id) diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/core/events.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/core/events.py index 1c293e78e710..d2b557401a42 100644 --- a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/core/events.py +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/core/events.py @@ -23,6 +23,7 @@ from ..services.deferred_manager import deferred_manager_lifespan from ..services.director_v0 import director_v0_lifespan from ..services.director_v2 import director_v2_lifespan +from ..services.generic_scheduler import generic_scheduler_lifespan from ..services.notifier import get_notifier_lifespans from ..services.rabbitmq import rabbitmq_lifespan from ..services.redis import redis_lifespan @@ -79,6 +80,8 @@ def create_app_lifespan( for lifespan in get_notifier_lifespans(): app_lifespan.add(lifespan) + app_lifespan.add(generic_scheduler_lifespan) + app_lifespan.add(service_tracker_lifespan) app_lifespan.add(deferred_manager_lifespan) app_lifespan.add(status_monitor_lifespan) diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/scheduler_interface.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/common_interface.py similarity index 99% rename from services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/scheduler_interface.py rename to services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/common_interface.py index ff279fb75c98..7510fbb60d6f 100644 --- a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/scheduler_interface.py +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/common_interface.py @@ -50,6 +50,8 @@ async def get_service_status( async def run_dynamic_service( app: FastAPI, *, dynamic_service_start: DynamicServiceStart ) -> NodeGet | DynamicServiceGet: + await set_request_as_running(app, dynamic_service_start) + settings: ApplicationSettings = app.state.settings if settings.DYNAMIC_SCHEDULER_USE_INTERNAL_SCHEDULER: raise NotImplementedError @@ -59,13 +61,14 @@ async def run_dynamic_service( await director_v2_client.run_dynamic_service(dynamic_service_start) ) - await set_request_as_running(app, dynamic_service_start) return response async def stop_dynamic_service( app: FastAPI, *, dynamic_service_stop: DynamicServiceStop ) -> None: + await set_request_as_stopped(app, dynamic_service_stop) + settings: ApplicationSettings = app.state.settings if settings.DYNAMIC_SCHEDULER_USE_INTERNAL_SCHEDULER: raise NotImplementedError @@ -78,8 +81,6 @@ async def stop_dynamic_service( timeout=settings.DYNAMIC_SCHEDULER_STOP_SERVICE_TIMEOUT, ) - await set_request_as_stopped(app, dynamic_service_stop) - async def get_project_inactivity( app: FastAPI, *, project_id: ProjectID, max_inactivity_seconds: NonNegativeInt diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/__init__.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/__init__.py new file mode 100644 index 000000000000..a294fccc2273 --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/__init__.py @@ -0,0 +1,49 @@ +from ._core import ( + cancel_operation, + restart_operation_step_stuck_during_undo, + restart_operation_step_stuck_in_manual_intervention_during_create, + start_operation, +) +from ._deferred_runner import ( + get_operation_context_proxy, + get_step_group_proxy, + get_step_store_proxy, +) +from ._lifespan import generic_scheduler_lifespan +from ._models import ( + OperationName, + ProvidedOperationContext, + RequiredOperationContext, + ScheduleId, +) +from ._operation import ( + BaseStep, + Operation, + OperationRegistry, + ParallelStepGroup, + SingleStepGroup, +) +from ._store import OperationContextProxy, StepGroupProxy, StepStoreProxy + +__all__: tuple[str, ...] = ( + "BaseStep", + "cancel_operation", + "generic_scheduler_lifespan", + "get_operation_context_proxy", + "get_step_group_proxy", + "get_step_store_proxy", + "Operation", + "OperationContextProxy", + "OperationName", + "OperationRegistry", + "ParallelStepGroup", + "ProvidedOperationContext", + "RequiredOperationContext", + "restart_operation_step_stuck_during_undo", + "restart_operation_step_stuck_in_manual_intervention_during_create", + "ScheduleId", + "SingleStepGroup", + "start_operation", + "StepGroupProxy", + "StepStoreProxy", +) diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_core.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_core.py new file mode 100644 index 000000000000..8ee52189eefb --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_core.py @@ -0,0 +1,682 @@ +import asyncio +import logging +from collections.abc import Iterable +from contextlib import suppress +from datetime import timedelta +from typing import Final +from uuid import uuid4 + +from fastapi import FastAPI +from pydantic import NonNegativeInt +from servicelib.fastapi.app_state import SingletonInAppStateMixin +from servicelib.logging_utils import log_context +from servicelib.utils import limited_gather + +from ._core_utils import ( + PARALLEL_REQUESTS, + are_any_steps_in_a_progress_status, + cleanup_after_finishing, + get_group_step_proxies, + get_requires_manual_intervention, + get_step_error_traceback, + get_steps_statuses, + raise_if_overwrites_any_operation_provided_key, + safe_event, + set_unexpected_opration_state, + start_and_mark_as_started, + start_steps_which_were_not_started, +) +from ._deferred_runner import DeferredRunner +from ._errors import ( + CannotCancelWhileWaitingForManualInterventionError, + NoDataFoundError, + StepNameNotInCurrentGroupError, + StepNotInErrorStateError, + StepNotWaitingForManualInterventionError, + UnexpectedStepHandlingError, +) +from ._event import enqueue_schedule_event +from ._models import ( + OperationContext, + OperationErrorType, + OperationName, + ScheduleId, + StepName, + StepStatus, +) +from ._operation import ( + BaseStepGroup, + Operation, + OperationRegistry, +) +from ._store import ( + DeleteStepKeys, + OperationContextProxy, + ScheduleDataStoreProxy, + StepGroupProxy, + StepStoreProxy, + Store, +) + +_DEFAULT_UNKNOWN_STATUS_MAX_RETRY: Final[NonNegativeInt] = 3 +_DEFAULT_UNKNOWN_STATUS_WAIT_BEFORE_RETRY: Final[timedelta] = timedelta(seconds=1) + +_logger = logging.getLogger(__name__) + + +class Core(SingletonInAppStateMixin): + app_state_name: str = "generic_scheduler_core" + + def __init__( + self, + app: FastAPI, + unknown_status_max_retry: NonNegativeInt = _DEFAULT_UNKNOWN_STATUS_MAX_RETRY, + unknown_status_wait_before_retry: timedelta = _DEFAULT_UNKNOWN_STATUS_WAIT_BEFORE_RETRY, + ) -> None: + self.app = app + self.unknown_status_max_retry = unknown_status_max_retry + self.unknown_status_wait_before_retry = unknown_status_wait_before_retry + self._store: Store = Store.get_from_app_state(app) + + async def start_operation( + self, operation_name: OperationName, initial_operation_context: OperationContext + ) -> ScheduleId: + """start an operation by it's given name and providing an initial context""" + schedule_id: ScheduleId = f"{uuid4()}" + + # check if operation is registered + operation = OperationRegistry.get_operation(operation_name) + + # NOTE: to ensure reproducibility of operations, the + # operation steps cannot overwrite keys in the + # initial context with their results + raise_if_overwrites_any_operation_provided_key( + operation, initial_operation_context + ) + + schedule_data_proxy = ScheduleDataStoreProxy( + store=self._store, schedule_id=schedule_id + ) + await schedule_data_proxy.create_or_update_multiple( + { + "operation_name": operation_name, + "group_index": 0, + "is_creating": True, + } + ) + + operation_content_proxy = OperationContextProxy( + store=self._store, + schedule_id=schedule_id, + operation_name=operation_name, + ) + await operation_content_proxy.create_or_update(initial_operation_context) + + await enqueue_schedule_event(self.app, schedule_id) + return schedule_id + + async def cancel_operation(self, schedule_id: ScheduleId) -> None: + """ + Sets the operation to undo form the point in which it arrived in: + - when is_creating=True: cancels all steps & moves operation to undo + - when is_creating=False: does nothing, since undo is already running + + # NOTE: SEE `_on_schedule_event` for more details + """ + schedule_data_proxy = ScheduleDataStoreProxy( + store=self._store, schedule_id=schedule_id + ) + + is_creating = await schedule_data_proxy.read("is_creating") + + if is_creating is False: + _logger.warning( + "Cannot cancel steps for schedule_id='%s' since UNDO is running", + schedule_id, + ) + return + + operation_name = await schedule_data_proxy.read("operation_name") + group_index = await schedule_data_proxy.read("group_index") + + operation = OperationRegistry.get_operation(operation_name) + group = operation[group_index] + + group_step_proxies = get_group_step_proxies( + self._store, + schedule_id=schedule_id, + operation_name=operation_name, + group_index=group_index, + step_group=group, + is_creating=is_creating, + ) + + # not allowed to cancel while waiting for manual intervention + if any( + await limited_gather( + *( + get_requires_manual_intervention(step) + for step in group_step_proxies.values() + ), + limit=PARALLEL_REQUESTS, + ) + ): + raise CannotCancelWhileWaitingForManualInterventionError( + schedule_id=schedule_id + ) + + async def _cancel_step(step_name: StepName, step_proxy: StepStoreProxy) -> None: + with log_context( # noqa: SIM117 + _logger, + logging.DEBUG, + f"Cancelling step {step_name=} of {operation_name=} for {schedule_id=}", + ): + with suppress(NoDataFoundError): + deferred_task_uid = await step_proxy.read("deferred_task_uid") + await DeferredRunner.cancel(deferred_task_uid) + await step_proxy.create_or_update("status", StepStatus.CANCELLED) + + await limited_gather( + *( + _cancel_step(step_name, step_proxy) + for step_name, step_proxy in group_step_proxies.items() + ), + limit=PARALLEL_REQUESTS, + ) + + async def restart_operation_step_stuck_in_error( + self, + schedule_id: ScheduleId, + step_name: StepName, + *, + in_manual_intervention: bool, + ) -> None: + """ + Force a step stuck in an error state to retry. + Will raise errors if step cannot be retried. + """ + schedule_data_proxy = ScheduleDataStoreProxy( + store=self._store, schedule_id=schedule_id + ) + is_creating = await schedule_data_proxy.read("is_creating") + operation_name = await schedule_data_proxy.read("operation_name") + group_index = await schedule_data_proxy.read("group_index") + + operation = OperationRegistry.get_operation(operation_name) + step_group = operation[group_index] + step_group_name = step_group.get_step_group_name(index=group_index) + + if step_name not in { + step.get_step_name() for step in step_group.get_step_subgroup_to_run() + }: + raise StepNameNotInCurrentGroupError( + step_name=step_name, + step_group_name=step_group_name, + operation_name=operation_name, + ) + + step_proxy = StepStoreProxy( + store=self._store, + schedule_id=schedule_id, + operation_name=operation_name, + step_group_name=step_group_name, + step_name=step_name, + is_creating=is_creating, + ) + + try: + await step_proxy.read("error_traceback") + except NoDataFoundError as exc: + raise StepNotInErrorStateError(step_name=step_name) from exc + + step_keys_to_remove: list[DeleteStepKeys] = [ + "deferred_created", + "error_traceback", + "deferred_task_uid", + ] + if in_manual_intervention: + requires_manual_intervention: bool = False + with suppress(NoDataFoundError): + requires_manual_intervention = await step_proxy.read( + "requires_manual_intervention" + ) + + if requires_manual_intervention is False: + raise StepNotWaitingForManualInterventionError(step_name=step_name) + + step_keys_to_remove.append("requires_manual_intervention") + + # restart the step + schedule_data_proxy = ScheduleDataStoreProxy( + store=self._store, schedule_id=schedule_id + ) + group_proxy = StepGroupProxy( + store=self._store, + schedule_id=schedule_id, + operation_name=operation_name, + step_group_name=step_group_name, + is_creating=is_creating, + ) + + # remove previus entries for the step + await step_proxy.delete_keys(*step_keys_to_remove) + await schedule_data_proxy.delete_keys( + "operation_error_type", "operation_error_message" + ) + await group_proxy.decrement_and_get_done_steps_count() + + _logger.debug( + "Restarting step_name='%s' of operation_name='%s' for schedule_id='%s' after '%s'", + step_name, + operation_name, + schedule_id, + "manual intervention" if in_manual_intervention else "error in undo", + ) + # restart only this step + await start_and_mark_as_started( + step_proxy, + is_creating=is_creating, + expected_steps_count=len(step_group), + ) + + async def safe_on_schedule_event(self, schedule_id: ScheduleId) -> None: + async with safe_event(self._store, schedule_id): + with log_context( + _logger, + logging.DEBUG, + f"processing schedule_event for schedule_id={schedule_id}", + log_duration=True, + ): + await self._on_schedule_event(schedule_id) + + async def _on_schedule_event(self, schedule_id: ScheduleId) -> None: + """ + A schedule event is what advances the `operation` processing. Multiple schedule events + are required to complete an operation. + + An `operation` is moved from one `step group` to the next one until all `steps` are done. + Steps: always finish, automatically retry and are guaranteed to be in a final state + (SUCCESS, FAILED, CANCELLED). + Processing continues when all steps are in a final state. + + From this point onwards an `operation` can be advanced in one the following modes: + - `CEREATEING`: default mode when starting an operation + - runs the `create()` of each step in each group (`first` -> `last` group) + - when done, it removes all operation data + - `UNDOING`: undo the actions of `create()` in reverse order with respect to CREATING + - runs the `undo()` of each step in each group (`current` -> `first` group) + - when done, it removes all operation data + - `REPEATING`: repeats the `create()` of all steps in a group + - waits and runs the `create()` of all the steps in last group in the operation + - never completes, unless operation is cancelled + + NOTE: `REPEATING` is triggered by setting `BaseStepGroup(repeat_steps=True)` during definition + of an `operation`. + NOTE: `UNDOING` is triggered by calling `cancel_operation()` or when a step finishes with + status `FAILED` or `CANCELLED` (except in manual intervention). + + There are 3 reasons why an operation will hang: + - MANUAL_INTERVENTION: step failed during `create()` and flagged for manual intervention + -> requires support intervention + - STEP_ISSUE: a step failed during `undo()` due to an error in the step's undo code + -> unexpected behviour / requires developer intervention + - FRAMEWORK_ISSUE: a step failed during `undo()` because it was cancelled + -> unexpected behviour / requires developer intervention + + NOTE: only MANUAL_INTERVENTION is an allowed to happen all other failuires are to be treated + as bugs and reported. + """ + schedule_data_proxy = ScheduleDataStoreProxy( + store=self._store, schedule_id=schedule_id + ) + + operation_name = await schedule_data_proxy.read("operation_name") + is_creating = await schedule_data_proxy.read("is_creating") + group_index = await schedule_data_proxy.read("group_index") + + operation = OperationRegistry.get_operation(operation_name) + step_group = operation[group_index] + + group_step_proxies = get_group_step_proxies( + self._store, + schedule_id=schedule_id, + operation_name=operation_name, + group_index=group_index, + step_group=step_group, + is_creating=is_creating, + ) + + # 1) ensure all operation steps in the group are started before advancing + if await start_steps_which_were_not_started( + group_step_proxies, + is_creating=is_creating, + group_step_count=len(step_group), + ): + return + + # 2) wait for all steps to finish before advancing + steps_statuses = await get_steps_statuses(group_step_proxies.values()) + _logger.debug( + "DETECTED: steps_statuses=%s in operation=%s for scheuled_id=%s", + steps_statuses, + operation_name, + schedule_id, + ) + if are_any_steps_in_a_progress_status(steps_statuses): + _logger.debug( + "operation_name='%s' has steps still in progress steps_statuses='%s'", + operation_name, + group_step_proxies, + ) + return + + # 3) advancing operation in mode + step_group_name = step_group.get_step_group_name(index=group_index) + base_message = f"{step_group_name=} in {operation_name=} for {schedule_id=}" + + if step_group.repeat_steps is True and is_creating: + with log_context(_logger, logging.DEBUG, f"REPEATING {base_message}"): + await self._advance_as_repeating( + schedule_data_proxy, + schedule_id, + operation_name, + group_index, + step_group, + group_step_proxies, + ) + + elif is_creating: + with log_context(_logger, logging.DEBUG, f"CREATING {base_message}"): + await self._advance_as_creating( + steps_statuses, + schedule_data_proxy, + schedule_id, + operation_name, + group_index, + step_group, + operation, + ) + + else: + with log_context(_logger, logging.DEBUG, f"UNDOING {base_message}"): + await self._advance_as_undoing( + steps_statuses, + schedule_data_proxy, + schedule_id, + operation_name, + group_index, + step_group, + ) + + async def _advance_as_repeating( + self, + schedule_data_proxy: ScheduleDataStoreProxy, + schedule_id: ScheduleId, + operation_name: OperationName, + group_index: NonNegativeInt, + current_step_group: BaseStepGroup, + group_step_proxies: dict[StepName, StepStoreProxy], + ) -> None: + # REPEATING logic: + # 1) sleep before repeating + # 2) if any of the repeating steps was cancelled -> move to undo + # 3) -> restart all steps in the group + + step_proxies: Iterable[StepStoreProxy] = group_step_proxies.values() + + # 1) sleep before repeating + await asyncio.sleep(current_step_group.wait_before_repeat.total_seconds()) + + # 2) if any of the repeating steps was cancelled -> move to undo + + # since some time passed, query all steps statuses again, + # a cancellation request might have been requested + steps_stauses = await get_steps_statuses(step_proxies) + if any(status == StepStatus.CANCELLED for status in steps_stauses.values()): + # NOTE: + await schedule_data_proxy.create_or_update("is_creating", value=False) + await enqueue_schedule_event(self.app, schedule_id) + return + + # 3) -> restart all steps in the group + await limited_gather( + *(x.delete() for x in step_proxies), limit=PARALLEL_REQUESTS + ) + group_proxy = StepGroupProxy( + store=self._store, + schedule_id=schedule_id, + operation_name=operation_name, + step_group_name=current_step_group.get_step_group_name(index=group_index), + is_creating=True, + ) + await group_proxy.delete() + await enqueue_schedule_event(self.app, schedule_id) + + async def _advance_as_creating( + self, + steps_statuses: dict[StepName, StepStatus], + schedule_data_proxy: ScheduleDataStoreProxy, + schedule_id: ScheduleId, + operation_name: OperationName, + group_index: NonNegativeInt, + current_step_group: BaseStepGroup, + operation: Operation, + ) -> None: + # CREATION logic: + # 1) if all steps in group in SUUCESS + # - 1a) -> move to next group + # - 1b) if reached the end of the CREATE operation -> remove all created data + # 2) if manual intervention is required -> do nothing else + # 3) if any step in CANCELLED or FAILED (and not in manual intervention) -> move to undo + + # 1) if all steps in group in SUUCESS + if all(status == StepStatus.SUCCESS for status in steps_statuses.values()): + + # 1a) -> move to next group + try: + next_group_index = group_index + 1 + # does a next group exist? + _ = operation[next_group_index] + await schedule_data_proxy.create_or_update( + "group_index", value=next_group_index + ) + await enqueue_schedule_event(self.app, schedule_id) + except IndexError: + + # 1b) if reached the end of the CREATE operation -> remove all created data + await cleanup_after_finishing( + self._store, schedule_id=schedule_id, is_creating=True + ) + + return + + # 2) if manual intervention is required -> do nothing else + manual_intervention_step_names: set[StepName] = set() + current_step_group.get_step_subgroup_to_run() + for step in current_step_group.get_step_subgroup_to_run(): + step_status = steps_statuses.get(step.get_step_name(), None) + if step_status == StepStatus.FAILED and step.wait_for_manual_intervention(): + step_proxy = StepStoreProxy( + store=self._store, + schedule_id=schedule_id, + operation_name=operation_name, + step_group_name=current_step_group.get_step_group_name( + index=group_index + ), + step_name=step.get_step_name(), + is_creating=True, + ) + await step_proxy.create_or_update( + "requires_manual_intervention", value=True + ) + manual_intervention_step_names.add(step.get_step_name()) + + if manual_intervention_step_names: + message = ( + f"Operation '{operation_name}' for schedule_id='{schedule_id}' " + f"requires manual intervention for steps: {manual_intervention_step_names}" + ) + _logger.warning(message) + await set_unexpected_opration_state( + self._store, schedule_id, OperationErrorType.STEP_ISSUE, message=message + ) + return + + # 3) if any step in CANCELLED or FAILED (and not in manual intervention) -> move to undo + if any( + s in {StepStatus.FAILED, StepStatus.CANCELLED} + for s in steps_statuses.values() + ): + with log_context( + _logger, + logging.DEBUG, + f"{operation_name=} was not successfull: {steps_statuses=}, moving to undo", + ): + await schedule_data_proxy.create_or_update("is_creating", value=False) + await enqueue_schedule_event(self.app, schedule_id) + return + + raise UnexpectedStepHandlingError( + direction="creation", steps_statuses=steps_statuses, schedule_id=schedule_id + ) + + async def _advance_as_undoing( + self, + steps_statuses: dict[StepName, StepStatus], + schedule_data_proxy: ScheduleDataStoreProxy, + schedule_id: ScheduleId, + operation_name: OperationName, + group_index: NonNegativeInt, + current_step_group: BaseStepGroup, + ) -> None: + # UNDO logic: + # 1) if all steps in group in SUCCESS + # - 1a) if reached the end of the UNDO operation -> remove all created data + # - 1b) -> move to previous group + # 2) it is unexpected to have a FAILED step -> do nothing else + # 3) it is unexpected to have a CANCELLED step -> do nothing else + + # 1) if all steps in group in SUCCESS + if all(s == StepStatus.SUCCESS for s in steps_statuses.values()): + previous_group_index = group_index - 1 + if previous_group_index < 0: + + # 1a) if reached the end of the UNDO operation -> remove all created data + await cleanup_after_finishing( + self._store, schedule_id=schedule_id, is_creating=False + ) + return + + # 1b) -> move to previous group + await schedule_data_proxy.create_or_update( + "group_index", value=previous_group_index + ) + await enqueue_schedule_event(self.app, schedule_id) + return + + # 2) it is unexpected to have a FAILED step -> do nothing else + if failed_step_names := [ + n for n, s in steps_statuses.items() if s == StepStatus.FAILED + ]: + error_tracebacks: list[tuple[StepName, str]] = await limited_gather( + *( + get_step_error_traceback( + self._store, + schedule_id=schedule_id, + operation_name=operation_name, + current_step_group=current_step_group, + group_index=group_index, + step_name=step_name, + ) + for step_name in failed_step_names + ), + limit=PARALLEL_REQUESTS, + ) + + formatted_tracebacks = "\n".join( + f"Step '{step_name}':\n{traceback}" + for step_name, traceback in error_tracebacks + ) + message = ( + f"Operation 'undo' for schedule_id='{schedule_id}' failed for steps: " + f"'{failed_step_names}'. Step code should never fail during destruction, " + f"please report to developers:\n{formatted_tracebacks}" + ) + _logger.error(message) + await set_unexpected_opration_state( + self._store, schedule_id, OperationErrorType.STEP_ISSUE, message=message + ) + return + + # 3) it is unexpected to have a CANCELLED step -> do nothing else + if cancelled_step_names := [ + n for n, s in steps_statuses.items() if s == StepStatus.CANCELLED + ]: + message = ( + f"Operation 'undo' for schedule_id='{schedule_id}' was cancelled for steps: " + f"{cancelled_step_names}. This should not happen, and should be addressed." + ) + _logger.error(message) + await set_unexpected_opration_state( + self._store, + schedule_id, + OperationErrorType.FRAMEWORK_ISSUE, + message=message, + ) + return + + raise UnexpectedStepHandlingError( + direction="undo", steps_statuses=steps_statuses, schedule_id=schedule_id + ) + + +async def start_operation( + app: FastAPI, + operation_name: OperationName, + initial_operation_context: OperationContext, +) -> ScheduleId: + return await Core.get_from_app_state(app).start_operation( + operation_name, initial_operation_context + ) + + +async def cancel_operation(app: FastAPI, schedule_id: ScheduleId) -> None: + """ + Unstruct scheduler to undo all steps completed until + now for the running operation. + + `undoing` refers to the act of undoing the effects of a step + that has already been completed (eg: remove a created network) + """ + await Core.get_from_app_state(app).cancel_operation(schedule_id) + + +async def restart_operation_step_stuck_in_manual_intervention_during_create( + app: FastAPI, schedule_id: ScheduleId, step_name: StepName +) -> None: + """ + restarts a step waiting for manual intervention + NOTE: to be used only with steps where `wait_for_manual_intervention()` is True + + `waiting for manual intervention` refers to a step that has failed and exhausted + all retries and is now waiting for a human to fix the issue (eg: storage service + is reachable once again) + """ + await Core.get_from_app_state(app).restart_operation_step_stuck_in_error( + schedule_id, step_name, in_manual_intervention=True + ) + + +async def restart_operation_step_stuck_during_undo( + app: FastAPI, schedule_id: ScheduleId, step_name: StepName +) -> None: + """ + Restarts a `stuck step` while the operation is being undone + + `stuck step` is a step that has failed and exhausted all retries + `undoing` refers to the act of undoing the effects of a step + that has already been completed (eg: remove a created network) + """ + await Core.get_from_app_state(app).restart_operation_step_stuck_in_error( + schedule_id, step_name, in_manual_intervention=False + ) diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_core_utils.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_core_utils.py new file mode 100644 index 000000000000..bef6ef6fa6a1 --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_core_utils.py @@ -0,0 +1,256 @@ +import logging +from collections.abc import AsyncIterator, Iterable +from contextlib import asynccontextmanager +from typing import Final + +from common_library.error_codes import create_error_code +from common_library.logging.logging_errors import create_troubleshooting_log_kwargs +from pydantic import NonNegativeInt +from servicelib.logging_utils import log_context +from servicelib.utils import limited_gather + +from ._deferred_runner import DeferredRunner +from ._errors import ( + InitialOperationContextKeyNotAllowedError, + NoDataFoundError, +) +from ._models import ( + OperationContext, + OperationErrorType, + OperationName, + ScheduleId, + StepName, + StepStatus, +) +from ._operation import ( + BaseStepGroup, + Operation, + get_operation_provided_context_keys, +) +from ._store import ( + OperationRemovalProxy, + ScheduleDataStoreProxy, + StepStoreProxy, + Store, +) + +_logger = logging.getLogger(__name__) + + +PARALLEL_REQUESTS: Final[NonNegativeInt] = 5 + + +_IN_PROGRESS_STATUSES: Final[set[StepStatus]] = { + StepStatus.SCHEDULED, + StepStatus.CREATED, + StepStatus.RUNNING, +} + + +def are_any_steps_in_a_progress_status( + steps_statuses: dict[StepName, StepStatus], +) -> bool: + return any(status in _IN_PROGRESS_STATUSES for status in steps_statuses.values()) + + +async def _get_step_status(step_proxy: StepStoreProxy) -> tuple[StepName, StepStatus]: + try: + status = await step_proxy.read("status") + except NoDataFoundError: + status = StepStatus.UNKNOWN + + return step_proxy.step_name, status + + +async def get_steps_statuses( + step_proxies: Iterable[StepStoreProxy], +) -> dict[StepName, StepStatus]: + result: list[tuple[StepName, StepStatus]] = await limited_gather( + *(_get_step_status(step) for step in step_proxies), + limit=PARALLEL_REQUESTS, + ) + return dict(result) + + +async def start_and_mark_as_started( + step_proxy: StepStoreProxy, + *, + is_creating: bool, + expected_steps_count: NonNegativeInt, +) -> None: + await DeferredRunner.start( + schedule_id=step_proxy.schedule_id, + operation_name=step_proxy.operation_name, + step_group_name=step_proxy.step_group_name, + step_name=step_proxy.step_name, + is_creating=is_creating, + expected_steps_count=expected_steps_count, + ) + await step_proxy.create_or_update_multiple( + {"deferred_created": True, "status": StepStatus.SCHEDULED} + ) + + +def raise_if_overwrites_any_operation_provided_key( + operation: Operation, initial_operation_context: OperationContext +) -> None: + operation_provided_context_keys = get_operation_provided_context_keys(operation) + for key in initial_operation_context: + if key in operation_provided_context_keys: + raise InitialOperationContextKeyNotAllowedError( + key=key, operation=operation + ) + + +async def get_step_error_traceback( + store: Store, + *, + schedule_id: ScheduleId, + operation_name: OperationName, + current_step_group: BaseStepGroup, + group_index: NonNegativeInt, + step_name: StepName, +) -> tuple[StepName, str]: + step_proxy = StepStoreProxy( + store=store, + schedule_id=schedule_id, + operation_name=operation_name, + step_group_name=current_step_group.get_step_group_name(index=group_index), + step_name=step_name, + is_creating=False, + ) + return step_name, await step_proxy.read("error_traceback") + + +def get_group_step_proxies( + store: Store, + *, + schedule_id: ScheduleId, + operation_name: OperationName, + group_index: NonNegativeInt, + step_group: BaseStepGroup, + is_creating: bool, +) -> dict[StepName, StepStoreProxy]: + return { + step.get_step_name(): StepStoreProxy( + store=store, + schedule_id=schedule_id, + operation_name=operation_name, + step_group_name=step_group.get_step_group_name(index=group_index), + step_name=step.get_step_name(), + is_creating=is_creating, + ) + for step in step_group.get_step_subgroup_to_run() + } + + +async def _get_was_step_started( + step_proxy: StepStoreProxy, +) -> tuple[bool, StepStoreProxy]: + try: + was_stated = (await step_proxy.read("deferred_created")) is True + except NoDataFoundError: + was_stated = False + + return was_stated, step_proxy + + +async def _get_steps_to_start( + step_proxies: Iterable[StepStoreProxy], +) -> list[StepStoreProxy]: + result: list[tuple[bool, StepStoreProxy]] = await limited_gather( + *(_get_was_step_started(step) for step in step_proxies), + limit=PARALLEL_REQUESTS, + ) + return [proxy for was_started, proxy in result if was_started is False] + + +async def start_steps_which_were_not_started( + group_step_proxies: dict[StepName, StepStoreProxy], + *, + is_creating: bool, + group_step_count: NonNegativeInt, +) -> bool: + """retruns True if any step was started""" + started_count: NonNegativeInt = 0 + if to_start_step_proxies := await _get_steps_to_start(group_step_proxies.values()): + steps_to_start_names = [ + step_proxy.step_name for step_proxy in to_start_step_proxies + ] + with log_context( + _logger, + logging.DEBUG, + f"starting steps: {steps_to_start_names=}", + ): + await limited_gather( + *( + start_and_mark_as_started( + step_proxy, + is_creating=is_creating, + expected_steps_count=group_step_count, + ) + for step_proxy in to_start_step_proxies + ), + limit=PARALLEL_REQUESTS, + ) + started_count = len(to_start_step_proxies) + return started_count > 0 + + +async def cleanup_after_finishing( + store: Store, *, schedule_id: ScheduleId, is_creating: bool +) -> None: + removal_proxy = OperationRemovalProxy(store=store, schedule_id=schedule_id) + await removal_proxy.delete() + verb = "COMPLETED" if is_creating else "UNDONE" + _logger.debug("Operation for schedule_id='%s' %s successfully", verb, schedule_id) + + +async def get_requires_manual_intervention(step_proxy: StepStoreProxy) -> bool: + try: + return await step_proxy.read("requires_manual_intervention") + except NoDataFoundError: + return False + + +async def set_unexpected_opration_state( + store: Store, + schedule_id: ScheduleId, + operation_error_type: OperationErrorType, + message: str, +) -> None: + schedule_data_proxy = ScheduleDataStoreProxy(store=store, schedule_id=schedule_id) + await schedule_data_proxy.create_or_update_multiple( + { + "operation_error_type": operation_error_type, + "operation_error_message": message, + } + ) + + +@asynccontextmanager +async def safe_event(store: Store, schedule_id: ScheduleId) -> AsyncIterator[None]: + try: + yield + except NoDataFoundError as err: + _logger.debug( + "Cannot process schedule_id='%s' since it's data was not found: %s", + schedule_id, + err, + ) + except Exception as err: # pylint:disable=broad-exception-caught + error_code = create_error_code(err) + log_kwargs = create_troubleshooting_log_kwargs( + "Unexpected error during scheduling", + error=err, + error_code=error_code, + error_context={"schedule_id": schedule_id}, + tip="This is a bug, please report it to the developers", + ) + _logger.exception(**log_kwargs) + await set_unexpected_opration_state( + store, + schedule_id, + OperationErrorType.FRAMEWORK_ISSUE, + message=log_kwargs["msg"], + ) diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_deferred_runner.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_deferred_runner.py new file mode 100644 index 000000000000..f3ee304c1923 --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_deferred_runner.py @@ -0,0 +1,232 @@ +from datetime import timedelta + +from fastapi import FastAPI +from pydantic import NonNegativeInt +from servicelib.deferred_tasks import BaseDeferredHandler, DeferredContext, TaskUID +from servicelib.deferred_tasks._models import TaskResultError + +from ._errors import ( + OperationContextValueIsNoneError, + ProvidedOperationContextKeysAreMissingError, +) +from ._event import enqueue_schedule_event +from ._models import ( + OperationContext, + OperationName, + ProvidedOperationContext, + ScheduleId, + StepGroupName, + StepName, + StepStatus, +) +from ._operation import BaseStep, OperationRegistry +from ._store import ( + OperationContextProxy, + StepGroupProxy, + StepStoreProxy, + Store, +) + + +def get_step_store_proxy(context: DeferredContext) -> StepStoreProxy: + app: FastAPI = context["app"] + schedule_id: ScheduleId = context["schedule_id"] + operation_name: OperationName = context["operation_name"] + step_group_name: StepGroupName = context["step_group_name"] + step_name: StepName = context["step_name"] + is_creating = context["is_creating"] + + return StepStoreProxy( + store=Store.get_from_app_state(app), + schedule_id=schedule_id, + operation_name=operation_name, + step_group_name=step_group_name, + step_name=step_name, + is_creating=is_creating, + ) + + +def get_step_group_proxy(context: DeferredContext) -> StepGroupProxy: + app: FastAPI = context["app"] + schedule_id: ScheduleId = context["schedule_id"] + operation_name: OperationName = context["operation_name"] + step_group_name: StepGroupName = context["step_group_name"] + is_creating = context["is_creating"] + + return StepGroupProxy( + store=Store.get_from_app_state(app), + schedule_id=schedule_id, + operation_name=operation_name, + step_group_name=step_group_name, + is_creating=is_creating, + ) + + +def get_operation_context_proxy(context: DeferredContext) -> OperationContextProxy: + app: FastAPI = context["app"] + schedule_id: ScheduleId = context["schedule_id"] + operation_name: OperationName = context["operation_name"] + + return OperationContextProxy( + store=Store.get_from_app_state(app), + schedule_id=schedule_id, + operation_name=operation_name, + ) + + +def _get_step(context: DeferredContext) -> type[BaseStep]: + operation_name: OperationName = context["operation_name"] + step_name: StepName = context["step_name"] + return OperationRegistry.get_step(operation_name, step_name) + + +async def _enqueue_schedule_event_if_group_is_done(context: DeferredContext) -> None: + # used to avoid concurrency issues when multiples steps finish "at the same time" + app: FastAPI = context["app"] + schedule_id: ScheduleId = context["schedule_id"] + expected_steps_count: NonNegativeInt = context["expected_steps_count"] + + if ( + await get_step_group_proxy(context).increment_and_get_done_steps_count() + == expected_steps_count + ): + await enqueue_schedule_event(app, schedule_id) + + +def _raise_if_any_context_value_is_none( + operation_context: OperationContext, +) -> None: + if any(value is None for value in operation_context.values()): + raise OperationContextValueIsNoneError(operation_context=operation_context) + + +def _raise_if_provided_context_keys_are_missing_or_none( + provided_context: ProvidedOperationContext, + expected_keys: set[str], +) -> None: + missing_keys = expected_keys - provided_context.keys() + if missing_keys: + raise ProvidedOperationContextKeysAreMissingError( + provided_context=provided_context, + missing_keys=missing_keys, + expected_keys=expected_keys, + ) + + _raise_if_any_context_value_is_none(provided_context) + + +class DeferredRunner(BaseDeferredHandler[None]): + @classmethod + async def start( # type:ignore[override] # pylint:disable=arguments-differ + cls, + *, + schedule_id: ScheduleId, + operation_name: OperationName, + step_group_name: StepGroupName, + step_name: StepName, + is_creating: bool, + expected_steps_count: NonNegativeInt, + ) -> DeferredContext: + return { + "schedule_id": schedule_id, + "operation_name": operation_name, + "step_group_name": step_group_name, + "step_name": step_name, + "is_creating": is_creating, + "expected_steps_count": expected_steps_count, + } + + @classmethod + async def get_retries(cls, context: DeferredContext) -> int: + is_creating = context["is_creating"] + step = _get_step(context) + return ( + await step.get_create_retries(context) + if is_creating + else await step.get_undo_retries(context) + ) + + @classmethod + async def get_timeout(cls, context: DeferredContext) -> timedelta: + is_creating = context["is_creating"] + step = _get_step(context) + return ( + await step.get_create_wait_between_attempts(context) + if is_creating + else await step.get_undo_wait_between_attempts(context) + ) + + @classmethod + async def on_created(cls, task_uid: TaskUID, context: DeferredContext) -> None: + await get_step_store_proxy(context).create_or_update_multiple( + {"deferred_task_uid": task_uid, "status": StepStatus.CREATED} + ) + + @classmethod + async def run(cls, context: DeferredContext) -> None: + app = context["app"] + is_creating = context["is_creating"] + + await get_step_store_proxy(context).create_or_update( + "status", StepStatus.RUNNING + ) + + step = _get_step(context) + + operation_context_proxy = get_operation_context_proxy(context) + + if is_creating: + required_context = await operation_context_proxy.read( + *step.get_create_requires_context_keys() + ) + _raise_if_any_context_value_is_none(required_context) + + step_provided_operation_context = await step.create(app, required_context) + provided_operation_context = step_provided_operation_context or {} + create_provides_keys = step.get_create_provides_context_keys() + + _raise_if_provided_context_keys_are_missing_or_none( + provided_operation_context, create_provides_keys + ) + else: + required_context = await operation_context_proxy.read( + *step.get_undo_requires_context_keys() + ) + _raise_if_any_context_value_is_none(required_context) + + step_provided_operation_context = await step.undo(app, required_context) + provided_operation_context = step_provided_operation_context or {} + undo_provides_keys = step.get_undo_provides_context_keys() + + _raise_if_provided_context_keys_are_missing_or_none( + provided_operation_context, undo_provides_keys + ) + + await operation_context_proxy.create_or_update(provided_operation_context) + + @classmethod + async def on_result(cls, result: None, context: DeferredContext) -> None: + _ = result + await get_step_store_proxy(context).create_or_update( + "status", StepStatus.SUCCESS + ) + + await _enqueue_schedule_event_if_group_is_done(context) + + @classmethod + async def on_finished_with_error( + cls, error: TaskResultError, context: DeferredContext + ) -> None: + await get_step_store_proxy(context).create_or_update_multiple( + {"status": StepStatus.FAILED, "error_traceback": error.format_error()} + ) + + await _enqueue_schedule_event_if_group_is_done(context) + + @classmethod + async def on_cancelled(cls, context: DeferredContext) -> None: + await get_step_store_proxy(context).create_or_update( + "status", StepStatus.CANCELLED + ) + + await _enqueue_schedule_event_if_group_is_done(context) diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_errors.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_errors.py new file mode 100644 index 000000000000..bb24cac569e2 --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_errors.py @@ -0,0 +1,71 @@ +from common_library.errors_classes import OsparcErrorMixin + + +class BaseGenericSchedulerError(OsparcErrorMixin, Exception): + """base exception for this module""" + + +class NoDataFoundError(BaseGenericSchedulerError): + msg_template: str = "Key '{key}' not found in hash '{hash_key}'" + + +class OperationAlreadyRegisteredError(BaseGenericSchedulerError): + msg_template: str = "Operation '{operation_name}' already registered" + + +class OperationNotFoundError(BaseGenericSchedulerError): + msg_template: str = ( + "Operation '{operation_name}' was not found, registered_operations='{registered_operations}'" + ) + + +class StepNotFoundInoperationError(BaseGenericSchedulerError): + msg_template: str = ( + "Step '{step_name}' not found steps_names='{steps_names}' for operation '{operation_name}'" + ) + + +class UnexpectedStepHandlingError(BaseGenericSchedulerError): + msg_template: str = ( + "During '{direction}' of steps_statuses='{steps_statuses}' for schedule_id='{schedule_id}' " + "reached the end of the handler. This should not happen." + ) + + +class OperationContextValueIsNoneError(BaseGenericSchedulerError): + msg_template: str = "Values of context cannot be None: {operation_context}" + + +class ProvidedOperationContextKeysAreMissingError(BaseGenericSchedulerError): + msg_template: str = ( + "Provided context {provided_context} is missing keys {missing_keys}, was expecting {expected_keys}" + ) + + +class InitialOperationContextKeyNotAllowedError(BaseGenericSchedulerError): + msg_template: str = ( + "Initial operation context cannot contain key '{key}' that would " + "be overritted by a step in the operation: {operation}" + ) + + +class CannotCancelWhileWaitingForManualInterventionError(BaseGenericSchedulerError): + msg_template: str = ( + "Cannot cancel schedule_id='{schedule_id}' while one or more steps are waiting for manual intervention." + ) + + +class StepNameNotInCurrentGroupError(BaseGenericSchedulerError): + msg_template: str = ( + "step_name='{step_name}' not in current step_group_name='{step_group_name}' of operation_name='{operation_name}'" + ) + + +class StepNotInErrorStateError(BaseGenericSchedulerError): + msg_template: str = ( + "step_name='{step_name}' is not in an error state and cannot be restarted" + ) + + +class StepNotWaitingForManualInterventionError(BaseGenericSchedulerError): + msg_template: str = "step_name='{step_name}' is not waiting for manual intervention" diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_event.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_event.py new file mode 100644 index 000000000000..ed0a64456fcb --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_event.py @@ -0,0 +1,13 @@ +from typing import TYPE_CHECKING + +from fastapi import FastAPI + +from ._models import ScheduleId + +if TYPE_CHECKING: + from ._event_scheduler import EventScheduler + + +async def enqueue_schedule_event(app: FastAPI, schedule_id: ScheduleId) -> None: + event_scheduler: EventScheduler = app.state.generic_scheduler_event_scheduler + await event_scheduler.enqueue_schedule_event(schedule_id) diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_event_scheduler.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_event_scheduler.py new file mode 100644 index 000000000000..12c36d15131e --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_event_scheduler.py @@ -0,0 +1,112 @@ +import functools +import logging +from typing import Final + +from fastapi import FastAPI +from faststream.exceptions import FastStreamException, RejectMessage +from faststream.rabbit import ( + ExchangeType, + RabbitBroker, + RabbitExchange, + RabbitQueue, + RabbitRouter, +) +from faststream.rabbit.schemas.queue import ClassicQueueArgs +from servicelib.fastapi.app_state import SingletonInAppStateMixin + +from ...core.settings import ApplicationSettings +from ._core import Core +from ._lifecycle_protocol import SupportsLifecycle +from ._models import ScheduleId + +_logger = logging.getLogger(__name__) + + +_EXCHANGE_NAME: Final[str] = __name__ + + +def _get_global_queue( + queue_name: str, arguments: ClassicQueueArgs | None = None +) -> RabbitQueue: + return RabbitQueue( + f"{_EXCHANGE_NAME}_{queue_name}", durable=True, arguments=arguments + ) + + +def _stop_retry_for_unintended_errors(func): + """ + Stops FastStream's retry chain when an unexpected error is raised (bug or otherwise). + This is especially important when the subscribers have ``retry=True``. + + Only propagate FastStream error that handle message acknowledgement. + """ + + @functools.wraps(func) + async def wrapper(*args, **kwargs): + try: + return await func(*args, **kwargs) + except Exception as e: + if isinstance(e, FastStreamException): + # if there are issues with Redis or FastStream (core dependencies) + # message is always retried + raise + + msg = ( + "Unexpected error. Aborting message retry. " + f"Please check code at: '{func.__module__}.{func.__name__}'" + ) + _logger.exception(msg) + raise RejectMessage from e + + return wrapper + + +class EventScheduler(SingletonInAppStateMixin, SupportsLifecycle): + """Handles scheduling of single events for a given schedule_id""" + + app_state_name: str = "generic_scheduler_event_scheduler" + + def __init__(self, app: FastAPI) -> None: + self.app = app + + settings: ApplicationSettings = app.state.settings + + self._broker: RabbitBroker = RabbitBroker( + settings.DYNAMIC_SCHEDULER_RABBITMQ.dsn, log_level=logging.DEBUG + ) + self._router: RabbitRouter = RabbitRouter() + self._exchange = RabbitExchange( + _EXCHANGE_NAME, durable=True, type=ExchangeType.DIRECT + ) + self._queue_schedule_event = _get_global_queue(queue_name="schedule_queue") + + @_stop_retry_for_unintended_errors + async def _on_safe_on_schedule_event( # pylint:disable=method-hidden + self, schedule_id: ScheduleId + ) -> None: + await Core.get_from_app_state(self.app).safe_on_schedule_event(schedule_id) + + async def enqueue_schedule_event(self, schedule_id: ScheduleId) -> None: + await self._broker.publish( + schedule_id, + queue=self._queue_schedule_event, + exchange=self._exchange, + ) + + def _register_subscribers(self) -> None: + # pylint:disable=unexpected-keyword-arg + # pylint:disable=no-value-for-parameter + self._on_safe_on_schedule_event = self._router.subscriber( + queue=self._queue_schedule_event, + exchange=self._exchange, + retry=True, + )(self._on_safe_on_schedule_event) + + async def setup(self) -> None: + self._register_subscribers() + self._broker.include_router(self._router) + + await self._broker.start() + + async def shutdown(self) -> None: + await self._broker.close() diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_lifecycle_protocol.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_lifecycle_protocol.py new file mode 100644 index 000000000000..13c9b12ec8b3 --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_lifecycle_protocol.py @@ -0,0 +1,9 @@ +from typing import Protocol + + +class SupportsLifecycle(Protocol): + async def setup(self) -> None: + """initialize resource or compoennts""" + + async def shutdown(self) -> None: + """clean resource or components""" diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_lifespan.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_lifespan.py new file mode 100644 index 000000000000..5e801014b159 --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_lifespan.py @@ -0,0 +1,34 @@ +from collections.abc import AsyncIterator + +from fastapi import FastAPI +from fastapi_lifespan_manager import State + +from ...core.settings import ApplicationSettings +from ._core import Core +from ._event_scheduler import EventScheduler +from ._lifecycle_protocol import SupportsLifecycle +from ._store import Store + + +async def generic_scheduler_lifespan(app: FastAPI) -> AsyncIterator[State]: + # store + settings: ApplicationSettings = app.state.settings + store = Store(settings.DYNAMIC_SCHEDULER_REDIS) + store.set_to_app_state(app) + + # core + Core(app).set_to_app_state(app) + + # event scheduler + event_scheduler = EventScheduler(app) + event_scheduler.set_to_app_state(app) + + supports_lifecycle: list[SupportsLifecycle] = [event_scheduler, store] + + for instance in supports_lifecycle: + await instance.setup() + + yield {} + + for instance in supports_lifecycle: + await instance.shutdown() diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_models.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_models.py new file mode 100644 index 000000000000..15a084b3ae12 --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_models.py @@ -0,0 +1,41 @@ +from enum import auto +from typing import Annotated, Any, Final, TypeAlias + +from models_library.basic_types import UUIDStr +from models_library.utils.enums import StrAutoEnum +from pydantic import StringConstraints + +_NAME_PATTERN: Final[str] = r"^[a-zA-Z0-9_]\w*$" + +ScheduleId: TypeAlias = UUIDStr + +OperationName: TypeAlias = Annotated[str, StringConstraints(pattern=_NAME_PATTERN)] +StepGroupName: TypeAlias = Annotated[str, StringConstraints(pattern=_NAME_PATTERN)] +StepName: TypeAlias = Annotated[str, StringConstraints(pattern=_NAME_PATTERN)] + +# contains all inputs and outpus of each step in the operation +OperationContext: TypeAlias = dict[str, Any] +# the inputs of `create` or `undo` of a step +RequiredOperationContext: TypeAlias = dict[str, Any] +# the outputs of `create` or `undo` of a step +ProvidedOperationContext: TypeAlias = dict[str, Any] + + +class StepStatus(StrAutoEnum): + # could not find a status for the step (key not in Redis) + UNKNOWN = auto() + + # in progress statuses + SCHEDULED = auto() + CREATED = auto() + RUNNING = auto() + + # final statuses + SUCCESS = auto() + FAILED = auto() + CANCELLED = auto() + + +class OperationErrorType(StrAutoEnum): + FRAMEWORK_ISSUE = auto() + STEP_ISSUE = auto() diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_operation.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_operation.py new file mode 100644 index 000000000000..7f9ecf448b5f --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_operation.py @@ -0,0 +1,373 @@ +from abc import ABC, abstractmethod +from datetime import timedelta +from typing import Annotated, ClassVar, Final, TypeAlias, TypedDict + +from fastapi import FastAPI +from pydantic import Field, NonNegativeInt, TypeAdapter, validate_call +from servicelib.deferred_tasks import DeferredContext + +from ._errors import ( + OperationAlreadyRegisteredError, + OperationNotFoundError, + StepNotFoundInoperationError, +) +from ._models import ( + OperationName, + ProvidedOperationContext, + RequiredOperationContext, + StepGroupName, + StepName, +) + +_DEFAULT_STEP_RETRIES: Final[NonNegativeInt] = 0 +_DEFAULT_STEP_TIMEOUT: Final[timedelta] = timedelta(seconds=5) +_DEFAULT_WAIT_FOR_MANUAL_INTERVENTION: Final[bool] = False + + +class BaseStep(ABC): + @classmethod + def get_step_name(cls) -> StepName: + return cls.__name__ + + ### CREATE + + @classmethod + @abstractmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + """ + [mandatory] handler to be implemented with the code resposible for achieving a goal + NOTE: Ensure this is successful if: + - `create` is called multiple times and does not cause duplicate resources + """ + + @classmethod + def get_create_requires_context_keys(cls) -> set[str]: + """ + [optional] keys that must be present in the OperationContext when CREATE is called + """ + return set() + + @classmethod + def get_create_provides_context_keys(cls) -> set[str]: + """ + [optional] keys that will be added to the OperationContext when CREATE is successful + """ + return set() + + @classmethod + async def get_create_retries(cls, context: DeferredContext) -> int: + """ + [optional] amount of retires in case of creation + HINT: you can use `get_operation_context_proxy()`, `get_step_group_proxy(context)` + and `get_step_store_proxy(context)` to implement custom retry strategy + """ + assert context # nosec + return _DEFAULT_STEP_RETRIES + + @classmethod + async def get_create_wait_between_attempts( + cls, context: DeferredContext + ) -> timedelta: + """ + [optional] wait time between retires case of creation + HINT: you can use `get_operation_context_proxy()`, `get_step_group_proxy(context)` + and `get_step_store_proxy(context)` to implement custom retry strategy + """ + assert context # nosec + return _DEFAULT_STEP_TIMEOUT + + @classmethod + def wait_for_manual_intervention(cls) -> bool: + """ + [optional] if True scheduler will block waiting for manual intervention form a user + """ + return _DEFAULT_WAIT_FOR_MANUAL_INTERVENTION + + ### UNDO + + @classmethod + async def undo( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + """ + [optional] handler responsible for cleanup of resources created above. + NOTE: Ensure this is successful if: + - `create` is not executed + - `create` is executed partially + - `undo` is called multiple times + """ + _ = required_context + _ = app + return {} + + @classmethod + def get_undo_requires_context_keys(cls) -> set[str]: + """ + [optional] keys that must be present in the OperationContext when UNDO is called + """ + return set() + + @classmethod + def get_undo_provides_context_keys(cls) -> set[str]: + """ + [optional] keys that will be added to the OperationContext when UNDO is successful + """ + return set() + + @classmethod + async def get_undo_retries(cls, context: DeferredContext) -> int: + """ + [optional] amount of retires in case of failure + HINT: you can use `get_operation_context_proxy()`, `get_step_group_proxy(context)` + and `get_step_store_proxy(context)` to implement custom retry strategy + """ + assert context # nosec + return _DEFAULT_STEP_RETRIES + + @classmethod + async def get_undo_wait_between_attempts( + cls, context: DeferredContext + ) -> timedelta: + """ + [optional] timeout between retires in case of failure + HINT: you can use `get_operation_context_proxy()`, `get_step_group_proxy(context)` + and `get_step_store_proxy(context)` to implement custom retry strategy + """ + assert context # nosec + return _DEFAULT_STEP_TIMEOUT + + +StepsSubGroup: TypeAlias = Annotated[tuple[type[BaseStep], ...], Field(min_length=1)] + + +class BaseStepGroup(ABC): + def __init__(self, *, repeat_steps: bool, wait_before_repeat: timedelta) -> None: + """ + if repeat_steps is True, the steps in this group will be repeated forever + """ + self.repeat_steps = repeat_steps + self.wait_before_repeat = wait_before_repeat + + @abstractmethod + def __len__(self) -> int: + """number of steps in this group""" + + @abstractmethod + def __repr__(self) -> str: + """text representation of this step group""" + + @abstractmethod + def get_step_group_name(self, *, index: NonNegativeInt) -> StepGroupName: + """returns the name of this step group""" + + @abstractmethod + def get_step_subgroup_to_run(self) -> StepsSubGroup: + """returns subgroups of steps to run""" + + +_DEFAULT_REPEAT_STEPS: Final[bool] = False +_DEFAULT_WAIT_BEFORE_REPEAT: Final[timedelta] = timedelta(seconds=5) + + +class SingleStepGroup(BaseStepGroup): + def __init__( + self, + step: type[BaseStep], + *, + repeat_steps: bool = _DEFAULT_REPEAT_STEPS, + wait_before_repeat: timedelta = _DEFAULT_WAIT_BEFORE_REPEAT, + ) -> None: + self._step: type[BaseStep] = step + super().__init__( + repeat_steps=repeat_steps, wait_before_repeat=wait_before_repeat + ) + + def __len__(self) -> int: + return 1 + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self._step.get_step_name()})" + + def get_step_group_name(self, *, index: NonNegativeInt) -> StepGroupName: + return f"{index}S{'R' if self.repeat_steps else ''}" + + def get_step_subgroup_to_run(self) -> StepsSubGroup: + return TypeAdapter(StepsSubGroup).validate_python((self._step,)) + + +_MIN_PARALLEL_STEPS: Final[int] = 2 + + +class ParallelStepGroup(BaseStepGroup): + def __init__( + self, + *steps: type[BaseStep], + repeat_steps: bool = _DEFAULT_REPEAT_STEPS, + wait_before_repeat: timedelta = _DEFAULT_WAIT_BEFORE_REPEAT, + ) -> None: + self._steps: list[type[BaseStep]] = list(steps) + super().__init__( + repeat_steps=repeat_steps, wait_before_repeat=wait_before_repeat + ) + + def __len__(self) -> int: + return len(self._steps) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({', '.join(step.get_step_name() for step in self._steps)})" + + @property + def steps(self) -> list[type[BaseStep]]: + return self._steps + + def get_step_group_name(self, *, index: NonNegativeInt) -> StepGroupName: + return f"{index}P{'R' if self.repeat_steps else ''}" + + def get_step_subgroup_to_run(self) -> StepsSubGroup: + return TypeAdapter(StepsSubGroup).validate_python(tuple(self._steps)) + + +Operation: TypeAlias = Annotated[list[BaseStepGroup], Field(min_length=1)] + + +def _has_abstract_methods(cls: type[object]) -> bool: + return bool(getattr(cls, "__abstractmethods__", set())) + + +@validate_call(config={"arbitrary_types_allowed": True}) +def _validate_operation(operation: Operation) -> dict[StepName, type[BaseStep]]: + detected_steps_names: dict[StepName, type[BaseStep]] = {} + create_provided_keys: set[str] = set() + undo_provided_keys: set[str] = set() + + for k, step_group in enumerate(operation): + if ( + isinstance(step_group, ParallelStepGroup) + and len(step_group.steps) < _MIN_PARALLEL_STEPS + ): + msg = ( + f"{ParallelStepGroup.__name__} needs at least {_MIN_PARALLEL_STEPS} " + f"steps. TIP: use {SingleStepGroup.__name__} instead." + ) + raise ValueError(msg) + + if k < len(operation) - 1 and step_group.repeat_steps is True: + msg = f"Only the last step group can have repeat_steps=True. Error at index {k=}" + raise ValueError(msg) + + for step in step_group.get_step_subgroup_to_run(): + step_name = step.get_step_name() + + if _has_abstract_methods(step): + msg = f"Step {step_name=} has abstract methods and cannot be registered" + raise ValueError(msg) + + if step_name in detected_steps_names: + msg = f"Step {step_name=} is already used in this operation {detected_steps_names=}" + raise ValueError(msg) + + detected_steps_names[step_name] = step + + for key in step.get_create_provides_context_keys(): + if key in create_provided_keys: + msg = ( + f"Step {step_name=} provides already provided {key=} in " + f"{step.get_create_provides_context_keys.__name__}()" + ) + raise ValueError(msg) + create_provided_keys.add(key) + for key in step.get_undo_provides_context_keys(): + if key in undo_provided_keys: + msg = ( + f"Step {step_name=} provides already provided {key=} in " + f"{step.get_undo_provides_context_keys.__name__}()" + ) + raise ValueError(msg) + undo_provided_keys.add(key) + + if ( + step_group.repeat_steps is True + and k == len(operation) - 1 + and any( + step.wait_for_manual_intervention() + for step in step_group.get_step_subgroup_to_run() + ) + ): + msg = ( + "Step groups with repeat_steps=True cannot have steps that require " + "manual intervention. This would lead to a deadlock." + ) + raise ValueError(msg) + + return detected_steps_names + + +def get_operation_provided_context_keys(operation: Operation) -> set[str]: + provided_keys: set[str] = set() + + for step_group in operation: + for step in step_group.get_step_subgroup_to_run(): + provided_keys.update(step.get_create_provides_context_keys()) + provided_keys.update(step.get_undo_provides_context_keys()) + + return provided_keys + + +class _UpdateScheduleDataDict(TypedDict): + operation: Operation + steps: dict[StepName, type[BaseStep]] + + +class OperationRegistry: + _OPERATIONS: ClassVar[dict[OperationName, _UpdateScheduleDataDict]] = {} + + @classmethod + def register(cls, operation_name: OperationName, operation: Operation) -> None: + steps = _validate_operation(operation) + + if operation_name in cls._OPERATIONS: + raise OperationAlreadyRegisteredError(operation_name=operation_name) + + cls._OPERATIONS[operation_name] = {"operation": operation, "steps": steps} + + @classmethod + def get_operation(cls, operation_name: OperationName) -> Operation: + if operation_name not in cls._OPERATIONS: + raise OperationNotFoundError( + operation_name=operation_name, + registered_operations=list(cls._OPERATIONS.keys()), + ) + + return cls._OPERATIONS[operation_name]["operation"] + + @classmethod + def get_step( + cls, operation_name: OperationName, step_name: StepName + ) -> type[BaseStep]: + if operation_name not in cls._OPERATIONS: + raise OperationNotFoundError( + operation_name=operation_name, + registered_operations=list(cls._OPERATIONS.keys()), + ) + + steps_names = set(cls._OPERATIONS[operation_name]["steps"].keys()) + if step_name not in steps_names: + raise StepNotFoundInoperationError( + step_name=step_name, + operation_name=operation_name, + steps_names=steps_names, + ) + + return cls._OPERATIONS[operation_name]["steps"][step_name] + + @classmethod + def unregister(cls, operation_name: OperationName) -> None: + if operation_name not in cls._OPERATIONS: + raise OperationNotFoundError( + operation_name=operation_name, + registered_operations=list(cls._OPERATIONS.keys()), + ) + + del cls._OPERATIONS[operation_name] diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_store.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_store.py new file mode 100644 index 000000000000..f0392f5a4017 --- /dev/null +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/generic_scheduler/_store.py @@ -0,0 +1,436 @@ +from typing import Any, Final, Literal, NotRequired, TypedDict, overload + +import redis.asyncio as aioredis +from common_library.json_serialization import json_dumps, json_loads +from pydantic import NonNegativeInt +from servicelib.deferred_tasks import TaskUID +from servicelib.fastapi.app_state import SingletonInAppStateMixin +from servicelib.redis._client import RedisClientSDK +from servicelib.redis._utils import handle_redis_returns_union_types +from settings_library.redis import RedisDatabase, RedisSettings + +from ._errors import NoDataFoundError +from ._lifecycle_protocol import SupportsLifecycle +from ._models import ( + OperationErrorType, + OperationName, + ProvidedOperationContext, + RequiredOperationContext, + ScheduleId, + StepGroupName, + StepName, + StepStatus, +) + +_SCHEDULE_NAMESPACE: Final[str] = "SCH" +_STEPS_KEY: Final[str] = "STEPS" +_GROUPS_KEY: Final[str] = "GROUPS" +_OPERATION_CONTEXT_KEY: Final[str] = "OP_CTX" + + +def _get_is_creating_str(*, is_creating: bool) -> str: + return "C" if is_creating else "U" + + +def _get_scheduler_data_hash_key(*, schedule_id: ScheduleId) -> str: + # SCHEDULE_NAMESPACE:SCHEDULE_ID + # - SCHEDULE_NAMESPACE: namespace prefix + # - SCHEDULE_ID: the unique scheudle_id assigned + # Example: + # - SCH:00000000-0000-0000-0000-000000000000 + return f"{_SCHEDULE_NAMESPACE}:{schedule_id}" + + +def _get_step_hash_key( + *, + schedule_id: ScheduleId, + operation_name: OperationName, + group_name: StepGroupName, + step_name: StepName, + is_creating: bool, +) -> str: + # SCHEDULE_NAMESPACE:SCHEDULE_ID:STEPS:OPERATION_NAME:GROUP_SHORT_NAME:STEP_NAME:IS_CREATING + # - SCHEDULE_NAMESPACE: namespace prefix + # - SCHEDULE_ID: the unique scheudle_id assigned + # - CONSTANT: the constant "STEPS" + # - OPERATION_NAME form the vairble's name during registration + # - GROUP_SHORT_NAME + # -> "{index}(S|P)[R]": S=single or P=parallel and optinally, "R" if steps should be repeated forever + # - IS_CREATING: "C" (create) or "U" (undo) + # - STEP_NAME form it's class + # Example: + # - SCH:00000000-0000-0000-0000-000000000000:STEPS:START_SERVICE:0S:C:BS1 + is_creating_str = _get_is_creating_str(is_creating=is_creating) + return f"{_SCHEDULE_NAMESPACE}:{schedule_id}:{_STEPS_KEY}:{operation_name}:{group_name}:{is_creating_str}:{step_name}" + + +def _get_group_hash_key( + *, + schedule_id: ScheduleId, + operation_name: OperationName, + group_name: StepGroupName, + is_creating: bool, +) -> str: + # SCHEDULE_NAMESPACE:SCHEDULE_ID:GROUPS:OPERATION_NAME:GROUP_SHORT_NAME:IS_CREATING + # - SCHEDULE_NAMESPACE: namespace prefix + # - SCHEDULE_ID: the unique scheudle_id assigned + # - CONSTANT: the constant "GROUPS" + # - OPERATION_NAME form the vairble's name during registration + # - GROUP_SHORT_NAME + # -> "{index}(S|P)[R]": S=single or P=parallel and optinally, "R" if steps should be repeated forever + # - IS_CREATING: "C" (create) or "U" (undo) + # Example: + # - SCH:00000000-0000-0000-0000-000000000000:GROUPS:START_SERVICE:0S:C + is_creating_str = _get_is_creating_str(is_creating=is_creating) + return f"{_SCHEDULE_NAMESPACE}:{schedule_id}:{_GROUPS_KEY}:{operation_name}:{group_name}:{is_creating_str}" + + +def _get_operation_context_hash_key( + *, schedule_id: ScheduleId, operation_name: OperationName +) -> str: + # SCHEDULE_NAMESPACE:SCHEDULE_ID:STEPS:OPERATION_NAME + # - SCHEDULE_NAMESPACE: namespace prefix + # - SCHEDULE_ID: the unique scheudle_id assigned + # - CONSTANT: the constant "OP_CTX" + # - OPERATION_NAME form the vairble's name during registration + # Example: + # - SCH:00000000-0000-0000-0000-000000000000:OP_CTX:START_SERVICE + return ( + f"{_SCHEDULE_NAMESPACE}:{schedule_id}:{_OPERATION_CONTEXT_KEY}:{operation_name}" + ) + + +class Store(SingletonInAppStateMixin, SupportsLifecycle): + """ + Interface to Redis, shuld not use directly but use the + proxies defined below. + """ + + app_state_name: str = "generic_scheduler_store" + + def __init__(self, redis_settings: RedisSettings) -> None: + self.redis_settings = redis_settings + + self._client: RedisClientSDK | None = None + + async def setup(self) -> None: + self._client = RedisClientSDK( + self.redis_settings.build_redis_dsn(RedisDatabase.DYNAMIC_SERVICES), + client_name=__name__, + ) + await self._client.setup() + + async def shutdown(self) -> None: + if self._client: + await self._client.shutdown() + + @property + def redis(self) -> aioredis.Redis: + assert self._client # nosec + return self._client.redis + + # HASH + + async def set_keys_in_hash(self, hash_key: str, updates: dict[str, Any]) -> None: + """saves multiple key-value pairs in a hash""" + await handle_redis_returns_union_types( + self.redis.hset( + hash_key, mapping={k: json_dumps(v) for k, v in updates.items()} + ) + ) + + async def set_key_in_hash(self, hash_key: str, key: str, value: Any) -> None: + """saves a single key-value pair in a hash""" + await self.set_keys_in_hash(hash_key, {key: value}) + + async def get_key_from_hash(self, hash_key: str, *keys: str) -> tuple[Any, ...]: + """retrieves one or more keys from a hash""" + result: list[str | None] = await handle_redis_returns_union_types( + self.redis.hmget(hash_key, list(keys)) + ) + return tuple(json_loads(x) if x else None for x in result) + + async def delete_key_from_hash(self, hash_key: str, *hash_keys: str) -> None: + """removes keys form a redis hash""" + await handle_redis_returns_union_types(self.redis.hdel(hash_key, *hash_keys)) + + async def increase_key_in_hash_and_get( + self, hash_key: str, key: str + ) -> NonNegativeInt: + """increasea a key in a hash by 1 and returns the new value""" + return await handle_redis_returns_union_types( + self.redis.hincrby(hash_key, key, amount=1) + ) + + async def decrease_key_in_hash_and_get( + self, hash_key: str, key: str + ) -> NonNegativeInt: + """decrease a key in a hash by 1 and returns the new value""" + return await handle_redis_returns_union_types( + self.redis.hincrby(hash_key, key, amount=-1) + ) + + # GENERIC + + async def delete(self, *keys: str) -> None: + """removes keys from redis""" + await handle_redis_returns_union_types(self.redis.delete(*keys)) + + +class _UpdateScheduleDataDict(TypedDict): + operation_name: NotRequired[OperationName] + group_index: NotRequired[NonNegativeInt] + is_creating: NotRequired[bool] + operation_error_type: NotRequired[OperationErrorType] + operation_error_message: NotRequired[str] + + +_DeleteScheduleDataKeys = Literal[ + "operation_name", + "group_index", + "is_creating", + "operation_error_type", + "operation_error_message", +] + + +class ScheduleDataStoreProxy: + def __init__(self, *, store: Store, schedule_id: ScheduleId) -> None: + self._store = store + self._schedule_id = schedule_id + + def _get_hash_key(self) -> str: + return _get_scheduler_data_hash_key(schedule_id=self._schedule_id) + + @overload + async def read(self, key: Literal["operation_name"]) -> OperationName: ... + @overload + async def read(self, key: Literal["group_index"]) -> NonNegativeInt: ... + @overload + async def read(self, key: Literal["is_creating"]) -> bool: ... + @overload + async def read( + self, key: Literal["operation_error_type"] + ) -> OperationErrorType: ... + @overload + async def read(self, key: Literal["operation_error_message"]) -> str: ... + async def read(self, key: str) -> Any: + """raises NoDataFoundError if the key is not present in the hash""" + hash_key = self._get_hash_key() + (result,) = await self._store.get_key_from_hash(hash_key, key) + if result is None: + raise NoDataFoundError(key=key, hash_key=hash_key) + return result + + @overload + async def create_or_update( + self, key: Literal["operation_name"], value: OperationName + ) -> None: ... + @overload + async def create_or_update( + self, key: Literal["group_index"], value: NonNegativeInt + ) -> None: ... + @overload + async def create_or_update( + self, key: Literal["is_creating"], *, value: bool + ) -> None: ... + @overload + async def create_or_update( + self, key: Literal["operation_error_type"], value: OperationErrorType + ) -> None: ... + @overload + async def create_or_update( + self, key: Literal["operation_error_message"], value: str + ) -> None: ... + async def create_or_update(self, key: str, value: Any) -> None: + await self._store.set_key_in_hash(self._get_hash_key(), key, value) + + async def create_or_update_multiple(self, updates: _UpdateScheduleDataDict) -> None: + await self._store.set_keys_in_hash(self._get_hash_key(), updates=updates) # type: ignore[arg-type] + + async def delete_keys(self, *keys: _DeleteScheduleDataKeys) -> None: + await self._store.delete_key_from_hash(self._get_hash_key(), *keys) + + +class StepGroupProxy: + def __init__( + self, + *, + store: Store, + schedule_id: ScheduleId, + operation_name: OperationName, + step_group_name: StepGroupName, + is_creating: bool, + ) -> None: + self._store = store + self.schedule_id = schedule_id + self.operation_name = operation_name + self.step_group_name = step_group_name + self.is_creating = is_creating + + def _get_hash_key(self) -> str: + return _get_group_hash_key( + schedule_id=self.schedule_id, + operation_name=self.operation_name, + group_name=self.step_group_name, + is_creating=self.is_creating, + ) + + async def increment_and_get_done_steps_count(self) -> NonNegativeInt: + return await self._store.increase_key_in_hash_and_get( + self._get_hash_key(), "done_steps" + ) + + async def decrement_and_get_done_steps_count(self) -> NonNegativeInt: + return await self._store.decrease_key_in_hash_and_get( + self._get_hash_key(), "done_steps" + ) + + async def delete(self) -> None: + await self._store.delete(self._get_hash_key()) + + +class _StepDict(TypedDict): + deferred_created: NotRequired[bool] + status: NotRequired[StepStatus] + deferred_task_uid: NotRequired[TaskUID] + error_traceback: NotRequired[str] + requires_manual_intervention: NotRequired[bool] + + +DeleteStepKeys = Literal[ + "deferred_created", + "status", + "deferred_task_uid", + "error_traceback", + "requires_manual_intervention", +] + + +class StepStoreProxy: + def __init__( + self, + *, + store: Store, + schedule_id: ScheduleId, + operation_name: OperationName, + step_group_name: StepGroupName, + step_name: StepName, + is_creating: bool, + ) -> None: + self._store = store + self.schedule_id = schedule_id + self.operation_name = operation_name + self.step_group_name = step_group_name + self.step_name = step_name + self.is_creating = is_creating + + def _get_hash_key(self) -> str: + return _get_step_hash_key( + schedule_id=self.schedule_id, + operation_name=self.operation_name, + group_name=self.step_group_name, + step_name=self.step_name, + is_creating=self.is_creating, + ) + + @overload + async def read(self, key: Literal["status"]) -> StepStatus: ... + @overload + async def read(self, key: Literal["deferred_task_uid"]) -> TaskUID: ... + @overload + async def read(self, key: Literal["error_traceback"]) -> str: ... + @overload + async def read(self, key: Literal["requires_manual_intervention"]) -> bool: ... + @overload + async def read(self, key: Literal["deferred_created"]) -> bool: ... + async def read(self, key: str) -> Any: + """raises NoDataFoundError if the key is not present in the hash""" + hash_key = self._get_hash_key() + (result,) = await self._store.get_key_from_hash(hash_key, key) + if result is None: + raise NoDataFoundError(schedule_id=self.schedule_id, hash_key=hash_key) + return result + + @overload + async def create_or_update( + self, key: Literal["status"], value: StepStatus + ) -> None: ... + @overload + async def create_or_update( + self, key: Literal["deferred_task_uid"], value: TaskUID + ) -> None: ... + @overload + async def create_or_update( + self, key: Literal["error_traceback"], value: str + ) -> None: ... + @overload + async def create_or_update( + self, key: Literal["requires_manual_intervention"], *, value: bool + ) -> None: ... + @overload + async def create_or_update( + self, key: Literal["deferred_created"], *, value: bool + ) -> None: ... + async def create_or_update(self, key: str, value: Any) -> None: + await self._store.set_key_in_hash(self._get_hash_key(), key, value) + + async def create_or_update_multiple(self, updates: _StepDict) -> None: + await self._store.set_keys_in_hash(self._get_hash_key(), updates=updates) # type: ignore[arg-type] + + async def delete_keys(self, *keys: DeleteStepKeys) -> None: + await self._store.delete_key_from_hash(self._get_hash_key(), *keys) + + async def delete(self) -> None: + await self._store.delete(self._get_hash_key()) + + +class OperationContextProxy: + def __init__( + self, + *, + store: Store, + schedule_id: ScheduleId, + operation_name: OperationName, + ) -> None: + self._store = store + self.schedule_id = schedule_id + self.operation_name = operation_name + + def _get_hash_key(self) -> str: + return _get_operation_context_hash_key( + schedule_id=self.schedule_id, operation_name=self.operation_name + ) + + async def create_or_update(self, updates: ProvidedOperationContext | None) -> None: + if not updates: + return + + await self._store.set_keys_in_hash(self._get_hash_key(), updates) + + async def read(self, *keys: str) -> RequiredOperationContext: + if len(keys) == 0: + return {} + + hash_key = self._get_hash_key() + result = await self._store.get_key_from_hash(hash_key, *keys) + return dict(zip(keys, result, strict=True)) + + async def delete(self) -> None: + await self._store.delete(self._get_hash_key()) + + +class OperationRemovalProxy: + def __init__(self, *, store: Store, schedule_id: ScheduleId) -> None: + self._store = store + self._schedule_id = schedule_id + + async def delete(self) -> None: + found_keys = [ + x + async for x in self._store.redis.scan_iter( + match=f"{_get_scheduler_data_hash_key(schedule_id=self._schedule_id)}*" + ) + ] + if found_keys: + await self._store.delete(*found_keys) diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/status_monitor/_deferred_get_status.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/status_monitor/_deferred_get_status.py index 84653c76b8c6..f788bc890dd8 100644 --- a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/status_monitor/_deferred_get_status.py +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/status_monitor/_deferred_get_status.py @@ -12,7 +12,7 @@ from servicelib.deferred_tasks import BaseDeferredHandler, TaskUID from servicelib.deferred_tasks._base_deferred_handler import DeferredContext -from .. import scheduler_interface, service_tracker +from .. import common_interface, service_tracker from ..notifier import notify_service_status_change _logger = logging.getLogger(__name__) @@ -47,7 +47,7 @@ async def run( node_id: NodeID = context["node_id"] service_status: NodeGet | RunningDynamicServiceDetails | NodeGetIdle = ( - await scheduler_interface.get_service_status(app, node_id=node_id) + await common_interface.get_service_status(app, node_id=node_id) ) _logger.debug( "Service status type=%s, %s", type(service_status), service_status diff --git a/services/dynamic-scheduler/tests/conftest.py b/services/dynamic-scheduler/tests/conftest.py index 5543ad0665da..6a313ec2b4d5 100644 --- a/services/dynamic-scheduler/tests/conftest.py +++ b/services/dynamic-scheduler/tests/conftest.py @@ -25,6 +25,7 @@ "pytest_simcore.cli_runner", "pytest_simcore.docker_compose", "pytest_simcore.docker_swarm", + "pytest_simcore.docker", "pytest_simcore.environment_configs", "pytest_simcore.faker_projects_data", "pytest_simcore.faker_users_data", @@ -120,6 +121,11 @@ def disable_status_monitor_lifespan(mocker: MockerFixture) -> None: mocker.patch(f"{_EVENTS_MODULE}.status_monitor_lifespan") +@pytest.fixture +def disable_generic_scheduler_lifespan(mocker: MockerFixture) -> None: + mocker.patch(f"{_EVENTS_MODULE}.generic_scheduler_lifespan") + + @pytest.fixture def disable_postgres_lifespan( mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch diff --git a/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__health.py b/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__health.py index 42bc7396c9c6..99ba36f16706 100644 --- a/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__health.py +++ b/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__health.py @@ -58,6 +58,7 @@ def mock_docker_api_proxy(mocker: MockerFixture, docker_api_proxy_ok: bool) -> N @pytest.fixture def app_environment( + disable_generic_scheduler_lifespan: None, mock_docker_api_proxy: None, mock_rabbitmq_clients: None, mock_redis_client: None, diff --git a/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__meta.py b/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__meta.py index 2fdb1de6afe9..72692ac97b83 100644 --- a/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__meta.py +++ b/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__meta.py @@ -6,7 +6,7 @@ from simcore_service_dynamic_scheduler.models.schemas.meta import Meta -async def test_meta(client: AsyncClient): +async def test_meta(disable_generic_scheduler_lifespan: None, client: AsyncClient): response = await client.get(f"/{API_VTAG}/meta") assert response.status_code == status.HTTP_200_OK assert Meta.model_validate_json(response.text) diff --git a/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__ops.py b/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__ops.py index 985cc86a4a35..90726fea767c 100644 --- a/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__ops.py +++ b/services/dynamic-scheduler/tests/unit/api_rest/test_api_rest__ops.py @@ -39,7 +39,11 @@ def mock_director_v2_service( [], ], ) -async def test_running_services(mock_director_v2_service: None, client: AsyncClient): +async def test_running_services( + mock_director_v2_service: None, + disable_generic_scheduler_lifespan: None, + client: AsyncClient, +): response = await client.get(f"/{API_VTAG}/ops/running-services") assert response.status_code == status.HTTP_200_OK assert isinstance( diff --git a/services/dynamic-scheduler/tests/unit/services/generic_scheduler/conftest.py b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/conftest.py new file mode 100644 index 000000000000..edc7dc2ee7fb --- /dev/null +++ b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/conftest.py @@ -0,0 +1,22 @@ +from collections.abc import Callable, Iterable + +import pytest +from simcore_service_dynamic_scheduler.services.generic_scheduler import ( + Operation, + OperationName, + OperationRegistry, +) + + +@pytest.fixture +def register_operation() -> Iterable[Callable[[OperationName, Operation], None]]: + to_unregister: list[OperationName] = [] + + def _(operation_name: OperationName, operation: Operation) -> None: + OperationRegistry.register(operation_name, operation) + to_unregister.append(operation_name) + + yield _ + + for operation_name in to_unregister: + OperationRegistry.unregister(operation_name) diff --git a/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__core.py b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__core.py new file mode 100644 index 000000000000..de649a1089ed --- /dev/null +++ b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__core.py @@ -0,0 +1,1685 @@ +# pylint:disable=redefined-outer-name +# pylint:disable=too-many-arguments +# pylint:disable=unused-argument + +import asyncio +import logging +import re +from collections.abc import AsyncIterable, Awaitable, Callable, Iterable +from contextlib import AsyncExitStack +from datetime import timedelta +from secrets import choice +from typing import Any, Final + +import pytest +from asgi_lifespan import LifespanManager +from fastapi import FastAPI +from pydantic import NonNegativeInt, TypeAdapter +from pytest_simcore.helpers.typing_env import EnvVarsDict +from servicelib.utils import limited_gather +from settings_library.rabbit import RabbitSettings +from settings_library.redis import RedisSettings +from simcore_service_dynamic_scheduler.core.application import create_app +from simcore_service_dynamic_scheduler.services.generic_scheduler import ( + BaseStep, + Operation, + OperationName, + ParallelStepGroup, + ProvidedOperationContext, + RequiredOperationContext, + ScheduleId, + SingleStepGroup, + cancel_operation, + restart_operation_step_stuck_during_undo, + restart_operation_step_stuck_in_manual_intervention_during_create, + start_operation, +) +from simcore_service_dynamic_scheduler.services.generic_scheduler._core import Core +from simcore_service_dynamic_scheduler.services.generic_scheduler._errors import ( + CannotCancelWhileWaitingForManualInterventionError, + InitialOperationContextKeyNotAllowedError, + OperationContextValueIsNoneError, + ProvidedOperationContextKeysAreMissingError, + StepNameNotInCurrentGroupError, + StepNotInErrorStateError, + StepNotWaitingForManualInterventionError, +) +from simcore_service_dynamic_scheduler.services.generic_scheduler._models import ( + OperationContext, +) +from simcore_service_dynamic_scheduler.services.generic_scheduler._store import ( + Store, +) +from tenacity import ( + AsyncRetrying, + retry_if_exception_type, + stop_after_delay, + wait_fixed, +) +from utils import ( + CREATED, + UNDONE, + BaseExpectedStepOrder, + CreateRandom, + CreateSequence, + UndoRandom, + UndoSequence, + ensure_expected_order, +) + +pytest_simcore_core_services_selection = [ + "rabbit", + "redis", +] +pytest_simcore_ops_services_selection = [ + "redis-commander", +] + + +_RETRY_PARAMS: Final[dict[str, Any]] = { + "wait": wait_fixed(0.1), + "stop": stop_after_delay(5), + "retry": retry_if_exception_type(AssertionError), +} + + +_PARALLEL_APP_CREATION: Final[NonNegativeInt] = 5 +_PARALLEL_RESTARTS: Final[NonNegativeInt] = 5 + + +@pytest.fixture +def app_environment( + disable_postgres_lifespan: None, + disable_service_tracker_lifespan: None, + disable_notifier_lifespan: None, + disable_status_monitor_lifespan: None, + app_environment: EnvVarsDict, + rabbit_service: RabbitSettings, + redis_service: RedisSettings, + remove_redis_data: None, +) -> EnvVarsDict: + return app_environment + + +@pytest.fixture +async def get_app( + app_environment: EnvVarsDict, +) -> AsyncIterable[Callable[[], Awaitable[FastAPI]]]: + exit_stack = AsyncExitStack() + + started_apps: list[FastAPI] = [] + + async def _() -> FastAPI: + app = create_app() + started_apps.append(app) + + await exit_stack.enter_async_context(LifespanManager(app)) + return app + + yield _ + + await exit_stack.aclose() + + +@pytest.fixture +async def selected_app( + get_app: Callable[[], Awaitable[FastAPI]], app_count: NonNegativeInt +) -> FastAPI: + # initialize a bunch of apps and randomly select one + # this will make sure that there is competition events catching possible issues + apps: list[FastAPI] = await limited_gather( + *[get_app() for _ in range(app_count)], limit=_PARALLEL_APP_CREATION + ) + return choice(apps) + + +@pytest.fixture +def operation_name() -> OperationName: + return "test_op" + + +_STEPS_CALL_ORDER: list[tuple[str, str]] = [] + + +@pytest.fixture +def steps_call_order() -> Iterable[list[tuple[str, str]]]: + _STEPS_CALL_ORDER.clear() + yield _STEPS_CALL_ORDER + _STEPS_CALL_ORDER.clear() + + +class _BS(BaseStep): + @classmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + _ = app + _ = required_context + _STEPS_CALL_ORDER.append((cls.__name__, CREATED)) + + return { + **required_context, + **{k: _CTX_VALUE for k in cls.get_create_provides_context_keys()}, + } + + @classmethod + async def undo( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + _ = app + _ = required_context + _STEPS_CALL_ORDER.append((cls.__name__, UNDONE)) + + return { + **required_context, + **{k: _CTX_VALUE for k in cls.get_undo_provides_context_keys()}, + } + + +class _UndoBS(_BS): + @classmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + await super().create(app, required_context) + msg = "always fails only on CREATE" + raise RuntimeError(msg) + + +class _GlobalStepIssueTracker: + has_issue: bool = True + + @classmethod + def set_issue_solved(cls) -> None: + cls.has_issue = False + + +@pytest.fixture +def reset_step_issue_tracker() -> Iterable[None]: + _GlobalStepIssueTracker.has_issue = True + yield + _GlobalStepIssueTracker.has_issue = True + + +class _FailOnCreateAndUndoBS(_BS): + @classmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + await super().create(app, required_context) + msg = "always fails on CREATE" + raise RuntimeError(msg) + + @classmethod + async def undo( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + await super().undo(app, required_context) + if _GlobalStepIssueTracker.has_issue: + msg = "sometimes fails only on UNDO" + raise RuntimeError(msg) + + +class _SleepsForeverBS(_BS): + @classmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + await super().create(app, required_context) + await asyncio.sleep(1e10) + + +class _WaitManualInerventionBS(_BS): + @classmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + await super().create(app, required_context) + if _GlobalStepIssueTracker.has_issue: + msg = "sometimes fails only on CREATE" + raise RuntimeError(msg) + + @classmethod + def wait_for_manual_intervention(cls) -> bool: + return True + + +def _get_steps_matching_class( + operation: Operation, *, match: type[BaseStep] +) -> list[type]: + return [ + step + for group in operation + for step in group.get_step_subgroup_to_run() + if issubclass(step, match) + ] + + +def _compose_key( + key_nuber: int | None, *, with_undo: bool, is_creating: bool, is_providing: bool +) -> str: + key_parts = [ + "bs", + "undo" if with_undo else "", + "c" if is_creating else "r", + "prov" if is_providing else "req", + f"{key_nuber}", + ] + return "_".join(key_parts) + + +_CTX_VALUE: Final[str] = "a_value" + + +class _MixingGetKeNumber: + @classmethod + def get_key_number(cls) -> int: + # key number if fetched form the calss name as the last digits or 0 + key_number: int = 0 + match = re.search(r"(\d+)\D*$", cls.__name__) + if match: + key_number = int(match.group(1)) + return key_number + + +class _BaseRequiresProvidesContext(_BS, _MixingGetKeNumber): + @classmethod + def get_create_requires_context_keys(cls) -> set[str]: + return { + _compose_key( + cls.get_key_number(), + with_undo=False, + is_creating=True, + is_providing=False, + ) + } + + @classmethod + def get_create_provides_context_keys(cls) -> set[str]: + return { + _compose_key( + cls.get_key_number(), + with_undo=False, + is_creating=True, + is_providing=True, + ) + } + + +class _BaseRequiresProvidesUndoContext(_UndoBS, _MixingGetKeNumber): + @classmethod + def get_create_requires_context_keys(cls) -> set[str]: + return { + _compose_key( + cls.get_key_number(), + with_undo=True, + is_creating=True, + is_providing=False, + ) + } + + @classmethod + def get_create_provides_context_keys(cls) -> set[str]: + return { + _compose_key( + cls.get_key_number(), + with_undo=True, + is_creating=True, + is_providing=True, + ) + } + + @classmethod + def get_undo_requires_context_keys(cls) -> set[str]: + return { + _compose_key( + cls.get_key_number(), + with_undo=True, + is_creating=False, + is_providing=False, + ) + } + + @classmethod + def get_undo_provides_context_keys(cls) -> set[str]: + return { + _compose_key( + cls.get_key_number(), + with_undo=True, + is_creating=False, + is_providing=True, + ) + } + + +async def _assert_keys_in_store(app: FastAPI, *, expected_keys: set[str]) -> None: + keys = set(await Store.get_from_app_state(app).redis.keys()) + assert keys == expected_keys + + +async def _ensure_keys_in_store(app: FastAPI, *, expected_keys: set[str]) -> None: + async for attempt in AsyncRetrying(**_RETRY_PARAMS): + with attempt: + await _assert_keys_in_store(app, expected_keys=expected_keys) + + +async def _esnure_log_mesage(caplog: pytest.LogCaptureFixture, *, message: str) -> None: + async for attempt in AsyncRetrying(**_RETRY_PARAMS): + with attempt: + await asyncio.sleep(0) # wait for envet to trigger + assert message in caplog.text + + +############## TESTS ############## + + +# Below always succeed (expected) + + +class _S1(_BS): ... + + +class _S2(_BS): ... + + +class _S3(_BS): ... + + +class _S4(_BS): ... + + +class _S5(_BS): ... + + +class _S6(_BS): ... + + +class _S7(_BS): ... + + +class _S8(_BS): ... + + +class _S9(_BS): ... + + +class _S10(_BS): ... + + +# Below fail on create (expected) + + +class _RS1(_UndoBS): ... + + +class _RS2(_UndoBS): ... + + +class _RS3(_UndoBS): ... + + +class _RS4(_UndoBS): ... + + +class _RS5(_UndoBS): ... + + +class _RS6(_UndoBS): ... + + +class _RS7(_UndoBS): ... + + +class _RS8(_UndoBS): ... + + +class _RS9(_UndoBS): ... + + +class _RS10(_UndoBS): ... + + +# Below fail both on create and undo (unexpected) + + +class _FCR1(_FailOnCreateAndUndoBS): ... + + +class _FCR2(_FailOnCreateAndUndoBS): ... + + +class _FCR3(_FailOnCreateAndUndoBS): ... + + +# Below will sleep forever + + +class _SF1(_SleepsForeverBS): ... + + +class _SF2(_SleepsForeverBS): ... + + +# Below will wait for manual intervention after it fails on create + + +class _WMI1(_WaitManualInerventionBS): ... + + +class _WMI2(_WaitManualInerventionBS): ... + + +class _WMI3(_WaitManualInerventionBS): ... + + +# Below steps which require and provide context keys + + +class RPCtxS1(_BaseRequiresProvidesContext): ... + + +class RPCtxS2(_BaseRequiresProvidesContext): ... + + +class RPCtxR1(_BaseRequiresProvidesUndoContext): ... + + +class RPCtxR2(_BaseRequiresProvidesUndoContext): ... + + +@pytest.mark.parametrize("app_count", [10]) +@pytest.mark.parametrize( + "operation, expected_order", + [ + pytest.param( + [ + SingleStepGroup(_S1), + ], + [ + CreateSequence(_S1), + ], + id="s1", + ), + pytest.param( + [ + ParallelStepGroup(_S1, _S2), + ], + [ + CreateRandom(_S1, _S2), + ], + id="p2", + ), + pytest.param( + [ + ParallelStepGroup(_S1, _S2, _S3, _S4, _S5, _S6, _S7, _S8, _S9, _S10), + ], + [ + CreateRandom(_S1, _S2, _S3, _S4, _S5, _S6, _S7, _S8, _S9, _S10), + ], + id="p10", + ), + pytest.param( + [ + SingleStepGroup(_S1), + SingleStepGroup(_S2), + SingleStepGroup(_S3), + ParallelStepGroup(_S4, _S5, _S6, _S7, _S8, _S9), + SingleStepGroup(_S10), + ], + [ + CreateSequence(_S1, _S2, _S3), + CreateRandom(_S4, _S5, _S6, _S7, _S8, _S9), + CreateSequence(_S10), + ], + id="s1-s1-s1-p6-s1", + ), + pytest.param( + [ + SingleStepGroup(_RS1), + ], + [ + CreateSequence(_RS1), + UndoSequence(_RS1), + ], + id="s1(1r)", + ), + pytest.param( + [ + ParallelStepGroup(_RS1, _S1, _S2, _S3, _S4, _S5, _S6), + ], + [ + CreateRandom(_S1, _S2, _S3, _S4, _S5, _S6, _RS1), + UndoRandom(_S1, _S2, _S3, _S4, _S5, _S6, _RS1), + ], + id="p7(1r)", + ), + pytest.param( + [ + SingleStepGroup(_S1), + ParallelStepGroup(_S2, _S3, _S4, _S5, _S6), + SingleStepGroup(_RS1), + SingleStepGroup(_S7), # will not execute + ParallelStepGroup(_S8, _S9), # will not execute + ], + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4, _S5, _S6), + CreateSequence(_RS1), + UndoSequence(_RS1), + UndoRandom(_S2, _S3, _S4, _S5, _S6), + UndoSequence(_S1), + ], + id="s1-p5-s1(1r)-s1-p2", + ), + pytest.param( + [ + SingleStepGroup(_S1), + ParallelStepGroup(_RS1, _S2, _S3, _S4, _S5, _S6), + SingleStepGroup(_S7), # will not execute + ParallelStepGroup(_S8, _S9), # will not execute + ], + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4, _S5, _S6, _RS1), + UndoRandom(_S2, _S3, _S4, _S5, _S6, _RS1), + UndoSequence(_S1), + ], + id="s1-p6(1r)-s1-p2", + ), + pytest.param( + [ + ParallelStepGroup( + _S1, + _S2, + _S3, + _S4, + _S5, + _S6, + _S7, + _S8, + _S9, + _S10, + _RS1, + _RS2, + _RS3, + _RS4, + _RS5, + _RS6, + _RS7, + _RS8, + _RS9, + _RS10, + ), + ], + [ + CreateRandom( + _S1, + _S2, + _S3, + _S4, + _S5, + _S6, + _S7, + _S8, + _S9, + _S10, + _RS1, + _RS2, + _RS3, + _RS4, + _RS5, + _RS6, + _RS7, + _RS8, + _RS9, + _RS10, + ), + UndoRandom( + _S1, + _S2, + _S3, + _S4, + _S5, + _S6, + _S7, + _S8, + _S9, + _S10, + _RS1, + _RS2, + _RS3, + _RS4, + _RS5, + _RS6, + _RS7, + _RS8, + _RS9, + _RS10, + ), + ], + id="p20(10r)", + ), + ], +) +async def test_create_undo_order( + preserve_caplog_for_async_logging: None, + steps_call_order: list[tuple[str, str]], + selected_app: FastAPI, + register_operation: Callable[[OperationName, Operation], None], + operation: Operation, + operation_name: OperationName, + expected_order: list[BaseExpectedStepOrder], +): + register_operation(operation_name, operation) + + schedule_id = await start_operation(selected_app, operation_name, {}) + assert TypeAdapter(ScheduleId).validate_python(schedule_id) + + await ensure_expected_order(steps_call_order, expected_order) + + await _ensure_keys_in_store(selected_app, expected_keys=set()) + + +@pytest.mark.parametrize("app_count", [10]) +@pytest.mark.parametrize( + "operation, expected_order, expected_keys", + [ + pytest.param( + [ + SingleStepGroup(_FCR1), + ], + [ + CreateSequence(_FCR1), + UndoSequence(_FCR1), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:0S:U", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_FCR1", + "SCH:{schedule_id}:STEPS:test_op:0S:U:_FCR1", + }, + id="s1(1rf)", + ), + pytest.param( + [ + SingleStepGroup(_S1), + SingleStepGroup(_FCR1), + ], + [ + CreateSequence(_S1, _FCR1), + UndoSequence(_FCR1), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:1S:C", + "SCH:{schedule_id}:GROUPS:test_op:1S:U", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_S1", + "SCH:{schedule_id}:STEPS:test_op:1S:C:_FCR1", + "SCH:{schedule_id}:STEPS:test_op:1S:U:_FCR1", + }, + id="s2(1rf)", + ), + pytest.param( + [ + SingleStepGroup(_S1), + ParallelStepGroup(_FCR1, _S2, _S3), + ], + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _FCR1), + UndoRandom(_S2, _S3, _FCR1), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:1P:C", + "SCH:{schedule_id}:GROUPS:test_op:1P:U", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_S1", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_FCR1", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S2", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S3", + "SCH:{schedule_id}:STEPS:test_op:1P:U:_FCR1", + "SCH:{schedule_id}:STEPS:test_op:1P:U:_S2", + "SCH:{schedule_id}:STEPS:test_op:1P:U:_S3", + }, + id="s1p3(1rf)", + ), + pytest.param( + [ + SingleStepGroup(_S1), + ParallelStepGroup(_FCR1, _FCR2, _S2, _S3), + ], + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _FCR1, _FCR2), + UndoRandom(_S2, _S3, _FCR2, _FCR1), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:1P:C", + "SCH:{schedule_id}:GROUPS:test_op:1P:U", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_S1", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_FCR1", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_FCR2", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S2", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S3", + "SCH:{schedule_id}:STEPS:test_op:1P:U:_FCR1", + "SCH:{schedule_id}:STEPS:test_op:1P:U:_FCR2", + "SCH:{schedule_id}:STEPS:test_op:1P:U:_S2", + "SCH:{schedule_id}:STEPS:test_op:1P:U:_S3", + }, + id="s1p4(2rf)", + ), + ], +) +async def test_fails_during_undo_is_in_error_state( + preserve_caplog_for_async_logging: None, + steps_call_order: list[tuple[str, str]], + selected_app: FastAPI, + register_operation: Callable[[OperationName, Operation], None], + operation: Operation, + operation_name: OperationName, + expected_order: list[BaseExpectedStepOrder], + expected_keys: set[str], +): + register_operation(operation_name, operation) + + schedule_id = await start_operation(selected_app, operation_name, {}) + assert TypeAdapter(ScheduleId).validate_python(schedule_id) + + await ensure_expected_order(steps_call_order, expected_order) + + formatted_expected_keys = {k.format(schedule_id=schedule_id) for k in expected_keys} + await _ensure_keys_in_store(selected_app, expected_keys=formatted_expected_keys) + + +@pytest.mark.parametrize("cancel_count", [1, 10]) +@pytest.mark.parametrize("app_count", [10]) +@pytest.mark.parametrize( + "operation, expected_before_cancel_order, expected_order", + [ + pytest.param( + [ + SingleStepGroup(_S1), + ParallelStepGroup(_S2, _S3, _S4), + SingleStepGroup(_SF1), + ], + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4), + CreateSequence(_SF1), + ], + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4), + CreateSequence(_SF1), + UndoSequence(_SF1), + UndoRandom(_S2, _S3, _S4), + UndoSequence(_S1), + ], + id="s1p3s1(1s)", + ), + pytest.param( + [ + SingleStepGroup(_S1), + ParallelStepGroup(_S2, _S3, _S4, _SF1, _SF2), + ], + [ + CreateSequence(_S1), + CreateRandom(_SF1, _SF2, _S2, _S3, _S4), + ], + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4, _SF1, _SF2), + UndoRandom(_S2, _S3, _S4, _SF2, _SF1), + UndoSequence(_S1), + ], + id="s1p4(1s)", + ), + ], +) +async def test_cancelled_finishes_nicely( + preserve_caplog_for_async_logging: None, + steps_call_order: list[tuple[str, str]], + selected_app: FastAPI, + register_operation: Callable[[OperationName, Operation], None], + operation: Operation, + operation_name: OperationName, + expected_before_cancel_order: list[BaseExpectedStepOrder], + expected_order: list[BaseExpectedStepOrder], + cancel_count: NonNegativeInt, +): + register_operation(operation_name, operation) + + schedule_id = await start_operation(selected_app, operation_name, {}) + assert TypeAdapter(ScheduleId).validate_python(schedule_id) + + await ensure_expected_order(steps_call_order, expected_before_cancel_order) + + # cancel in parallel multiple times (worst case) + await asyncio.gather( + *[cancel_operation(selected_app, schedule_id) for _ in range(cancel_count)] + ) + + await ensure_expected_order(steps_call_order, expected_order) + + await _ensure_keys_in_store(selected_app, expected_keys=set()) + + +_FAST_REPEAT_INTERVAL: Final[timedelta] = timedelta(seconds=0.1) +_REPAT_COUNT: Final[NonNegativeInt] = 10 + + +@pytest.mark.parametrize("app_count", [10]) +@pytest.mark.parametrize( + "operation, expected_before_cancel_order, expected_order", + [ + pytest.param( + [ + SingleStepGroup( + _S1, repeat_steps=True, wait_before_repeat=_FAST_REPEAT_INTERVAL + ), + ], + [CreateSequence(_S1) for _ in range(_REPAT_COUNT)], + [ + *[CreateSequence(_S1) for _ in range(_REPAT_COUNT)], + UndoSequence(_S1), + ], + id="s1(r)", + ), + pytest.param( + [ + ParallelStepGroup( + _S1, + _S2, + repeat_steps=True, + wait_before_repeat=_FAST_REPEAT_INTERVAL, + ), + ], + [CreateRandom(_S1, _S2) for _ in range(_REPAT_COUNT)], + [ + *[CreateRandom(_S1, _S2) for _ in range(_REPAT_COUNT)], + UndoRandom(_S1, _S2), + ], + id="p2(r)", + ), + pytest.param( + [ + SingleStepGroup( + _RS1, repeat_steps=True, wait_before_repeat=_FAST_REPEAT_INTERVAL + ), + ], + [CreateSequence(_RS1) for _ in range(_REPAT_COUNT)], + [ + *[CreateSequence(_RS1) for _ in range(_REPAT_COUNT)], + UndoSequence(_RS1), + ], + id="s1(rf)", + ), + pytest.param( + [ + ParallelStepGroup( + _RS1, + _RS2, + repeat_steps=True, + wait_before_repeat=_FAST_REPEAT_INTERVAL, + ), + ], + [CreateRandom(_RS1, _RS2) for _ in range(_REPAT_COUNT)], + [ + *[CreateRandom(_RS1, _RS2) for _ in range(_REPAT_COUNT)], + UndoRandom(_RS1, _RS2), + ], + id="p2(rf)", + ), + ], +) +async def test_repeating_step( + preserve_caplog_for_async_logging: None, + steps_call_order: list[tuple[str, str]], + selected_app: FastAPI, + register_operation: Callable[[OperationName, Operation], None], + operation: Operation, + operation_name: OperationName, + expected_before_cancel_order: list[BaseExpectedStepOrder], + expected_order: list[BaseExpectedStepOrder], +): + register_operation(operation_name, operation) + + schedule_id = await start_operation(selected_app, operation_name, {}) + assert TypeAdapter(ScheduleId).validate_python(schedule_id) + + await ensure_expected_order( + steps_call_order, expected_before_cancel_order, use_only_first_entries=True + ) + + # cancelling stops the loop and causes undo to run + await cancel_operation(selected_app, schedule_id) + + await ensure_expected_order( + steps_call_order, expected_order, use_only_last_entries=True + ) + + await _ensure_keys_in_store(selected_app, expected_keys=set()) + + +@pytest.mark.parametrize("app_count", [10]) +@pytest.mark.parametrize( + "operation, expected_order, expected_keys, after_restart_expected_order", + [ + pytest.param( + [ + SingleStepGroup(_S1), + ParallelStepGroup(_S2, _S3, _S4), + SingleStepGroup(_WMI1), + # below are not included when waiting for manual intervention + ParallelStepGroup(_S5, _S6), + SingleStepGroup(_S7), + ], + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4), + CreateSequence(_WMI1), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:1P:C", + "SCH:{schedule_id}:GROUPS:test_op:2S:C", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_S1", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S2", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S3", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S4", + "SCH:{schedule_id}:STEPS:test_op:2S:C:_WMI1", + }, + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4), + CreateSequence(_WMI1), + CreateSequence(_WMI1), # retried step + CreateRandom(_S5, _S6), # it is completed now + CreateSequence(_S7), # it is completed now + ], + id="s1-p3-s1(1mi)", + ), + pytest.param( + [ + SingleStepGroup(_S1), + ParallelStepGroup(_S2, _S3, _S4), + ParallelStepGroup(_WMI1, _WMI2, _WMI3, _S5, _S6, _S7), + # below are not included when waiting for manual intervention + SingleStepGroup(_S8), + ParallelStepGroup(_S9, _S10), + ], + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4), + CreateRandom(_WMI1, _WMI2, _WMI3, _S5, _S6, _S7), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:1P:C", + "SCH:{schedule_id}:GROUPS:test_op:2P:C", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_S1", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S2", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S3", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S4", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_S5", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_S6", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_S7", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_WMI1", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_WMI2", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_WMI3", + }, + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4), + CreateRandom(_WMI1, _WMI2, _WMI3, _S5, _S6, _S7), + CreateRandom(_WMI1, _WMI2, _WMI3), # retried steps + CreateSequence(_S8), # it is completed now + CreateRandom(_S9, _S10), # it is completed now + ], + id="s1-p3-p6(3mi)", + ), + ], +) +async def test_wait_for_manual_intervention( + reset_step_issue_tracker: None, + preserve_caplog_for_async_logging: None, + steps_call_order: list[tuple[str, str]], + selected_app: FastAPI, + register_operation: Callable[[OperationName, Operation], None], + operation: Operation, + operation_name: OperationName, + expected_order: list[BaseExpectedStepOrder], + expected_keys: set[str], + after_restart_expected_order: list[BaseExpectedStepOrder], +): + register_operation(operation_name, operation) + + schedule_id = await start_operation(selected_app, operation_name, {}) + assert TypeAdapter(ScheduleId).validate_python(schedule_id) + + formatted_expected_keys = {k.format(schedule_id=schedule_id) for k in expected_keys} + + await ensure_expected_order(steps_call_order, expected_order) + + await _ensure_keys_in_store(selected_app, expected_keys=formatted_expected_keys) + + # even if cancelled, state of waiting for manual intervention remains the same + with pytest.raises(CannotCancelWhileWaitingForManualInterventionError): + await cancel_operation(selected_app, schedule_id) + + await _ensure_keys_in_store(selected_app, expected_keys=formatted_expected_keys) + + # set step to no longer raise and restart the failed steps + steps_to_restart = _get_steps_matching_class( + operation, match=_WaitManualInerventionBS + ) + _GlobalStepIssueTracker.set_issue_solved() + await limited_gather( + *( + restart_operation_step_stuck_in_manual_intervention_during_create( + selected_app, schedule_id, step.get_step_name() + ) + for step in steps_to_restart + ), + limit=_PARALLEL_RESTARTS, + ) + # should finish schedule operation + await ensure_expected_order(steps_call_order, after_restart_expected_order) + await _ensure_keys_in_store(selected_app, expected_keys=set()) + + +@pytest.mark.parametrize("app_count", [10]) +@pytest.mark.parametrize( + "operation, expected_order, expected_keys, after_restart_expected_order", + [ + pytest.param( + [ + SingleStepGroup(_S1), + ParallelStepGroup(_S2, _S3, _S4), + SingleStepGroup(_FCR1), + # below are not included in any expected order + ParallelStepGroup(_S5, _S6), + SingleStepGroup(_S7), + ], + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4), + CreateSequence(_FCR1), + UndoSequence(_FCR1), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:1P:C", + "SCH:{schedule_id}:GROUPS:test_op:2S:C", + "SCH:{schedule_id}:GROUPS:test_op:2S:U", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_S1", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S2", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S3", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S4", + "SCH:{schedule_id}:STEPS:test_op:2S:C:_FCR1", + "SCH:{schedule_id}:STEPS:test_op:2S:U:_FCR1", + }, + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4), + CreateSequence(_FCR1), + UndoSequence(_FCR1), + UndoSequence(_FCR1), # this one is retried + UndoRandom(_S2, _S3, _S4), + UndoSequence(_S1), + ], + id="s1-p3-s1(1r)", + ), + pytest.param( + [ + SingleStepGroup(_S1), + ParallelStepGroup(_S2, _S3, _S4), + ParallelStepGroup(_FCR1, _FCR2, _FCR3, _S5, _S6, _S7), + # below are not included in any expected order + SingleStepGroup(_S8), + ParallelStepGroup(_S9, _S10), + ], + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4), + CreateRandom(_FCR1, _FCR2, _FCR3, _S5, _S6, _S7), + UndoRandom(_FCR1, _FCR2, _FCR3, _S5, _S6, _S7), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:1P:C", + "SCH:{schedule_id}:GROUPS:test_op:2P:C", + "SCH:{schedule_id}:GROUPS:test_op:2P:U", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_S1", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S2", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S3", + "SCH:{schedule_id}:STEPS:test_op:1P:C:_S4", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_S5", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_S6", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_S7", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_FCR1", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_FCR2", + "SCH:{schedule_id}:STEPS:test_op:2P:C:_FCR3", + "SCH:{schedule_id}:STEPS:test_op:2P:U:_S5", + "SCH:{schedule_id}:STEPS:test_op:2P:U:_S6", + "SCH:{schedule_id}:STEPS:test_op:2P:U:_S7", + "SCH:{schedule_id}:STEPS:test_op:2P:U:_FCR1", + "SCH:{schedule_id}:STEPS:test_op:2P:U:_FCR2", + "SCH:{schedule_id}:STEPS:test_op:2P:U:_FCR3", + }, + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4), + CreateRandom(_FCR1, _FCR2, _FCR3, _S5, _S6, _S7), + UndoRandom(_FCR1, _FCR2, _FCR3, _S5, _S6, _S7), + UndoRandom(_FCR1, _FCR2, _FCR3), # retried steps + UndoRandom(_S2, _S3, _S4), + UndoSequence(_S1), + ], + id="s1-p3-p6(3r)", + ), + ], +) +async def test_restart_undo_operation_step_in_error( + reset_step_issue_tracker: None, + preserve_caplog_for_async_logging: None, + steps_call_order: list[tuple[str, str]], + selected_app: FastAPI, + register_operation: Callable[[OperationName, Operation], None], + operation: Operation, + operation_name: OperationName, + expected_order: list[BaseExpectedStepOrder], + expected_keys: set[str], + after_restart_expected_order: list[BaseExpectedStepOrder], +): + register_operation(operation_name, operation) + + schedule_id = await start_operation(selected_app, operation_name, {}) + assert TypeAdapter(ScheduleId).validate_python(schedule_id) + + formatted_expected_keys = {k.format(schedule_id=schedule_id) for k in expected_keys} + + await ensure_expected_order(steps_call_order, expected_order) + await _ensure_keys_in_store(selected_app, expected_keys=formatted_expected_keys) + + # give some time for the deferred runner to store the errors + # avoids flakiness + await asyncio.sleep(0.1) + + # set step to no longer raise and restart the failed steps + steps_to_restart = _get_steps_matching_class( + operation, match=_FailOnCreateAndUndoBS + ) + _GlobalStepIssueTracker.set_issue_solved() + await limited_gather( + *( + restart_operation_step_stuck_during_undo( + selected_app, schedule_id, step.get_step_name() + ) + for step in steps_to_restart + ), + limit=_PARALLEL_RESTARTS, + ) + # should finish schedule operation + await ensure_expected_order(steps_call_order, after_restart_expected_order) + await _ensure_keys_in_store(selected_app, expected_keys=set()) + + +@pytest.mark.parametrize("app_count", [10]) +@pytest.mark.parametrize("in_manual_intervention", [True, False]) +async def test_errors_with_restart_operation_step_in_error( + preserve_caplog_for_async_logging: None, + steps_call_order: list[tuple[str, str]], + selected_app: FastAPI, + register_operation: Callable[[OperationName, Operation], None], + operation_name: OperationName, + in_manual_intervention: bool, +): + operation: Operation = [ + SingleStepGroup(_S1), + ParallelStepGroup(_S2, _S3, _S4), + ParallelStepGroup(_SF1, _FCR1), # sleeps here forever + ] + register_operation(operation_name, operation) + + schedule_id = await start_operation(selected_app, operation_name, {}) + assert TypeAdapter(ScheduleId).validate_python(schedule_id) + + await ensure_expected_order( + steps_call_order, + [ + CreateSequence(_S1), + CreateRandom(_S2, _S3, _S4), + CreateRandom(_SF1, _FCR1), + ], + ) + # give some time for the deferred runner to store the errors + await asyncio.sleep(0.1) + + with pytest.raises(StepNameNotInCurrentGroupError): + await Core.get_from_app_state( + selected_app + ).restart_operation_step_stuck_in_error( + schedule_id, + _S5.get_step_name(), + in_manual_intervention=in_manual_intervention, + ) + + with pytest.raises(StepNotInErrorStateError): + await Core.get_from_app_state( + selected_app + ).restart_operation_step_stuck_in_error( + schedule_id, + _SF1.get_step_name(), + in_manual_intervention=in_manual_intervention, + ) + + if not in_manual_intervention: + # force restart of step as it would be in manual intervention + # this is not allowed + with pytest.raises(StepNotWaitingForManualInterventionError): + await Core.get_from_app_state( + selected_app + ).restart_operation_step_stuck_in_error( + schedule_id, + _FCR1.get_step_name(), + in_manual_intervention=True, + ) + + +@pytest.mark.parametrize("app_count", [10]) +@pytest.mark.parametrize( + "operation, initial_context, expected_order", + [ + pytest.param( + [ + SingleStepGroup(RPCtxS1), + ], + { + "bs__c_req_1": _CTX_VALUE, # required by create + }, + [ + CreateSequence(RPCtxS1), + ], + id="s1", + ), + pytest.param( + [ + ParallelStepGroup(RPCtxS1, RPCtxS2), + ], + { + "bs__c_req_1": _CTX_VALUE, # required by create + "bs__c_req_2": _CTX_VALUE, # required by create + }, + [ + CreateRandom(RPCtxS1, RPCtxS2), + ], + id="p2", + ), + pytest.param( + [ + SingleStepGroup(RPCtxR1), + ], + { + "bs_undo_c_req_1": _CTX_VALUE, # required by create + "bs_undo_r_req_1": _CTX_VALUE, # not created automatically since crete fails + }, + [ + CreateSequence(RPCtxR1), + UndoSequence(RPCtxR1), + ], + id="s1(1r)", + ), + pytest.param( + [ + ParallelStepGroup(RPCtxR1, RPCtxR2), + ], + { + "bs_undo_c_req_1": _CTX_VALUE, # required by create + "bs_undo_c_req_2": _CTX_VALUE, # required by create + "bs_undo_r_req_1": _CTX_VALUE, # not created automatically since crete fails + "bs_undo_r_req_2": _CTX_VALUE, # not created automatically since crete fails + }, + [ + CreateRandom(RPCtxR1, RPCtxR2), + UndoRandom(RPCtxR1, RPCtxR2), + ], + id="p2(2r)", + ), + ], +) +async def test_operation_context_usage( + preserve_caplog_for_async_logging: None, + caplog: pytest.LogCaptureFixture, + steps_call_order: list[tuple[str, str]], + selected_app: FastAPI, + register_operation: Callable[[OperationName, Operation], None], + operation: Operation, + operation_name: OperationName, + initial_context: OperationContext, + expected_order: list[BaseExpectedStepOrder], +): + caplog.at_level(logging.DEBUG) + caplog.clear() + + register_operation(operation_name, operation) + + schedule_id = await start_operation(selected_app, operation_name, initial_context) + assert TypeAdapter(ScheduleId).validate_python(schedule_id) + + # NOTE: might fail because it raised ProvidedOperationContextKeysAreMissingError check logs + await ensure_expected_order(steps_call_order, expected_order) + + await _ensure_keys_in_store(selected_app, expected_keys=set()) + + assert f"{OperationContextValueIsNoneError.__name__}" not in caplog.text + assert f"{ProvidedOperationContextKeysAreMissingError.__name__}" not in caplog.text + + +@pytest.mark.parametrize("app_count", [10]) +@pytest.mark.parametrize( + "operation, initial_context", + [ + pytest.param( + [ + SingleStepGroup(RPCtxS1), + ], + { + "bs__c_prov_1": _CTX_VALUE, # already provied by step creates issue + }, + id="s1", + ), + pytest.param( + [ + SingleStepGroup(RPCtxR1), + ], + { + "bs_undo_c_prov_1": _CTX_VALUE, # already provied by step creates issue + }, + id="s1", + ), + pytest.param( + [ + SingleStepGroup(RPCtxR1), + ], + { + "bs_undo_r_prov_1": _CTX_VALUE, # already provied by step creates issue + }, + id="s1", + ), + ], +) +async def test_operation_initial_context_using_key_provided_by_step( + preserve_caplog_for_async_logging: None, + selected_app: FastAPI, + register_operation: Callable[[OperationName, Operation], None], + operation: Operation, + operation_name: OperationName, + initial_context: OperationContext, +): + register_operation(operation_name, operation) + + with pytest.raises(InitialOperationContextKeyNotAllowedError): + await start_operation(selected_app, operation_name, initial_context) + + await _ensure_keys_in_store(selected_app, expected_keys=set()) + + +@pytest.mark.parametrize("app_count", [10]) +@pytest.mark.parametrize( + "operation, initial_context, expected_order", + [ + pytest.param( + [ + SingleStepGroup(RPCtxS1), + ], + { + # `bs__c_req_1` is missing + }, + [ + UndoSequence(RPCtxS1), + ], + id="missing_context_key", + ), + pytest.param( + [ + SingleStepGroup(RPCtxS1), + ], + { + "bs__c_req_1": None, + }, + [ + UndoSequence(RPCtxS1), + ], + id="context_key_is_none", + ), + ], +) +async def test_step_does_not_receive_context_key_or_is_none( + preserve_caplog_for_async_logging: None, + caplog: pytest.LogCaptureFixture, + steps_call_order: list[tuple[str, str]], + selected_app: FastAPI, + register_operation: Callable[[OperationName, Operation], None], + operation: Operation, + operation_name: OperationName, + initial_context: OperationContext, + expected_order: list[BaseExpectedStepOrder], +): + caplog.at_level(logging.DEBUG) + caplog.clear() + + register_operation(operation_name, operation) + + schedule_id = await start_operation(selected_app, operation_name, initial_context) + assert TypeAdapter(ScheduleId).validate_python(schedule_id) + + await _esnure_log_mesage(caplog, message=OperationContextValueIsNoneError.__name__) + + await ensure_expected_order(steps_call_order, expected_order) + + await _ensure_keys_in_store(selected_app, expected_keys=set()) + + +class _BadImplementedStep(BaseStep): + @classmethod + def _get_provided_context( + cls, required_context: RequiredOperationContext + ) -> ProvidedOperationContext: + print("GOT", required_context) + return_values = {} + to_return = required_context["to_return"] + if to_return["add_to_return"]: + return_values.update(to_return["keys"]) + + return return_values + + # CREATE + + @classmethod + def get_create_requires_context_keys(cls) -> set[str]: + return {"to_return", "trigger_undo"} + + @classmethod + def get_create_provides_context_keys(cls) -> set[str]: + return {"a_key"} + + @classmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + print("INJECTED_CONTEXT_C", required_context) + _ = app + _STEPS_CALL_ORDER.append((cls.__name__, CREATED)) + + if required_context.get("trigger_undo"): + msg = "triggering undo" + raise RuntimeError(msg) + + return cls._get_provided_context(required_context) + + # UNDO + + @classmethod + def get_undo_requires_context_keys(cls) -> set[str]: + return {"to_return", "trigger_undo"} + + @classmethod + def get_undo_provides_context_keys(cls) -> set[str]: + return {"a_key"} + + @classmethod + async def undo( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + print("INJECTED_CONTEXT_R", required_context) + _ = app + _STEPS_CALL_ORDER.append((cls.__name__, UNDONE)) + + return cls._get_provided_context(required_context) + + +@pytest.mark.parametrize("app_count", [10]) +@pytest.mark.parametrize( + "operation, initial_context, expected_error_str, expected_order, expected_keys", + [ + pytest.param( + [ + SingleStepGroup(_BadImplementedStep), + ], + { + "trigger_undo": False, + "to_return": { + "add_to_return": True, + "keys": {"a_key": None}, + }, + }, + f"{OperationContextValueIsNoneError.__name__}: Values of context cannot be None: {{'a_key'", + [ + CreateSequence(_BadImplementedStep), + UndoSequence(_BadImplementedStep), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:0S:U", + "SCH:{schedule_id}:OP_CTX:test_op", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_BadImplementedStep", + "SCH:{schedule_id}:STEPS:test_op:0S:U:_BadImplementedStep", + }, + id="create-returns-key-set-to-None", + ), + pytest.param( + [ + SingleStepGroup(_BadImplementedStep), + ], + { + "trigger_undo": False, + "to_return": { + "add_to_return": False, + }, + }, + f"{ProvidedOperationContextKeysAreMissingError.__name__}: Provided context {{}} is missing keys {{'a_key'", + [ + CreateSequence(_BadImplementedStep), + UndoSequence(_BadImplementedStep), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:0S:U", + "SCH:{schedule_id}:OP_CTX:test_op", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_BadImplementedStep", + "SCH:{schedule_id}:STEPS:test_op:0S:U:_BadImplementedStep", + }, + id="create-does-not-set-the-key-to-return", + ), + pytest.param( + [ + SingleStepGroup(_BadImplementedStep), + ], + { + "trigger_undo": True, + "to_return": { + "add_to_return": True, + "keys": {"a_key": None}, + }, + }, + f"{OperationContextValueIsNoneError.__name__}: Values of context cannot be None: {{'a_key'", + [ + CreateSequence(_BadImplementedStep), + UndoSequence(_BadImplementedStep), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:0S:U", + "SCH:{schedule_id}:OP_CTX:test_op", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_BadImplementedStep", + "SCH:{schedule_id}:STEPS:test_op:0S:U:_BadImplementedStep", + }, + id="undo-returns-key-set-to-None", + ), + pytest.param( + [ + SingleStepGroup(_BadImplementedStep), + ], + { + "trigger_undo": True, + "to_return": { + "add_to_return": False, + }, + }, + f"{ProvidedOperationContextKeysAreMissingError.__name__}: Provided context {{}} is missing keys {{'a_key'", + [ + CreateSequence(_BadImplementedStep), + UndoSequence(_BadImplementedStep), + ], + { + "SCH:{schedule_id}", + "SCH:{schedule_id}:GROUPS:test_op:0S:C", + "SCH:{schedule_id}:GROUPS:test_op:0S:U", + "SCH:{schedule_id}:OP_CTX:test_op", + "SCH:{schedule_id}:STEPS:test_op:0S:C:_BadImplementedStep", + "SCH:{schedule_id}:STEPS:test_op:0S:U:_BadImplementedStep", + }, + id="undo-does-not-set-the-key-to-return", + ), + ], +) +async def test_step_does_not_provide_declared_key_or_is_none( + preserve_caplog_for_async_logging: None, + caplog: pytest.LogCaptureFixture, + steps_call_order: list[tuple[str, str]], + selected_app: FastAPI, + register_operation: Callable[[OperationName, Operation], None], + operation: Operation, + operation_name: OperationName, + initial_context: OperationContext, + expected_error_str: str, + expected_order: list[BaseExpectedStepOrder], + expected_keys: set[str], +): + caplog.at_level(logging.DEBUG) + caplog.clear() + + register_operation(operation_name, operation) + + schedule_id = await start_operation(selected_app, operation_name, initial_context) + assert TypeAdapter(ScheduleId).validate_python(schedule_id) + + await _esnure_log_mesage(caplog, message=expected_error_str) + + await ensure_expected_order(steps_call_order, expected_order) + + formatted_expected_keys = {k.format(schedule_id=schedule_id) for k in expected_keys} + await _ensure_keys_in_store(selected_app, expected_keys=formatted_expected_keys) diff --git a/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__deferred_runner.py b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__deferred_runner.py new file mode 100644 index 000000000000..af48fc6b1989 --- /dev/null +++ b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__deferred_runner.py @@ -0,0 +1,316 @@ +# pylint:disable=protected-access +# pylint:disable=redefined-outer-name +# pylint:disable=unused-argument + + +import asyncio +from collections.abc import AsyncIterable +from enum import Enum +from typing import ClassVar +from unittest.mock import AsyncMock + +import pytest +from fastapi import FastAPI +from pydantic import NonNegativeInt +from pytest_mock import MockerFixture +from pytest_simcore.helpers.typing_env import EnvVarsDict +from settings_library.rabbit import RabbitSettings +from settings_library.redis import RedisSettings +from simcore_service_dynamic_scheduler.services.generic_scheduler._deferred_runner import ( + DeferredRunner, +) +from simcore_service_dynamic_scheduler.services.generic_scheduler._errors import ( + NoDataFoundError, +) +from simcore_service_dynamic_scheduler.services.generic_scheduler._models import ( + OperationName, + ProvidedOperationContext, + RequiredOperationContext, + ScheduleId, + StepStatus, +) +from simcore_service_dynamic_scheduler.services.generic_scheduler._operation import ( + BaseStep, + BaseStepGroup, + Operation, + OperationRegistry, + SingleStepGroup, +) +from simcore_service_dynamic_scheduler.services.generic_scheduler._store import ( + ScheduleDataStoreProxy, + StepStoreProxy, + Store, +) +from tenacity import ( + AsyncRetrying, + retry_if_exception_type, + stop_after_delay, + wait_fixed, +) + +pytest_simcore_core_services_selection = [ + "rabbit", + "redis", +] +pytest_simcore_ops_services_selection = [ + "redis-commander", +] + + +@pytest.fixture +def app_environment( + disable_postgres_lifespan: None, + disable_service_tracker_lifespan: None, + disable_notifier_lifespan: None, + disable_status_monitor_lifespan: None, + app_environment: EnvVarsDict, + rabbit_service: RabbitSettings, + redis_service: RedisSettings, + remove_redis_data: None, +) -> EnvVarsDict: + return app_environment + + +@pytest.fixture +def store(app: FastAPI) -> Store: + return Store.get_from_app_state(app) + + +@pytest.fixture +def schedule_id() -> ScheduleId: + return "a-schedule-id" + + +@pytest.fixture +async def operation_name() -> OperationName: + return "an-operation" + + +@pytest.fixture +async def registed_operation( + operation_name: OperationName, operation: Operation +) -> AsyncIterable[None]: + OperationRegistry.register(operation_name, operation) + yield + OperationRegistry.unregister(operation_name) + + +@pytest.fixture +def mock_enqueue_event(mocker: MockerFixture) -> AsyncMock: + mock = AsyncMock() + mocker.patch( + "simcore_service_dynamic_scheduler.services.generic_scheduler._deferred_runner.enqueue_schedule_event", + mock, + ) + return mock + + +async def _assert_finshed_with_status( + step_proxy: StepStoreProxy, expected_status: StepStatus +) -> None: + async for attempt in AsyncRetrying( + wait=wait_fixed(0.1), + stop=stop_after_delay(10), + reraise=True, + retry=retry_if_exception_type((AssertionError, NoDataFoundError)), + ): + with attempt: + assert await step_proxy.read("status") == expected_status + + +class _StepResultStore: + _STORE: ClassVar[dict[str, str]] = {} + + @classmethod + def set_result(cls, key: str, value: str) -> None: + cls._STORE[key] = value + + @classmethod + def get_result(cls, key: str) -> str: + return cls._STORE[key] + + @classmethod + def clear(cls) -> None: + cls._STORE.clear() + + +class _StepFinisheWithSuccess(BaseStep): + @classmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + _ = app + _ = required_context + _StepResultStore.set_result(cls.__name__, "created") + return {} + + @classmethod + async def undo( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + _ = app + _ = required_context + _StepResultStore.set_result(cls.__name__, "destroyed") + return {} + + +class _StepFinisheError(BaseStep): + @classmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + _ = app + _ = required_context + _StepResultStore.set_result(cls.__name__, "created") + msg = "I failed creating" + raise RuntimeError(msg) + + @classmethod + async def undo( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + _ = app + _ = required_context + _StepResultStore.set_result(cls.__name__, "destroyed") + msg = "I failed destorying" + raise RuntimeError(msg) + + +class _StepLongRunningToCancel(BaseStep): + @classmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + _ = app + _ = required_context + _StepResultStore.set_result(cls.__name__, "created") + await asyncio.sleep(10000) + return {} + + @classmethod + async def undo( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + _ = app + _ = required_context + _StepResultStore.set_result(cls.__name__, "destroyed") + await asyncio.sleep(10000) + return {} + + +class _Action(str, Enum): + DO_NOTHING = "NOTHING" + CANCEL = "CANCEL" + + +def _get_step_group( + operation_name: OperationName, group_index: NonNegativeInt +) -> BaseStepGroup: + assert operation_name in OperationRegistry._OPERATIONS # noqa: SLF001 + + operation = OperationRegistry._OPERATIONS[operation_name][ # noqa: SLF001 + "operation" + ] + operations_count = len(operation) + assert group_index < operations_count + + return operation[group_index] + + +@pytest.mark.parametrize( + "operation, expected_step_status, action, expected_steps_count", + [ + ( + [ + SingleStepGroup(_StepFinisheWithSuccess), + ], + StepStatus.SUCCESS, + _Action.DO_NOTHING, + 1, + ), + ( + [ + SingleStepGroup(_StepFinisheError), + ], + StepStatus.FAILED, + _Action.DO_NOTHING, + 1, + ), + ( + [ + SingleStepGroup(_StepLongRunningToCancel), + ], + StepStatus.CANCELLED, + _Action.CANCEL, + 1, + ), + ], +) +@pytest.mark.parametrize("is_creating", [True, False]) +async def test_something( + mock_enqueue_event: AsyncMock, + registed_operation: None, + app: FastAPI, + store: Store, + schedule_id: ScheduleId, + operation_name: OperationName, + expected_step_status: StepStatus, + is_creating: bool, + action: _Action, + expected_steps_count: NonNegativeInt, +) -> None: + + # setup + schedule_data_proxy = ScheduleDataStoreProxy(store=store, schedule_id=schedule_id) + await schedule_data_proxy.create_or_update_multiple( + {"operation_name": operation_name, "group_index": 0, "is_creating": is_creating} + ) + + step_group = _get_step_group(operation_name, 0) + + step_group_name = step_group.get_step_group_name(index=0) + + steps = step_group.get_step_subgroup_to_run() + assert len(steps) == 1 + step = steps[0] + + step_name = step.get_step_name() + + step_proxy = StepStoreProxy( + store=store, + schedule_id=schedule_id, + operation_name=operation_name, + step_group_name=step_group_name, + step_name=step_name, + is_creating=is_creating, + ) + + ### tests starts here + + await DeferredRunner.start( + schedule_id=schedule_id, + operation_name=operation_name, + step_group_name=step_group_name, + step_name=step_name, + is_creating=is_creating, + expected_steps_count=expected_steps_count, + ) + + if action == _Action.CANCEL: + await asyncio.sleep(0.2) # give it some time to start + + task_uid = await step_proxy.read("deferred_task_uid") + await DeferredRunner.cancel(task_uid) + + await _assert_finshed_with_status(step_proxy, expected_step_status) + + assert _StepResultStore.get_result(step.__name__) == ( + "created" if is_creating else "destroyed" + ) + + if expected_step_status == StepStatus.FAILED: + error_traceback = await step_proxy.read("error_traceback") + assert "I failed" in error_traceback + + # ensure called once with arguments + assert mock_enqueue_event.call_args_list == [((app, schedule_id),)] diff --git a/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__event_scheduler.py b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__event_scheduler.py new file mode 100644 index 000000000000..33aa2d906204 --- /dev/null +++ b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__event_scheduler.py @@ -0,0 +1,136 @@ +# pylint:disable=redefined-outer-name +# pylint:disable=unused-argument + +import asyncio +from collections.abc import Awaitable, Callable +from unittest.mock import Mock, call +from uuid import uuid4 + +import pytest +from fastapi import FastAPI +from pydantic import TypeAdapter +from pytest_mock import MockerFixture +from pytest_simcore.helpers.typing_env import EnvVarsDict +from settings_library.rabbit import RabbitSettings +from simcore_service_dynamic_scheduler.services.generic_scheduler._event_scheduler import ( + EventScheduler, +) +from simcore_service_dynamic_scheduler.services.generic_scheduler._models import ( + ScheduleId, +) +from tenacity import ( + AsyncRetrying, + retry_if_exception_type, + stop_after_delay, + wait_fixed, +) + +pytest_simcore_core_services_selection = [ + "rabbit", +] + + +@pytest.fixture +def disable_other_generic_scheduler_modules(mocker: MockerFixture) -> None: + # these also use redis + generic_scheduler_module = ( + "simcore_service_dynamic_scheduler.services.generic_scheduler" + ) + mocker.patch(f"{generic_scheduler_module}._lifespan.Core", autospec=True) + mocker.patch(f"{generic_scheduler_module}._lifespan.Store", autospec=True) + + +@pytest.fixture +def app_environment( + disable_other_generic_scheduler_modules: None, + disable_redis_lifespan: None, + disable_postgres_lifespan: None, + disable_service_tracker_lifespan: None, + disable_deferred_manager_lifespan: None, + disable_notifier_lifespan: None, + disable_status_monitor_lifespan: None, + app_environment: EnvVarsDict, + rabbit_service: RabbitSettings, +) -> EnvVarsDict: + return app_environment + + +@pytest.fixture +def event_scheduler(app: FastAPI) -> EventScheduler: + return app.state.generic_scheduler_event_scheduler + + +@pytest.fixture +def get_mock_safe_on_schedule_event( + mocker: MockerFixture, +) -> Callable[[Callable[[ScheduleId], Awaitable[None]]], Mock]: + + def _(side_effect: Callable[[ScheduleId], Awaitable[None]]) -> Mock: + another_mock = Mock() + + async def _mock(schedule_id: ScheduleId) -> None: + await side_effect(schedule_id) + another_mock(schedule_id) + + core_mock = Mock() + core_mock.safe_on_schedule_event = _mock + mocker.patch( + "simcore_service_dynamic_scheduler.services.generic_scheduler._event_scheduler.Core.get_from_app_state", + return_value=core_mock, + ) + return another_mock + + return _ + + +async def _side_effect_nothing(schedule_id: ScheduleId) -> None: + pass + + +async def _side_effect_raise_error(schedule_id: ScheduleId) -> None: + msg = "always failing here as requesed" + raise RuntimeError(msg) + + +async def test_event_scheduling( + get_mock_safe_on_schedule_event: Callable[ + [Callable[[ScheduleId], Awaitable[None]]], Mock + ], + event_scheduler: EventScheduler, +) -> None: + mock = get_mock_safe_on_schedule_event(_side_effect_nothing) + + schedule_id = TypeAdapter(ScheduleId).validate_python(f"{uuid4()}") + await event_scheduler.enqueue_schedule_event(schedule_id) + + async for attempt in AsyncRetrying( + wait=wait_fixed(0.1), + stop=stop_after_delay(5), + retry=retry_if_exception_type(AssertionError), + ): + with attempt: + await asyncio.sleep(0) # wait for envet to trigger + assert mock.call_args_list == [call(schedule_id)] + + +async def test_event_scheduling_raises_error( + get_mock_safe_on_schedule_event: Callable[ + [Callable[[ScheduleId], Awaitable[None]]], Mock + ], + event_scheduler: EventScheduler, + caplog: pytest.LogCaptureFixture, +) -> None: + caplog.clear() + get_mock_safe_on_schedule_event(_side_effect_raise_error) + + schedule_id = TypeAdapter(ScheduleId).validate_python(f"{uuid4()}") + await event_scheduler.enqueue_schedule_event(schedule_id) + + async for attempt in AsyncRetrying( + wait=wait_fixed(0.1), + stop=stop_after_delay(5), + retry=retry_if_exception_type(AssertionError), + ): + with attempt: + await asyncio.sleep(0) # wait for envet to trigger + assert "Unexpected error. Aborting message retry" in caplog.text diff --git a/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__operation.py b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__operation.py new file mode 100644 index 000000000000..e6b199c2d15c --- /dev/null +++ b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__operation.py @@ -0,0 +1,206 @@ +# pylint: disable=protected-access + +import pytest +from fastapi import FastAPI +from simcore_service_dynamic_scheduler.services.generic_scheduler._errors import ( + OperationAlreadyRegisteredError, + OperationNotFoundError, + StepNotFoundInoperationError, +) +from simcore_service_dynamic_scheduler.services.generic_scheduler._models import ( + ProvidedOperationContext, + RequiredOperationContext, +) +from simcore_service_dynamic_scheduler.services.generic_scheduler._operation import ( + BaseStep, + Operation, + OperationRegistry, + ParallelStepGroup, + SingleStepGroup, + _validate_operation, +) + + +class BaseBS(BaseStep): + @classmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + _ = app + _ = required_context + + +class BS1(BaseBS): ... + + +class BS2(BaseBS): ... + + +class BS3(BaseBS): ... + + +class MI1(BaseBS): + @classmethod + def wait_for_manual_intervention(cls) -> bool: + return True + + +class WrongBS1C(BaseBS): + @classmethod + def get_create_provides_context_keys(cls) -> set[str]: + return {"create_key"} + + +class WrongBS2C(BaseBS): + @classmethod + def get_create_provides_context_keys(cls) -> set[str]: + return {"create_key"} + + +class WrongBS1R(BaseBS): + @classmethod + def get_undo_provides_context_keys(cls) -> set[str]: + return {"undo_key"} + + +class WrongBS2R(BaseBS): + @classmethod + def get_undo_provides_context_keys(cls) -> set[str]: + return {"undo_key"} + + +@pytest.mark.parametrize( + "operation", + [ + [ + SingleStepGroup(BS1), + ParallelStepGroup(BS2, BS3), + ], + [ + SingleStepGroup(BS1), + ], + [ + SingleStepGroup(BS1), + SingleStepGroup(BS2), + ], + [ + SingleStepGroup(WrongBS1C), + SingleStepGroup(WrongBS1R), + ], + [ + ParallelStepGroup(WrongBS2C, WrongBS2R), + ], + [ + SingleStepGroup(BS2), + ParallelStepGroup(BS1, BS3, repeat_steps=True), + ], + [ + ParallelStepGroup(BS1, BS3), + SingleStepGroup(BS2, repeat_steps=True), + ], + [ + SingleStepGroup(BS1, repeat_steps=True), + ], + [ + ParallelStepGroup(BS1, BS3, repeat_steps=True), + ], + ], +) +def test_validate_operation_passes(operation: Operation): + _validate_operation(operation) + + +@pytest.mark.parametrize( + "operation, match", + [ + ([], "List should have at least 1 item after validation"), + ( + [ + SingleStepGroup(BS1, repeat_steps=True), + SingleStepGroup(BS2), + ], + "Only the last step group can have repeat_steps=True", + ), + ( + [ + SingleStepGroup(BS1), + SingleStepGroup(BS1), + ], + f"step_name='{BS1.__name__}' is already used in this operation", + ), + ( + [ + ParallelStepGroup(BS2, BS2), + ], + f"step_name='{BS2.__name__}' is already used in this operation", + ), + ( + [ + ParallelStepGroup(BS1), + ], + f"{ParallelStepGroup.__name__} needs at least 2 steps", + ), + ( + [SingleStepGroup(WrongBS1C), SingleStepGroup(WrongBS2C)], + f"already provided key='create_key' in {BaseStep.get_create_provides_context_keys.__name__}", + ), + ( + [ParallelStepGroup(WrongBS1C, WrongBS2C)], + f"already provided key='create_key' in {BaseStep.get_create_provides_context_keys.__name__}", + ), + ( + [SingleStepGroup(WrongBS1R), SingleStepGroup(WrongBS2R)], + f"already provided key='undo_key' in {BaseStep.get_undo_provides_context_keys.__name__}", + ), + ( + [ParallelStepGroup(WrongBS1R, WrongBS2R)], + f"already provided key='undo_key' in {BaseStep.get_undo_provides_context_keys.__name__}", + ), + ( + [SingleStepGroup(MI1, repeat_steps=True)], + "cannot have steps that require manual intervention", + ), + ( + [ + ParallelStepGroup(MI1, BS1, BS2, repeat_steps=True), + ], + "cannot have steps that require manual intervention", + ), + ], +) +def test_validate_operations_fails(operation: Operation, match: str): + with pytest.raises(ValueError, match=match): + _validate_operation(operation) + + +def test_operation_registry_workflow(): + operation: Operation = [SingleStepGroup(BS1)] + OperationRegistry.register("op1", operation) + assert len(OperationRegistry._OPERATIONS) == 1 # noqa: SLF001 + + assert OperationRegistry.get_operation("op1") == operation + + assert OperationRegistry.get_step("op1", "BS1") == BS1 + + OperationRegistry.unregister("op1") + assert len(OperationRegistry._OPERATIONS) == 0 # noqa: SLF001 + + +def test_operation_registry_raises_errors(): + operation: Operation = [SingleStepGroup(BS1)] + OperationRegistry.register("op1", operation) + + with pytest.raises(OperationAlreadyRegisteredError): + OperationRegistry.register("op1", operation) + + with pytest.raises(OperationNotFoundError): + OperationRegistry.get_operation("non_existing") + + with pytest.raises(OperationNotFoundError): + OperationRegistry.unregister("non_existing") + + with pytest.raises(OperationNotFoundError): + OperationRegistry.get_step("non_existing", "BS1") + + with pytest.raises(StepNotFoundInoperationError): + OperationRegistry.get_step("op1", "non_existing") diff --git a/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__store.py b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__store.py new file mode 100644 index 000000000000..fb0c08bf1b4a --- /dev/null +++ b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test__store.py @@ -0,0 +1,383 @@ +# pylint: disable=protected-access +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument + +from collections.abc import AsyncIterable +from typing import Any + +import pytest +from faker import Faker +from servicelib.deferred_tasks import TaskUID +from servicelib.redis._utils import handle_redis_returns_union_types +from settings_library.redis import RedisSettings +from simcore_service_dynamic_scheduler.services.generic_scheduler._models import ( + OperationErrorType, + ProvidedOperationContext, + ScheduleId, + StepStatus, +) +from simcore_service_dynamic_scheduler.services.generic_scheduler._store import ( + OperationContextProxy, + OperationRemovalProxy, + ScheduleDataStoreProxy, + StepGroupProxy, + StepStoreProxy, + Store, +) + + +@pytest.fixture +async def store(use_in_memory_redis: RedisSettings) -> AsyncIterable[Store]: + store = Store(use_in_memory_redis) + await store.setup() + yield store + await store.redis.client().flushall() + await store.shutdown() + + +async def _assert_keys(store: Store, expected_keys: set[str]) -> None: + keys = set(await store.redis.keys()) + assert keys == expected_keys + + +async def _assert_keys_in_hash( + store: Store, hash_key: str, expected_keys: set[str] +) -> None: + keys = set(await handle_redis_returns_union_types(store.redis.hkeys(hash_key))) + assert keys == expected_keys + + +async def test_store_workflow(store: Store): + # save single value + await store.set_key_in_hash("hash1", "key1", "value1") + await _assert_keys(store, {"hash1"}) + await _assert_keys_in_hash(store, "hash1", {"key1"}) + assert await store.get_key_from_hash("hash1", "key1") == ("value1",) + assert await store.get_key_from_hash("hash1", "key1", "key1") == ( + "value1", + "value1", + ) + assert await store.get_key_from_hash("hash1", "missing1", "missing2") == ( + None, + None, + ) + + # remove last key in hash + await store.delete_key_from_hash("hash1", "key1") + await _assert_keys(store, set()) + await _assert_keys_in_hash(store, "hash1", set()) + assert await store.get_key_from_hash("hash1", "key1") == (None,) + + # save multiple values + await store.set_keys_in_hash("hash2", {"key1": "value1", "key2": 2, "key3": True}) + await _assert_keys(store, {"hash2"}) + await _assert_keys_in_hash(store, "hash2", {"key1", "key2", "key3"}) + assert await store.get_key_from_hash("hash2", "key1", "key2", "key3") == ( + "value1", + 2, + True, + ) + + # delete a few keys form hash + await store.delete_key_from_hash( + "hash2", "key1", "key3", "missing1", "missing2", "missing3" + ) + await _assert_keys(store, {"hash2"}) + await _assert_keys_in_hash(store, "hash2", {"key2"}) + assert await store.get_key_from_hash("hash2", "key1", "key2", "key3") == ( + None, + 2, + None, + ) + + # increase a key in the hahs + assert await store.increase_key_in_hash_and_get("hash2", "key4") == 1 + assert await store.increase_key_in_hash_and_get("hash2", "key4") == 2 + assert await store.increase_key_in_hash_and_get("hash2", "key4") == 3 + assert await store.increase_key_in_hash_and_get("hash2", "key4") == 4 + + # remove hash completely + await store.delete("hash2") + await _assert_keys(store, set()) + await _assert_keys_in_hash(store, "hash2", set()) + assert await store.get_key_from_hash("hash2", "key1", "key2", "key3") == ( + None, + None, + None, + ) + + +@pytest.mark.parametrize( + "value", + [ + 1, + "a", + 3.14, + True, + None, + {"dict": "with_data"}, + [1, 2, 3], + ], +) +async def test_store_supporse_multiple_python_base_types(store: Store, value: Any): + # values are stored and recovered in their original type + await store.set_key_in_hash("hash1", "key1", value) + assert (await store.get_key_from_hash("hash1", "key1")) == (value,) + + +@pytest.fixture +def schedule_id(faker: Faker) -> ScheduleId: + return faker.uuid4() + + +async def test_schedule_data_store_proxy(store: Store, schedule_id: ScheduleId): + proxy = ScheduleDataStoreProxy(store=store, schedule_id=schedule_id) + hash_key = f"SCH:{schedule_id}" + + # set + await proxy.create_or_update("operation_name", "op1") + await proxy.create_or_update("group_index", 1) + await proxy.create_or_update("is_creating", value=True) + await _assert_keys(store, {hash_key}) + await _assert_keys_in_hash( + store, hash_key, {"operation_name", "group_index", "is_creating"} + ) + + # get + assert await proxy.read("operation_name") == "op1" + assert await proxy.read("group_index") == 1 + assert await proxy.read("is_creating") is True + + # remove + await proxy.delete_keys("operation_name", "is_creating", "group_index") + await _assert_keys(store, set()) + await _assert_keys_in_hash(store, hash_key, set()) + + # set multiple + await proxy.create_or_update_multiple( + { + "group_index": 2, + "is_creating": False, + "operation_error_type": OperationErrorType.STEP_ISSUE, + "operation_error_message": "mock_error_message", + } + ) + await _assert_keys(store, {hash_key}) + await _assert_keys_in_hash( + store, + hash_key, + { + "group_index", + "is_creating", + "operation_error_type", + "operation_error_message", + }, + ) + + # remove all keys an even missing ones + await proxy.delete_keys( + "operation_name", + "is_creating", + "group_index", + "operation_error_type", + "operation_error_message", + ) + await _assert_keys(store, set()) + await _assert_keys_in_hash(store, hash_key, set()) + + +@pytest.mark.parametrize("is_creating", [True, False]) +@pytest.mark.parametrize("use_remove", [True, False]) +async def test_steps_store_proxy( + store: Store, schedule_id: ScheduleId, is_creating: bool, use_remove: bool +): + proxy = StepStoreProxy( + store=store, + schedule_id=schedule_id, + operation_name="op1", + step_group_name="sg1", + step_name="step", + is_creating=is_creating, + ) + is_creating_str = "C" if is_creating else "U" + hash_key = f"SCH:{schedule_id}:STEPS:op1:sg1:{is_creating_str}:step" + + # set + await proxy.create_or_update("status", StepStatus.RUNNING) + await _assert_keys(store, {hash_key}) + await _assert_keys_in_hash(store, hash_key, {"status"}) + + # get + assert await proxy.read("status") == StepStatus.RUNNING + + # remove + await proxy.delete_keys("status") + await _assert_keys(store, set()) + await _assert_keys_in_hash(store, hash_key, set()) + + # set multiple + await proxy.create_or_update_multiple( + { + "status": StepStatus.SUCCESS, + "deferred_task_uid": TaskUID("mytask"), + "error_traceback": "mock_traceback", + "requires_manual_intervention": True, + "deferred_created": True, + } + ) + await _assert_keys(store, {hash_key}) + await _assert_keys_in_hash( + store, + hash_key, + { + "status", + "deferred_task_uid", + "error_traceback", + "requires_manual_intervention", + "deferred_created", + }, + ) + + # remove all keys an even missing ones + if use_remove: + await proxy.delete() + else: + await proxy.delete_keys( + "status", + "deferred_task_uid", + "error_traceback", + "requires_manual_intervention", + "deferred_created", + ) + await _assert_keys(store, set()) + await _assert_keys_in_hash(store, hash_key, set()) + + +@pytest.mark.parametrize("is_creating", [True, False]) +async def test_step_group_proxy( + store: Store, + schedule_id: ScheduleId, + is_creating: bool, +): + step_group_proxy = StepGroupProxy( + store=store, + schedule_id=schedule_id, + operation_name="op1", + step_group_name="sg1", + is_creating=is_creating, + ) + + async def _get_steps_count() -> int | None: + (response,) = await store.get_key_from_hash( + step_group_proxy._get_hash_key(), "done_steps" # noqa: SLF001 + ) + return response + + assert await _get_steps_count() is None + + for _ in range(10): + await step_group_proxy.increment_and_get_done_steps_count() + assert await _get_steps_count() == 1 + await step_group_proxy.decrement_and_get_done_steps_count() + assert await _get_steps_count() == 0 + + await step_group_proxy.delete() + assert await _get_steps_count() is None + + +@pytest.mark.parametrize( + "provided_context", + [ + {}, + { + "k1": "v1", + "k2": 2, + "k3": True, + "k4": None, + "k5": 3.14, + "k6": {"a": "b"}, + "k7": [1, 2, 3], + }, + ], +) +async def test_operation_context_proxy( + store: Store, schedule_id: ScheduleId, provided_context: ProvidedOperationContext +): + proxy = OperationContextProxy( + store=store, schedule_id=schedule_id, operation_name="op1" + ) + hash_key = f"SCH:{schedule_id}:OP_CTX:op1" + + await _assert_keys(store, set()) + await _assert_keys_in_hash(store, hash_key, set()) + + await proxy.create_or_update(provided_context) + + await _assert_keys(store, set() if len(provided_context) == 0 else {hash_key}) + await _assert_keys_in_hash(store, hash_key, set(provided_context.keys())) + + assert await proxy.read(*provided_context.keys()) == provided_context + + +async def test_operation_removal_proxy(store: Store, schedule_id: ScheduleId): + await _assert_keys(store, set()) + + proxy = ScheduleDataStoreProxy(store=store, schedule_id=schedule_id) + await proxy.create_or_update_multiple( + { + "group_index": 1, + "is_creating": True, + "operation_error_type": OperationErrorType.STEP_ISSUE, + "operation_error_message": "mock_error_message", + "operation_name": "op1", + } + ) + + proxy = StepStoreProxy( + store=store, + schedule_id=schedule_id, + operation_name="op1", + step_group_name="sg1", + step_name="step", + is_creating=True, + ) + await proxy.create_or_update_multiple( + { + "deferred_created": True, + "status": StepStatus.SUCCESS, + "deferred_task_uid": TaskUID("mytask"), + "requires_manual_intervention": True, + "error_traceback": "mock_traceback", + } + ) + + proxy = StepGroupProxy( + store=store, + schedule_id=schedule_id, + operation_name="op1", + step_group_name="sg1", + is_creating=True, + ) + await proxy.increment_and_get_done_steps_count() + + proxy = OperationContextProxy( + store=store, schedule_id=schedule_id, operation_name="op1" + ) + await proxy.create_or_update({"k1": "v1", "k2": 2}) + + await _assert_keys( + store, + { + f"SCH:{schedule_id}", + f"SCH:{schedule_id}:GROUPS:op1:sg1:C", + f"SCH:{schedule_id}:OP_CTX:op1", + f"SCH:{schedule_id}:STEPS:op1:sg1:C:step", + }, + ) + + proxy = OperationRemovalProxy(store=store, schedule_id=schedule_id) + await proxy.delete() + await _assert_keys(store, set()) + + # try to call when empty as well + await proxy.delete() diff --git a/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test_generic_scheduler.py b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test_generic_scheduler.py new file mode 100644 index 000000000000..44084f36a912 --- /dev/null +++ b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/test_generic_scheduler.py @@ -0,0 +1,338 @@ +# pylint:disable=redefined-outer-name +# pylint:disable=unused-argument + +import asyncio +import secrets +from collections.abc import AsyncIterable, AsyncIterator, Callable, Iterable +from contextlib import ( + AbstractAsyncContextManager, + asynccontextmanager, +) +from datetime import timedelta +from enum import Enum +from multiprocessing import Process, Queue +from typing import Any, Final + +import pytest +from asgi_lifespan import LifespanManager +from common_library.async_tools import cancel_wait_task +from fastapi import FastAPI +from pydantic import NonNegativeFloat, NonNegativeInt +from pytest_simcore.helpers.paused_container import pause_rabbit, pause_redis +from pytest_simcore.helpers.typing_env import EnvVarsDict +from servicelib.deferred_tasks import DeferredContext +from servicelib.rabbitmq import RabbitMQClient +from servicelib.redis._client import RedisClientSDK +from settings_library.rabbit import RabbitSettings +from settings_library.redis import RedisDatabase, RedisSettings +from simcore_service_dynamic_scheduler.core.application import create_app +from simcore_service_dynamic_scheduler.services.generic_scheduler import ( + BaseStep, + Operation, + OperationName, + ParallelStepGroup, + ProvidedOperationContext, + RequiredOperationContext, + SingleStepGroup, + start_operation, +) +from utils import ( + BaseExpectedStepOrder, + CreateRandom, + CreateSequence, + ensure_expected_order, +) + +pytest_simcore_core_services_selection = [ + "rabbit", + "redis", +] +pytest_simcore_ops_services_selection = [ + "redis-commander", +] + + +_OPERATION_MIN_RUNTIME: Final[timedelta] = timedelta(seconds=2) +_OPERATION_STEPS_COUNT: Final[NonNegativeInt] = 10 +_STEP_SLEEP_DURATION: Final[timedelta] = _OPERATION_MIN_RUNTIME / _OPERATION_STEPS_COUNT + + +def _get_random_interruption_duration() -> NonNegativeFloat: + random_duration = secrets.SystemRandom().uniform( + 0.1, _OPERATION_MIN_RUNTIME.total_seconds() + ) + print(f"⏳ Waiting {random_duration:.1f} seconds before interrupting...") + return random_duration + + +@pytest.fixture +def app_environment( + disable_postgres_lifespan: None, + disable_service_tracker_lifespan: None, + disable_notifier_lifespan: None, + disable_status_monitor_lifespan: None, + app_environment: EnvVarsDict, + rabbit_service: RabbitSettings, + redis_service: RedisSettings, + remove_redis_data: None, +) -> EnvVarsDict: + return app_environment + + +@pytest.fixture +async def rabbit_client( + create_rabbitmq_client: Callable[[str], RabbitMQClient], +) -> RabbitMQClient: + return create_rabbitmq_client("pinger") + + +@pytest.fixture +async def redis_client_sdk( + redis_service: RedisSettings, +) -> RedisClientSDK: + return RedisClientSDK( + redis_service.build_redis_dsn(RedisDatabase.DYNAMIC_SERVICES), + client_name="test-client", + ) + + +class _AsyncMultiprocessingQueue: + def __init__(self) -> None: + self._queue = Queue() + + async def get(self) -> Any: + loop = asyncio.get_running_loop() + return await loop.run_in_executor(None, self._queue.get) + + async def put(self, item: Any) -> None: + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, self._queue.put, item) + + +@pytest.fixture +async def multiprocessing_queue() -> _AsyncMultiprocessingQueue: + return _AsyncMultiprocessingQueue() + + +class _QueuePoller: + def __init__(self, queue: _AsyncMultiprocessingQueue) -> None: + self._events: list[tuple[str, str]] = [] + self.queue = queue + + @property + def events(self) -> list[tuple[str, str]]: + return self._events + + async def poll_worker(self) -> None: + while True: + event = await self.queue.get() + self._events.append(event) + if event is None: + break + + async def reset(self) -> None: + self._events.clear() + + +@pytest.fixture +async def queue_poller( + multiprocessing_queue: _AsyncMultiprocessingQueue, +) -> AsyncIterable[_QueuePoller]: + poller = _QueuePoller(multiprocessing_queue) + task = asyncio.create_task(poller.poll_worker(), name="queue-poller") + + yield poller + + await multiprocessing_queue.put(None) # unblock queue if needed + await cancel_wait_task(task) + + +@asynccontextmanager +async def _get_app( + multiprocessing_queue: _AsyncMultiprocessingQueue, +) -> AsyncIterator[FastAPI]: + app = create_app() + app.state.multiprocessing_queue = multiprocessing_queue + async with LifespanManager(app): + yield app + + +class _ProcessManager: + def __init__(self, multiprocessing_queue: _AsyncMultiprocessingQueue) -> None: + self.multiprocessing_queue = multiprocessing_queue + self.process: Process | None = None + + async def _async_worker(self, operation_name: OperationName) -> None: + async with _get_app(self.multiprocessing_queue) as app: + await start_operation(app, operation_name, {}) + while True: # noqa: ASYNC110 + await asyncio.sleep(1) + + def _worker(self, operation_name: OperationName) -> None: + asyncio.run(self._async_worker(operation_name)) + + def start(self, operation_name: OperationName) -> None: + if self.process: + msg = "Process already started" + raise RuntimeError(msg) + + self.process = Process(target=self._worker, args=(operation_name,), daemon=True) + self.process.start() + + def kill(self) -> None: + if self.process is None: + return + + self.process.terminate() + self.process.join() + self.process = None + + +@pytest.fixture +def process_manager( + multiprocessing_queue: _AsyncMultiprocessingQueue, +) -> Iterable[_ProcessManager]: + process_manager = _ProcessManager(multiprocessing_queue) + + yield process_manager + process_manager.kill() + + +class _InterruptionType(str, Enum): + REDIS = "redis" + RABBIT = "rabbit" + DYNAMIC_SCHEDULER = "dynamic-scheduler" + + +_CREATED: Final[str] = "create" +_UNDONE: Final[str] = "undo" + +_CTX_VALUE: Final[str] = "a_value" + + +class _BS(BaseStep): + @classmethod + async def get_create_retries(cls, context: DeferredContext) -> int: + _ = context + return 10 + + @classmethod + async def get_create_wait_between_attempts( + cls, context: DeferredContext + ) -> timedelta: + _ = context + return _STEP_SLEEP_DURATION + + @classmethod + async def create( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + multiprocessing_queue: _AsyncMultiprocessingQueue = ( + app.state.multiprocessing_queue + ) + await multiprocessing_queue.put((cls.__name__, _CREATED)) + + return { + **required_context, + **{k: _CTX_VALUE for k in cls.get_create_provides_context_keys()}, + } + + @classmethod + async def undo( + cls, app: FastAPI, required_context: RequiredOperationContext + ) -> ProvidedOperationContext | None: + multiprocessing_queue: _AsyncMultiprocessingQueue = ( + app.state.multiprocessing_queue + ) + await multiprocessing_queue.put((cls.__name__, _UNDONE)) + + return { + **required_context, + **{k: _CTX_VALUE for k in cls.get_undo_provides_context_keys()}, + } + + +class _BS1(_BS): ... + + +class _BS2(_BS): ... + + +class _BS3(_BS): ... + + +@pytest.mark.parametrize( + "operation, expected_order", + [ + pytest.param( + [ + SingleStepGroup(_BS1), + ], + [ + CreateSequence(_BS1), + ], + id="s1", + ), + pytest.param( + [ + ParallelStepGroup(_BS1, _BS2, _BS3), + ], + [ + CreateRandom(_BS1, _BS2, _BS3), + ], + id="p3", + ), + ], +) +@pytest.mark.parametrize("interruption_type", list(_InterruptionType)) +async def test_can_recover_from_interruption( + app_environment: EnvVarsDict, + interruption_type: _InterruptionType, + rabbit_client: RabbitMQClient, + redis_client_sdk: RedisClientSDK, + register_operation: Callable[[OperationName, Operation], None], + paused_container: Callable[[str], AbstractAsyncContextManager[None]], + operation: Operation, + queue_poller: _QueuePoller, + process_manager: _ProcessManager, + expected_order: list[BaseExpectedStepOrder], +) -> None: + operation_name: OperationName = "test_op" + register_operation(operation_name, operation) + process_manager.start(operation_name) + + match interruption_type: + case _InterruptionType.REDIS: + print(f"[{interruption_type}]: will pause ⚙️") + async with pause_rabbit(paused_container, rabbit_client): + print(f"[{interruption_type}]: paused ⏸️") + + await asyncio.sleep(_get_random_interruption_duration()) + print(f"[{interruption_type}]: unpaused ⏯️") + case _InterruptionType.RABBIT: + print(f"[{interruption_type}]: will pause ⚙️") + async with pause_redis(paused_container, redis_client_sdk): + print(f"[{interruption_type}]: paused ⏸️") + + await asyncio.sleep(_get_random_interruption_duration()) + print(f"[{interruption_type}]: unpaused ⏯️") + case _InterruptionType.DYNAMIC_SCHEDULER: + print(f"[{interruption_type}]: will pause ⚙️") + process_manager.kill() + print(f"[{interruption_type}]: paused ⏸️") + + await asyncio.sleep(_get_random_interruption_duration()) + process_manager.start(operation_name) + print(f"[{interruption_type}]: unpaused ⏯️") + case _: + msg = f"Unhandled interruption_type={interruption_type}" + raise RuntimeError(msg) + + await ensure_expected_order(queue_poller.events, expected_order) + + +# TODO: add a test that replaces a running operation with a new one! make sure nothing bad happens and that the old +# running operation manages to reach the end + + +# THe only way to do it is by cancelling the existing and waitin for it to finish before running something new. diff --git a/services/dynamic-scheduler/tests/unit/services/generic_scheduler/utils.py b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/utils.py new file mode 100644 index 000000000000..075f301688ac --- /dev/null +++ b/services/dynamic-scheduler/tests/unit/services/generic_scheduler/utils.py @@ -0,0 +1,127 @@ +import asyncio +from copy import deepcopy +from typing import Any, Final + +from simcore_service_dynamic_scheduler.services.generic_scheduler import BaseStep +from tenacity import ( + AsyncRetrying, + retry_if_exception_type, + stop_after_delay, + wait_fixed, +) + +_RETRY_PARAMS: Final[dict[str, Any]] = { + "wait": wait_fixed(0.1), + "stop": stop_after_delay(5), + "retry": retry_if_exception_type(AssertionError), +} + +CREATED: Final[str] = "create" +UNDONE: Final[str] = "undo" + + +class BaseExpectedStepOrder: + def __init__(self, *steps: type[BaseStep]) -> None: + self.steps = steps + + def __len__(self) -> int: + return len(self.steps) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({', '.join(step.get_step_name() for step in self.steps)})" + + +class CreateSequence(BaseExpectedStepOrder): + """steps appear in a sequence as CREATE""" + + +class CreateRandom(BaseExpectedStepOrder): + """steps appear in any given order as CREATE""" + + +class UndoSequence(BaseExpectedStepOrder): + """steps appear in a sequence as UNDO""" + + +class UndoRandom(BaseExpectedStepOrder): + """steps appear in any given order as UNDO""" + + +def _assert_order_sequence( + remaning_call_order: list[tuple[str, str]], + steps: tuple[type[BaseStep], ...], + *, + expected: str, +) -> None: + for step in steps: + step_name, actual = remaning_call_order.pop(0) + assert step_name == step.get_step_name() + assert actual == expected + + +def _assert_order_random( + remaning_call_order: list[tuple[str, str]], + steps: tuple[type[BaseStep], ...], + *, + expected: str, +) -> None: + steps_names = {step.get_step_name() for step in steps} + for _ in steps: + step_name, actual = remaning_call_order.pop(0) + assert step_name in steps_names + assert actual == expected + steps_names.remove(step_name) + + +def _assert_expected_order( + steps_call_order: list[tuple[str, str]], + expected_order: list[BaseExpectedStepOrder], + *, + use_only_first_entries: bool, + use_only_last_entries: bool, +) -> None: + assert not (use_only_first_entries and use_only_last_entries) + + expected_order_length = sum(len(x) for x in expected_order) + + # below operations are destructive make a copy + call_order = deepcopy(steps_call_order) + + if use_only_first_entries: + call_order = call_order[:expected_order_length] + if use_only_last_entries: + call_order = call_order[-expected_order_length:] + + assert len(call_order) == expected_order_length + + for group in expected_order: + if isinstance(group, CreateSequence): + _assert_order_sequence(call_order, group.steps, expected=CREATED) + elif isinstance(group, CreateRandom): + _assert_order_random(call_order, group.steps, expected=CREATED) + elif isinstance(group, UndoSequence): + _assert_order_sequence(call_order, group.steps, expected=UNDONE) + elif isinstance(group, UndoRandom): + _assert_order_random(call_order, group.steps, expected=UNDONE) + else: + msg = f"Unknown {group=}" + raise NotImplementedError(msg) + assert not call_order, f"Left overs {call_order=}" + + +async def ensure_expected_order( + detected_calls: list[tuple[str, str]], + expected_order: list[BaseExpectedStepOrder], + *, + use_only_first_entries: bool = False, + use_only_last_entries: bool = False, +) -> None: + async for attempt in AsyncRetrying(**_RETRY_PARAMS): + with attempt: + await asyncio.sleep(0) # wait for envet to trigger + _assert_expected_order( + detected_calls, + expected_order, + use_only_first_entries=use_only_first_entries, + use_only_last_entries=use_only_last_entries, + ) diff --git a/services/dynamic-scheduler/tests/unit/service_tracker/test__api.py b/services/dynamic-scheduler/tests/unit/services/service_tracker/test__api.py similarity index 99% rename from services/dynamic-scheduler/tests/unit/service_tracker/test__api.py rename to services/dynamic-scheduler/tests/unit/services/service_tracker/test__api.py index b1041b1a8963..aa8b91bce65a 100644 --- a/services/dynamic-scheduler/tests/unit/service_tracker/test__api.py +++ b/services/dynamic-scheduler/tests/unit/services/service_tracker/test__api.py @@ -52,6 +52,7 @@ @pytest.fixture def app_environment( + disable_generic_scheduler_lifespan: None, disable_postgres_lifespan: None, disable_rabbitmq_lifespan: None, disable_deferred_manager_lifespan: None, diff --git a/services/dynamic-scheduler/tests/unit/service_tracker/test__models.py b/services/dynamic-scheduler/tests/unit/services/service_tracker/test__models.py similarity index 100% rename from services/dynamic-scheduler/tests/unit/service_tracker/test__models.py rename to services/dynamic-scheduler/tests/unit/services/service_tracker/test__models.py diff --git a/services/dynamic-scheduler/tests/unit/service_tracker/test__tracker.py b/services/dynamic-scheduler/tests/unit/services/service_tracker/test__tracker.py similarity index 98% rename from services/dynamic-scheduler/tests/unit/service_tracker/test__tracker.py rename to services/dynamic-scheduler/tests/unit/services/service_tracker/test__tracker.py index 818a724c77d0..76089824e799 100644 --- a/services/dynamic-scheduler/tests/unit/service_tracker/test__tracker.py +++ b/services/dynamic-scheduler/tests/unit/services/service_tracker/test__tracker.py @@ -35,6 +35,7 @@ def disable_monitor_task(mocker: MockerFixture) -> None: @pytest.fixture def app_environment( + disable_generic_scheduler_lifespan: None, disable_postgres_lifespan: None, disable_monitor_task: None, disable_rabbitmq_lifespan: None, diff --git a/services/dynamic-scheduler/tests/unit/status_monitor/test_services_status_monitor__monitor.py b/services/dynamic-scheduler/tests/unit/services/status_monitor/test__monitor.py similarity index 100% rename from services/dynamic-scheduler/tests/unit/status_monitor/test_services_status_monitor__monitor.py rename to services/dynamic-scheduler/tests/unit/services/status_monitor/test__monitor.py diff --git a/services/dynamic-scheduler/tests/unit/test_services_catalog.py b/services/dynamic-scheduler/tests/unit/services/test_catalog.py similarity index 98% rename from services/dynamic-scheduler/tests/unit/test_services_catalog.py rename to services/dynamic-scheduler/tests/unit/services/test_catalog.py index c54222fdab45..865ca16005cd 100644 --- a/services/dynamic-scheduler/tests/unit/test_services_catalog.py +++ b/services/dynamic-scheduler/tests/unit/services/test_catalog.py @@ -21,6 +21,7 @@ @pytest.fixture def app_environment( + disable_generic_scheduler_lifespan: None, disable_postgres_lifespan: None, disable_redis_lifespan: None, disable_rabbitmq_lifespan: None, diff --git a/services/dynamic-scheduler/tests/unit/test_services_director_v0.py b/services/dynamic-scheduler/tests/unit/services/test_director_v0.py similarity index 98% rename from services/dynamic-scheduler/tests/unit/test_services_director_v0.py rename to services/dynamic-scheduler/tests/unit/services/test_director_v0.py index a24f2b7a5eda..f6d5f0841a12 100644 --- a/services/dynamic-scheduler/tests/unit/test_services_director_v0.py +++ b/services/dynamic-scheduler/tests/unit/services/test_director_v0.py @@ -20,6 +20,7 @@ @pytest.fixture def app_environment( + disable_generic_scheduler_lifespan: None, disable_postgres_lifespan: None, disable_redis_lifespan: None, disable_rabbitmq_lifespan: None, diff --git a/services/dynamic-scheduler/tests/unit/test_services_rabbitmq.py b/services/dynamic-scheduler/tests/unit/services/test_rabbitmq.py similarity index 96% rename from services/dynamic-scheduler/tests/unit/test_services_rabbitmq.py rename to services/dynamic-scheduler/tests/unit/services/test_rabbitmq.py index bdc5fe73fa31..3c4ba1670c05 100644 --- a/services/dynamic-scheduler/tests/unit/test_services_rabbitmq.py +++ b/services/dynamic-scheduler/tests/unit/services/test_rabbitmq.py @@ -20,6 +20,7 @@ @pytest.fixture def app_environment( + disable_generic_scheduler_lifespan: None, disable_postgres_lifespan: None, disable_redis_lifespan: None, disable_service_tracker_lifespan: None, diff --git a/services/dynamic-scheduler/tests/unit/test_services_redis.py b/services/dynamic-scheduler/tests/unit/services/test_redis.py similarity index 95% rename from services/dynamic-scheduler/tests/unit/test_services_redis.py rename to services/dynamic-scheduler/tests/unit/services/test_redis.py index 54a8ad29cc75..1c3b3c9a40c6 100644 --- a/services/dynamic-scheduler/tests/unit/test_services_redis.py +++ b/services/dynamic-scheduler/tests/unit/services/test_redis.py @@ -15,6 +15,7 @@ @pytest.fixture def app_environment( + disable_generic_scheduler_lifespan: None, disable_postgres_lifespan: None, disable_rabbitmq_lifespan: None, disable_deferred_manager_lifespan: None, diff --git a/services/dynamic-scheduler/tests/unit/test_repository_postgres_networks.py b/services/dynamic-scheduler/tests/unit/test_repository_postgres_networks.py index 9ed34d603d42..f5995492be6a 100644 --- a/services/dynamic-scheduler/tests/unit/test_repository_postgres_networks.py +++ b/services/dynamic-scheduler/tests/unit/test_repository_postgres_networks.py @@ -1,5 +1,6 @@ # pylint:disable=contextmanager-generator-missing-cleanup # pylint:disable=redefined-outer-name +# pylint:disable=too-many-arguments # pylint:disable=unused-argument from collections.abc import AsyncIterator @@ -42,6 +43,7 @@ @pytest.fixture def app_environment( app_environment: EnvVarsDict, + disable_generic_scheduler_lifespan: None, postgres_db: sa.engine.Engine, postgres_host_config: PostgresTestConfig, disable_rabbitmq_lifespan: None,