From 3bc2dfab6f78d591b66d00833fa02c2aac39ad5c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 23 Oct 2025 08:20:47 +0200 Subject: [PATCH 1/5] removed log_catch + refactor --- .../computational_sidecar/docker_utils.py | 74 ++++++++++--------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py b/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py index b4b7aad93366..9d06f0613f2b 100644 --- a/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py +++ b/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py @@ -272,6 +272,11 @@ async def _parse_container_docker_logs( s3_settings: S3Settings | None, progress_bar: ProgressBarData, ) -> None: + """ + + Raises: + TimeoutError: raised when no logs are received for longer than _AIODOCKER_LOGS_TIMEOUT_S + """ with log_context( logger, logging.DEBUG, "started monitoring of >=1.0 service - using docker logs" ): @@ -348,41 +353,40 @@ async def _monitor_container_logs( # noqa: PLR0913 # pylint: disable=too-many-a are retrieved using the usual cli 'docker logs CONTAINERID' """ - with log_catch(logger, reraise=False): - container_info = await container.show() - container_name = container_info.get("Name", "undefined") - with log_context( - logger, - logging.INFO, - f"parse logs of {service_key}:{service_version} - {container.id}-{container_name}", - ): - if integration_version > LEGACY_INTEGRATION_VERSION: - await _parse_container_docker_logs( - container=container, - progress_regexp=progress_regexp, - service_key=service_key, - service_version=service_version, - container_name=container_name, - task_publishers=task_publishers, - log_file_url=log_file_url, - log_publishing_cb=log_publishing_cb, - s3_settings=s3_settings, - progress_bar=progress_bar, - ) - else: - await _parse_container_log_file( - container=container, - progress_regexp=progress_regexp, - service_key=service_key, - service_version=service_version, - container_name=container_name, - task_publishers=task_publishers, - task_volumes=task_volumes, - log_file_url=log_file_url, - log_publishing_cb=log_publishing_cb, - s3_settings=s3_settings, - progress_bar=progress_bar, - ) + container_info = await container.show() + container_name = container_info.get("Name", "undefined") + with log_context( + logger, + logging.INFO, + f"parse logs of {service_key}:{service_version} - {container.id}-{container_name}", + ): + if integration_version > LEGACY_INTEGRATION_VERSION: + await _parse_container_docker_logs( + container=container, + progress_regexp=progress_regexp, + service_key=service_key, + service_version=service_version, + container_name=container_name, + task_publishers=task_publishers, + log_file_url=log_file_url, + log_publishing_cb=log_publishing_cb, + s3_settings=s3_settings, + progress_bar=progress_bar, + ) + else: + await _parse_container_log_file( + container=container, + progress_regexp=progress_regexp, + service_key=service_key, + service_version=service_version, + container_name=container_name, + task_publishers=task_publishers, + task_volumes=task_volumes, + log_file_url=log_file_url, + log_publishing_cb=log_publishing_cb, + s3_settings=s3_settings, + progress_bar=progress_bar, + ) @contextlib.asynccontextmanager From d140027d41c00593abd7424727d0671911866ec9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 23 Oct 2025 08:39:57 +0200 Subject: [PATCH 2/5] created new error --- .../container_tasks/errors.py | 8 + .../computational_sidecar/docker_utils.py | 165 ++++++++++-------- 2 files changed, 102 insertions(+), 71 deletions(-) diff --git a/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/errors.py b/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/errors.py index 1d4a48bc535e..ccb472b3f6e4 100644 --- a/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/errors.py +++ b/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/errors.py @@ -25,3 +25,11 @@ class ServiceInputsUseFileToKeyMapButReceivesZipDataError( "The service {service_key}:{service_version} {input} uses a file-to-key {file_to_key_map} map but receives zip data instead. " "TIP: either pass a single file or zip file and remove the file-to-key map parameter." ) + + +class ServiceTimeoutLoggingError(ServiceRuntimeError): + msg_template = ( + "The service {service_key}:{service_version}" + " running in container {container_id} was detected as hanging and forcefully terminated by the platform. " + "This happened because it exceeded the maximum allowed time of {timeout_timedelta} without producing any logs." + ) diff --git a/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py b/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py index 9d06f0613f2b..801257b6a15c 100644 --- a/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py +++ b/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py @@ -1,5 +1,6 @@ import asyncio import contextlib +import datetime import logging import re import socket @@ -22,6 +23,7 @@ from aiodocker.containers import DockerContainer from aiodocker.volumes import DockerVolume from dask_task_models_library.container_tasks.docker import DockerBasicAuth +from dask_task_models_library.container_tasks.errors import ServiceTimeoutLoggingError from dask_task_models_library.container_tasks.protocol import ( ContainerCommands, ContainerEnvsDict, @@ -56,7 +58,7 @@ ) from .task_shared_volume import TaskSharedVolumes -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) LogPublishingCB = Callable[[LogMessageStr, LogLevelInt], Coroutine[Any, Any, None]] @@ -102,7 +104,7 @@ async def create_container_config( NanoCPUs=nano_cpus_limit, ), ) - logger.debug("Container configuration: \n%s", pformat(config.model_dump())) + _logger.debug("Container configuration: \n%s", pformat(config.model_dump())) return config @@ -113,7 +115,7 @@ async def managed_container( container = None try: with log_context( - logger, logging.DEBUG, msg=f"managing container {name} for {config.image}" + _logger, logging.DEBUG, msg=f"managing container {name} for {config.image}" ): container = await docker_client.containers.create( config.model_dump(by_alias=True), name=name @@ -121,7 +123,7 @@ async def managed_container( yield container except asyncio.CancelledError: if container: - logger.warning( + _logger.warning( "Cancelling run of container %s, for %s", container.id, config.image ) raise @@ -129,14 +131,14 @@ async def managed_container( try: if container: with log_context( - logger, + _logger, logging.DEBUG, msg=f"Removing container {name}:{container.id} for {config.image}", ): await container.delete(remove=True, v=True, force=True) - logger.info("Completed run of %s", config.image) + _logger.info("Completed run of %s", config.image) except DockerError: - logger.exception( + _logger.exception( "Unknown error with docker client when removing container '%s'", container or name, ) @@ -166,7 +168,7 @@ def _guess_progress_value(progress_match: re.Match[str]) -> float: async def _try_parse_progress( line: str, *, progress_regexp: re.Pattern[str] ) -> float | None: - with log_catch(logger, reraise=False): + with log_catch(_logger, reraise=False): # pattern might be like "timestamp log" log = line.strip("\n") splitted_log = log.split(" ", maxsplit=1) @@ -215,14 +217,14 @@ async def _parse_container_log_file( # noqa: PLR0913 # pylint: disable=too-many ) -> None: log_file = task_volumes.logs_folder / LEGACY_SERVICE_LOG_FILE_NAME with log_context( - logger, + _logger, logging.DEBUG, "started monitoring of pre-1.0 service - using log file in /logs folder", ): - async with aiofiles.open(log_file, mode="rt") as file_pointer: + async with aiofiles.open(log_file) as file_pointer: while (await container.show())["State"]["Running"]: if line := await file_pointer.readline(): - logger.info( + _logger.info( "[%s]: %s", f"{service_key}:{service_version} - {container.id}{container_name}", line, @@ -236,7 +238,7 @@ async def _parse_container_log_file( # noqa: PLR0913 # pylint: disable=too-many # finish reading the logs if possible async for line in file_pointer: - logger.info( + _logger.info( "[%s]: %s", f"{service_key}:{service_version} - {container.id}{container_name}", line, @@ -275,62 +277,76 @@ async def _parse_container_docker_logs( """ Raises: - TimeoutError: raised when no logs are received for longer than _AIODOCKER_LOGS_TIMEOUT_S + ServiceTimeoutLoggingError: raised when no logs are received for longer than _AIODOCKER_LOGS_TIMEOUT_S """ - with log_context( - logger, logging.DEBUG, "started monitoring of >=1.0 service - using docker logs" - ): - assert isinstance(container.docker.connector, aiohttp.UnixConnector) # nosec - async with Docker( - session=aiohttp.ClientSession( - connector=aiohttp.UnixConnector(container.docker.connector.path), - timeout=aiohttp.ClientTimeout(total=_AIODOCKER_LOGS_TIMEOUT_S), - ) - ) as docker_client_for_logs: - # NOTE: this is a workaround for aiodocker not being able to get the container - # logs when the container is not running - container_for_long_running_logs = ( - await docker_client_for_logs.containers.get(container.id) - ) - # NOTE: this is a workaround for aiodocker not being able to get the container - # logs when the container is not running - await container.show() - await container_for_long_running_logs.show() - async with aiofiles.tempfile.TemporaryDirectory() as tmp_dir: - log_file_path = ( - Path(tmp_dir) - / f"{service_key.split(sep='/')[-1]}_{service_version}.logs" + try: + with log_context( + _logger, + logging.DEBUG, + "started monitoring of >=1.0 service - using docker logs", + ): + assert isinstance( + container.docker.connector, aiohttp.UnixConnector + ) # nosec + async with Docker( + session=aiohttp.ClientSession( + connector=aiohttp.UnixConnector(container.docker.connector.path), + timeout=aiohttp.ClientTimeout(total=_AIODOCKER_LOGS_TIMEOUT_S), ) - log_file_path.parent.mkdir(parents=True, exist_ok=True) - async with aiofiles.open(log_file_path, mode="wb+") as log_fp: - async for log_line in cast( - AsyncGenerator[str, None], - container_for_long_running_logs.log( - stdout=True, - stderr=True, - follow=True, - timestamps=True, - ), - ): - log_msg_without_timestamp = log_line.split(" ", maxsplit=1)[1] - logger.info( - "[%s]: %s", - f"{service_key}:{service_version} - {container_for_long_running_logs.id}{container_name}", - log_msg_without_timestamp, - ) - await log_fp.write(log_line.encode("utf-8")) - # NOTE: here we remove the timestamp, only needed for the file - await _parse_and_publish_logs( - log_msg_without_timestamp, - task_publishers=task_publishers, - progress_regexp=progress_regexp, - progress_bar=progress_bar, - ) - - # copy the log file to the log_file_url - await push_file_to_remote( - log_file_path, log_file_url, log_publishing_cb, s3_settings + ) as docker_client_for_logs: + # NOTE: this is a workaround for aiodocker not being able to get the container + # logs when the container is not running + container_for_long_running_logs = ( + await docker_client_for_logs.containers.get(container.id) ) + # NOTE: this is a workaround for aiodocker not being able to get the container + # logs when the container is not running + await container.show() + await container_for_long_running_logs.show() + async with aiofiles.tempfile.TemporaryDirectory() as tmp_dir: + log_file_path = ( + Path(tmp_dir) + / f"{service_key.split(sep='/')[-1]}_{service_version}.logs" + ) + log_file_path.parent.mkdir(parents=True, exist_ok=True) + async with aiofiles.open(log_file_path, mode="wb+") as log_fp: + async for log_line in cast( + AsyncGenerator[str, None], + container_for_long_running_logs.log( + stdout=True, + stderr=True, + follow=True, + timestamps=True, + ), + ): + log_msg_without_timestamp = log_line.split(" ", maxsplit=1)[ + 1 + ] + _logger.info( + "[%s]: %s", + f"{service_key}:{service_version} - {container_for_long_running_logs.id}{container_name}", + log_msg_without_timestamp, + ) + await log_fp.write(log_line.encode("utf-8")) + # NOTE: here we remove the timestamp, only needed for the file + await _parse_and_publish_logs( + log_msg_without_timestamp, + task_publishers=task_publishers, + progress_regexp=progress_regexp, + progress_bar=progress_bar, + ) + + # copy the log file to the log_file_url + await push_file_to_remote( + log_file_path, log_file_url, log_publishing_cb, s3_settings + ) + except TimeoutError as e: + raise ServiceTimeoutLoggingError( + service_key=service_key, + service_version=service_version, + container_id=container.id, + timeout_timedelta=datetime.timedelta(seconds=_AIODOCKER_LOGS_TIMEOUT_S), + ) from e async def _monitor_container_logs( # noqa: PLR0913 # pylint: disable=too-many-arguments @@ -351,12 +367,15 @@ async def _monitor_container_logs( # noqa: PLR0913 # pylint: disable=too-many-a that must be available in task_volumes.log / log.dat Services above are not creating a file and use the usual docker logging. These logs are retrieved using the usual cli 'docker logs CONTAINERID' + + Raises: ServiceTimeoutLoggingError if no logs are received for longer than _AIODOCKER_LOGS_TIMEOUT_S + any error """ container_info = await container.show() container_name = container_info.get("Name", "undefined") with log_context( - logger, + _logger, logging.INFO, f"parse logs of {service_key}:{service_version} - {container.id}-{container_name}", ): @@ -403,6 +422,10 @@ async def managed_monitor_container_log_task( # noqa: PLR0913 # pylint: disable s3_settings: S3Settings | None, progress_bar: ProgressBarData, ) -> AsyncIterator[Awaitable[None]]: + """ + Raises: + ServiceTimeoutLoggingError -- raised when no logs are received for longer than _AIODOCKER_LOGS_TIMEOUT_S + """ monitoring_task = None try: if integration_version == LEGACY_INTEGRATION_VERSION: @@ -432,7 +455,7 @@ async def managed_monitor_container_log_task( # noqa: PLR0913 # pylint: disable await monitoring_task finally: if monitoring_task: - with log_context(logger, logging.DEBUG, "cancel logs monitoring task"): + with log_context(_logger, logging.DEBUG, "cancel logs monitoring task"): monitoring_task.cancel() with contextlib.suppress(asyncio.CancelledError): await monitoring_task @@ -479,7 +502,7 @@ async def get_image_labels( # NOTE: old services did not have the integration-version label # image labels are set to None when empty if image_labels := image_cfg["Config"].get("Labels"): - logger.debug("found following image labels:\n%s", pformat(image_labels)) + _logger.debug("found following image labels:\n%s", pformat(image_labels)) data = from_labels( image_labels, prefix_key=OSPARC_LABEL_PREFIXES[0], trim_key_head=False ) @@ -490,20 +513,20 @@ async def get_image_labels( async def get_computational_shared_data_mount_point(docker_client: Docker) -> Path: app_settings = ApplicationSettings.create_from_envs() try: - logger.debug( + _logger.debug( "getting computational shared data mount point for %s", app_settings.SIDECAR_COMP_SERVICES_SHARED_VOLUME_NAME, ) volume_attributes = await DockerVolume( docker_client, app_settings.SIDECAR_COMP_SERVICES_SHARED_VOLUME_NAME ).show() - logger.debug( + _logger.debug( "found following volume attributes: %s", pformat(volume_attributes) ) return Path(volume_attributes["Mountpoint"]) except DockerError: - logger.exception( + _logger.exception( "Error while retrieving docker volume %s, returnining default %s instead", app_settings.SIDECAR_COMP_SERVICES_SHARED_VOLUME_NAME, app_settings.SIDECAR_COMP_SERVICES_SHARED_FOLDER, From 0412dd322e40b6751f1019fcbba175e0cfdf989f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 23 Oct 2025 08:55:34 +0200 Subject: [PATCH 3/5] ongoing --- .../computational_sidecar/docker_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py b/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py index 801257b6a15c..bea49c7de485 100644 --- a/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py +++ b/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py @@ -422,9 +422,11 @@ async def managed_monitor_container_log_task( # noqa: PLR0913 # pylint: disable s3_settings: S3Settings | None, progress_bar: ProgressBarData, ) -> AsyncIterator[Awaitable[None]]: + # TODO: implement the gneeric error!! """ Raises: ServiceTimeoutLoggingError -- raised when no logs are received for longer than _AIODOCKER_LOGS_TIMEOUT_S + DaskSidecarLoggerError -- raised for any other issue """ monitoring_task = None try: From 19134390f05df9b65c5ede448b3a340c10f15bba Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 24 Oct 2025 17:58:54 +0200 Subject: [PATCH 4/5] raise on service timeout --- .../container_tasks/errors.py | 32 ++++++++++--------- .../computational_sidecar/docker_utils.py | 1 - .../unit/test_computational_sidecar_tasks.py | 14 +++----- 3 files changed, 21 insertions(+), 26 deletions(-) diff --git a/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/errors.py b/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/errors.py index ccb472b3f6e4..a9087cc0bb95 100644 --- a/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/errors.py +++ b/packages/dask-task-models-library/src/dask_task_models_library/container_tasks/errors.py @@ -3,14 +3,14 @@ from common_library.errors_classes import OsparcErrorMixin -class TaskValueError(OsparcErrorMixin, ValueError): ... - - -class TaskCancelledError(OsparcErrorMixin, RuntimeError): - msg_template = "The task was cancelled" +class ContainerTaskError(OsparcErrorMixin, RuntimeError): + msg_template = ( + "The service {service_key}:{service_version}" + " running in container {container_id} encountered an unexpected error: {error_message}." + ) -class ServiceRuntimeError(OsparcErrorMixin, RuntimeError): +class ServiceRuntimeError(ContainerTaskError): msg_template = ( "The service {service_key}:{service_version}" " running in container {container_id} failed with code" @@ -18,18 +18,20 @@ class ServiceRuntimeError(OsparcErrorMixin, RuntimeError): ) -class ServiceInputsUseFileToKeyMapButReceivesZipDataError( - OsparcErrorMixin, RuntimeError -): +class ServiceTimeoutLoggingError(ContainerTaskError): msg_template = ( - "The service {service_key}:{service_version} {input} uses a file-to-key {file_to_key_map} map but receives zip data instead. " - "TIP: either pass a single file or zip file and remove the file-to-key map parameter." + "The service {service_key}:{service_version}" + " running in container {container_id} was detected as hanging and forcefully terminated by the platform. " + "This happened because it exceeded the maximum allowed time of {timeout_timedelta} without producing any logs." ) -class ServiceTimeoutLoggingError(ServiceRuntimeError): +class TaskCancelledError(ContainerTaskError): + msg_template = "The task was cancelled" + + +class ServiceInputsUseFileToKeyMapButReceivesZipDataError(ContainerTaskError): msg_template = ( - "The service {service_key}:{service_version}" - " running in container {container_id} was detected as hanging and forcefully terminated by the platform. " - "This happened because it exceeded the maximum allowed time of {timeout_timedelta} without producing any logs." + "The service {service_key}:{service_version} {input} uses a file-to-key {file_to_key_map} map but receives zip data instead. " + "TIP: either pass a single file or zip file and remove the file-to-key map parameter." ) diff --git a/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py b/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py index bea49c7de485..3fbfebdcab81 100644 --- a/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py +++ b/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py @@ -422,7 +422,6 @@ async def managed_monitor_container_log_task( # noqa: PLR0913 # pylint: disable s3_settings: S3Settings | None, progress_bar: ProgressBarData, ) -> AsyncIterator[Awaitable[None]]: - # TODO: implement the gneeric error!! """ Raises: ServiceTimeoutLoggingError -- raised when no logs are received for longer than _AIODOCKER_LOGS_TIMEOUT_S diff --git a/services/dask-sidecar/tests/unit/test_computational_sidecar_tasks.py b/services/dask-sidecar/tests/unit/test_computational_sidecar_tasks.py index 94503ba4fd79..22a0539f713d 100644 --- a/services/dask-sidecar/tests/unit/test_computational_sidecar_tasks.py +++ b/services/dask-sidecar/tests/unit/test_computational_sidecar_tasks.py @@ -26,6 +26,7 @@ from dask_task_models_library.container_tasks.errors import ( ServiceInputsUseFileToKeyMapButReceivesZipDataError, ServiceRuntimeError, + ServiceTimeoutLoggingError, ) from dask_task_models_library.container_tasks.events import TaskProgressEvent from dask_task_models_library.container_tasks.io import ( @@ -857,7 +858,6 @@ def test_running_service_with_incorrect_zip_data_that_uses_a_file_to_key_map_rai "integration_version, boot_mode", [("1.0.0", BootMode.CPU)], indirect=True ) def test_delayed_logging_with_small_timeout_raises_exception( - caplog: pytest.LogCaptureFixture, app_environment: EnvVarsDict, dask_subsystem_mock: dict[str, mock.Mock], sidecar_task: Callable[..., ServiceExampleParam], @@ -881,17 +881,11 @@ def test_delayed_logging_with_small_timeout_raises_exception( ) # Execute the task and expect a timeout exception in the logs - with caplog.at_level(logging.ERROR, logger="simcore_service_dask_sidecar"): + with pytest.raises(ServiceTimeoutLoggingError): run_computational_sidecar(**waiting_task.sidecar_params()) - assert len(caplog.records) == 1 - record = caplog.records[0] - assert record.exc_info - assert isinstance(record.exc_info[1], TimeoutError) - caplog.clear() + mocker.patch( "simcore_service_dask_sidecar.computational_sidecar.docker_utils._AIODOCKER_LOGS_TIMEOUT_S", 10, # larger timeout to avoid issues ) - with caplog.at_level(logging.ERROR, logger="simcore_service_dask_sidecar"): - run_computational_sidecar(**waiting_task.sidecar_params()) - assert len(caplog.records) == 0 + run_computational_sidecar(**waiting_task.sidecar_params()) From fe881f50af28a43102c142526318dd81e2a8fe70 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 24 Oct 2025 18:14:06 +0200 Subject: [PATCH 5/5] ongoing --- .../computational_sidecar/docker_utils.py | 8 +++++ .../unit/test_computational_sidecar_tasks.py | 32 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py b/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py index 3fbfebdcab81..a18f9a1a4c0d 100644 --- a/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py +++ b/services/dask-sidecar/src/simcore_service_dask_sidecar/computational_sidecar/docker_utils.py @@ -454,6 +454,14 @@ async def managed_monitor_container_log_task( # noqa: PLR0913 # pylint: disable yield monitoring_task # wait for task to complete, so we get the complete log await monitoring_task + except Exception: + _logger.exception( + "Error while monitoring logs of container %s for service %s:%s", + container.id, + service_key, + service_version, + ) + raise finally: if monitoring_task: with log_context(_logger, logging.DEBUG, "cancel logs monitoring task"): diff --git a/services/dask-sidecar/tests/unit/test_computational_sidecar_tasks.py b/services/dask-sidecar/tests/unit/test_computational_sidecar_tasks.py index 22a0539f713d..d068daf6660c 100644 --- a/services/dask-sidecar/tests/unit/test_computational_sidecar_tasks.py +++ b/services/dask-sidecar/tests/unit/test_computational_sidecar_tasks.py @@ -889,3 +889,35 @@ def test_delayed_logging_with_small_timeout_raises_exception( 10, # larger timeout to avoid issues ) run_computational_sidecar(**waiting_task.sidecar_params()) + + +@pytest.mark.parametrize( + "integration_version, boot_mode", [("1.0.0", BootMode.CPU)], indirect=True +) +def test_run_sidecar_with_managed_monitor_container_log_task_raising( + app_environment: EnvVarsDict, + dask_subsystem_mock: dict[str, mock.Mock], + sidecar_task: Callable[..., ServiceExampleParam], + mocked_get_image_labels: mock.Mock, + mocker: MockerFixture, +): + """https://github.com/aio-libs/aiodocker/issues/901""" + # Mock the timeout with a very small value + + mocker.patch( + "simcore_service_dask_sidecar.computational_sidecar.docker_utils.managed_monitor_container_log_task", + side_effect=RuntimeError("Simulated log monitoring failure"), + ) + + # Configure the task to sleep first and then generate logs + waiting_task = sidecar_task( + command=[ + "/bin/bash", + "-c", + 'echo "Starting task"; sleep 5; echo "After sleep"', + ] + ) + + # Execute the task and expect a timeout exception in the logs + run_computational_sidecar(**waiting_task.sidecar_params()) + pytest.fail("TODO: check that the generic error is raised")