From f949ccd95acf7c8363efecda929f7655cd693136 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 24 Sep 2025 13:08:06 +0200 Subject: [PATCH 1/3] add a real error message when time out is effective --- .../modules/comp_scheduler/_scheduler_dask.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py index 867b59625cbf..56c436a7be36 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py @@ -406,6 +406,14 @@ async def _handle_computational_retrieval_error( elapsed_time > self.settings.COMPUTATIONAL_BACKEND_MAX_WAITING_FOR_RETRIEVING_RESULTS ): + _logger.error( + **create_troubleshooting_log_kwargs( + f"Task {task.job_id} failed because results could not be retrieved after {elapsed_time}", + error=result, + error_context=log_error_context, + tip="Please try again later or contact support if the problem persists.", + ) + ) return RunningState.FAILED, SimcorePlatformStatus.BAD, task_errors, True # state is kept as STARTED so it will be retried return RunningState.STARTED, SimcorePlatformStatus.BAD, task_errors, False From 8bf5dfa5b4f1dce8b2ee90056f9a89e083c33e3b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 08:56:13 +0200 Subject: [PATCH 2/3] done --- .env-devel | 1 + .../core/settings.py | 8 ++++++++ .../data/docker-compose.yml | 2 ++ .../utils/clusters.py | 1 + services/clusters-keeper/tests/unit/conftest.py | 1 + services/dask-sidecar/docker/boot.sh | 13 +++++++++++++ services/docker-compose.yml | 1 + 7 files changed, 27 insertions(+) diff --git a/.env-devel b/.env-devel index 0e27dfde6f1a..cd60f2e9d365 100644 --- a/.env-devel +++ b/.env-devel @@ -56,6 +56,7 @@ CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH='{"type":"tls","tls_c CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG=master-github-latest CLUSTERS_KEEPER_DASK_NPROCS=1 CLUSTERS_KEEPER_DASK_NTHREADS=0 +CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER=1 CLUSTERS_KEEPER_DASK_WORKER_SATURATION=inf CLUSTERS_KEEPER_EC2_ACCESS=null CLUSTERS_KEEPER_SSM_ACCESS=null diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py index d7f47dcf16e5..463ac51189e8 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py @@ -441,6 +441,14 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): ), ] + CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER: Annotated[ + PositiveInt, + Field( + description="multiplier for the default number of threads per process in the dask-sidecars, (see description in dask-sidecar)", + le=10, + ), + ] = 1 + CLUSTERS_KEEPER_DASK_WORKER_SATURATION: Annotated[ NonNegativeFloat | Literal["inf"], Field( diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml index 1a9bc6b65526..d643eb910694 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml @@ -32,6 +32,7 @@ services: - "node.role==manager" resources: limits: + # TODO: should be tuned based on machines where this is deployed memory: 2048M secrets: - source: dask_tls_ca @@ -58,6 +59,7 @@ services: DASK_LOG_FORMAT_LOCAL_DEV_ENABLED: 1 DASK_NPROCS: ${DASK_NPROCS} DASK_NTHREADS: ${DASK_NTHREADS} + DASK_NTHREADS_MULTIPLIER: ${DASK_NTHREADS_MULTIPLIER} DASK_SCHEDULER_URL: tls://dask-scheduler:8786 DASK_SIDECAR_NON_USABLE_RAM: 0 DASK_SIDECAR_NUM_NON_USABLE_CPUS: 0 diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py index 8cbd057fce32..653cb6f0ecdc 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py @@ -91,6 +91,7 @@ def _convert_to_env_dict(entries: dict[str, Any]) -> str: f"CLUSTERS_KEEPER_EC2_SECRET_ACCESS_KEY={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_SECRET_ACCESS_KEY}", f"DASK_NPROCS={app_settings.CLUSTERS_KEEPER_DASK_NPROCS}", f"DASK_NTHREADS={app_settings.CLUSTERS_KEEPER_DASK_NTHREADS}", + f"DASK_NTHREADS_MULTIPLIER={app_settings.CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER}", f"DASK_TLS_CA_FILE={_HOST_TLS_CA_FILE_PATH}", f"DASK_TLS_CERT={_HOST_TLS_CERT_FILE_PATH}", f"DASK_TLS_KEY={_HOST_TLS_KEY_FILE_PATH}", diff --git a/services/clusters-keeper/tests/unit/conftest.py b/services/clusters-keeper/tests/unit/conftest.py index b464bd38da48..f6c4283f77f9 100644 --- a/services/clusters-keeper/tests/unit/conftest.py +++ b/services/clusters-keeper/tests/unit/conftest.py @@ -132,6 +132,7 @@ def app_environment( "CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX": faker.pystr(), "CLUSTERS_KEEPER_DASK_NPROCS": f"{faker.pyint()}", "CLUSTERS_KEEPER_DASK_NTHREADS": f"{faker.pyint(min_value=0)}", + "CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER": f"{faker.pyint(min_value=1, max_value=10)}", "CLUSTERS_KEEPER_DASK_WORKER_SATURATION": f"{faker.pyfloat(min_value=0.1)}", "CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH": "{}", "PRIMARY_EC2_INSTANCES_KEY_NAME": faker.pystr(), diff --git a/services/dask-sidecar/docker/boot.sh b/services/dask-sidecar/docker/boot.sh index 56ca3b6310e5..5a01d008c936 100755 --- a/services/dask-sidecar/docker/boot.sh +++ b/services/dask-sidecar/docker/boot.sh @@ -167,6 +167,19 @@ else DASK_NTHREADS=${DASK_NTHREADS:="$num_cpus"} DASK_MEMORY_LIMIT=${DASK_MEMORY_LIMIT:="$ram"} DASK_WORKER_NAME=${DASK_WORKER_NAME:="dask-sidecar_$(hostname)_$(date +'%Y-%m-%d_%T')_$$"} + # If DASK_NTHREADS_MULTIPLIER is defined, multiply DASK_NTHREADS (round to nearest int, min 1) + if [ -n "${DASK_NTHREADS_MULTIPLIER:-}" ]; then + # check DASK_NTHREADS_MULTIPLIER is a number + if awk -v m="$DASK_NTHREADS_MULTIPLIER" 'BEGIN{ if (m+0==m) exit 0; else exit 1 }'; then + # multiply and round to nearest int, min 1 + new_nthreads=$(awk -v n="$DASK_NTHREADS" -v m="$DASK_NTHREADS_MULTIPLIER" 'BEGIN{ r=n*m; if(r<1) r=1; printf("%d", (r==int(r)?int(r):int(r+0.5))) }') + DASK_NTHREADS="$new_nthreads" + print_info "DASK_NTHREADS multiplied by ${DASK_NTHREADS_MULTIPLIER} -> ${DASK_NTHREADS}" + else + print_info "DASK_NTHREADS_MULTIPLIER is not numeric: ${DASK_NTHREADS_MULTIPLIER}" + fi + fi + # # 'daemonic processes are not allowed to have children' arises when running the sidecar.cli # because multi-processing library is used by the sidecar and the nanny does not like it diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 2e46e7ccf8b3..d8830b48c522 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -228,6 +228,7 @@ services: CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH: ${CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH} CLUSTERS_KEEPER_DASK_NPROCS: ${CLUSTERS_KEEPER_DASK_NPROCS} CLUSTERS_KEEPER_DASK_NTHREADS: ${CLUSTERS_KEEPER_DASK_NTHREADS} + CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER: ${CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER} CLUSTERS_KEEPER_DASK_WORKER_SATURATION: ${CLUSTERS_KEEPER_DASK_WORKER_SATURATION} CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION: ${CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION} CLUSTERS_KEEPER_TASK_INTERVAL: ${CLUSTERS_KEEPER_TASK_INTERVAL} From 784bfe06eeff11e4ede109728717ea9a885aaa11 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Sep 2025 10:00:41 +0200 Subject: [PATCH 3/3] revert --- .../simcore_service_clusters_keeper/data/docker-compose.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml index d643eb910694..761d3029c454 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml @@ -32,7 +32,6 @@ services: - "node.role==manager" resources: limits: - # TODO: should be tuned based on machines where this is deployed memory: 2048M secrets: - source: dask_tls_ca @@ -166,7 +165,7 @@ services: resources: limits: memory: 512M - cpus: "0.5" + cpus: "1.0" prometheus: image: prom/prometheus:v2.51.0@sha256:5ccad477d0057e62a7cd1981ffcc43785ac10c5a35522dc207466ff7e7ec845f