Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env-devel
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH='{"type":"tls","tls_c
CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG=master-github-latest
CLUSTERS_KEEPER_DASK_NPROCS=1
CLUSTERS_KEEPER_DASK_NTHREADS=0
CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER=1
CLUSTERS_KEEPER_DASK_WORKER_SATURATION=inf
CLUSTERS_KEEPER_EC2_ACCESS=null
CLUSTERS_KEEPER_SSM_ACCESS=null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,14 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings):
),
]

CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER: Annotated[
PositiveInt,
Field(
description="multiplier for the default number of threads per process in the dask-sidecars, (see description in dask-sidecar)",
le=10,
),
] = 1

CLUSTERS_KEEPER_DASK_WORKER_SATURATION: Annotated[
NonNegativeFloat | Literal["inf"],
Field(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ services:
DASK_LOG_FORMAT_LOCAL_DEV_ENABLED: 1
DASK_NPROCS: ${DASK_NPROCS}
DASK_NTHREADS: ${DASK_NTHREADS}
DASK_NTHREADS_MULTIPLIER: ${DASK_NTHREADS_MULTIPLIER}
DASK_SCHEDULER_URL: tls://dask-scheduler:8786
DASK_SIDECAR_NON_USABLE_RAM: 0
DASK_SIDECAR_NUM_NON_USABLE_CPUS: 0
Expand Down Expand Up @@ -164,7 +165,7 @@ services:
resources:
limits:
memory: 512M
cpus: "0.5"
cpus: "1.0"

prometheus:
image: prom/prometheus:v2.51.0@sha256:5ccad477d0057e62a7cd1981ffcc43785ac10c5a35522dc207466ff7e7ec845f
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def _convert_to_env_dict(entries: dict[str, Any]) -> str:
f"CLUSTERS_KEEPER_EC2_SECRET_ACCESS_KEY={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_SECRET_ACCESS_KEY}",
f"DASK_NPROCS={app_settings.CLUSTERS_KEEPER_DASK_NPROCS}",
f"DASK_NTHREADS={app_settings.CLUSTERS_KEEPER_DASK_NTHREADS}",
f"DASK_NTHREADS_MULTIPLIER={app_settings.CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER}",
f"DASK_TLS_CA_FILE={_HOST_TLS_CA_FILE_PATH}",
f"DASK_TLS_CERT={_HOST_TLS_CERT_FILE_PATH}",
f"DASK_TLS_KEY={_HOST_TLS_KEY_FILE_PATH}",
Expand Down
1 change: 1 addition & 0 deletions services/clusters-keeper/tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def app_environment(
"CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX": faker.pystr(),
"CLUSTERS_KEEPER_DASK_NPROCS": f"{faker.pyint()}",
"CLUSTERS_KEEPER_DASK_NTHREADS": f"{faker.pyint(min_value=0)}",
"CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER": f"{faker.pyint(min_value=1, max_value=10)}",
"CLUSTERS_KEEPER_DASK_WORKER_SATURATION": f"{faker.pyfloat(min_value=0.1)}",
"CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH": "{}",
"PRIMARY_EC2_INSTANCES_KEY_NAME": faker.pystr(),
Expand Down
13 changes: 13 additions & 0 deletions services/dask-sidecar/docker/boot.sh
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,19 @@ else
DASK_NTHREADS=${DASK_NTHREADS:="$num_cpus"}
DASK_MEMORY_LIMIT=${DASK_MEMORY_LIMIT:="$ram"}
DASK_WORKER_NAME=${DASK_WORKER_NAME:="dask-sidecar_$(hostname)_$(date +'%Y-%m-%d_%T')_$$"}
# If DASK_NTHREADS_MULTIPLIER is defined, multiply DASK_NTHREADS (round to nearest int, min 1)
if [ -n "${DASK_NTHREADS_MULTIPLIER:-}" ]; then
# check DASK_NTHREADS_MULTIPLIER is a number
if awk -v m="$DASK_NTHREADS_MULTIPLIER" 'BEGIN{ if (m+0==m) exit 0; else exit 1 }'; then
# multiply and round to nearest int, min 1
new_nthreads=$(awk -v n="$DASK_NTHREADS" -v m="$DASK_NTHREADS_MULTIPLIER" 'BEGIN{ r=n*m; if(r<1) r=1; printf("%d", (r==int(r)?int(r):int(r+0.5))) }')
DASK_NTHREADS="$new_nthreads"
print_info "DASK_NTHREADS multiplied by ${DASK_NTHREADS_MULTIPLIER} -> ${DASK_NTHREADS}"
else
print_info "DASK_NTHREADS_MULTIPLIER is not numeric: ${DASK_NTHREADS_MULTIPLIER}"
fi
fi

#
# 'daemonic processes are not allowed to have children' arises when running the sidecar.cli
# because multi-processing library is used by the sidecar and the nanny does not like it
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,14 @@ async def _handle_computational_retrieval_error(
elapsed_time
> self.settings.COMPUTATIONAL_BACKEND_MAX_WAITING_FOR_RETRIEVING_RESULTS
):
_logger.error(
**create_troubleshooting_log_kwargs(
f"Task {task.job_id} failed because results could not be retrieved after {elapsed_time}",
error=result,
error_context=log_error_context,
tip="Please try again later or contact support if the problem persists.",
)
)
return RunningState.FAILED, SimcorePlatformStatus.BAD, task_errors, True
# state is kept as STARTED so it will be retried
return RunningState.STARTED, SimcorePlatformStatus.BAD, task_errors, False
Expand Down
1 change: 1 addition & 0 deletions services/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ services:
CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH: ${CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH}
CLUSTERS_KEEPER_DASK_NPROCS: ${CLUSTERS_KEEPER_DASK_NPROCS}
CLUSTERS_KEEPER_DASK_NTHREADS: ${CLUSTERS_KEEPER_DASK_NTHREADS}
CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER: ${CLUSTERS_KEEPER_DASK_NTHREADS_MULTIPLIER}
CLUSTERS_KEEPER_DASK_WORKER_SATURATION: ${CLUSTERS_KEEPER_DASK_WORKER_SATURATION}
CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION: ${CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION}
CLUSTERS_KEEPER_TASK_INTERVAL: ${CLUSTERS_KEEPER_TASK_INTERVAL}
Expand Down
Loading