Skip to content

Commit 41ff2d3

Browse files
authored
🏗️ ✨ Enhancement/connect director to dask (⚠️ devops) (ITISFoundation#2418)
1 parent 67b000b commit 41ff2d3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+2228
-913
lines changed

.env-devel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ DYNAMIC_SIDECAR_IMAGE=${DOCKER_REGISTRY:-itisfoundation}/dynamic-sidecar:${DOCKE
2323
DIRECTOR_REGISTRY_CACHING_TTL=900
2424
DIRECTOR_REGISTRY_CACHING=True
2525

26+
DIRECTOR_V2_DEV_FEATURES_ENABLED=0
27+
2628
DYNAMIC_SIDECAR_IMAGE=${DOCKER_REGISTRY:-itisfoundation}/dynamic-sidecar:${DOCKER_IMAGE_TAG:-latest}
2729

2830
POSTGRES_DB=simcoredb

.github/workflows/ci-testing-deploy.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ env:
3232

3333
jobs:
3434
unit-test-api:
35+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
3536
name: "[unit] api"
3637
runs-on: ${{ matrix.os }}
3738
strategy:
@@ -75,6 +76,7 @@ jobs:
7576
run: ./ci/github/unit-testing/api.bash test
7677

7778
unit-test-api-server:
79+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
7880
name: "[unit] api-server"
7981
runs-on: ${{ matrix.os }}
8082
strategy:
@@ -131,6 +133,7 @@ jobs:
131133
path: codeclimate.${{ github.job }}_coverage.json
132134

133135
unit-test-catalog:
136+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
134137
name: "[unit] catalog"
135138
runs-on: ${{ matrix.os }}
136139
strategy:
@@ -193,6 +196,7 @@ jobs:
193196
path: codeclimate.${{ github.job }}_coverage.json
194197

195198
unit-test-datcore-adapter:
199+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
196200
name: "[unit] datcore-adapter"
197201
runs-on: ${{ matrix.os }}
198202
strategy:
@@ -253,6 +257,7 @@ jobs:
253257
path: codeclimate.${{ github.job }}_coverage.json
254258

255259
unit-test-director:
260+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
256261
name: "[unit] director"
257262
runs-on: ${{ matrix.os }}
258263
strategy:
@@ -310,6 +315,7 @@ jobs:
310315
path: codeclimate.${{ github.job }}_coverage.json
311316

312317
unit-test-director-v2:
318+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
313319
name: "[unit] director-v2"
314320
runs-on: ${{ matrix.os }}
315321
strategy:
@@ -370,6 +376,7 @@ jobs:
370376
path: codeclimate.${{ github.job }}_coverage.json
371377

372378
unit-test-sidecar:
379+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
373380
name: "[unit] sidecar"
374381
runs-on: ${{ matrix.os }}
375382
strategy:
@@ -426,6 +433,7 @@ jobs:
426433
path: codeclimate.${{ github.job }}_coverage.json
427434

428435
unit-test-dask-sidecar:
436+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
429437
name: "[unit] dask-sidecar"
430438
runs-on: ${{ matrix.os }}
431439
strategy:
@@ -482,6 +490,7 @@ jobs:
482490
path: codeclimate.${{ github.job }}_coverage.json
483491

484492
unit-test-dynamic-sidecar:
493+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
485494
name: "[unit] dynamic-sidecar"
486495
runs-on: ${{ matrix.os }}
487496
strategy:
@@ -540,6 +549,7 @@ jobs:
540549
path: codeclimate.${{ github.job }}_coverage.json
541550

542551
unit-test-frontend:
552+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
543553
name: "[unit] frontend"
544554
runs-on: ${{ matrix.os }}
545555
strategy:
@@ -577,6 +587,7 @@ jobs:
577587
# flags: unittests #optional
578588

579589
unit-test-python-linting:
590+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
580591
name: "[unit] python-linting"
581592
runs-on: ${{ matrix.os }}
582593
strategy:
@@ -620,6 +631,7 @@ jobs:
620631
run: ./ci/github/unit-testing/python-linting.bash test
621632

622633
unit-test-postgres-database:
634+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
623635
name: "[unit] postgres-database"
624636
runs-on: ${{ matrix.os }}
625637
strategy:
@@ -677,6 +689,7 @@ jobs:
677689
path: codeclimate.${{ github.job }}_coverage.json
678690

679691
unit-test-service-integration:
692+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
680693
name: "[unit] service-integration"
681694
runs-on: ${{ matrix.os }}
682695
strategy:
@@ -733,6 +746,7 @@ jobs:
733746
path: codeclimate.${{ github.job }}_coverage.json
734747

735748
unit-test-service-library:
749+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
736750
name: "[unit] service-library"
737751
runs-on: ${{ matrix.os }}
738752
strategy:
@@ -789,6 +803,7 @@ jobs:
789803
path: codeclimate.${{ github.job }}_coverage.json
790804

791805
unit-test-settings-library:
806+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
792807
name: "[unit] settings-library"
793808
runs-on: ${{ matrix.os }}
794809
strategy:
@@ -845,6 +860,7 @@ jobs:
845860
path: codeclimate.${{ github.job }}_coverage.json
846861

847862
unit-test-models-library:
863+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
848864
name: "[unit] models-library"
849865
runs-on: ${{ matrix.os }}
850866
strategy:
@@ -901,6 +917,7 @@ jobs:
901917
path: codeclimate.${{ github.job }}_coverage.json
902918

903919
unit-test-simcore-sdk:
920+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
904921
name: "[unit] simcore-sdk"
905922
runs-on: ${{ matrix.os }}
906923
strategy:
@@ -957,6 +974,7 @@ jobs:
957974
path: codeclimate.${{ github.job }}_coverage.json
958975

959976
unit-test-storage:
977+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
960978
name: "[unit] storage"
961979
runs-on: ${{ matrix.os }}
962980
strategy:
@@ -1013,6 +1031,7 @@ jobs:
10131031
path: codeclimate.${{ github.job }}_coverage.json
10141032

10151033
unit-test-webserver-isolated:
1034+
timeout-minutes: 14 # if this timeout gets too small, then split the tests
10161035
name: "[unit] webserver isolated"
10171036
runs-on: ${{ matrix.os }}
10181037
strategy:

Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,9 @@ printf "$$rows" 'Postgres DB' 'http://$(get_my_ip).nip.io:18080/?pgsql=postgres&
206206
printf "$$rows" Portainer 'http://$(get_my_ip).nip.io:9000' admin adminadmin;\
207207
printf "$$rows" Redis 'http://$(get_my_ip).nip.io:18081';\
208208
printf "$$rows" 'Docker Registry' $${REGISTRY_URL} $${REGISTRY_USER} $${REGISTRY_PW};\
209-
echo "⚠️ if a DNS is not used (as displayed above), the interactive services started via dynamic-sidecar"
210-
echo "⚠️ will not be shown. The frontend accesses them via the uuid.services.YOUR_IP.nip.io:9081"
209+
printf "$$rows" "Dask Dashboard" "http://$(if $(IS_WSL2),$(get_my_ip),127.0.0.1).nip.io:8787";
210+
printf "\n%s\n" "⚠️ if a DNS is not used (as displayed above), the interactive services started via dynamic-sidecar";\
211+
echo "⚠️ will not be shown. The frontend accesses them via the uuid.services.YOUR_IP.nip.io:9081";
211212
endef
212213

213214
show-endpoints:

packages/pytest-simcore/src/pytest_simcore/docker_registry.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ def docker_registry(keep_docker_up: bool) -> str:
3939
name="pytest_registry",
4040
environment=["REGISTRY_STORAGE_DELETE_ENABLED=true"],
4141
restart_policy={"Name": "always"},
42+
volumes={
43+
"pytest_registry_data": {"bind": "/var/lib/registry", "mode": "rw"}
44+
},
4245
detach=True,
4346
)
4447

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# pylint:disable=unused-variable
2+
# pylint:disable=unused-argument
3+
# pylint:disable=redefined-outer-name
4+
5+
from typing import Any, Dict, Iterator
6+
7+
import pytest
8+
from distributed import Client
9+
10+
from .helpers.utils_docker import get_service_published_port
11+
12+
13+
@pytest.fixture(scope="function")
14+
async def dask_scheduler_service(simcore_services, monkeypatch) -> Dict[str, Any]:
15+
# the dask scheduler has a UI for the dashboard and a secondary port for the API
16+
# simcore_services fixture already ensure the dask-scheduler is up and running
17+
dask_scheduler_api_port = get_service_published_port(
18+
"dask-scheduler", target_ports=[8786]
19+
)
20+
# override the port
21+
monkeypatch.setenv("DASK_SCHEDULER_PORT", f"{dask_scheduler_api_port}")
22+
return {"host": "127.0.0.1", "port": dask_scheduler_api_port}
23+
24+
25+
@pytest.fixture(scope="function")
26+
def dask_client(dask_scheduler_service: Dict[str, Any]) -> Iterator[Client]:
27+
28+
client = Client(
29+
f"{dask_scheduler_service['host']}:{dask_scheduler_service['port']}"
30+
)
31+
yield client
32+
client.close()
33+
34+
35+
@pytest.fixture(scope="function")
36+
def dask_sidecar_service(dask_client: Client) -> None:
37+
dask_client.wait_for_workers(n_workers=1, timeout=30)

packages/pytest-simcore/src/pytest_simcore/simcore_services.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,15 @@
1515

1616
log = logging.getLogger(__name__)
1717

18-
SERVICES_TO_SKIP = ["sidecar", "postgres", "redis", "rabbit"]
19-
SERVICE_HEALTHCHECK_ENTRYPOINT = {"director-v2": "/"}
18+
SERVICES_TO_SKIP = ["dask-sidecar", "sidecar", "postgres", "redis", "rabbit"]
19+
SERVICE_PUBLISHED_PORT = {}
20+
SERVICE_HEALTHCHECK_ENTRYPOINT = {
21+
"director-v2": "/",
22+
"dask-scheduler": "/health",
23+
}
24+
AIOHTTP_BASED_SERVICE_PORT: int = 8080
25+
FASTAPI_BASED_SERVICE_PORT: int = 8000
26+
DASK_SCHEDULER_SERVICE_PORT: int = 8787
2027

2128

2229
@pytest.fixture(scope="module")
@@ -28,9 +35,9 @@ def services_endpoint(
2835
stack_name = testing_environ_vars["SWARM_STACK_NAME"]
2936
for service in core_services_selection:
3037
assert f"{stack_name}_{service}" in docker_stack["services"]
31-
if not service in SERVICES_TO_SKIP:
38+
if service not in SERVICES_TO_SKIP:
3239
endpoint = URL(
33-
f"http://127.0.0.1:{get_service_published_port(service, [8080, 8000])}"
40+
f"http://127.0.0.1:{get_service_published_port(service, [AIOHTTP_BASED_SERVICE_PORT, FASTAPI_BASED_SERVICE_PORT, DASK_SCHEDULER_SERVICE_PORT])}"
3441
)
3542
services_endpoint[service] = endpoint
3643
return services_endpoint
@@ -42,7 +49,7 @@ async def simcore_services(services_endpoint: Dict[str, URL], monkeypatch) -> No
4249
# waits for all services to be responsive
4350
wait_tasks = [
4451
wait_till_service_responsive(
45-
f"{endpoint}{SERVICE_HEALTHCHECK_ENTRYPOINT.get(service, '/v0/')}"
52+
URL(f"{endpoint}{SERVICE_HEALTHCHECK_ENTRYPOINT.get(service, '/v0/')}")
4653
)
4754
for service, endpoint in services_endpoint.items()
4855
]

services/dask-sidecar/docker/boot.sh

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ if [ "${SC_BUILD_TARGET}" = "development" ]; then
2525
pip list | sed 's/^/ /'
2626
fi
2727

28-
2928
# RUNNING application ----------------------------------------
3029
#
3130
# - If DASK_START_AS_SCHEDULER is set, then it boots as scheduler otherwise as worker
@@ -52,24 +51,49 @@ else
5251
DASK_WORKER_VERSION=$(dask-worker --version)
5352
DASK_SCHEDULER_ADDRESS="tcp://${DASK_SCHEDULER_HOST}:8786"
5453

55-
54+
#
55+
# by default a dask worker will use as many threads as there are CPUs on the machine regardless of what limit the
56+
# the docker container has set (e.g. if the docker container is limited to 4 CPUs out of 10, dask will still use 10 threads by default)
57+
# so for now we lock the number of threads to 1, so that only 1 job is done by 1 sidecar, thus --nthreads 1.
58+
59+
#
60+
# 'daemonic processes are not allowed to have children' arises when running the sidecar.cli
61+
# because multi-processing library is used by the sidecar and the nanny does not like it
62+
# setting --no-nanny fixes this: see https://github.com/dask/distributed/issues/2142
63+
num_gpus=$(python -c "from simcore_service_sidecar.utils import num_available_gpus; print(num_available_gpus());")
64+
resources="CPU=1"
65+
if [ "$num_gpus" -gt 0 ]; then
66+
resources="$resources,GPU=$num_gpus"
67+
fi
68+
if [ ${TARGET_MPI_NODE_CPU_COUNT+x} ]; then
69+
if [ $(nproc) -eq ${TARGET_MPI_NODE_CPU_COUNT} ]; then
70+
resources="$resources,MPI=1"
71+
fi
72+
fi
5673
echo "$INFO" "Starting as a ${DASK_WORKER_VERSION} -> ${DASK_SCHEDULER_ADDRESS} ..."
74+
echo "$INFO" "Worker resources set as: $resources"
5775
if [ "${SC_BOOT_MODE}" = "debug-ptvsd" ]; then
5876

5977
exec watchmedo auto-restart --recursive --pattern="*.py" -- \
6078
dask-worker "${DASK_SCHEDULER_ADDRESS}" \
61-
--local-directory /tmp/dask-sidecar \
62-
--preload simcore_service_dask_sidecar.tasks \
63-
--reconnect \
64-
--dashboard-address 8787
79+
--local-directory /tmp/dask-sidecar \
80+
--preload simcore_service_dask_sidecar.tasks \
81+
--reconnect \
82+
--no-nanny \
83+
--nthreads 1 \
84+
--dashboard-address 8787 \
85+
--resources "$resources"
6586

6687
else
6788

6889
exec dask-worker "${DASK_SCHEDULER_ADDRESS}" \
69-
--local-directory /tmp/dask-sidecar \
70-
--preload simcore_service_dask_sidecar.tasks \
71-
--reconnect \
72-
--dashboard-address 8787
90+
--local-directory /tmp/dask-sidecar \
91+
--preload simcore_service_dask_sidecar.tasks \
92+
--reconnect \
93+
--no-nanny \
94+
--nthreads 1 \
95+
--dashboard-address 8787 \
96+
--resources "$resources"
7397

7498
fi
7599
fi

services/dask-sidecar/tests/unit/conftest.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
pytest_plugins = [
1111
"pytest_simcore.repository_paths",
1212
"pytest_simcore.environment_configs",
13+
"pytest_simcore.docker_compose",
14+
"pytest_simcore.tmp_path_extra",
1315
]
1416

1517

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from typing import Any, Dict
2+
3+
4+
def test_sidecar_service_is_deployed_in_global_mode(
5+
simcore_docker_compose: Dict[str, Any]
6+
):
7+
dask_sidecar_deploy_config = simcore_docker_compose["services"]["dask-sidecar"][
8+
"deploy"
9+
]
10+
assert dask_sidecar_deploy_config["mode"] == "global"

services/director-v2/requirements/_base.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#
44
# NOTE: ALL version constraints MUST be commented
55
--constraint ../../../requirements/constraints.txt
6+
# we need EXACTLY the same dask libraries in the client/scheduler/workers
7+
--requirement ../../../services/dask-sidecar/requirements/_dask-distributed.txt
68

79
--requirement ../../../packages/models-library/requirements/_base.in
810
--requirement ../../../packages/settings-library/requirements/_base.in

0 commit comments

Comments
 (0)