Skip to content

Commit 5293319

Browse files
authored
🐛Dynamic-sidecar stops testing dependencies after 30 seconds instead of relying on docker healthcheck (#8686)
1 parent b66d48a commit 5293319

File tree

10 files changed

+78
-56
lines changed

10 files changed

+78
-56
lines changed

services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_core.py

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
ServiceState.RUNNING,
4848
}
4949

50-
log = logging.getLogger(__name__)
50+
_logger = logging.getLogger(__name__)
5151

5252

5353
async def get_swarm_network(simcore_services_network_name: DockerNetworkName) -> dict:
@@ -122,14 +122,10 @@ async def create_service_and_get_id(
122122
}
123123
kwargs["registry"] = registry_settings.resolved_registry_url
124124

125-
logging.debug("Creating service with\n%s", json_dumps(kwargs, indent=1))
125+
# convert to a list if networks instead of target: id/name
126+
_logger.debug("Creating service with\n%s", json_dumps(kwargs, indent=1))
126127
service_start_result = await client.services.create(**kwargs)
127-
128-
log.debug(
129-
"Started service %s with\n%s",
130-
service_start_result,
131-
json_dumps(kwargs, indent=1),
132-
)
128+
_logger.debug("Started service %s", service_start_result)
133129

134130
if "ID" not in service_start_result:
135131
msg = f"Error while starting service: {service_start_result!s}"
@@ -284,10 +280,7 @@ async def are_sidecar_and_proxy_services_present(
284280
swarm_stack_name=swarm_stack_name,
285281
return_only_sidecars=False,
286282
)
287-
if len(stack_services) != _NUM_SIDECAR_STACK_SERVICES:
288-
return False
289-
290-
return True
283+
return len(stack_services) == _NUM_SIDECAR_STACK_SERVICES
291284

292285

293286
async def _list_docker_services(
@@ -352,7 +345,7 @@ async def remove_dynamic_sidecar_network(network_name: str) -> bool:
352345
"Docker takes some time to establish that the network has no more "
353346
"containers attached to it."
354347
)
355-
log.warning(message)
348+
_logger.warning(message)
356349
return False
357350

358351

@@ -386,7 +379,7 @@ async def _get_id_from_name(client, network_name: str) -> str:
386379

387380
async with docker_client() as client:
388381
existing_networks_names = {x["Name"] for x in await client.networks.list()}
389-
log.debug("existing_networks_names=%s", existing_networks_names)
382+
_logger.debug("existing_networks_names=%s", existing_networks_names)
390383

391384
# create networks if missing
392385
for network in networks:
@@ -409,7 +402,7 @@ async def _get_id_from_name(client, network_name: str) -> str:
409402
# multiple calls to this function can be processed in parallel
410403
# this will cause creation to fail, it is OK to assume it already
411404
# exist an raise an error (see below)
412-
log.info(
405+
_logger.info(
413406
"Network %s might already exist, skipping creation", network
414407
)
415408

@@ -456,7 +449,7 @@ async def try_to_remove_network(network_name: str) -> None:
456449
try:
457450
await network.delete()
458451
except aiodocker.exceptions.DockerError:
459-
log.warning("Could not remove network %s", network_name)
452+
_logger.warning("Could not remove network %s", network_name)
460453

461454

462455
async def _update_service_spec(
@@ -523,7 +516,7 @@ async def update_scheduler_data_label(scheduler_data: SchedulerData) -> None:
523516
)
524517
except GenericDockerError as e:
525518
if e.original_exception.status == status.HTTP_404_NOT_FOUND:
526-
log.info(
519+
_logger.info(
527520
"Skipped labels update for service '%s' which could not be found.",
528521
scheduler_data.service_name,
529522
)
@@ -540,4 +533,4 @@ async def constrain_service_to_node(
540533
}
541534
},
542535
)
543-
log.info("Constraining service %s to node %s", service_name, docker_node_id)
536+
_logger.info("Constraining service %s to node %s", service_name, docker_node_id)

services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/proxy.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,10 @@ def get_dynamic_proxy_spec(
127127
cpu_limit=float(CPU_10_PERCENT) / 1e9,
128128
).to_simcore_runtime_docker_labels(),
129129
"name": scheduler_data.proxy_service_name,
130-
"networks": [swarm_network_id, dynamic_sidecar_network_id],
130+
"networks": [ # NOTE: this is deprecated in docker v1.44 and is replaced by task_template/Networks
131+
swarm_network_id,
132+
dynamic_sidecar_network_id,
133+
],
131134
"task_template": {
132135
"ContainerSpec": {
133136
"Env": {},
@@ -153,6 +156,10 @@ def get_dynamic_proxy_spec(
153156
],
154157
"Mounts": mounts,
155158
},
159+
"Networks": [
160+
{"Target": swarm_network_id},
161+
{"Target": dynamic_sidecar_network_id},
162+
],
156163
"Placement": {
157164
"Constraints": [
158165
"node.platform.os == linux",

services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ async def get_dynamic_sidecar_spec( # pylint:disable=too-many-arguments# noqa:
504504
"endpoint_spec": {"Ports": ports} if ports else {},
505505
"labels": service_labels,
506506
"name": scheduler_data.service_name,
507-
"networks": [
507+
"networks": [ # NOTE: this is deprecated in docker v1.44 and is replaced by task_template/Networks
508508
{"Target": swarm_network_id},
509509
*get_prometheus_monitoring_networks(
510510
dynamic_services_scheduler_settings.DYNAMIC_SIDECAR_PROMETHEUS_MONITORING_NETWORKS,
@@ -549,6 +549,13 @@ async def get_dynamic_sidecar_spec( # pylint:disable=too-many-arguments# noqa:
549549
else None
550550
),
551551
},
552+
"Networks": [
553+
{"Target": swarm_network_id},
554+
*get_prometheus_monitoring_networks(
555+
dynamic_services_scheduler_settings.DYNAMIC_SIDECAR_PROMETHEUS_MONITORING_NETWORKS,
556+
scheduler_data.callbacks_mapping,
557+
),
558+
],
552559
"Placement": {"Constraints": placement_constraints},
553560
"RestartPolicy": DOCKER_CONTAINER_SPEC_RESTART_POLICY_DEFAULTS,
554561
# this will get overwritten

services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_service_specs.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ def expected_dynamic_sidecar_spec(
393393
},
394394
],
395395
},
396+
"Networks": [{"Target": "mocked_swarm_network_id"}],
396397
"Placement": {
397398
"Constraints": [
398399
f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={hardware_info.aws_ec2_instances[0]}",
@@ -568,7 +569,7 @@ async def test_merge_dynamic_sidecar_specs_with_user_specific_specs(
568569
expected_dynamic_sidecar_spec_dict = AioDockerServiceSpec.model_validate(
569570
expected_dynamic_sidecar_spec
570571
).model_dump()
571-
# ensure some entries are sorted the same to prevent flakyness
572+
# ensure some entries are sorted the same to prevent flakiness
572573
for sorted_dict in [dynamic_sidecar_spec_dict, expected_dynamic_sidecar_spec_dict]:
573574
for key in ["DY_SIDECAR_STATE_EXCLUDE", "DY_SIDECAR_STATE_PATHS"]:
574575
# this is a json of a list

services/director/src/simcore_service_director/producer.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,9 @@ async def _create_docker_service_params(
231231
"name": service_name,
232232
"task_template": {
233233
"ContainerSpec": container_spec,
234+
"Networks": (
235+
[{"Target": internal_network_id}] if internal_network_id else []
236+
),
234237
"Placement": {"Constraints": ([])},
235238
"RestartPolicy": {
236239
"Condition": "on-failure",
@@ -277,7 +280,9 @@ async def _create_docker_service_params(
277280
f"traefik.http.routers.{service_name}.middlewares": f"{app_settings.DIRECTOR_SWARM_STACK_NAME}_gzip@swarm",
278281
}
279282
| app_settings.DIRECTOR_SERVICES_CUSTOM_LABELS,
280-
"networks": [internal_network_id] if internal_network_id else [],
283+
"networks": (
284+
[internal_network_id] if internal_network_id else []
285+
), # NOTE: this is deprecated in docker v1.44 and is replaced by task_template/Networks
281286
}
282287
if app_settings.DIRECTOR_SERVICES_CUSTOM_PLACEMENT_CONSTRAINTS:
283288
_logger.debug(
@@ -427,6 +432,7 @@ async def _create_docker_service_params(
427432
swarm_network_id = swarm_network["Id"]
428433
swarm_network_name = swarm_network["Name"]
429434
docker_params["networks"].append(swarm_network_id)
435+
docker_params["task_template"]["Networks"].append({"Target": swarm_network_id})
430436
docker_params["labels"]["traefik.swarm.network"] = swarm_network_name
431437

432438
# set labels for CPU and Memory limits

services/dynamic-sidecar/Dockerfile

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
100100
# volumes between itself and the spawned containers
101101
ENV DYNAMIC_SIDECAR_DY_VOLUMES_MOUNT_DIR="/dy-volumes"
102102

103-
# create direcotry to persist SharedStore data accessiable
103+
# create directory to persist SharedStore data accessible
104104
# between dynamic-sidecar reboots
105105
ENV DYNAMIC_SIDECAR_SHARED_STORE_DIR="${DYNAMIC_SIDECAR_DY_VOLUMES_MOUNT_DIR}/shared-store"
106106
RUN mkdir -p "${DYNAMIC_SIDECAR_SHARED_STORE_DIR}" && \
@@ -186,14 +186,11 @@ COPY --chown=scu:scu --from=prod-only-deps ${DYNAMIC_SIDECAR_DY_VOLUMES_MOUNT_D
186186
COPY --chown=scu:scu services/dynamic-sidecar/docker services/dynamic-sidecar/docker
187187
RUN chmod +x services/dynamic-sidecar/docker/*.sh
188188

189-
# NOTE: the start period of 3 minutes is to allow the dynamic-sidecar
190-
# enough time to connect to the externald dependencies. some times the docker
191-
# networks take time to get created
192-
# https://docs.docker.com/reference/dockerfile/#healthcheck
189+
# NOTE: sometimes docker network may take some time to resolve DNS, thus the longer start-period
193190
HEALTHCHECK \
194191
--interval=10s \
195192
--timeout=5s \
196-
--start-period=180s \
193+
--start-period=64s \
197194
--start-interval=1s \
198195
--retries=5 \
199196
CMD ["python3", "services/dynamic-sidecar/docker/healthcheck.py", "http://localhost:8000/health"]
@@ -206,7 +203,7 @@ CMD ["/bin/sh", "services/dynamic-sidecar/docker/boot.sh"]
206203

207204
# --------------------------Development stage -------------------
208205
# Source code accessible in host but runs in container
209-
# Runs as myu with same gid/uid as host
206+
# Runs as my with same gid/uid as host
210207
# Placed at the end to speed-up the build if images targeting production
211208
#
212209
# + /devel WORKDIR

services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/service_liveness.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,14 @@
55
from typing import Final
66

77
from common_library.errors_classes import OsparcErrorMixin
8-
from tenacity import AsyncRetrying, RetryCallState, TryAgain
8+
from tenacity import AsyncRetrying, RetryCallState, TryAgain, stop_never
99
from tenacity.stop import stop_after_delay
1010
from tenacity.wait import wait_fixed
1111

1212
_logger = logging.getLogger(__name__)
1313

1414

1515
_DEFAULT_CHECK_INTERVAL: Final[timedelta] = timedelta(seconds=1)
16-
_DEFAULT_TIMEOUT_INTERVAL: Final[timedelta] = timedelta(seconds=30)
1716

1817

1918
class CouldNotReachServiceError(OsparcErrorMixin, Exception):
@@ -46,12 +45,16 @@ async def _attempt_to_wait_for_handler(
4645
service_name: str,
4746
endpoint: str,
4847
check_interval: timedelta,
49-
timeout: timedelta,
48+
max_delay: timedelta | None,
5049
**kwargs,
5150
) -> None:
5251
async for attempt in AsyncRetrying(
5352
wait=wait_fixed(check_interval),
54-
stop=stop_after_delay(timeout.total_seconds()),
53+
stop=(
54+
stop_never
55+
if max_delay is None
56+
else stop_after_delay(max_delay.total_seconds())
57+
),
5558
before_sleep=_before_sleep_log(_logger, service_name, endpoint),
5659
reraise=True,
5760
):
@@ -66,7 +69,7 @@ async def wait_for_service_liveness(
6669
service_name: str,
6770
endpoint: str,
6871
check_interval: timedelta | None = None,
69-
timeout: timedelta | None = None,
72+
max_delay: timedelta | None = None,
7073
**kwargs,
7174
) -> None:
7275
"""waits for async_handler to return ``True`` or ``None`` instead of
@@ -79,17 +82,15 @@ async def wait_for_service_liveness(
7982
8083
Keyword Arguments:
8184
check_interval -- interval at which service check is ran (default: {_DEFAULT_CHECK_INTERVAL})
82-
timeout -- stops trying to contact service and raises ``CouldNotReachServiceError``
83-
(default: {_DEFAULT_TIMEOUT_INTERVAL})
85+
max_delay -- stops trying to contact service after max_delay and raises ``CouldNotReachServiceError``
86+
(default: None - no timeout)
8487
8588
Raises:
8689
CouldNotReachServiceError: if it was not able to contact the service in time
8790
"""
8891

8992
if check_interval is None:
9093
check_interval = _DEFAULT_CHECK_INTERVAL
91-
if timeout is None:
92-
timeout = _DEFAULT_TIMEOUT_INTERVAL
9394

9495
try:
9596
start = time.time()
@@ -99,7 +100,7 @@ async def wait_for_service_liveness(
99100
service_name=service_name,
100101
endpoint=endpoint,
101102
check_interval=check_interval,
102-
timeout=timeout,
103+
max_delay=max_delay,
103104
**kwargs,
104105
)
105106
elapsed_ms = (time.time() - start) * 1000

services/dynamic-sidecar/tests/unit/test_core_external_dependencies.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,36 @@
1616
from simcore_service_dynamic_sidecar.core.external_dependencies import (
1717
CouldNotReachExternalDependenciesError,
1818
)
19+
from simcore_service_dynamic_sidecar.modules.service_liveness import (
20+
wait_for_service_liveness as original_wait_for_service_liveness,
21+
)
1922

2023
_LONG_STARTUP_SHUTDOWN_TIMEOUT: Final[NonNegativeFloat] = 60
2124

2225

2326
@pytest.fixture
2427
def mock_liveness_timeout(mocker: MockerFixture) -> None:
25-
mocker.patch(
26-
"simcore_service_dynamic_sidecar.modules.service_liveness._DEFAULT_TIMEOUT_INTERVAL",
27-
new=timedelta(seconds=0.1),
28-
)
28+
# Wrap wait_for_service_liveness to inject a short timeout
29+
30+
async def _wait_with_timeout(*args, **kwargs):
31+
# Force a short timeout for all liveness checks
32+
kwargs["max_delay"] = timedelta(seconds=0.5)
33+
kwargs["check_interval"] = timedelta(seconds=0.1)
34+
return await original_wait_for_service_liveness( # pylint: disable=missing-kwoa
35+
*args, **kwargs
36+
)
37+
38+
# Patch in all modules that import it
39+
for module in [
40+
"simcore_service_dynamic_sidecar.modules.database",
41+
"simcore_service_dynamic_sidecar.core.storage",
42+
"simcore_service_dynamic_sidecar.core.rabbitmq",
43+
"simcore_service_dynamic_sidecar.core.registry",
44+
]:
45+
mocker.patch(
46+
f"{module}.wait_for_service_liveness",
47+
side_effect=_wait_with_timeout,
48+
)
2949

3050

3151
@pytest.fixture

services/dynamic-sidecar/tests/unit/test_core_stroage.py renamed to services/dynamic-sidecar/tests/unit/test_core_storage.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
import multiprocessing
55
from collections.abc import AsyncIterable
6-
from datetime import timedelta
76
from typing import Annotated, Final
87
from unittest.mock import Mock
98

@@ -13,7 +12,6 @@
1312
from fastapi import Depends, FastAPI, HTTPException, status
1413
from fastapi.security import HTTPBasic, HTTPBasicCredentials
1514
from pydantic import TypeAdapter
16-
from pytest_mock import MockerFixture
1715
from settings_library.node_ports import StorageAuthSettings
1816
from simcore_service_dynamic_sidecar.core.storage import (
1917
_get_url,
@@ -105,17 +103,9 @@ def _run_server(app):
105103
process.kill()
106104

107105

108-
@pytest.fixture
109-
def mock_liveness_timeout(mocker: MockerFixture) -> None:
110-
mocker.patch(
111-
"simcore_service_dynamic_sidecar.modules.service_liveness._DEFAULT_TIMEOUT_INTERVAL",
112-
new=timedelta(seconds=2),
113-
)
114-
115-
116106
@pytest.fixture
117107
def mock_dynamic_sidecar_app(
118-
mock_liveness_timeout: None, storage_auth_settings: StorageAuthSettings
108+
storage_auth_settings: StorageAuthSettings,
119109
) -> Mock:
120110
mock = Mock()
121111
mock.state.settings.NODE_PORTS_STORAGE_AUTH = storage_auth_settings

services/dynamic-sidecar/tests/unit/test_service_liveness.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ async def _ok_handler() -> bool | None:
3131
service_name="test_service",
3232
endpoint="http://fake.endpoint_string",
3333
check_interval=check_interval,
34-
timeout=timeout,
34+
max_delay=timeout,
3535
)
3636

3737

@@ -52,6 +52,6 @@ async def _failing_handler() -> bool:
5252
service_name="test_service",
5353
endpoint="http://fake.endpoint_string",
5454
check_interval=check_interval,
55-
timeout=timeout,
55+
max_delay=timeout,
5656
)
5757
assert "Could not contact service" in f"{exc_info.value}"

0 commit comments

Comments
 (0)