Skip to content

Commit cd6c4a4

Browse files
authored
🐛Fix save state for non-responsive services (ITISFoundation#2883)
* Does not raise if service failed to start task * adds flaky tester
1 parent b969b8d commit cd6c4a4

File tree

5 files changed

+31
-21
lines changed

5 files changed

+31
-21
lines changed

services/director-v2/requirements/_test.in

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,33 +11,36 @@
1111

1212

1313
# testing
14-
asgi_lifespan
14+
flaky
1515
pytest
1616
pytest-aiohttp
1717
pytest-cov
18+
pytest-docker
19+
pytest-icdiff
1820
pytest-mock
1921
pytest-runner
20-
pytest-docker
2122
pytest-xdist
22-
pytest-icdiff
23-
async-asgi-testclient
24-
minio
25-
aioboto3
2623

2724
# fixtures
25+
asgi_lifespan
2826
Faker
2927

28+
# helpers
29+
aioboto3
30+
async-asgi-testclient
31+
minio
32+
33+
3034
# migration due to pytest_simcore.postgres_service2
31-
alembic
3235
aio_pika
3336
aioredis
37+
alembic
3438
bokeh
3539
dask-gateway-server[local]
3640
docker
3741
respx
3842

3943
# tools
40-
pylint
41-
coveralls
4244
codecov
43-
ptvsd
45+
coveralls
46+
pylint

services/director-v2/requirements/_test.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# pip-compile --output-file=requirements/_test.txt --strip-extras requirements/_test.in
66
#
7-
aio_pika==6.8.0
7+
aio-pika==6.8.0
88
# via
99
# -c requirements/_base.txt
1010
# -r requirements/_test.in
@@ -129,6 +129,8 @@ execnet==1.9.0
129129
# via pytest-xdist
130130
faker==9.8.3
131131
# via -r requirements/_test.in
132+
flaky==3.7.0
133+
# via -r requirements/_test.in
132134
frozenlist==1.3.0
133135
# via
134136
# -c requirements/_base.txt
@@ -235,8 +237,6 @@ psycopg2-binary==2.9.2
235237
# via
236238
# -c requirements/_base.txt
237239
# sqlalchemy
238-
ptvsd==4.3.2
239-
# via -r requirements/_test.in
240240
py==1.11.0
241241
# via
242242
# pytest

services/director-v2/requirements/_tools.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# pip-compile --output-file=requirements/_tools.txt --strip-extras requirements/_tools.in
66
#
7-
backports.entry-points-selectable==1.1.1
7+
backports-entry-points-selectable==1.1.1
88
# via virtualenv
99
black==21.12b0
1010
# via -r requirements/../../../requirements/devenv.txt

services/director-v2/tests/unit/test_modules_dask_client.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,7 @@ def fake_remote_fct(
641641
)
642642

643643

644+
@pytest.mark.flaky
644645
async def test_failed_task_returns_exceptions(
645646
dask_client: DaskClient,
646647
user_id: UserID,

services/director/src/simcore_service_director/producer.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,7 +1025,11 @@ async def _save_service_state(service_host_name: str, session: aiohttp.ClientSes
10251025
response.raise_for_status()
10261026

10271027
except ClientResponseError as err:
1028-
if err.status in (HTTPStatus.METHOD_NOT_ALLOWED, HTTPStatus.NOT_FOUND):
1028+
if err.status in (
1029+
HTTPStatus.METHOD_NOT_ALLOWED,
1030+
HTTPStatus.NOT_FOUND,
1031+
HTTPStatus.NOT_IMPLEMENTED,
1032+
):
10291033
# NOTE: Legacy Override. Some old services do not have a state entrypoint defined
10301034
# therefore we assume there is nothing to be saved and do not raise exception
10311035
# Responses found so far:
@@ -1038,7 +1042,7 @@ async def _save_service_state(service_host_name: str, session: aiohttp.ClientSes
10381042
err,
10391043
)
10401044
else:
1041-
# re-reaise
1045+
# upss ... could service had troubles saving, reraise
10421046
raise
10431047
else:
10441048
log.info(
@@ -1090,7 +1094,7 @@ async def stop_service(app: web.Application, node_uuid: str, save_state: bool) -
10901094
else "",
10911095
)
10921096

1093-
# If state save is enforced, it will fail if not completed
1097+
# If state save is enforced
10941098
if save_state:
10951099
log.debug("saving state of service %s...", service_host_name)
10961100
try:
@@ -1101,13 +1105,15 @@ async def stop_service(app: web.Application, node_uuid: str, save_state: bool) -
11011105
raise ServiceStateSaveError(
11021106
node_uuid,
11031107
reason=f"service {service_host_name} rejected to save state, "
1104-
f"responded {err.message} (status {err.status})",
1108+
f"responded {err.message} (status {err.status})."
1109+
"Aborting stop service to prevent data loss.",
11051110
) from err
11061111

11071112
except ClientError as err:
1108-
raise ServiceStateSaveError(
1109-
node_uuid, reason=f"service {service_host_name} unreachable [{err}]"
1110-
) from err
1113+
log.warning(
1114+
"Could not save state because {service_host_name} is unreachable [{err}]."
1115+
"Resuming stop_service."
1116+
)
11111117

11121118
# remove the services
11131119
try:

0 commit comments

Comments
 (0)