Skip to content

Commit a2c1b11

Browse files
GitHKAndrei Neagu
andauthored
🐛dynamic-sidecar fixes (ITISFoundation#2990)
* fix issues with service update * fix description * fix pylint * catching failed failed block stopping monitor * fix pylint * using same encoding pattern * pylint * capture correct error * using explicit status * update comment * migrating to pydantic * fix codestyle * bumping down attempts to 3 Co-authored-by: Andrei Neagu <[email protected]>
1 parent 227d72e commit a2c1b11

File tree

5 files changed

+93
-37
lines changed

5 files changed

+93
-37
lines changed

services/director-v2/src/simcore_service_director_v2/models/schemas/dynamic_services/scheduler.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,13 @@ def from_service_inspect(
375375
labels = service_inspect["Spec"]["Labels"]
376376
return cls.parse_raw(labels[DYNAMIC_SIDECAR_SCHEDULER_DATA_LABEL])
377377

378+
def as_label_data(self) -> str:
379+
# compose_spec needs to be json encoded before encoding it to json
380+
# and storing it in the label
381+
return self.copy(
382+
update={"compose_spec": json.dumps(self.compose_spec)}, deep=True
383+
).json()
384+
378385
class Config:
379386
extra = Extra.allow
380387
allow_population_by_field_name = True

services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api.py

Lines changed: 56 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,20 @@
77
import time
88
import traceback
99
from contextlib import asynccontextmanager
10-
from copy import deepcopy
1110
from typing import Any, AsyncIterator, Dict, List, Mapping, Optional, Set, Tuple
1211

1312
import aiodocker
1413
from aiodocker.utils import clean_filters, clean_map
14+
from fastapi import status
1515
from models_library.projects import ProjectID
1616
from models_library.projects_nodes_io import NodeID
1717
from models_library.users import UserID
1818
from packaging import version
1919
from servicelib.utils import logged_gather
20+
from tenacity._asyncio import AsyncRetrying
21+
from tenacity.retry import retry_if_exception_type
22+
from tenacity.stop import stop_after_attempt
23+
from tenacity.wait import wait_exponential
2024

2125
from ...core.settings import DynamicSidecarSettings
2226
from ...models.schemas.constants import (
@@ -79,6 +83,10 @@ async def _custom_volumes_get(self, id): # pylint: disable=redefined-builtin
7983
_monkey_patch_aiodocker()
8084

8185

86+
class _RetryError(Exception):
87+
pass
88+
89+
8290
@asynccontextmanager
8391
async def docker_client() -> AsyncIterator[aiodocker.docker.Docker]:
8492
client = None
@@ -519,36 +527,50 @@ async def update_scheduler_data_label(scheduler_data: SchedulerData) -> None:
519527
# NOTE: builtin `DockerServices.update` function is very limited.
520528
# Using the same pattern but updating labels
521529

522-
try:
523-
# fetch information from service name
524-
service_inspect = await client.services.inspect(scheduler_data.service_name)
525-
service_version = service_inspect["Version"]["Index"]
526-
service_id = service_inspect["ID"]
527-
spec = service_inspect["Spec"]
528-
529-
# compose_spec needs to be json encoded
530-
# before encoding it to json and storing it
531-
# in the label
532-
scheduler_data_copy = deepcopy(scheduler_data)
533-
scheduler_data_copy.compose_spec = json.dumps(
534-
scheduler_data_copy.compose_spec
535-
)
536-
label_data = scheduler_data_copy.json()
537-
spec["Labels"][DYNAMIC_SIDECAR_SCHEDULER_DATA_LABEL] = label_data
538-
539-
await client._query_json( # pylint: disable=protected-access
540-
f"services/{service_id}/update",
541-
method="POST",
542-
data=json.dumps(clean_map(spec)),
543-
params={"version": service_version},
544-
)
545-
except aiodocker.exceptions.DockerError as e:
546-
if not (
547-
e.status == 404
548-
and e.message == f"service {scheduler_data.service_name} not found"
549-
):
550-
raise e
551-
log.debug(
552-
"Skip update for service '%s' which could not be found",
553-
scheduler_data.service_name,
554-
)
530+
# The docker service update API is async, so `update out of sequence` error
531+
# might get raised. This is caused by the `service_version` being out of sync
532+
# with what is currently stored in the docker daemon.
533+
async for attempt in AsyncRetrying(
534+
# waits 1, 4, 8 seconds between retries and gives up
535+
stop=stop_after_attempt(3),
536+
wait=wait_exponential(),
537+
retry=retry_if_exception_type(_RetryError),
538+
reraise=True,
539+
):
540+
with attempt:
541+
try:
542+
# fetch information from service name
543+
service_inspect = await client.services.inspect(
544+
scheduler_data.service_name
545+
)
546+
service_version = service_inspect["Version"]["Index"]
547+
service_id = service_inspect["ID"]
548+
spec = service_inspect["Spec"]
549+
550+
spec["Labels"][
551+
DYNAMIC_SIDECAR_SCHEDULER_DATA_LABEL
552+
] = scheduler_data.as_label_data()
553+
554+
await client._query_json( # pylint: disable=protected-access
555+
f"services/{service_id}/update",
556+
method="POST",
557+
data=json.dumps(clean_map(spec)),
558+
params={"version": service_version},
559+
)
560+
except aiodocker.exceptions.DockerError as e:
561+
if not (
562+
e.status == status.HTTP_404_NOT_FOUND
563+
and e.message
564+
== f"service {scheduler_data.service_name} not found"
565+
):
566+
raise e
567+
if (
568+
e.status == status.HTTP_500_INTERNAL_SERVER_ERROR
569+
and e.message
570+
== "rpc error: code = Unknown desc = update out of sequence"
571+
):
572+
raise _RetryError() from e
573+
log.debug(
574+
"Skip update for service '%s' which could not be found",
575+
scheduler_data.service_name,
576+
)

services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def get_dynamic_sidecar_spec(
192192
# the following are used for scheduling
193193
"uuid": f"{scheduler_data.node_uuid}", # also needed for removal when project is closed
194194
"swarm_stack_name": dynamic_sidecar_settings.SWARM_STACK_NAME, # required for listing services with uuid
195-
DYNAMIC_SIDECAR_SCHEDULER_DATA_LABEL: scheduler_data.json(),
195+
DYNAMIC_SIDECAR_SCHEDULER_DATA_LABEL: scheduler_data.as_label_data(),
196196
},
197197
"name": scheduler_data.service_name,
198198
"networks": [swarm_network_id, dynamic_sidecar_network_id],

services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/task.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,11 @@
4040
update_scheduler_data_label,
4141
)
4242
from ..docker_states import ServiceState, extract_containers_minimim_statuses
43-
from ..errors import DynamicSidecarError, DynamicSidecarNotFoundError
43+
from ..errors import (
44+
DynamicSidecarError,
45+
DynamicSidecarNotFoundError,
46+
GenericDockerError,
47+
)
4448
from .events import REGISTERED_EVENTS
4549

4650
logger = logging.getLogger(__name__)
@@ -368,7 +372,12 @@ async def observing_single_service(service_name: str) -> None:
368372
finally:
369373
# when done, always unlock the resource
370374
if scheduler_data_copy != scheduler_data:
371-
await update_scheduler_data_label(scheduler_data)
375+
try:
376+
await update_scheduler_data_label(scheduler_data)
377+
except GenericDockerError as e:
378+
logger.warning(
379+
"Skipped labels update, please check:\n %s", f"{e}"
380+
)
372381
await lock_with_scheduler_data.resource_lock.unlock_resource()
373382

374383
service_name: Optional[str]
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import json
2+
from copy import deepcopy
3+
4+
from simcore_service_director_v2.models.schemas.dynamic_services import SchedulerData
5+
6+
7+
def test_regression_as_label_data(scheduler_data: SchedulerData) -> None:
8+
# old tested implementation
9+
scheduler_data_copy = deepcopy(scheduler_data)
10+
scheduler_data_copy.compose_spec = json.dumps(scheduler_data_copy.compose_spec)
11+
json_encoded = scheduler_data_copy.json()
12+
13+
# using pydantic's internals
14+
label_data = scheduler_data.as_label_data()
15+
16+
parsed_json_encoded = SchedulerData.parse_raw(json_encoded)
17+
parsed_label_data = SchedulerData.parse_raw(label_data)
18+
assert parsed_json_encoded == parsed_label_data

0 commit comments

Comments
 (0)