Skip to content

Commit 31dd926

Browse files
GitHKAndrei Neagu
andauthored
♻️🐛 director-v2 sidecar tracking fixes (⚠️ devops) (ITISFoundation#3272)
Co-authored-by: Andrei Neagu <[email protected]>
1 parent bce5e99 commit 31dd926

25 files changed

+582
-507
lines changed

services/director-v2/src/simcore_service_director_v2/cli.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
from .core.application import create_base_app
1919
from .core.settings import AppSettings
2020
from .meta import PROJECT_NAME
21-
from .models.schemas.dynamic_services import DynamicSidecarNames
21+
from .models.schemas.dynamic_services import DynamicSidecarNamesHelper
2222
from .modules import db, director_v0, dynamic_sidecar
2323
from .modules.db.repositories.projects import ProjectsRepository
2424
from .modules.director_v0 import DirectorV0Client
2525
from .modules.dynamic_sidecar import api_client
26-
from .modules.dynamic_sidecar.scheduler._utils import fetch_repo_outside_of_request
26+
from .modules.dynamic_sidecar.scheduler._utils import get_repository
2727
from .modules.projects_networks import requires_dynamic_sidecar
2828

2929
DEFAULT_NODE_SAVE_RETRY: Final[int] = 3
@@ -52,7 +52,7 @@ async def _initialized_app() -> AsyncIterator[FastAPI]:
5252
def _get_dynamic_sidecar_endpoint(
5353
settings: AppSettings, node_id: NodeIDStr
5454
) -> AnyHttpUrl:
55-
dynamic_sidecar_names = DynamicSidecarNames.make(UUID(node_id))
55+
dynamic_sidecar_names = DynamicSidecarNamesHelper.make(UUID(node_id))
5656
hostname = dynamic_sidecar_names.service_name_dynamic_sidecar
5757
port = settings.DYNAMIC_SERVICES.DYNAMIC_SIDECAR.DYNAMIC_SIDECAR_PORT
5858
return parse_obj_as(AnyHttpUrl, f"http://{hostname}:{port}") # NOSONAR
@@ -80,7 +80,7 @@ async def _save_node_state(
8080

8181
async def _async_project_save_state(project_id: ProjectID, retry_save: int) -> None:
8282
async with _initialized_app() as app:
83-
projects_repository: ProjectsRepository = fetch_repo_outside_of_request(
83+
projects_repository: ProjectsRepository = get_repository(
8484
app, ProjectsRepository
8585
)
8686
project_at_db = await projects_repository.get_project(project_id)

services/director-v2/src/simcore_service_director_v2/core/settings.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,12 @@ class DynamicSidecarSettings(BaseCustomSettings):
305305
False,
306306
description=(
307307
"Limits concurrent service saves for a docker node. Guarantees "
308-
"that no more than X services use a resource together."
308+
"that no more than X services use a resource together. "
309+
"NOTE: A node can end up with all the services from a single study. "
310+
"When the study is closed/opened all the services will try to "
311+
"upload/download their data. This causes a lot of disk "
312+
"and network stress (especially for low power nodes like in AWS). "
313+
"Some nodes collapse under load or behave unexpectedly."
309314
),
310315
)
311316
DYNAMIC_SIDECAR_DOCKER_NODE_CONCURRENT_RESOURCE_SLOTS: PositiveInt = Field(
@@ -449,6 +454,14 @@ class AppSettings(BaseCustomSettings, MixinLoggingSettings):
449454
)
450455
DIRECTOR_V2_DEV_FEATURES_ENABLED: bool = False
451456

457+
DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED: bool = Field(
458+
False,
459+
description=(
460+
"Under development feature. If enabled state "
461+
"is saved using rclone docker volumes."
462+
),
463+
)
464+
452465
# for passing self-signed certificate to spawned services
453466
# TODO: fix these variables once the timeout-minutes: 30 is able to start dynamic services
454467
DIRECTOR_V2_SELF_SIGNED_SSL_SECRET_ID: str = ""

services/director-v2/src/simcore_service_director_v2/models/schemas/dynamic_services/scheduler.py

Lines changed: 65 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import Any, Mapping, Optional
55
from uuid import UUID, uuid4
66

7+
from models_library.basic_types import PortInt
78
from models_library.projects_nodes_io import NodeID
89
from models_library.service_settings_labels import (
910
DynamicSidecarServiceLabels,
@@ -12,15 +13,7 @@
1213
)
1314
from models_library.services import RunID
1415
from models_library.services_resources import ServiceResourcesDict
15-
from pydantic import (
16-
AnyHttpUrl,
17-
BaseModel,
18-
Extra,
19-
Field,
20-
PositiveInt,
21-
constr,
22-
parse_obj_as,
23-
)
16+
from pydantic import AnyHttpUrl, BaseModel, Extra, Field, constr, parse_obj_as
2417
from servicelib.error_codes import ErrorCodeStr
2518

2619
from ..constants import (
@@ -39,6 +32,7 @@
3932
DockerId = constr(max_length=25, regex=r"[A-Za-z0-9]{25}")
4033
ServiceId = DockerId
4134
NetworkId = DockerId
35+
ServiceName = constr(strip_whitespace=True, min_length=2)
4236

4337
logger = logging.getLogger()
4438

@@ -143,25 +137,11 @@ def mark_removed(self) -> None:
143137

144138

145139
class DynamicSidecar(BaseModel):
146-
run_id: RunID = Field(
147-
default_factory=uuid4,
148-
description=(
149-
"Used to discriminate between dynamic-sidecar docker resources "
150-
"generated during different runs. Sometimes artifacts remain in the"
151-
"system after an error. This helps avoiding collisions."
152-
"For now used by anonymous volumes involved in data sharing"
153-
),
154-
)
155-
156140
status: Status = Field(
157141
Status.create_as_initially_ok(),
158142
description="status of the service sidecar also with additional information",
159143
)
160144

161-
hostname: str = Field(..., description="docker hostname for this service")
162-
163-
port: PositiveInt = Field(8000, description="dynamic-sidecar port")
164-
165145
is_available: bool = Field(
166146
False,
167147
scription=(
@@ -224,6 +204,18 @@ def compose_spec_submitted(self) -> bool:
224204
),
225205
)
226206

207+
wait_for_manual_intervention_after_error: bool = Field(
208+
False,
209+
description=(
210+
"Marks the sidecar as untouchable since there was an error and "
211+
"important data might be lost. awaits for manual intervention."
212+
),
213+
)
214+
were_state_and_outputs_saved: bool = Field(
215+
False,
216+
description="set True if the dy-sidecar saves the state and uploads the outputs",
217+
)
218+
227219
# below had already been validated and
228220
# used only to start the proxy
229221
dynamic_sidecar_id: Optional[ServiceId] = Field(
@@ -239,34 +231,19 @@ def compose_spec_submitted(self) -> bool:
239231
None, description="used for starting the proxy"
240232
)
241233

242-
@property
243-
def can_save_state(self) -> bool:
244-
"""
245-
Keeps track of the current state of the application, if it was starte successfully
246-
the state of the service can be saved when stopping the service
247-
"""
248-
# TODO: implement when adding save status hooks
249-
return False
250-
251-
# consider adding containers for healthchecks but this is more difficult and it depends on each service
252-
253-
@property
254-
def endpoint(self) -> AnyHttpUrl:
255-
"""endpoint where all the services are exposed"""
256-
return parse_obj_as(
257-
AnyHttpUrl, f"http://{self.hostname}:{self.port}" # NOSONAR
258-
)
234+
docker_node_id: Optional[str] = Field(
235+
None,
236+
description=(
237+
"contains node id of the docker node where all services "
238+
"and created containers are started"
239+
),
240+
)
259241

260-
@property
261-
def are_containers_ready(self) -> bool:
262-
"""returns: True if all containers are in running state"""
263-
return all(
264-
docker_container_inspect.status == DockerStatus.RUNNING
265-
for docker_container_inspect in self.containers_inspect
266-
)
242+
class Config:
243+
validate_assignment = True
267244

268245

269-
class DynamicSidecarNames(BaseModel):
246+
class DynamicSidecarNamesHelper(BaseModel):
270247
"""
271248
Service naming schema:
272249
NOTE: name is max 63 characters
@@ -304,7 +281,7 @@ class DynamicSidecarNames(BaseModel):
304281
)
305282

306283
@classmethod
307-
def make(cls, node_uuid: UUID) -> "DynamicSidecarNames":
284+
def make(cls, node_uuid: UUID) -> "DynamicSidecarNamesHelper":
308285
return cls(
309286
service_name_dynamic_sidecar=assemble_service_name(
310287
DYNAMIC_SIDECAR_SERVICE_PREFIX, node_uuid
@@ -318,10 +295,28 @@ def make(cls, node_uuid: UUID) -> "DynamicSidecarNames":
318295

319296

320297
class SchedulerData(CommonServiceDetails, DynamicSidecarServiceLabels):
321-
service_name: constr(strip_whitespace=True, min_length=2) = Field(
298+
service_name: ServiceName = Field(
322299
...,
323300
description="Name of the current dynamic-sidecar being observed",
324301
)
302+
run_id: RunID = Field(
303+
default_factory=uuid4,
304+
description=(
305+
"Uniquely identify the dynamic sidecar session (a.k.a. 2 "
306+
"subsequent exact same services will have a different run_id)"
307+
),
308+
)
309+
hostname: str = Field(
310+
..., description="dy-sidecar's service hostname (provided by docker-swarm)"
311+
)
312+
port: PortInt = Field(8000, description="dynamic-sidecar port")
313+
314+
@property
315+
def endpoint(self) -> AnyHttpUrl:
316+
"""endpoint where all the services are exposed"""
317+
return parse_obj_as(
318+
AnyHttpUrl, f"http://{self.hostname}:{self.port}" # NOSONAR
319+
)
325320

326321
dynamic_sidecar: DynamicSidecar = Field(
327322
...,
@@ -340,7 +335,7 @@ class SchedulerData(CommonServiceDetails, DynamicSidecarServiceLabels):
340335
description="required for Traefik to correctly route requests to the spawned container",
341336
)
342337

343-
service_port: PositiveInt = Field(
338+
service_port: PortInt = Field(
344339
TEMPORARY_PORT_NUMBER,
345340
description=(
346341
"port where the service is exposed defined by the service; "
@@ -353,37 +348,31 @@ class SchedulerData(CommonServiceDetails, DynamicSidecarServiceLabels):
353348
..., description="service resources used to enforce limits"
354349
)
355350

356-
request_dns: Optional[str] = Field(
357-
None, description="used when configuring the CORS options on the proxy"
358-
)
359-
request_scheme: Optional[str] = Field(
360-
None, description="used when configuring the CORS options on the proxy"
361-
)
362-
proxy_service_name: Optional[str] = Field(
363-
None, description="service name given to the proxy"
351+
request_dns: str = Field(
352+
..., description="used when configuring the CORS options on the proxy"
364353
)
365-
docker_node_id: Optional[str] = Field(
366-
None,
367-
description=(
368-
"contains node id of the docker node where all services "
369-
"and created containers are started"
370-
),
354+
request_scheme: str = Field(
355+
..., description="used when configuring the CORS options on the proxy"
371356
)
357+
proxy_service_name: str = Field(None, description="service name given to the proxy")
372358

373359
@classmethod
374360
def from_http_request(
375361
# pylint: disable=too-many-arguments
376362
cls,
377363
service: "DynamicServiceCreate",
378364
simcore_service_labels: SimcoreServiceLabels,
379-
port: Optional[int],
380-
request_dns: Optional[str] = None,
381-
request_scheme: Optional[str] = None,
365+
port: PortInt,
366+
request_dns: str,
367+
request_scheme: str,
368+
run_id: Optional[UUID] = None,
382369
) -> "SchedulerData":
383-
dynamic_sidecar_names = DynamicSidecarNames.make(service.node_uuid)
370+
names_helper = DynamicSidecarNamesHelper.make(service.node_uuid)
384371

385372
obj_dict = dict(
386-
service_name=dynamic_sidecar_names.service_name_dynamic_sidecar,
373+
service_name=names_helper.service_name_dynamic_sidecar,
374+
hostname=names_helper.service_name_dynamic_sidecar,
375+
port=port,
387376
node_uuid=service.node_uuid,
388377
project_id=service.project_id,
389378
user_id=service.user_id,
@@ -394,16 +383,15 @@ def from_http_request(
394383
compose_spec=json.dumps(simcore_service_labels.compose_spec),
395384
container_http_entry=simcore_service_labels.container_http_entry,
396385
restart_policy=simcore_service_labels.restart_policy,
397-
dynamic_sidecar_network_name=dynamic_sidecar_names.dynamic_sidecar_network_name,
398-
simcore_traefik_zone=dynamic_sidecar_names.simcore_traefik_zone,
386+
dynamic_sidecar_network_name=names_helper.dynamic_sidecar_network_name,
387+
simcore_traefik_zone=names_helper.simcore_traefik_zone,
399388
request_dns=request_dns,
400389
request_scheme=request_scheme,
401-
proxy_service_name=dynamic_sidecar_names.proxy_service_name,
402-
dynamic_sidecar=dict(
403-
hostname=dynamic_sidecar_names.service_name_dynamic_sidecar,
404-
port=port,
405-
),
390+
proxy_service_name=names_helper.proxy_service_name,
391+
dynamic_sidecar={},
406392
)
393+
if run_id:
394+
obj_dict["run_id"] = run_id
407395
return cls.parse_obj(obj_dict)
408396

409397
@classmethod

services/director-v2/src/simcore_service_director_v2/modules/catalog.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414

1515
logger = logging.getLogger(__name__)
1616

17-
_MINUTE = 60
18-
1917

2018
def setup(app: FastAPI, settings: CatalogSettings) -> None:
2119
if not settings:

services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/api_client/_public.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ async def get_dynamic_sidecar_service_health(
402402
app: FastAPI, scheduler_data: SchedulerData
403403
) -> None:
404404
api_client = get_dynamic_sidecar_client(app)
405-
service_endpoint = scheduler_data.dynamic_sidecar.endpoint
405+
service_endpoint = scheduler_data.endpoint
406406

407407
# update service health
408408
is_healthy = await api_client.is_healthy(service_endpoint)

services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_core.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -254,19 +254,27 @@ def _make_pending() -> tuple[ServiceState, str]:
254254
return service_state, message
255255

256256

257-
async def is_dynamic_sidecar_stack_missing(
257+
async def _get_dynamic_sidecar_stack_services(
258258
node_uuid: NodeID, dynamic_sidecar_settings: DynamicSidecarSettings
259-
) -> bool:
260-
"""Check if the proxy and the dynamic-sidecar are absent"""
259+
) -> list[Mapping]:
261260
filters = {
262261
"label": [
263262
f"swarm_stack_name={dynamic_sidecar_settings.SWARM_STACK_NAME}",
264263
f"uuid={node_uuid}",
265264
]
266265
}
267266
async with docker_client() as client:
268-
stack_services = await client.services.list(filters=filters)
269-
return len(stack_services) == 0
267+
return await client.services.list(filters=filters)
268+
269+
270+
async def is_dynamic_sidecar_stack_missing(
271+
node_uuid: NodeID, dynamic_sidecar_settings: DynamicSidecarSettings
272+
) -> bool:
273+
"""Check if the proxy and the dynamic-sidecar are absent"""
274+
stack_services = await _get_dynamic_sidecar_stack_services(
275+
node_uuid, dynamic_sidecar_settings
276+
)
277+
return len(stack_services) == 0
270278

271279

272280
async def are_all_services_present(
@@ -275,20 +283,13 @@ async def are_all_services_present(
275283
"""
276284
The dynamic-sidecar stack always expects to have 2 running services
277285
"""
278-
async with docker_client() as client:
279-
stack_services = await client.services.list(
280-
filters={
281-
"label": [
282-
f"swarm_stack_name={dynamic_sidecar_settings.SWARM_STACK_NAME}",
283-
f"uuid={node_uuid}",
284-
]
285-
}
286-
)
287-
if len(stack_services) != 2:
288-
log.warning("Expected 2 services found %s", stack_services)
289-
return False
286+
stack_services = await _get_dynamic_sidecar_stack_services(
287+
node_uuid, dynamic_sidecar_settings
288+
)
289+
if len(stack_services) != 2:
290+
return False
290291

291-
return True
292+
return True
292293

293294

294295
async def remove_dynamic_sidecar_stack(
@@ -437,8 +438,12 @@ def _count_containers(item: dict[str, Any]) -> int:
437438
async def try_to_remove_network(network_name: str) -> None:
438439
async with docker_client() as client:
439440
network = await client.networks.get(network_name)
441+
442+
# if a project network for the current project has no more
443+
# containers attached to it (because the last service which
444+
# was using it was removed), also removed the network
440445
try:
441-
return await network.delete()
446+
await network.delete()
442447
except aiodocker.exceptions.DockerError:
443448
log.warning("Could not remove network %s", network_name)
444449

0 commit comments

Comments
 (0)