Skip to content

Commit a5f5c08

Browse files
author
Andrei Neagu
committed
fixed volume removal
1 parent 9a71d52 commit a5f5c08

File tree

6 files changed

+61
-16
lines changed

6 files changed

+61
-16
lines changed
Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,25 @@
1+
import logging
2+
13
from fastapi import FastAPI
24
from models_library.projects_nodes_io import NodeID
5+
from servicelib.logging_utils import log_context
36
from servicelib.rabbitmq import RPCRouter
47
from simcore_service_agent.services.volumes_manager import get_volumes_manager
58

9+
_logger = logging.getLogger(__name__)
10+
611
router = RPCRouter()
712

813

914
@router.expose()
1015
async def remove_volumes_without_backup_for_service(
1116
app: FastAPI, *, node_id: NodeID
1217
) -> None:
13-
await get_volumes_manager(app).remove_service_volumes(node_id)
18+
with log_context(_logger, logging.INFO, f"removing volumes for service: {node_id}"):
19+
await get_volumes_manager(app).remove_service_volumes(node_id)
1420

1521

1622
@router.expose()
1723
async def backup_and_remove_volumes_for_all_services(app: FastAPI) -> None:
18-
await get_volumes_manager(app).remove_all_volumes()
24+
with log_context(_logger, logging.INFO, "removing all service volumes from node"):
25+
await get_volumes_manager(app).remove_all_volumes()

services/agent/src/simcore_service_agent/core/api/rpc/routes.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@ def setup_rpc_api_routes(app: FastAPI) -> None:
1515
async def startup() -> None:
1616
rpc_server = get_rabbitmq_rpc_server(app)
1717
settings: ApplicationSettings = app.state.settings
18-
namespace = RPCNamespace.from_entries(
18+
rpc_namespace = RPCNamespace.from_entries(
1919
{
2020
"service": "agent",
2121
"docker_node_id": settings.AGENT_DOCKER_NODE_ID,
2222
"swarm_stack_name": settings.AGENT_VOLUMES_CLEANUP_TARGET_SWARM_STACK_NAME,
2323
}
2424
)
2525
for router in ROUTERS:
26-
await rpc_server.register_router(router, namespace, app)
26+
await rpc_server.register_router(router, rpc_namespace, app)
2727

2828
app.add_event_handler("startup", startup)

services/agent/src/simcore_service_agent/services/volumes_manager.py

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@
1010
from pydantic import NonNegativeFloat
1111
from servicelib.background_task import start_periodic_task, stop_periodic_task
1212
from servicelib.logging_utils import log_context
13+
from tenacity import (
14+
AsyncRetrying,
15+
TryAgain,
16+
before_sleep_log,
17+
stop_after_delay,
18+
wait_fixed,
19+
)
1320

1421
from ..core.settings import ApplicationSettings
1522
from .docker_utils import get_unused_dynamc_sidecar_volumes, remove_volume
@@ -96,17 +103,46 @@ async def _periodic_volmue_cleanup_task(self) -> None:
96103
for volume in volumes_to_remove:
97104
await self._remove_volume_safe(volume_name=volume, requires_backup=True)
98105

106+
async def _wait_for_service_volumes_to_become_unused(
107+
self, node_id: NodeID
108+
) -> set[str]:
109+
# NOTE: it usually takes a few seconds for volumes to become unused
110+
# if agent does not wait for this operation to finish
111+
# they will be removed and backed up by the background task
112+
# causing unncecessary data transfer to S3
113+
async for attempt in AsyncRetrying(
114+
reraise=True,
115+
stop=stop_after_delay(60),
116+
wait=wait_fixed(1),
117+
before_sleep=before_sleep_log(_logger, logging.DEBUG),
118+
):
119+
with attempt:
120+
current_unused_volumes = await get_unused_dynamc_sidecar_volumes(
121+
self.docker
122+
)
123+
124+
service_volumes = {
125+
v for v in current_unused_volumes if f"{node_id}" in v
126+
}
127+
_logger.debug(
128+
"service %s found volumes to remove: %s", node_id, service_volumes
129+
)
130+
if len(service_volumes) == 0:
131+
raise TryAgain
132+
133+
return current_unused_volumes
134+
99135
async def remove_service_volumes(self, node_id: NodeID) -> None:
100136
# bookkept volumes might not be up to date
101-
current_unused_volumes = await get_unused_dynamc_sidecar_volumes(self.docker)
102-
103-
service_volumes: set[str] = set()
104-
for volume in current_unused_volumes:
105-
if f"{node_id}" in volume:
106-
service_volumes.add(volume)
137+
service_volumes = await self._wait_for_service_volumes_to_become_unused(node_id)
138+
_logger.debug(
139+
"will remove volumes for %s from service_volumes=%s",
140+
node_id,
141+
service_volumes,
142+
)
107143

108144
for volume_name in service_volumes:
109-
# these volumes have already been saved to S3 by the sidecar, no longer requires a backup
145+
# volumes already saved to S3 by the sidecar and no longer require backup
110146
await self._remove_volume_safe(
111147
volume_name=volume_name, requires_backup=False
112148
)

services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from servicelib.fastapi.long_running_tasks.server import TaskProgress
2828
from servicelib.logging_utils import log_context
2929
from servicelib.rabbitmq import RabbitMQClient
30+
from servicelib.rabbitmq._client_rpc import RabbitMQRPCClient
3031
from servicelib.utils import limited_gather, logged_gather
3132
from simcore_postgres_database.models.comp_tasks import NodeClass
3233
from tenacity import RetryError, TryAgain
@@ -230,9 +231,9 @@ async def service_remove_sidecar_proxy_docker_networks_and_volumes(
230231
message="removing volumes", percent=ProgressPercent(0.3)
231232
)
232233
with log_context(_logger, logging.DEBUG, f"removing volumes '{node_uuid}'"):
233-
rabbitmq_client: RabbitMQClient = app.state.rabbitmq_client
234+
rabbit_rpc_client: RabbitMQRPCClient = app.state.rabbitmq_rpc_client
234235
await remove_volumes_from_node(
235-
rabbitmq_client=rabbitmq_client,
236+
rabbit_rpc_client=rabbit_rpc_client,
236237
docker_node_id=scheduler_data.dynamic_sidecar.docker_node_id,
237238
swarm_stack_name=swarm_stack_name,
238239
node_id=scheduler_data.node_uuid,

services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes_removal.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
from models_library.projects_nodes_io import NodeID
2-
from servicelib.rabbitmq import RabbitMQClient
2+
from servicelib.rabbitmq._client_rpc import RabbitMQRPCClient
33
from servicelib.rabbitmq.rpc_interfaces.agent.volumes import (
44
remove_volumes_without_backup_for_service,
55
)
66

77

88
async def remove_volumes_from_node(
9-
rabbitmq_client: RabbitMQClient,
9+
rabbit_rpc_client: RabbitMQRPCClient,
1010
docker_node_id: str,
1111
swarm_stack_name: str,
1212
*,
@@ -15,7 +15,7 @@ async def remove_volumes_from_node(
1515
"""removes all service volumes form the node where it was running"""
1616

1717
await remove_volumes_without_backup_for_service(
18-
rabbitmq_client,
18+
rabbit_rpc_client,
1919
docker_node_id=docker_node_id,
2020
swarm_stack_name=swarm_stack_name,
2121
node_id=node_id,

services/docker-compose.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,6 +1010,7 @@ services:
10101010
RABBIT_PASSWORD: ${RABBIT_PASSWORD}
10111011
RABBIT_PORT: ${RABBIT_PORT}
10121012
RABBIT_USER: ${RABBIT_USER}
1013+
RABBIT_SECURE: ${RABBIT_SECURE}
10131014

10141015
dask-sidecar:
10151016
image: ${DOCKER_REGISTRY:-itisfoundation}/dask-sidecar:${DOCKER_IMAGE_TAG:-latest}

0 commit comments

Comments
 (0)