ongoing

sanderegg · sanderegg · commit d47c01f4c37d · 2025-09-16T10:33:48.000+02:00
diff --git a/packages/service-library/src/servicelib/redis/_semaphore_decorator.py b/packages/service-library/src/servicelib/redis/_semaphore_decorator.py
@@ -3,7 +3,8 @@
 import functools
 import logging
 import socket
-from collections.abc import Callable, Coroutine
+from collections.abc import AsyncIterator, Callable, Coroutine
+from contextlib import asynccontextmanager
 from typing import Any, ParamSpec, TypeVar
 
 from common_library.async_tools import cancel_wait_task
@@ -27,6 +28,102 @@
 R = TypeVar("R")
 
 
+@asynccontextmanager
+async def _managed_semaphore_execution(
+    semaphore: DistributedSemaphore,
+    semaphore_key: str,
+    ttl: datetime.timedelta,
+    execution_context: str,
+) -> AsyncIterator:
+    """Common semaphore management logic with auto-renewal."""
+    # Acquire the semaphore first
+    if not await semaphore.acquire():
+        raise SemaphoreAcquisitionError(name=semaphore_key, capacity=semaphore.capacity)
+
+    try:
+        # Use TaskGroup for proper exception propagation
+        async with asyncio.TaskGroup() as tg:
+            started_event = asyncio.Event()
+
+            # Create auto-renewal task
+            @periodic(interval=ttl / 3, raise_on_error=True)
+            async def _periodic_renewer() -> None:
+                await semaphore.reacquire()
+                if not started_event.is_set():
+                    started_event.set()
+
+            # Start the renewal task
+            renewal_task = tg.create_task(
+                _periodic_renewer(),
+                name=f"semaphore/autorenewal_{semaphore_key}_{semaphore.instance_id}",
+            )
+
+            # Wait for first renewal to complete (ensures task is running)
+            await started_event.wait()
+
+            # Yield control back to caller
+            yield
+
+            # Cancel renewal task when execution is done
+            await cancel_wait_task(renewal_task, max_delay=None)
+
+    except BaseExceptionGroup as eg:
+        # Re-raise the first exception in the group
+        raise eg.exceptions[0] from eg
+
+    finally:
+        # Always attempt to release the semaphore
+        try:
+            await semaphore.release()
+        except Exception as exc:
+            _logger.exception(
+                **create_troubleshootting_log_kwargs(
+                    "Unexpected error while releasing semaphore",
+                    error=exc,
+                    error_context={
+                        "semaphore_key": semaphore_key,
+                        "client_name": semaphore.redis_client.client_name,
+                        "hostname": socket.gethostname(),
+                        "execution_context": execution_context,
+                    },
+                    tip="This might happen if the semaphore was lost before releasing it. "
+                    "Look for synchronous code that prevents refreshing the semaphore or asyncio loop overload.",
+                )
+            )
+
+
+def _create_semaphore(
+    redis_client: RedisClientSDK | Callable[..., RedisClientSDK],
+    args: tuple[Any, ...],
+    *,
+    key: str | Callable[..., str],
+    capacity: int | Callable[..., int],
+    ttl: datetime.timedelta,
+    blocking: bool,
+    blocking_timeout: datetime.timedelta | None,
+    kwargs: dict[str, Any],
+) -> tuple[DistributedSemaphore, str]:
+    """Create and configure a distributed semaphore from callable or static parameters."""
+    semaphore_key = key(*args, **kwargs) if callable(key) else key
+    semaphore_capacity = capacity(*args, **kwargs) if callable(capacity) else capacity
+    client = redis_client(*args, **kwargs) if callable(redis_client) else redis_client
+
+    assert isinstance(semaphore_key, str)  # nosec
+    assert isinstance(semaphore_capacity, int)  # nosec
+    assert isinstance(client, RedisClientSDK)  # nosec
+
+    semaphore = DistributedSemaphore(
+        redis_client=client,
+        key=semaphore_key,
+        capacity=semaphore_capacity,
+        ttl=ttl,
+        blocking=blocking,
+        blocking_timeout=blocking_timeout,
+    )
+
+    return semaphore, semaphore_key
+
+
 def with_limited_concurrency(
     redis_client: RedisClientSDK | Callable[..., RedisClientSDK],
     *,
@@ -75,101 +172,89 @@ def _decorator(
     ) -> Callable[P, Coroutine[Any, Any, R]]:
         @functools.wraps(coro)
         async def _wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
-            # Resolve callable parameters
-            semaphore_key = key(*args, **kwargs) if callable(key) else key
-            semaphore_capacity = (
-                capacity(*args, **kwargs) if callable(capacity) else capacity
-            )
-            client = (
-                redis_client(*args, **kwargs)
-                if callable(redis_client)
-                else redis_client
+            semaphore, semaphore_key = _create_semaphore(
+                redis_client,
+                args,
+                key=key,
+                capacity=capacity,
+                ttl=ttl,
+                blocking=blocking,
+                blocking_timeout=blocking_timeout,
+                kwargs=kwargs,
             )
 
-            assert isinstance(semaphore_key, str)  # nosec
-            assert isinstance(semaphore_capacity, int)  # nosec
-            assert isinstance(client, RedisClientSDK)  # nosec
+            async with _managed_semaphore_execution(
+                semaphore, semaphore_key, ttl, f"coroutine_{coro.__name__}"
+            ):
+                return await coro(*args, **kwargs)
+
+        return _wrapper
+
+    return _decorator
+
 
-            # Create the semaphore (without auto-renewal)
-            semaphore = DistributedSemaphore(
-                redis_client=client,
-                key=semaphore_key,
-                capacity=semaphore_capacity,
+def with_limited_concurrency_cm(
+    redis_client: RedisClientSDK | Callable[..., RedisClientSDK],
+    *,
+    key: str | Callable[..., str],
+    capacity: int | Callable[..., int],
+    ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL,
+    blocking: bool = True,
+    blocking_timeout: datetime.timedelta | None = DEFAULT_SOCKET_TIMEOUT,
+) -> Callable[[Callable[P, AsyncIterator[R]]], Callable[P, AsyncIterator[R]]]:
+    """
+    Decorator to limit concurrent execution of async context managers using a distributed semaphore.
+
+    This decorator ensures that only a specified number of instances of the decorated
+    async context manager can be active concurrently across multiple processes/instances
+    using Redis as the coordination backend.
+
+    Args:
+        redis_client: Redis client for coordination (can be callable)
+        key: Unique identifier for the semaphore (can be callable)
+        capacity: Maximum number of concurrent executions (can be callable)
+        ttl: Time-to-live for semaphore entries (default: 5 minutes)
+        blocking: Whether to block when semaphore is full (default: True)
+        blocking_timeout: Maximum time to wait when blocking (default: socket timeout)
+
+    Example:
+        @asynccontextmanager
+        @with_limited_concurrency_cm(
+            redis_client,
+            key="cluster:my-cluster",
+            capacity=5,
+            blocking=True,
+            blocking_timeout=None
+        )
+        async def get_cluster_client():
+            async with pool.acquire() as client:
+                yield client
+
+    Raises:
+        SemaphoreAcquisitionError: If semaphore cannot be acquired and blocking=True
+    """
+
+    def _decorator(
+        cm_func: Callable[P, AsyncIterator[R]],
+    ) -> Callable[P, AsyncIterator[R]]:
+        @functools.wraps(cm_func)
+        async def _wrapper(*args: P.args, **kwargs: P.kwargs) -> AsyncIterator[R]:
+            semaphore, semaphore_key = _create_semaphore(
+                redis_client,
+                args,
+                key=key,
+                capacity=capacity,
                 ttl=ttl,
                 blocking=blocking,
                 blocking_timeout=blocking_timeout,
+                kwargs=kwargs,
             )
 
-            # Acquire the semaphore first
-            if not await semaphore.acquire():
-                raise SemaphoreAcquisitionError(
-                    name=semaphore_key, capacity=semaphore_capacity
-                )
-
-            try:
-                # Use TaskGroup for proper exception propagation (similar to exclusive decorator)
-                async with asyncio.TaskGroup() as tg:
-                    started_event = asyncio.Event()
-
-                    # Create auto-renewal task
-                    @periodic(interval=ttl / 3, raise_on_error=True)
-                    async def _periodic_renewer() -> None:
-                        await semaphore.reacquire()
-                        if not started_event.is_set():
-                            started_event.set()
-
-                    # Start the renewal task
-                    renewal_task = tg.create_task(
-                        _periodic_renewer(),
-                        name=f"semaphore/autorenewal_{semaphore_key}_{semaphore.instance_id}",
-                    )
-
-                    # Wait for first renewal to complete (ensures task is running)
-                    await started_event.wait()
-
-                    # Run the user work
-                    work_task = tg.create_task(
-                        coro(*args, **kwargs),
-                        name=f"semaphore/work_{coro.__module__}.{coro.__name__}",
-                    )
-                    result = await work_task
-
-                    # Cancel renewal task (work is done)
-                    # NOTE: if we do not explicitely await the task inside the context manager
-                    # it sometimes hangs forever (Python issue?)
-                    await cancel_wait_task(renewal_task, max_delay=None)
-
-                return result
-
-            except BaseExceptionGroup as eg:
-                # Handle exceptions similar to exclusive decorator
-                # If renewal fails, the TaskGroup will propagate the exception
-                # and cancel the work task automatically
-
-                # Re-raise the first exception in the group
-                raise eg.exceptions[0] from eg
-
-            finally:
-                # Always attempt to release the semaphore, regardless of Python state
-                # The Redis-side state is the source of truth, not the Python _acquired flag
-                try:
-                    await semaphore.release()
-                except Exception as exc:
-                    # Log any other release errors but don't re-raise
-                    _logger.exception(
-                        **create_troubleshootting_log_kwargs(
-                            "Unexpected error while releasing semaphore",
-                            error=exc,
-                            error_context={
-                                "semaphore_key": semaphore_key,
-                                "client_name": client.client_name,
-                                "hostname": socket.gethostname(),
-                                "coroutine": coro.__name__,
-                            },
-                            tip="This might happen if the semaphore was lost before releasing it. "
-                            "Look for synchronous code that prevents refreshing the semaphore or asyncio loop overload.",
-                        )
-                    )
+            async with _managed_semaphore_execution(
+                semaphore, semaphore_key, ttl, f"context_manager_{cm_func.__name__}"
+            ):
+                async for value in cm_func(*args, **kwargs):
+                    yield value
 
         return _wrapper
 
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py
@@ -24,6 +24,10 @@
 from servicelib.common_headers import UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE
 from servicelib.logging_errors import create_troubleshootting_log_kwargs
 from servicelib.logging_utils import log_catch, log_context
+from servicelib.redis._client import RedisClientSDK
+from servicelib.redis._semaphore_decorator import (
+    with_limited_concurrency_cm,
+)
 from servicelib.utils import limited_as_completed, limited_gather
 
 from ...core.errors import (
@@ -53,11 +57,13 @@
 from ..db.repositories.comp_tasks import CompTasksRepository
 from ._constants import (
     MAX_CONCURRENT_PIPELINE_SCHEDULING,
+    MODULE_NAME_WORKER,
 )
 from ._models import TaskStateTracker
 from ._scheduler_base import BaseCompScheduler
 from ._utils import (
     WAITING_FOR_START_STATES,
+    get_redis_lock_key,
 )
 
 _logger = logging.getLogger(__name__)
@@ -67,7 +73,25 @@
 _TASK_RETRIEVAL_ERROR_CONTEXT_TIME_KEY: Final[str] = "check_time"
 
 
+def _get_redis_client_from_scheduler(scheduler: "DaskScheduler") -> RedisClientSDK:
+    return scheduler.redis_client
+
+
+def _unique_key_builder(_app, user_id: UserID, run_metadata: RunMetadataDict) -> str:
+    return f"user_id_{user_id}-wallet_id_{run_metadata.get('wallet_id')}"
+
+
 @asynccontextmanager
+@with_limited_concurrency_cm(
+    _get_redis_client_from_scheduler,
+    key=get_redis_lock_key(
+        MODULE_NAME_WORKER,
+        unique_lock_key_builder=_unique_key_builder,
+    ),
+    capacity=1,
+    blocking=True,
+    blocking_timeout=None,
+)
 async def _cluster_dask_client(
     user_id: UserID,
     scheduler: "DaskScheduler",
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dask_clients_pool.py b/services/director-v2/src/simcore_service_director_v2/modules/dask_clients_pool.py
@@ -10,12 +10,6 @@
 from models_library.clusters import BaseCluster, ClusterTypeInModel
 from pydantic import AnyUrl
 from servicelib.logging_utils import log_context
-from servicelib.redis._semaphore_decorator import with_limited_concurrency
-from settings_library.redis import RedisDatabase
-from simcore_service_director_v2.modules.comp_scheduler._utils import (
-    get_redis_lock_key,
-)
-from simcore_service_director_v2.modules.redis import get_redis_client_manager
 
 from ..core.errors import (
     ComputationalBackendNotConnectedError,
@@ -120,16 +114,6 @@ async def acquire(
         `release_client_ref` to release the client reference when done.
         """
 
-        @with_limited_concurrency(
-            get_redis_client_manager(self.app).client(RedisDatabase.LOCKS),
-            key=get_redis_lock_key(
-                "dask-clients-pool",
-                unique_lock_key_builder=lambda: f"{cluster.name}-{cluster.endpoint}",
-            ),
-            capacity=20,
-            blocking=True,
-            blocking_timeout=None,
-        )
         async def _concurently_safe_acquire_client() -> DaskClient:
             async with self._client_acquisition_lock:
                 with log_context(