From 415cb0a022be1eca186b9ee7f0e65d8de1c003b1 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 08:14:03 +0200 Subject: [PATCH 01/68] ongoing fair queuing semaphore --- .../src/servicelib/redis/fair_semaphore.py | 404 +++++++++++++++++ .../redis/fair_semaphore_decorator.py | 272 ++++++++++++ .../redis/lua/acquire_fair_semaphore_v2.lua | 32 ++ .../redis/lua/cleanup_fair_semaphore_v2.lua | 66 +++ .../redis/lua/count_fair_semaphore_v2.lua | 17 + .../redis/lua/register_semaphore_holder.lua | 31 ++ .../redis/lua/release_fair_semaphore_v2.lua | 48 ++ .../redis/lua/renew_fair_semaphore_v2.lua | 40 ++ .../servicelib/redis/pure_brpop_semaphore.py | 417 ++++++++++++++++++ 9 files changed, 1327 insertions(+) create mode 100644 packages/service-library/src/servicelib/redis/fair_semaphore.py create mode 100644 packages/service-library/src/servicelib/redis/fair_semaphore_decorator.py create mode 100644 packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua create mode 100644 packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua create mode 100644 packages/service-library/src/servicelib/redis/lua/count_fair_semaphore_v2.lua create mode 100644 packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua create mode 100644 packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua create mode 100644 packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua create mode 100644 packages/service-library/src/servicelib/redis/pure_brpop_semaphore.py diff --git a/packages/service-library/src/servicelib/redis/fair_semaphore.py b/packages/service-library/src/servicelib/redis/fair_semaphore.py new file mode 100644 index 000000000000..f70c699e9fcc --- /dev/null +++ b/packages/service-library/src/servicelib/redis/fair_semaphore.py @@ -0,0 +1,404 @@ +"""Fair distributed semaphore using token pool with crash recovery.""" + +import asyncio +import datetime +import logging +import uuid +from typing import ClassVar + +from pydantic import BaseModel, Field, PositiveInt, computed_field, field_validator +from redis.commands.core import AsyncScript + +from ._client import RedisClientSDK +from ._constants import ( + DEFAULT_SEMAPHORE_TTL, + DEFAULT_SOCKET_TIMEOUT, + SEMAPHORE_HOLDER_KEY_PREFIX, + SEMAPHORE_KEY_PREFIX, +) +from ._errors import ( + SemaphoreAcquisitionError, + SemaphoreLostError, + SemaphoreNotAcquiredError, +) +from ._semaphore_lua import ( + ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT, + CLEANUP_FAIR_SEMAPHORE_V2_SCRIPT, + COUNT_FAIR_SEMAPHORE_V2_SCRIPT, + RELEASE_FAIR_SEMAPHORE_V2_SCRIPT, + RENEW_FAIR_SEMAPHORE_V2_SCRIPT, + SCRIPT_OK_EXIT_CODE, +) + +_logger = logging.getLogger(__name__) + + +class FairSemaphore(BaseModel): + """ + A fair distributed semaphore using Redis token pool with BRPOP. + + Features: + - True FIFO fairness via BRPOP blocking operations + - Crash recovery through TTL-based cleanup + - No Python-side retry logic needed + - Automatic token pool management + """ + + capacity: PositiveInt = Field(description="Maximum number of concurrent holders") + key: str = Field(description="Unique semaphore identifier") + ttl: datetime.timedelta = Field( + default=DEFAULT_SEMAPHORE_TTL, + description="How long a holder can keep the semaphore", + ) + timeout: datetime.timedelta = Field( + default=DEFAULT_SOCKET_TIMEOUT, + description="How long to block waiting for semaphore (0 = infinite)", + ) + cleanup_interval: datetime.timedelta = Field( + default=datetime.timedelta(seconds=30), + description="How often to run cleanup to recover crashed client tokens", + ) + enable_auto_cleanup: bool = Field( + default=True, description="Whether to automatically run background cleanup" + ) + + # Internal state + instance_id: str = Field( + default_factory=lambda: str(uuid.uuid4())[:8], + description="Unique identifier for this semaphore instance", + ) + _acquired: bool = Field(default=False, exclude=True) + _token: str | None = Field(default=None, exclude=True) + _redis_client: RedisClientSDK | None = Field(default=None, exclude=True) + _cleanup_task: asyncio.Task | None = Field(default=None, exclude=True) + + # Class-level script storage + _acquire_script: ClassVar[AsyncScript | None] = None + _release_script: ClassVar[AsyncScript | None] = None + _cleanup_script: ClassVar[AsyncScript | None] = None + _renew_script: ClassVar[AsyncScript | None] = None + _count_script: ClassVar[AsyncScript | None] = None + + @computed_field + @property + def tokens_key(self) -> str: + """Redis key for the token pool LIST.""" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}:tokens" + + @computed_field + @property + def holders_key(self) -> str: + """Redis key for the holders SET.""" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders" + + @computed_field + @property + def holder_key(self) -> str: + """Redis key for this instance's holder TTL key.""" + return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:{self.instance_id}" + + @computed_field + @property + def holder_prefix(self) -> str: + """Prefix for holder keys (used in cleanup).""" + return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:" + + @field_validator("ttl", "timeout", "cleanup_interval") + @classmethod + def validate_positive_timedelta(cls, v: datetime.timedelta) -> datetime.timedelta: + if v.total_seconds() <= 0: + raise ValueError("Timedelta must be positive") + return v + + def model_post_init(self, __context) -> None: + """Initialize Redis client.""" + if self._redis_client is None: + self._redis_client = RedisClientSDK() + + async def _load_scripts(self) -> None: + """Load Lua scripts into Redis.""" + if self.__class__._acquire_script is None: + redis = await self._redis_client.get_redis_client() + + self.__class__._acquire_script = redis.register_script( + ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT + ) + self.__class__._release_script = redis.register_script( + RELEASE_FAIR_SEMAPHORE_V2_SCRIPT + ) + self.__class__._cleanup_script = redis.register_script( + CLEANUP_FAIR_SEMAPHORE_V2_SCRIPT + ) + self.__class__._renew_script = redis.register_script( + RENEW_FAIR_SEMAPHORE_V2_SCRIPT + ) + self.__class__._count_script = redis.register_script( + COUNT_FAIR_SEMAPHORE_V2_SCRIPT + ) + + async def _start_cleanup_task(self) -> None: + """Start the background cleanup task if enabled.""" + if self.enable_auto_cleanup and ( + self._cleanup_task is None or self._cleanup_task.done() + ): + self._cleanup_task = asyncio.create_task(self._cleanup_loop()) + + async def _cleanup_loop(self) -> None: + """Background task to periodically clean up crashed client tokens.""" + try: + while True: + await asyncio.sleep(self.cleanup_interval.total_seconds()) + try: + await self._recover_crashed_tokens() + except Exception as e: + _logger.warning(f"Cleanup failed for semaphore {self.key}: {e}") + except asyncio.CancelledError: + _logger.debug(f"Cleanup task cancelled for semaphore {self.key}") + + async def _recover_crashed_tokens(self) -> dict: + """Recover tokens from crashed clients.""" + await self._load_scripts() + + result = await self.__class__._cleanup_script( + keys=[self.tokens_key, self.holders_key, self.holder_prefix], + args=[self.capacity], + ) + + recovered_tokens, current_holders, available_tokens, total_cleaned = result + + cleanup_stats = { + "recovered_tokens": recovered_tokens, + "current_holders": current_holders, + "available_tokens": available_tokens, + "total_cleaned": total_cleaned, + } + + if recovered_tokens > 0 or total_cleaned > 0: + _logger.info( + f"Semaphore cleanup for '{self.key}': " + f"recovered {recovered_tokens} tokens, " + f"cleaned {total_cleaned} crashed holders, " + f"current state: {current_holders} holders, {available_tokens} available" + ) + + return cleanup_stats + + async def acquire(self) -> bool: + """ + Acquire the semaphore using blocking Redis operation. + + This method blocks until a semaphore slot becomes available or timeout. + Uses Redis BRPOP for true FIFO fairness with no starvation possible. + + Returns: + True if acquired successfully + + Raises: + SemaphoreAcquisitionError: If acquisition fails or times out + """ + await self._load_scripts() + + if self.enable_auto_cleanup: + await self._start_cleanup_task() + + if self._acquired: + raise SemaphoreAcquisitionError( + "Semaphore already acquired by this instance" + ) + + ttl_seconds = max(1, int(self.ttl.total_seconds())) + timeout_seconds = int(self.timeout.total_seconds()) + + _logger.debug( + f"Attempting to acquire fair semaphore '{self.key}' " + f"(timeout: {timeout_seconds}s, ttl: {ttl_seconds}s)" + ) + + try: + result = await self.__class__._acquire_script( + keys=[self.tokens_key, self.holders_key, self.holder_key], + args=[self.instance_id, self.capacity, ttl_seconds, timeout_seconds], + ) + + exit_code, status, token, current_count = result + + _logger.debug( + f"Fair semaphore acquisition result for '{self.key}'", + extra={ + "instance_id": self.instance_id, + "exit_code": exit_code, + "status": status, + "token": token, + "current_count": current_count, + }, + ) + + if exit_code == SCRIPT_OK_EXIT_CODE: # Success + self._acquired = True + self._token = token + _logger.info( + f"Acquired fair semaphore '{self.key}' with token '{token}'" + ) + return True + # Timeout or error + raise SemaphoreAcquisitionError(f"Failed to acquire semaphore: {status}") + + except Exception as e: + _logger.error(f"Error acquiring semaphore '{self.key}': {e}") + raise SemaphoreAcquisitionError(f"Redis error during acquisition: {e}") + + async def release(self) -> bool: + """ + Release the semaphore and return token to pool. + + This automatically makes the semaphore available to waiting clients. + The token is returned to the pool, unblocking any BRPOP waiters. + + Returns: + True if released successfully + + Raises: + SemaphoreNotAcquiredError: If semaphore not held by this instance + """ + await self._load_scripts() + + if not self._acquired: + raise SemaphoreNotAcquiredError("Semaphore not acquired by this instance") + + try: + result = await self.__class__._release_script( + keys=[self.tokens_key, self.holders_key, self.holder_key], + args=[self.instance_id], + ) + + exit_code, status, current_count = result + + _logger.debug( + f"Fair semaphore release result for '{self.key}'", + extra={ + "instance_id": self.instance_id, + "exit_code": exit_code, + "status": status, + "current_count": current_count, + }, + ) + + if exit_code == SCRIPT_OK_EXIT_CODE: # Success + self._acquired = False + _logger.info( + f"Released fair semaphore '{self.key}' with token '{self._token}'" + ) + self._token = None + return True + # Error + self._acquired = False # Mark as not acquired even on error + raise SemaphoreNotAcquiredError(f"Failed to release semaphore: {status}") + + except Exception as e: + _logger.error(f"Error releasing semaphore '{self.key}': {e}") + self._acquired = False # Mark as not acquired on error + raise SemaphoreNotAcquiredError(f"Redis error during release: {e}") + + async def renew(self) -> bool: + """ + Renew the semaphore TTL. + + Returns: + True if renewed successfully + + Raises: + SemaphoreLostError: If semaphore was lost (expired or not held) + """ + await self._load_scripts() + + if not self._acquired: + raise SemaphoreNotAcquiredError("Semaphore not acquired by this instance") + + ttl_seconds = max(1, int(self.ttl.total_seconds())) + + try: + result = await self.__class__._renew_script( + keys=[self.holders_key, self.holder_key], + args=[self.instance_id, ttl_seconds], + ) + + exit_code, status, current_count = result + + if exit_code == SCRIPT_OK_EXIT_CODE: + _logger.debug(f"Renewed semaphore '{self.key}' TTL") + return True + self._acquired = False + raise SemaphoreLostError(f"Semaphore was lost: {status}") + + except Exception as e: + _logger.error(f"Error renewing semaphore '{self.key}': {e}") + # Don't mark as not acquired on network errors + raise SemaphoreLostError(f"Redis error during renewal: {e}") + + async def count(self) -> dict: + """ + Get semaphore usage statistics. + + Returns: + Dictionary with current_holders, available_tokens, capacity + """ + await self._load_scripts() + + result = await self.__class__._count_script( + keys=[self.holders_key, self.tokens_key], args=[self.capacity] + ) + + current_holders, available_tokens, capacity = result + + return { + "current_holders": current_holders, + "available_tokens": available_tokens, + "capacity": capacity, + "utilization": current_holders / capacity if capacity > 0 else 0.0, + } + + async def health_check(self) -> dict: + """Get comprehensive semaphore health information.""" + count_info = await self.count() + cleanup_stats = await self._recover_crashed_tokens() + + total_accounted = count_info["current_holders"] + count_info["available_tokens"] + + return { + **count_info, + **cleanup_stats, + "total_accounted": total_accounted, + "is_healthy": total_accounted == self.capacity, + "cleanup_enabled": self.enable_auto_cleanup, + "instance_acquired": self._acquired, + } + + async def force_cleanup(self) -> dict: + """Manually trigger cleanup and return recovery statistics.""" + return await self._recover_crashed_tokens() + + async def __aenter__(self): + """Async context manager entry.""" + await self.acquire() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit.""" + if self._acquired: + try: + await self.release() + except Exception as e: + _logger.error(f"Error releasing semaphore in __aexit__: {e}") + + # Cancel cleanup task when exiting + if self._cleanup_task and not self._cleanup_task.done(): + self._cleanup_task.cancel() + try: + await self._cleanup_task + except asyncio.CancelledError: + pass + + @property + def acquired(self) -> bool: + """Check if semaphore is currently acquired.""" + return self._acquired diff --git a/packages/service-library/src/servicelib/redis/fair_semaphore_decorator.py b/packages/service-library/src/servicelib/redis/fair_semaphore_decorator.py new file mode 100644 index 000000000000..8202ebf79d56 --- /dev/null +++ b/packages/service-library/src/servicelib/redis/fair_semaphore_decorator.py @@ -0,0 +1,272 @@ +"""Fair semaphore decorator with automatic renewal and crash recovery.""" + +import asyncio +import datetime +import functools +import logging +from collections.abc import Callable, Coroutine +from contextlib import asynccontextmanager +from typing import Any, ParamSpec, TypeVar + +from common_library.logging.logging_errors import create_troubleshooting_log_kwargs + +from ._constants import ( + DEFAULT_EXPECTED_LOCK_OVERALL_TIME, + DEFAULT_SEMAPHORE_TTL, + DEFAULT_SOCKET_TIMEOUT, +) +from ._errors import ( + SemaphoreAcquisitionError, + SemaphoreLostError, + SemaphoreNotAcquiredError, +) +from .fair_semaphore import FairSemaphore + +_logger = logging.getLogger(__name__) + +P = ParamSpec("P") +R = TypeVar("R") + + +@asynccontextmanager +async def _managed_fair_semaphore_execution( + semaphore: FairSemaphore, + semaphore_key: str, + ttl: datetime.timedelta, + execution_context: str, + enable_auto_renewal: bool = True, +): + """Context manager for fair semaphore with auto-renewal.""" + + async def _auto_renewal(): + """Background task to automatically renew semaphore.""" + if not enable_auto_renewal: + return + + renewal_interval = ttl.total_seconds() / 3 # Renew at 1/3 TTL + + while semaphore.acquired: + try: + await asyncio.sleep(renewal_interval) + if semaphore.acquired: # Check again after sleep + await semaphore.renew() + _logger.debug(f"Renewed fair semaphore {semaphore_key}") + except SemaphoreLostError: + _logger.error( + f"Fair semaphore {semaphore_key} was lost during execution" + ) + break + except Exception as e: + _logger.warning(f"Failed to renew fair semaphore {semaphore_key}: {e}") + break + + renewal_task = None + try: + # Acquire the semaphore (blocks until available) + if not await semaphore.acquire(): + raise SemaphoreAcquisitionError( + f"Failed to acquire fair semaphore {semaphore_key}" + ) + + _logger.info(f"Acquired fair semaphore {semaphore_key} for {execution_context}") + + # Start auto-renewal task if enabled + if enable_auto_renewal: + renewal_task = asyncio.create_task(_auto_renewal()) + + yield + + except Exception as e: + _logger.error( + f"Error in fair semaphore-protected execution: {e}", + extra=create_troubleshooting_log_kwargs( + context=execution_context, + semaphore_key=semaphore_key, + ), + ) + raise + finally: + # Cancel renewal task + if renewal_task and not renewal_task.done(): + renewal_task.cancel() + try: + await renewal_task + except asyncio.CancelledError: + pass + + # Release semaphore + if semaphore.acquired: + try: + await semaphore.release() + _logger.info(f"Released fair semaphore {semaphore_key}") + except Exception as e: + _logger.error(f"Failed to release fair semaphore {semaphore_key}: {e}") + + +def fair_semaphore( + *, + key: str, + capacity: int, + ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL, + timeout: datetime.timedelta = DEFAULT_SOCKET_TIMEOUT, + expected_execution_time: datetime.timedelta = DEFAULT_EXPECTED_LOCK_OVERALL_TIME, + cleanup_interval: datetime.timedelta = datetime.timedelta(seconds=30), + enable_auto_cleanup: bool = True, + enable_auto_renewal: bool = True, +) -> Callable[ + [Callable[P, Coroutine[Any, Any, R]]], Callable[P, Coroutine[Any, Any, R]] +]: + """ + Decorator that protects async functions with a fair distributed semaphore. + + Uses Redis BRPOP for true FIFO fairness - first requester gets first slot. + No starvation possible, automatic crash recovery. + + Args: + key: Unique semaphore identifier + capacity: Maximum concurrent executions allowed + ttl: How long each holder can keep the semaphore + timeout: How long to wait for semaphore (0 = infinite wait) + expected_execution_time: Expected total execution time (unused, kept for compatibility) + cleanup_interval: How often to run cleanup for crashed clients + enable_auto_cleanup: Whether to run background cleanup + enable_auto_renewal: Whether to automatically renew TTL during execution + + Example: + @fair_semaphore( + key="api_calls", + capacity=10, + ttl=datetime.timedelta(seconds=30), + timeout=datetime.timedelta(seconds=60) + ) + async def call_external_api(): + # This will block fairly until semaphore available + # Maximum 10 concurrent executions + # First-come-first-served ordering guaranteed + pass + """ + + def decorator( + func: Callable[P, Coroutine[Any, Any, R]], + ) -> Callable[P, Coroutine[Any, Any, R]]: + @functools.wraps(func) + async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + semaphore = FairSemaphore( + key=key, + capacity=capacity, + ttl=ttl, + timeout=timeout, + cleanup_interval=cleanup_interval, + enable_auto_cleanup=enable_auto_cleanup, + ) + + execution_context = f"{func.__module__}.{func.__qualname__}" + + async with _managed_fair_semaphore_execution( + semaphore=semaphore, + semaphore_key=key, + ttl=ttl, + execution_context=execution_context, + enable_auto_renewal=enable_auto_renewal, + ): + return await func(*args, **kwargs) + + return wrapper + + return decorator + + +class FairSemaphoreContext: + """Async context manager for manual fair semaphore control.""" + + def __init__( + self, + key: str, + capacity: int, + ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL, + timeout: datetime.timedelta = DEFAULT_SOCKET_TIMEOUT, + cleanup_interval: datetime.timedelta = datetime.timedelta(seconds=30), + enable_auto_cleanup: bool = True, + enable_auto_renewal: bool = True, + ): + self.semaphore = FairSemaphore( + key=key, + capacity=capacity, + ttl=ttl, + timeout=timeout, + cleanup_interval=cleanup_interval, + enable_auto_cleanup=enable_auto_cleanup, + ) + self.ttl = ttl + self.enable_auto_renewal = enable_auto_renewal + self._renewal_task: Optional[asyncio.Task] = None + + async def __aenter__(self) -> FairSemaphore: + """Acquire semaphore and start auto-renewal.""" + await self.semaphore.acquire() + + # Start auto-renewal if enabled + if self.enable_auto_renewal: + + async def _auto_renewal(): + renewal_interval = self.ttl.total_seconds() / 3 + while self.semaphore.acquired: + try: + await asyncio.sleep(renewal_interval) + if self.semaphore.acquired: + await self.semaphore.renew() + except (SemaphoreLostError, SemaphoreNotAcquiredError): + break + except Exception as e: + _logger.warning(f"Auto-renewal failed: {e}") + + self._renewal_task = asyncio.create_task(_auto_renewal()) + + return self.semaphore + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Stop renewal and release semaphore.""" + if self._renewal_task and not self._renewal_task.done(): + self._renewal_task.cancel() + try: + await self._renewal_task + except asyncio.CancelledError: + pass + + if self.semaphore.acquired: + await self.semaphore.release() + + +# Convenience function for creating fair semaphore contexts +def fair_semaphore_context( + key: str, + capacity: int, + ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL, + timeout: datetime.timedelta = DEFAULT_SOCKET_TIMEOUT, + cleanup_interval: datetime.timedelta = datetime.timedelta(seconds=30), + enable_auto_cleanup: bool = True, + enable_auto_renewal: bool = True, +) -> FairSemaphoreContext: + """ + Create an async context manager for fair semaphore usage. + + Example: + async with fair_semaphore_context( + "my_resource", + capacity=5, + timeout=datetime.timedelta(seconds=30) + ) as sem: + # Protected code here - guaranteed fair access + # sem is the FairSemaphore instance + stats = await sem.count() + print(f"Current holders: {stats['current_holders']}") + """ + return FairSemaphoreContext( + key=key, + capacity=capacity, + ttl=ttl, + timeout=timeout, + cleanup_interval=cleanup_interval, + enable_auto_cleanup=enable_auto_cleanup, + enable_auto_renewal=enable_auto_renewal, + ) diff --git a/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua new file mode 100644 index 000000000000..dd4f7ba3e192 --- /dev/null +++ b/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua @@ -0,0 +1,32 @@ +-- Fair distributed semaphore using token pool (BRPOP-based) +-- KEYS[1]: tokens_key (LIST of available tokens) +-- KEYS[2]: holders_key (SET of current holder instance IDs) +-- KEYS[3]: holder_key (individual holder TTL key for this instance) +-- ARGV[1]: instance_id +-- ARGV[2]: capacity (max concurrent holders) +-- ARGV[3]: ttl_seconds +-- ARGV[4]: timeout_seconds (for BRPOP) +-- +-- Returns: {exit_code, status, token, current_count} +-- exit_code: 0 if acquired, 255 if timeout/failed +-- status: 'acquired' or 'timeout' + +local holders_key = KEYS[1] +local holder_key = KEYS[2] + +local token = ARGV[1] +local instance_id = ARGV[2] +local ttl_seconds = tonumber(ARGV[3]) + + + +-- Step 1: Register as holder +redis.call('SADD', holders_key, instance_id) +redis.call('SETEX', holder_key, ttl_seconds, token) + +-- Step 2: Set expiry on holders set to prevent infinite growth +redis.call('EXPIRE', holders_key, ttl_seconds * 10) + +local current_count = redis.call('SCARD', holders_key) + +return {0, 'acquired', token, current_count} diff --git a/packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua new file mode 100644 index 000000000000..88da6ca5ca1b --- /dev/null +++ b/packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua @@ -0,0 +1,66 @@ +-- Cleanup orphaned tokens from crashed clients +-- KEYS[1]: tokens_key (LIST of available tokens) +-- KEYS[2]: holders_key (SET of current holders) +-- KEYS[3]: holder_prefix (prefix for holder keys, e.g. "semaphores:holders:key:") +-- ARGV[1]: capacity (total semaphore capacity) +-- +-- Returns: {recovered_tokens, current_holders, available_tokens, total_cleaned} +-- This script should be run periodically to recover tokens from crashed clients + +local tokens_key = KEYS[1] +local holders_key = KEYS[2] +local holder_prefix = KEYS[3] + +local capacity = tonumber(ARGV[1]) + +-- Step 1: Get all current holders +local current_holders = redis.call('SMEMBERS', holders_key) +local recovered_tokens = 0 +local cleaned_holders = {} + +-- Step 2: Check each holder to see if their TTL key still exists +for i = 1, #current_holders do + local holder_id = current_holders[i] + local holder_key = holder_prefix .. holder_id + local exists = redis.call('EXISTS', holder_key) + + if exists == 0 then + -- Holder key doesn't exist but holder is in SET + -- This indicates a crashed client - clean up and recover token + redis.call('SREM', holders_key, holder_id) + redis.call('LPUSH', tokens_key, 'token_recovered_' .. holder_id) + recovered_tokens = recovered_tokens + 1 + table.insert(cleaned_holders, holder_id) + end +end + +-- Step 3: Ensure we have the correct total number of tokens +local remaining_holders = redis.call('SCARD', holders_key) +local available_tokens_count = redis.call('LLEN', tokens_key) +local total_tokens = remaining_holders + available_tokens_count + +-- If we're missing tokens (due to crashes or Redis issues), add them back +local missing_tokens = capacity - total_tokens +for i = 1, missing_tokens do + redis.call('LPUSH', tokens_key, 'token_missing_' .. i) + recovered_tokens = recovered_tokens + 1 +end + +-- If we somehow have too many tokens (shouldn't happen), remove extras +local excess_tokens = total_tokens - capacity +for i = 1, excess_tokens do + redis.call('RPOP', tokens_key) +end + +-- Step 4: Refresh expiry on data structures to prevent cleanup +local final_holders = redis.call('SCARD', holders_key) +local final_available = redis.call('LLEN', tokens_key) + +if final_holders > 0 then + redis.call('EXPIRE', holders_key, 3600) -- 1 hour expiry +end +if final_available > 0 then + redis.call('EXPIRE', tokens_key, 3600) -- 1 hour expiry +end + +return {recovered_tokens, final_holders, final_available, #cleaned_holders} diff --git a/packages/service-library/src/servicelib/redis/lua/count_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/count_fair_semaphore_v2.lua new file mode 100644 index 000000000000..fb04f4e4356d --- /dev/null +++ b/packages/service-library/src/servicelib/redis/lua/count_fair_semaphore_v2.lua @@ -0,0 +1,17 @@ +-- Count current semaphore holders (simplified for token pool design) +-- KEYS[1]: holders_key (SET of current holders) +-- KEYS[2]: tokens_key (LIST of available tokens) +-- ARGV[1]: capacity (total semaphore capacity) +-- +-- Returns: {current_holders, available_tokens, total_capacity} + +local holders_key = KEYS[1] +local tokens_key = KEYS[2] + +local capacity = tonumber(ARGV[1]) + +-- Count current holders and available tokens +local current_holders = redis.call('SCARD', holders_key) +local available_tokens = redis.call('LLEN', tokens_key) + +return {current_holders, available_tokens, capacity} diff --git a/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua b/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua new file mode 100644 index 000000000000..7d447d14bdc4 --- /dev/null +++ b/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua @@ -0,0 +1,31 @@ +-- Simple token initialization and management for Python BRPOP +-- KEYS[1]: tokens_key (LIST of available tokens) +-- KEYS[2]: holders_key (SET of current holder instance IDs) +-- KEYS[3]: holder_key (individual holder TTL key for this instance) +-- ARGV[1]: instance_id +-- ARGV[2]: capacity (max concurrent holders) +-- ARGV[3]: ttl_seconds +-- ARGV[4]: token (the token received from BRPOP) +-- +-- Returns: {exit_code, status, current_count} +-- exit_code: 0 if registered successfully + +local tokens_key = KEYS[1] +local holders_key = KEYS[2] + +local capacity = tonumber(ARGV[1]) +local ttl_seconds = tonumber(ARGV[2]) + +-- Step 1: Initialize token pool if needed (first time setup) +local tokens_exist = redis.call('EXISTS', tokens_key) +local holders_exist = redis.call('EXISTS', holders_key) +if tokens_exist == 0 and holders_exist == 0 then + -- Initialize with capacity number of tokens + for i = 1, capacity do + redis.call('LPUSH', tokens_key, 'token_' .. i) + end + -- Set expiry on tokens list to prevent infinite growth + -- redis.call('EXPIRE', tokens_key, ttl_seconds) +end + +return 0 diff --git a/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua new file mode 100644 index 000000000000..a675568bc391 --- /dev/null +++ b/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua @@ -0,0 +1,48 @@ +-- Release fair semaphore and return token to pool +-- KEYS[1]: tokens_key (LIST of available tokens) +-- KEYS[2]: holders_key (SET of current holders) +-- KEYS[3]: holder_key (individual holder TTL key for this instance) +-- ARGV[1]: instance_id +-- +-- Returns: {exit_code, status, current_count} +-- exit_code: 0 if released, 255 if failed +-- status: 'released', 'not_held', or 'already_expired' + +local tokens_key = KEYS[1] +local holders_key = KEYS[2] +local holder_key = KEYS[3] + +local instance_id = ARGV[1] + +-- Step 1: Check if this instance is currently a holder +local is_holder = redis.call('SISMEMBER', holders_key, instance_id) +if is_holder == 0 then + -- Not in holders set - check if holder key exists + local exists = redis.call('EXISTS', holder_key) + if exists == 1 then + -- Holder key exists but not in set - clean it up + redis.call('DEL', holder_key) + return {255, 'already_expired', redis.call('SCARD', holders_key)} + else + return {255, 'not_held', redis.call('SCARD', holders_key)} + end +end + +-- Step 2: Get the token from holder key before releasing +local token = redis.call('GET', holder_key) +if not token then + -- Fallback token if somehow missing + token = 'token_default' +end + +-- Step 3: Release the semaphore +redis.call('SREM', holders_key, instance_id) +redis.call('DEL', holder_key) + +-- Step 4: Return token to available pool +-- This automatically unblocks any waiting BRPOP calls +redis.call('LPUSH', tokens_key, token) + +local new_count = redis.call('SCARD', holders_key) + +return {0, 'released', new_count} diff --git a/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua new file mode 100644 index 000000000000..3c897ed8d90f --- /dev/null +++ b/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua @@ -0,0 +1,40 @@ +-- Renew semaphore holder TTL (simplified for token pool design) +-- KEYS[1]: holders_key (SET of current holders) +-- KEYS[2]: holder_key (individual holder TTL key for this instance) +-- ARGV[1]: instance_id +-- ARGV[2]: ttl_seconds +-- +-- Returns: {exit_code, status, current_count} +-- exit_code: 0 if renewed, 255 if failed +-- status: 'renewed', 'not_held', or 'expired' + +local holders_key = KEYS[1] +local holder_key = KEYS[2] + +local instance_id = ARGV[1] +local ttl_seconds = tonumber(ARGV[2]) + +-- Step 1: Check if this instance is currently a holder +local is_holder = redis.call('SISMEMBER', holders_key, instance_id) +if is_holder == 0 then + -- Not in holders set + local current_count = redis.call('SCARD', holders_key) + return {255, 'not_held', current_count} +end + +-- Step 2: Check if holder key exists (to detect if it expired) +local exists = redis.call('EXISTS', holder_key) +if exists == 0 then + -- Holder key expired - remove from set and fail renewal + redis.call('SREM', holders_key, instance_id) + local current_count = redis.call('SCARD', holders_key) + return {255, 'expired', current_count} +end + +-- Step 3: Renew the holder key TTL +local token = redis.call('GET', holder_key) +redis.call('SETEX', holder_key, ttl_seconds, token) + +local current_count = redis.call('SCARD', holders_key) + +return {0, 'renewed', current_count} diff --git a/packages/service-library/src/servicelib/redis/pure_brpop_semaphore.py b/packages/service-library/src/servicelib/redis/pure_brpop_semaphore.py new file mode 100644 index 000000000000..0eb6a781f87e --- /dev/null +++ b/packages/service-library/src/servicelib/redis/pure_brpop_semaphore.py @@ -0,0 +1,417 @@ +"""Pure Python BRPOP-based fair semaphore implementation.""" + +import asyncio +import datetime +import logging +import uuid +from types import TracebackType +from typing import Annotated, ClassVar + +from common_library.basic_types import DEFAULT_FACTORY +from pydantic import ( + BaseModel, + Field, + PositiveInt, + computed_field, + field_validator, +) +from redis.commands.core import AsyncScript + +from ._client import RedisClientSDK +from ._constants import ( + DEFAULT_SEMAPHORE_TTL, + DEFAULT_SOCKET_TIMEOUT, + SEMAPHORE_HOLDER_KEY_PREFIX, + SEMAPHORE_KEY_PREFIX, +) +from ._errors import ( + SemaphoreAcquisitionError, + SemaphoreLostError, + SemaphoreNotAcquiredError, +) +from ._semaphore_lua import ( + CLEANUP_FAIR_SEMAPHORE_V2_SCRIPT, + COUNT_FAIR_SEMAPHORE_V2_SCRIPT, + REGISTER_SEMAPHORE_HOLDER_SCRIPT, + RELEASE_FAIR_SEMAPHORE_V2_SCRIPT, + RENEW_FAIR_SEMAPHORE_V2_SCRIPT, + SCRIPT_OK_EXIT_CODE, +) + +_logger = logging.getLogger(__name__) + + +class PureBRPOPSemaphore(BaseModel): + """ + A pure Python BRPOP-based fair semaphore implementation. + + This approach uses Redis BRPOP directly from Python for true blocking fairness, + with minimal Lua scripts only for registration and cleanup. + + Features: + - True FIFO fairness guaranteed by Redis BRPOP + - Native Redis blocking - no Python-side polling needed + - Crash recovery through TTL-based cleanup + - Maximum simplicity and reliability + """ + + model_config = { + "arbitrary_types_allowed": True, # For RedisClientSDK + } + + # Configuration fields + redis_client: RedisClientSDK + key: Annotated[ + str, Field(min_length=1, description="Unique identifier for the semaphore") + ] + capacity: Annotated[ + PositiveInt, Field(description="Maximum number of concurrent holders") + ] + ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL + blocking_timeout: Annotated[ + datetime.timedelta | None, + Field(description="Maximum time to wait when blocking"), + ] = DEFAULT_SOCKET_TIMEOUT + + enable_auto_cleanup: bool = Field( + default=True, + description="Whether to enable automatic cleanup of crashed holders", + ) + + instance_id: Annotated[ + str, + Field( + description="Unique instance identifier", + default_factory=lambda: f"{uuid.uuid4()}", + ), + ] = DEFAULT_FACTORY + + # Class-level script storage + register_script: ClassVar[AsyncScript | None] = None + release_script: ClassVar[AsyncScript | None] = None + renew_script: ClassVar[AsyncScript | None] = None + count_script: ClassVar[AsyncScript | None] = None + cleanup_script: ClassVar[AsyncScript | None] = None + + # Private state + _acquired: bool = False + _token: str | None = None + _cleanup_task: asyncio.Task | None = None + + @classmethod + def _register_scripts(cls, redis_client: RedisClientSDK) -> None: + """Register minimal Lua scripts with Redis.""" + if cls.register_script is None: + cls.register_script = redis_client.redis.register_script( + REGISTER_SEMAPHORE_HOLDER_SCRIPT + ) + cls.release_script = redis_client.redis.register_script( + RELEASE_FAIR_SEMAPHORE_V2_SCRIPT + ) + cls.renew_script = redis_client.redis.register_script( + RENEW_FAIR_SEMAPHORE_V2_SCRIPT + ) + cls.count_script = redis_client.redis.register_script( + COUNT_FAIR_SEMAPHORE_V2_SCRIPT + ) + cls.cleanup_script = redis_client.redis.register_script( + CLEANUP_FAIR_SEMAPHORE_V2_SCRIPT + ) + + def __init__(self, **data) -> None: + super().__init__(**data) + self.__class__._register_scripts(self.redis_client) + + # Start cleanup task if enabled + if self.enable_auto_cleanup: + self._start_cleanup_task() + + def _start_cleanup_task(self) -> None: + """Start background cleanup task for crashed holders.""" + if self._cleanup_task is None or self._cleanup_task.done(): + self._cleanup_task = asyncio.create_task(self._cleanup_loop()) + + async def _cleanup_loop(self) -> None: + """Background task to clean up crashed holders.""" + try: + while True: + await asyncio.sleep(30) # Cleanup every 30 seconds + try: + await self._recover_crashed_tokens() + except Exception as e: + _logger.warning(f"Cleanup failed for semaphore {self.key}: {e}") + except asyncio.CancelledError: + pass + + async def _recover_crashed_tokens(self) -> None: + """Recover tokens from crashed clients.""" + cls = type(self) + assert cls.cleanup_script is not None + + result = await cls.cleanup_script( + keys=[self.tokens_key, self.holders_key, self.holder_prefix], + args=[self.capacity], + client=self.redis_client.redis, + ) + + recovered_tokens, current_holders, available_tokens, total_cleaned = result + + if recovered_tokens > 0 or total_cleaned > 0: + _logger.info( + f"Recovered {recovered_tokens} tokens from {total_cleaned} crashed holders " + f"for semaphore '{self.key}'" + ) + + @computed_field + @property + def tokens_key(self) -> str: + """Redis key for the token pool LIST.""" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}:tokens" + + @computed_field + @property + def holders_key(self) -> str: + """Redis key for the holders SET.""" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders" + + @computed_field + @property + def holder_key(self) -> str: + """Redis key for this instance's holder entry.""" + return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:{self.instance_id}" + + @computed_field + @property + def holder_prefix(self) -> str: + """Prefix for holder keys (used in cleanup).""" + return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:" + + @field_validator("ttl") + @classmethod + def validate_ttl(cls, v: datetime.timedelta) -> datetime.timedelta: + if v.total_seconds() <= 0: + raise ValueError("TTL must be positive") + return v + + @field_validator("blocking_timeout") + @classmethod + def validate_timeout( + cls, v: datetime.timedelta | None + ) -> datetime.timedelta | None: + if v is not None and v.total_seconds() <= 0: + raise ValueError("Timeout must be positive") + return v + + async def acquire(self) -> bool: + """ + Acquire the semaphore using pure Python BRPOP. + + This is the cleanest possible approach: + 1. Call Redis BRPOP directly from Python (guaranteed FIFO fairness) + 2. Use minimal Lua script only to register as holder + 3. No complex retry logic or notifications needed + + Returns: + True if acquired successfully + + Raises: + SemaphoreAcquisitionError: If acquisition fails or times out + """ + if self._acquired: + raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) + + timeout_seconds = ( + int(self.blocking_timeout.total_seconds()) if self.blocking_timeout else 0 + ) + ttl_seconds = int(self.ttl.total_seconds()) + + _logger.debug( + f"Attempting to acquire semaphore '{self.key}' using BRPOP " + f"(timeout: {timeout_seconds}s)" + ) + + try: + # Use Redis BRPOP directly from Python - this is perfectly legal! + # BRPOP blocks until a token is available or timeout occurs + result = await self.redis_client.redis.brpop( + self.tokens_key, timeout=timeout_seconds + ) + + if result is None: + # Timeout occurred + raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) + + # result is (key, token) tuple + _, token = result + token = token.decode("utf-8") if isinstance(token, bytes) else token + + # Register as holder using minimal Lua script + cls = type(self) + assert cls.register_script is not None + + register_result = await cls.register_script( + keys=[self.tokens_key, self.holders_key, self.holder_key], + args=[self.instance_id, self.capacity, ttl_seconds, token], + client=self.redis_client.redis, + ) + + exit_code, status, current_count = register_result + + if exit_code == SCRIPT_OK_EXIT_CODE: + self._acquired = True + self._token = token + + _logger.info( + f"Acquired semaphore '{self.key}' with token '{token}' " + f"(instance: {self.instance_id}, count: {current_count})" + ) + return True + else: + # Registration failed - this shouldn't happen but be safe + # Return the token to the pool + await self.redis_client.redis.lpush(self.tokens_key, token) + raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) + + except TimeoutError: + raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) + except Exception as e: + _logger.error(f"Error acquiring semaphore '{self.key}': {e}") + raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) + + async def release(self) -> None: + """ + Release the semaphore and return token to pool. + + Raises: + SemaphoreNotAcquiredError: If semaphore was not acquired by this instance + """ + if not self._acquired: + raise SemaphoreNotAcquiredError(name=self.key) + + try: + # Use existing release script + cls = type(self) + assert cls.release_script is not None + + result = await cls.release_script( + keys=[self.tokens_key, self.holders_key, self.holder_key], + args=[self.instance_id], + client=self.redis_client.redis, + ) + + exit_code, status, current_count = result + + if exit_code == SCRIPT_OK_EXIT_CODE: + released_token = self._token + self._acquired = False + self._token = None + + _logger.info( + f"Released semaphore '{self.key}' with token '{released_token}' " + f"(instance: {self.instance_id}, count: {current_count})" + ) + return + + # Release failed + _logger.error( + f"Failed to release semaphore '{self.key}' - {status} " + f"(instance: {self.instance_id}, count: {current_count})" + ) + # Mark as not acquired anyway to prevent stuck state + self._acquired = False + self._token = None + raise SemaphoreNotAcquiredError(name=self.key) + + except Exception as e: + _logger.error(f"Error releasing semaphore '{self.key}': {e}") + # Mark as not acquired to prevent stuck state + self._acquired = False + self._token = None + raise SemaphoreNotAcquiredError(name=self.key) + + async def renew(self) -> None: + """ + Renew the semaphore TTL. + + Raises: + SemaphoreLostError: If the semaphore was lost or expired + """ + if not self._acquired: + raise SemaphoreNotAcquiredError(name=self.key) + + ttl_seconds = int(self.ttl.total_seconds()) + + try: + cls = type(self) + assert cls.renew_script is not None + + result = await cls.renew_script( + keys=[self.holders_key, self.holder_key], + args=[self.instance_id, ttl_seconds], + client=self.redis_client.redis, + ) + + exit_code, status, current_count = result + + if exit_code == SCRIPT_OK_EXIT_CODE: + _logger.debug(f"Renewed semaphore '{self.key}' TTL") + return + + # Renewal failed - semaphore was lost + _logger.warning( + f"Semaphore '{self.key}' was lost during renewal - {status} " + f"(instance: {self.instance_id})" + ) + self._acquired = False + self._token = None + raise SemaphoreLostError(name=self.key, instance_id=self.instance_id) + + except Exception as e: + _logger.error(f"Error renewing semaphore '{self.key}': {e}") + raise SemaphoreLostError(name=self.key, instance_id=self.instance_id) + + async def get_current_count(self) -> int: + """Get the current number of semaphore holders.""" + cls = type(self) + assert cls.count_script is not None + + result = await cls.count_script( + keys=[self.holders_key, self.tokens_key], + args=[self.capacity], + client=self.redis_client.redis, + ) + + current_holders, available_tokens, capacity = result + return int(current_holders) + + async def get_available_count(self) -> int: + """Get the number of available semaphore slots.""" + current_count = await self.get_current_count() + return max(0, self.capacity - current_count) + + @property + def acquired(self) -> bool: + """Check if semaphore is currently acquired.""" + return self._acquired + + # Context manager support + async def __aenter__(self) -> "PureBRPOPSemaphore": + await self.acquire() + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + if self._acquired: + await self.release() + + # Clean up background task + if self._cleanup_task and not self._cleanup_task.done(): + self._cleanup_task.cancel() + try: + await self._cleanup_task + except asyncio.CancelledError: + pass From c65c0c8e9a1010d4ca445bad48091982da45157e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 08:24:34 +0200 Subject: [PATCH 02/68] copy from branch --- .../src/servicelib/redis/_semaphore.py | 275 ++++++++++-------- .../src/servicelib/redis/_semaphore_lua.py | 12 +- 2 files changed, 158 insertions(+), 129 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index b62fbc7d238d..1f25ca3972fd 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -4,6 +4,7 @@ from types import TracebackType from typing import Annotated, ClassVar +import redis.exceptions from common_library.basic_types import DEFAULT_FACTORY from pydantic import ( BaseModel, @@ -13,15 +14,7 @@ field_validator, ) from redis.commands.core import AsyncScript -from tenacity import ( - RetryError, - before_sleep_log, - retry, - retry_if_not_result, - stop_after_delay, - stop_never, - wait_random_exponential, -) +from servicelib.redis._utils import handle_redis_returns_union_types from ._client import RedisClientSDK from ._constants import ( @@ -36,10 +29,11 @@ SemaphoreNotAcquiredError, ) from ._semaphore_lua import ( - ACQUIRE_SEMAPHORE_SCRIPT, - COUNT_SEMAPHORE_SCRIPT, - RELEASE_SEMAPHORE_SCRIPT, - RENEW_SEMAPHORE_SCRIPT, + ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT, + COUNT_FAIR_SEMAPHORE_V2_SCRIPT, + REGISTER_FAIR_SEMAPHORE_SCRIPT, + RELEASE_FAIR_SEMAPHORE_V2_SCRIPT, + RENEW_FAIR_SEMAPHORE_V2_SCRIPT, SCRIPT_BAD_EXIT_CODE, SCRIPT_OK_EXIT_CODE, ) @@ -101,6 +95,7 @@ class DistributedSemaphore(BaseModel): ] = DEFAULT_FACTORY # Class and/or Private state attributes (not part of the model) + register_semaphore: ClassVar[AsyncScript | None] = None acquire_script: ClassVar[AsyncScript | None] = None count_script: ClassVar[AsyncScript | None] = None release_script: ClassVar[AsyncScript | None] = None @@ -113,17 +108,20 @@ def _register_scripts(cls, redis_client: RedisClientSDK) -> None: caches the script SHA, so this is efficient. Even if called multiple times, the script is only registered once.""" if cls.acquire_script is None: + cls.register_semaphore = redis_client.redis.register_script( + REGISTER_FAIR_SEMAPHORE_SCRIPT + ) cls.acquire_script = redis_client.redis.register_script( - ACQUIRE_SEMAPHORE_SCRIPT + ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT ) cls.count_script = redis_client.redis.register_script( - COUNT_SEMAPHORE_SCRIPT + COUNT_FAIR_SEMAPHORE_V2_SCRIPT ) cls.release_script = redis_client.redis.register_script( - RELEASE_SEMAPHORE_SCRIPT + RELEASE_FAIR_SEMAPHORE_V2_SCRIPT ) cls.renew_script = redis_client.redis.register_script( - RENEW_SEMAPHORE_SCRIPT + RENEW_FAIR_SEMAPHORE_V2_SCRIPT ) def __init__(self, **data) -> None: @@ -136,11 +134,29 @@ def semaphore_key(self) -> str: """Redis key for the semaphore sorted set.""" return f"{SEMAPHORE_KEY_PREFIX}{self.key}" + @computed_field # type: ignore[prop-decorator] + @property + def tokens_key(self) -> str: + """Redis key for the token pool LIST.""" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}:tokens" + + @computed_field # type: ignore[prop-decorator] + @property + def holders_key(self) -> str: + """Redis key for the holders SET.""" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders" + @computed_field # type: ignore[prop-decorator] @property def holder_key(self) -> str: """Redis key for this instance's holder entry.""" - return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:{self.instance_id}" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders:{self.instance_id}" + + @computed_field + @property + def holder_prefix(self) -> str: + """Prefix for holder keys (used in cleanup).""" + return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:" # Additional validation @field_validator("ttl") @@ -172,109 +188,134 @@ async def acquire(self) -> bool: SemaphoreAcquisitionError: If acquisition fails and blocking=True """ - if not self.blocking: - # Non-blocking: try once - return await self._try_acquire() - - # Blocking - @retry( - wait=wait_random_exponential(min=0.1, max=2), - reraise=True, - stop=( - stop_after_delay(self.blocking_timeout.total_seconds()) + ttl_seconds = int(self.ttl.total_seconds()) + blocking_timeout_seconds = 1 + if self.blocking: + blocking_timeout_seconds = ( + int(self.blocking_timeout.total_seconds()) if self.blocking_timeout - else stop_never - ), - retry=retry_if_not_result(lambda acquired: acquired), - before_sleep=before_sleep_log(_logger, logging.DEBUG), - ) - async def _blocking_acquire() -> bool: - return await self._try_acquire() + else 60 + ) + + # Execute the Lua scripts atomically + cls = type(self) + assert cls.register_semaphore is not None # nosec + await cls.register_semaphore( + keys=[self.tokens_key, self.holders_key], + args=[self.capacity, ttl_seconds], + client=self.redis_client.redis, + ) # pylint: disable=not-callable try: - return await _blocking_acquire() - except RetryError as exc: + # this is blocking pop with timeout + tokens_key_token: list[str] = await handle_redis_returns_union_types( + self.redis_client.redis.brpop( + [self.tokens_key], timeout=blocking_timeout_seconds + ) + ) + except redis.exceptions.TimeoutError as e: + _logger.debug( + "Timeout acquiring semaphore '%s' (instance: %s)", + self.key, + self.instance_id, + ) raise SemaphoreAcquisitionError( name=self.key, capacity=self.capacity - ) from exc + ) from e - async def release(self) -> None: - """ - Release the semaphore atomically using Lua script. - - Raises: - SemaphoreNotAcquiredError: If semaphore was not acquired by this instance - """ - ttl_seconds = int(self.ttl.total_seconds()) + assert len(tokens_key_token) == 2 # nosec + assert tokens_key_token[0] == self.tokens_key # nosec + token = tokens_key_token[1] - # Execute the release Lua script atomically - cls = type(self) - assert cls.release_script is not None # nosec - result = await cls.release_script( # pylint: disable=not-callable - keys=( - self.semaphore_key, - self.holder_key, - ), - args=( + assert cls.acquire_script is not None # nosec + result = await cls.acquire_script( # pylint: disable=not-callable + keys=[self.holders_key, self.holder_key], + args=[ + token, self.instance_id, - str(ttl_seconds), - ), + ttl_seconds, + ], client=self.redis_client.redis, ) + # Lua script returns: [exit_code, status, current_count, expired_count] assert isinstance(result, list) # nosec - exit_code, status, current_count, expired_count = result - result = status + exit_code, status, token, current_count = result - if result == "released": - assert exit_code == SCRIPT_OK_EXIT_CODE # nosec + if exit_code == SCRIPT_OK_EXIT_CODE: _logger.debug( - "Released semaphore '%s' (instance: %s, count: %s, expired: %s)", + "Acquired semaphore '%s' with token %s (instance: %s, count: %s)", self.key, + token, self.instance_id, current_count, - expired_count, ) - else: - # Instance wasn't in the semaphore set - this shouldn't happen - # but let's handle it gracefully - assert exit_code == SCRIPT_BAD_EXIT_CODE # nosec - raise SemaphoreNotAcquiredError(name=self.key) + return True - async def _try_acquire(self) -> bool: - ttl_seconds = int(self.ttl.total_seconds()) + if status == "timeout": + if self.blocking: + _logger.debug( + "Timeout acquiring semaphore '%s' (instance: %s, count: %s)", + self.key, + self.instance_id, + current_count, + ) + raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) + _logger.debug( + "Timeout acquiring semaphore '%s' (instance: %s, count: %s)", + self.key, + self.instance_id, + current_count, + ) + return False + + _logger.debug( + "Failed to acquire semaphore '%s' - %s (count: %s)", + self.key, + status, + current_count, + ) + raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) - # Execute the Lua script atomically + async def release(self) -> None: + """ + Release the semaphore atomically using Lua script. + + Raises: + SemaphoreNotAcquiredError: If semaphore was not acquired by this instance + """ + + # Execute the release Lua script atomically cls = type(self) - assert cls.acquire_script is not None # nosec - result = await cls.acquire_script( # pylint: disable=not-callable - keys=(self.semaphore_key, self.holder_key), - args=(self.instance_id, str(self.capacity), str(ttl_seconds)), + assert cls.release_script is not None # nosec + result = await cls.release_script( # pylint: disable=not-callable + keys=[self.tokens_key, self.holders_key, self.holder_key], + args=[self.instance_id], client=self.redis_client.redis, ) - # Lua script returns: [exit_code, status, current_count, expired_count] assert isinstance(result, list) # nosec - exit_code, status, current_count, expired_count = result - + exit_code, status, current_count = result if exit_code == SCRIPT_OK_EXIT_CODE: + assert status == "released" # nosec _logger.debug( - "Acquired semaphore '%s' (instance: %s, count: %s, expired: %s)", + "Released semaphore '%s' (instance: %s, count: %s)", self.key, self.instance_id, current_count, - expired_count, ) - return True + return - _logger.debug( - "Failed to acquire semaphore '%s' - %s (count: %s, expired: %s)", + # Instance was already expired or not acquired + assert exit_code == SCRIPT_BAD_EXIT_CODE # nosec + _logger.error( + "Failed to release semaphore '%s' - %s (instance: %s, count: %s)", self.key, status, + self.instance_id, current_count, - expired_count, ) - return False + raise SemaphoreNotAcquiredError(name=self.key) async def reacquire(self) -> None: """ @@ -293,72 +334,50 @@ async def reacquire(self) -> None: cls = type(self) assert cls.renew_script is not None # nosec result = await cls.renew_script( # pylint: disable=not-callable - keys=(self.semaphore_key, self.holder_key), - args=( - self.instance_id, - str(ttl_seconds), - ), + keys=[self.holders_key, self.holder_key], + args=[self.instance_id, ttl_seconds], client=self.redis_client.redis, ) assert isinstance(result, list) # nosec - exit_code, status, current_count, expired_count = result + exit_code, status, current_count = result - # Lua script returns: 'renewed' or status message - if status == "renewed": - assert exit_code == SCRIPT_OK_EXIT_CODE # nosec + if exit_code == SCRIPT_OK_EXIT_CODE: + assert status == "renewed" # nosec _logger.debug( - "Renewed semaphore '%s' (instance: %s, count: %s, expired: %s)", + "Renewed semaphore '%s' (instance: %s, count: %s)", self.key, self.instance_id, current_count, - expired_count, ) - else: - assert exit_code == SCRIPT_BAD_EXIT_CODE # nosec - if status == "expired": - _logger.warning( - "Semaphore '%s' holder key expired (instance: %s, count: %s, expired: %s)", - self.key, - self.instance_id, - current_count, - expired_count, - ) - elif status == "not_held": - _logger.warning( - "Semaphore '%s' not held (instance: %s, count: %s, expired: %s)", - self.key, - self.instance_id, - current_count, - expired_count, - ) + return + assert exit_code == SCRIPT_BAD_EXIT_CODE # nosec + + _logger.warning( + "Semaphore '%s' holder key was lost (instance: %s, status: %s, count: %s)", + self.key, + self.instance_id, + status, + current_count, + ) - raise SemaphoreLostError(name=self.key, instance_id=self.instance_id) + raise SemaphoreLostError(name=self.key, instance_id=self.instance_id) async def get_current_count(self) -> int: """Get the current number of semaphore holders""" - ttl_seconds = int(self.ttl.total_seconds()) - # Execute the count Lua script atomically cls = type(self) assert cls.count_script is not None # nosec result = await cls.count_script( # pylint: disable=not-callable - keys=(self.semaphore_key,), - args=(str(ttl_seconds),), + keys=[self.holders_key, self.tokens_key], + args=[self.capacity], client=self.redis_client.redis, ) assert isinstance(result, list) # nosec - current_count, expired_count = result - - if int(expired_count) > 0: - _logger.debug( - "Cleaned up %s expired entries from semaphore '%s'", - expired_count, - self.key, - ) + current_holders, available_tokens, capacity = result - return int(current_count) + return int(current_holders) async def get_available_count(self) -> int: """Get the number of available semaphore slots""" diff --git a/packages/service-library/src/servicelib/redis/_semaphore_lua.py b/packages/service-library/src/servicelib/redis/_semaphore_lua.py index 8bf685b30a86..6d95006dc965 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore_lua.py +++ b/packages/service-library/src/servicelib/redis/_semaphore_lua.py @@ -26,10 +26,20 @@ def _load_script(script_name: str) -> str: return script_file.read_text(encoding="utf-8").strip() +# TODO: old ACQUIRE_SEMAPHORE_SCRIPT: Final[str] = _load_script("acquire_semaphore") RELEASE_SEMAPHORE_SCRIPT: Final[str] = _load_script("release_semaphore") RENEW_SEMAPHORE_SCRIPT: Final[str] = _load_script("renew_semaphore") COUNT_SEMAPHORE_SCRIPT: Final[str] = _load_script("count_semaphore") -SCRIPT_BAD_EXIT_CODE: Final[int] = 255 +# fair semaphore scripts (token pool based) +REGISTER_FAIR_SEMAPHORE_SCRIPT: Final[str] = _load_script("register_semaphore_holder") +ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("acquire_fair_semaphore_v2") +RELEASE_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("release_fair_semaphore_v2") +CLEANUP_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("cleanup_fair_semaphore_v2") +RENEW_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("renew_fair_semaphore_v2") +COUNT_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("count_fair_semaphore_v2") + + SCRIPT_OK_EXIT_CODE: Final[int] = 0 +SCRIPT_BAD_EXIT_CODE: Final[int] = 255 From c2f6100144b90f2da0e7595dac5602c30dc6f44d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 08:46:34 +0200 Subject: [PATCH 03/68] implementing fair queuing --- .../src/servicelib/redis/_semaphore.py | 16 ++++---- .../tests/redis/test_semaphore.py | 37 ------------------- 2 files changed, 9 insertions(+), 44 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 1f25ca3972fd..e23a79745bbd 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -14,7 +14,6 @@ field_validator, ) from redis.commands.core import AsyncScript -from servicelib.redis._utils import handle_redis_returns_union_types from ._client import RedisClientSDK from ._constants import ( @@ -37,6 +36,7 @@ SCRIPT_BAD_EXIT_CODE, SCRIPT_OK_EXIT_CODE, ) +from ._utils import handle_redis_returns_union_types _logger = logging.getLogger(__name__) @@ -150,7 +150,7 @@ def holders_key(self) -> str: @property def holder_key(self) -> str: """Redis key for this instance's holder entry.""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders:{self.instance_id}" + return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:{self.instance_id}" @computed_field @property @@ -200,11 +200,11 @@ async def acquire(self) -> bool: # Execute the Lua scripts atomically cls = type(self) assert cls.register_semaphore is not None # nosec - await cls.register_semaphore( + await cls.register_semaphore( # pylint: disable=not-callable keys=[self.tokens_key, self.holders_key], args=[self.capacity, ttl_seconds], client=self.redis_client.redis, - ) # pylint: disable=not-callable + ) try: # this is blocking pop with timeout @@ -219,9 +219,11 @@ async def acquire(self) -> bool: self.key, self.instance_id, ) - raise SemaphoreAcquisitionError( - name=self.key, capacity=self.capacity - ) from e + if self.blocking: + raise SemaphoreAcquisitionError( + name=self.key, capacity=self.capacity + ) from e + return False assert len(tokens_key_token) == 2 # nosec assert tokens_key_token[0] == self.tokens_key # nosec diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 14042589b5a0..f8cc2c64e122 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -290,43 +290,6 @@ async def _raising_context(): assert await captured_semaphore.get_current_count() == 0 -async def test_semaphore_ttl_cleanup( - redis_client_sdk: RedisClientSDK, - semaphore_name: str, - semaphore_capacity: int, - short_ttl: datetime.timedelta, -): - # Create semaphore with explicit short TTL - semaphore = DistributedSemaphore( - redis_client=redis_client_sdk, - key=semaphore_name, - capacity=semaphore_capacity, - ttl=short_ttl, - ) - - # Manually add an expired entry - expired_instance_id = "expired-instance" - current_time = asyncio.get_event_loop().time() - # Make sure it's definitely expired by using the short TTL - expired_time = current_time - short_ttl.total_seconds() - 1 - - await redis_client_sdk.redis.zadd( - semaphore.semaphore_key, {expired_instance_id: expired_time} - ) - - # Verify the entry was added - initial_count = await redis_client_sdk.redis.zcard(semaphore.semaphore_key) - assert initial_count == 1 - - # Current count should clean up expired entries - count = await semaphore.get_current_count() - assert count == 0 - - # Verify expired entry was removed - remaining = await redis_client_sdk.redis.zcard(semaphore.semaphore_key) - assert remaining == 0 - - async def test_multiple_semaphores_different_keys( redis_client_sdk: RedisClientSDK, faker: Faker, From 550f003a034694f3e661fc27c68e615fd71ca7b8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 08:54:58 +0200 Subject: [PATCH 04/68] ongoing --- packages/service-library/src/servicelib/redis/_semaphore.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index e23a79745bbd..f7f2bd27403f 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -189,12 +189,10 @@ async def acquire(self) -> bool: """ ttl_seconds = int(self.ttl.total_seconds()) - blocking_timeout_seconds = 1 + blocking_timeout_seconds = 0.001 if self.blocking: blocking_timeout_seconds = ( - int(self.blocking_timeout.total_seconds()) - if self.blocking_timeout - else 60 + self.blocking_timeout.total_seconds() if self.blocking_timeout else 0 ) # Execute the Lua scripts atomically From f71135357a82d8f947b1a48158b42e2b3bbbe82b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 14:07:58 +0200 Subject: [PATCH 05/68] fixing code --- .../src/servicelib/redis/_semaphore.py | 41 +++++++++++-------- .../tests/redis/test_semaphore.py | 10 ++++- 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index f7f2bd27403f..d625cf9af589 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -19,7 +19,6 @@ from ._constants import ( DEFAULT_SEMAPHORE_TTL, DEFAULT_SOCKET_TIMEOUT, - SEMAPHORE_HOLDER_KEY_PREFIX, SEMAPHORE_KEY_PREFIX, ) from ._errors import ( @@ -150,13 +149,13 @@ def holders_key(self) -> str: @property def holder_key(self) -> str: """Redis key for this instance's holder entry.""" - return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:{self.instance_id}" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders_:{self.instance_id}" @computed_field @property def holder_prefix(self) -> str: """Prefix for holder keys (used in cleanup).""" - return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders_:" # Additional validation @field_validator("ttl") @@ -188,6 +187,14 @@ async def acquire(self) -> bool: SemaphoreAcquisitionError: If acquisition fails and blocking=True """ + if await self.is_acquired(): + _logger.debug( + "Semaphore '%s' already acquired by this instance (instance: %s)", + self.key, + self.instance_id, + ) + return True + ttl_seconds = int(self.ttl.total_seconds()) blocking_timeout_seconds = 0.001 if self.blocking: @@ -363,26 +370,26 @@ async def reacquire(self) -> None: raise SemaphoreLostError(name=self.key, instance_id=self.instance_id) + async def is_acquired(self) -> bool: + """Check if the semaphore is currently acquired by this instance.""" + return ( + await handle_redis_returns_union_types( + self.redis_client.redis.exists(self.holder_key) + ) + == 1 + ) + async def get_current_count(self) -> int: """Get the current number of semaphore holders""" - - cls = type(self) - assert cls.count_script is not None # nosec - result = await cls.count_script( # pylint: disable=not-callable - keys=[self.holders_key, self.tokens_key], - args=[self.capacity], - client=self.redis_client.redis, + return await handle_redis_returns_union_types( + self.redis_client.redis.scard(self.holders_key) ) - assert isinstance(result, list) # nosec - current_holders, available_tokens, capacity = result - - return int(current_holders) - async def get_available_count(self) -> int: """Get the number of available semaphore slots""" - current_count = await self.get_current_count() - return max(0, self.capacity - current_count) + return await handle_redis_returns_union_types( + self.redis_client.redis.llen(self.tokens_key) + ) # Context manager support async def __aenter__(self) -> "DistributedSemaphore": diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index f8cc2c64e122..6ea0aa4926b9 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -102,7 +102,10 @@ async def test_semaphore_acquire_release_single( semaphore_capacity: int, ): semaphore = DistributedSemaphore( - redis_client=redis_client_sdk, key=semaphore_name, capacity=semaphore_capacity + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=semaphore_capacity, + ttl=datetime.timedelta(seconds=60), ) # Initially not acquired @@ -216,7 +219,10 @@ async def test_semaphore_blocking_timeout( # First semaphore acquires async with DistributedSemaphore( - redis_client=redis_client_sdk, key=semaphore_name, capacity=capacity + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=capacity, + ttl=datetime.timedelta(seconds=60), ): # Second semaphore should timeout semaphore2 = DistributedSemaphore( From 4feca5b4f60bb8a94193b3df45f4a5767dbf007b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 14:14:29 +0200 Subject: [PATCH 06/68] remove holder key --- .../src/servicelib/redis/_constants.py | 1 - .../src/servicelib/redis/_semaphore.py | 22 ++++++++++++------- .../tests/redis/test_semaphore.py | 7 ++++-- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_constants.py b/packages/service-library/src/servicelib/redis/_constants.py index e34befa1536b..456d86b4d2a6 100644 --- a/packages/service-library/src/servicelib/redis/_constants.py +++ b/packages/service-library/src/servicelib/redis/_constants.py @@ -11,7 +11,6 @@ DEFAULT_SEMAPHORE_TTL: Final[datetime.timedelta] = datetime.timedelta(seconds=10) SEMAPHORE_KEY_PREFIX: Final[str] = "semaphores:" -SEMAPHORE_HOLDER_KEY_PREFIX: Final[str] = "semaphores:holders:" DEFAULT_DECODE_RESPONSES: Final[bool] = True DEFAULT_HEALTH_CHECK_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index d625cf9af589..c130fd199e7c 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -176,6 +176,17 @@ def validate_timeout( raise ValueError(msg) return v + async def _initialize_semaphore(self) -> None: + """Initializes the semaphore in Redis if not already done.""" + ttl_seconds = int(self.ttl.total_seconds()) + cls = type(self) + assert cls.register_semaphore is not None # nosec + await cls.register_semaphore( # pylint: disable=not-callable + keys=[self.tokens_key, self.holders_key], + args=[self.capacity, ttl_seconds], + client=self.redis_client.redis, + ) + async def acquire(self) -> bool: """ Acquire the semaphore. @@ -202,14 +213,7 @@ async def acquire(self) -> bool: self.blocking_timeout.total_seconds() if self.blocking_timeout else 0 ) - # Execute the Lua scripts atomically - cls = type(self) - assert cls.register_semaphore is not None # nosec - await cls.register_semaphore( # pylint: disable=not-callable - keys=[self.tokens_key, self.holders_key], - args=[self.capacity, ttl_seconds], - client=self.redis_client.redis, - ) + await self._initialize_semaphore() try: # this is blocking pop with timeout @@ -234,6 +238,7 @@ async def acquire(self) -> bool: assert tokens_key_token[0] == self.tokens_key # nosec token = tokens_key_token[1] + cls = type(self) assert cls.acquire_script is not None # nosec result = await cls.acquire_script( # pylint: disable=not-callable keys=[self.holders_key, self.holder_key], @@ -387,6 +392,7 @@ async def get_current_count(self) -> int: async def get_available_count(self) -> int: """Get the number of available semaphore slots""" + await self._initialize_semaphore() return await handle_redis_returns_union_types( self.redis_client.redis.llen(self.tokens_key) ) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 6ea0aa4926b9..3746d06ad755 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -14,7 +14,6 @@ from servicelib.redis import RedisClientSDK from servicelib.redis._constants import ( DEFAULT_SEMAPHORE_TTL, - SEMAPHORE_HOLDER_KEY_PREFIX, SEMAPHORE_KEY_PREFIX, ) from servicelib.redis._errors import SemaphoreLostError @@ -60,7 +59,7 @@ async def test_semaphore_initialization( assert semaphore.instance_id is not None assert semaphore.semaphore_key == f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}" assert semaphore.holder_key.startswith( - f"{SEMAPHORE_HOLDER_KEY_PREFIX}{semaphore_name}:" + f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders_:" ) @@ -109,6 +108,9 @@ async def test_semaphore_acquire_release_single( ) # Initially not acquired + assert await semaphore.get_current_count() == 0 + assert await semaphore.get_available_count() == semaphore_capacity + assert await semaphore.is_acquired() is False # Acquire successfully result = await semaphore.acquire() @@ -117,6 +119,7 @@ async def test_semaphore_acquire_release_single( # Check Redis state assert await semaphore.get_current_count() == 1 assert await semaphore.get_available_count() == semaphore_capacity - 1 + assert await semaphore.is_acquired() is True # Acquire again on same instance should return True immediately and keep the same count (reentrant) result = await semaphore.acquire() From 5698788f894dcd3fe933b7dc22a27e619b0e1ec9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 14:15:41 +0200 Subject: [PATCH 07/68] revert --- packages/service-library/tests/redis/test_semaphore.py | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 3746d06ad755..070927330b0c 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -104,7 +104,6 @@ async def test_semaphore_acquire_release_single( redis_client=redis_client_sdk, key=semaphore_name, capacity=semaphore_capacity, - ttl=datetime.timedelta(seconds=60), ) # Initially not acquired From 409a173321c12606cc7d109b526a4cc4cbfe5cd0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 15:15:28 +0200 Subject: [PATCH 08/68] fixes handling --- .../src/servicelib/redis/_semaphore.py | 43 ++++++++++++++++--- .../tests/redis/test_semaphore.py | 1 - 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index c130fd199e7c..70abd6d3ec29 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -14,6 +14,12 @@ field_validator, ) from redis.commands.core import AsyncScript +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_delay, + wait_random_exponential, +) from ._client import RedisClientSDK from ._constants import ( @@ -216,13 +222,29 @@ async def acquire(self) -> bool: await self._initialize_semaphore() try: - # this is blocking pop with timeout - tokens_key_token: list[str] = await handle_redis_returns_union_types( - self.redis_client.redis.brpop( - [self.tokens_key], timeout=blocking_timeout_seconds - ) + + @retry( + stop=stop_after_delay(blocking_timeout_seconds), + wait=wait_random_exponential(min=0.1), + retry=retry_if_exception_type(redis.exceptions.TimeoutError), + reraise=True, ) + async def _try_acquire() -> list[str] | None: + # NOTE: brpop returns None on timeout + # NOTE: the timeout here is for the socket, not for the semaphore itself + tokens_key_token: list[str] | None = ( + await handle_redis_returns_union_types( + self.redis_client.redis.brpop( + [self.tokens_key], + timeout=blocking_timeout_seconds, + ) + ) + ) + return tokens_key_token + + tokens_key_token = await _try_acquire() except redis.exceptions.TimeoutError as e: + # when this triggers it is the socket timeout of the client _logger.debug( "Timeout acquiring semaphore '%s' (instance: %s)", self.key, @@ -234,6 +256,17 @@ async def acquire(self) -> bool: ) from e return False + if tokens_key_token is None: + # when this triggers it is the blocking timeout of the brpop call + _logger.debug( + "Timeout acquiring semaphore '%s' (instance: %s)", + self.key, + self.instance_id, + ) + if self.blocking: + raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) + return False + assert len(tokens_key_token) == 2 # nosec assert tokens_key_token[0] == self.tokens_key # nosec token = tokens_key_token[1] diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 070927330b0c..eeae5e34acc9 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -224,7 +224,6 @@ async def test_semaphore_blocking_timeout( redis_client=redis_client_sdk, key=semaphore_name, capacity=capacity, - ttl=datetime.timedelta(seconds=60), ): # Second semaphore should timeout semaphore2 = DistributedSemaphore( From 82a658aa64b3be4b6dac4f5e59c8c09566d44d51 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 15:26:50 +0200 Subject: [PATCH 09/68] adjusting --- .../src/servicelib/redis/_semaphore.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 70abd6d3ec29..3aedf7dd06f9 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -15,6 +15,7 @@ ) from redis.commands.core import AsyncScript from tenacity import ( + RetryError, retry, retry_if_exception_type, stop_after_delay, @@ -182,7 +183,7 @@ def validate_timeout( raise ValueError(msg) return v - async def _initialize_semaphore(self) -> None: + async def _ensure_semaphore_initialized(self) -> None: """Initializes the semaphore in Redis if not already done.""" ttl_seconds = int(self.ttl.total_seconds()) cls = type(self) @@ -203,6 +204,7 @@ async def acquire(self) -> bool: Raises: SemaphoreAcquisitionError: If acquisition fails and blocking=True """ + await self._ensure_semaphore_initialized() if await self.is_acquired(): _logger.debug( @@ -219,19 +221,19 @@ async def acquire(self) -> bool: self.blocking_timeout.total_seconds() if self.blocking_timeout else 0 ) - await self._initialize_semaphore() - try: @retry( - stop=stop_after_delay(blocking_timeout_seconds), + stop=stop_after_delay( # this is the time after the first attempt + blocking_timeout_seconds + ), wait=wait_random_exponential(min=0.1), retry=retry_if_exception_type(redis.exceptions.TimeoutError), - reraise=True, ) async def _try_acquire() -> list[str] | None: # NOTE: brpop returns None on timeout # NOTE: the timeout here is for the socket, not for the semaphore itself + tokens_key_token: list[str] | None = ( await handle_redis_returns_union_types( self.redis_client.redis.brpop( @@ -243,8 +245,10 @@ async def _try_acquire() -> list[str] | None: return tokens_key_token tokens_key_token = await _try_acquire() - except redis.exceptions.TimeoutError as e: - # when this triggers it is the socket timeout of the client + except RetryError as e: + # NOTE: this can happen with either the blocking timeout or the socket timeout + # but the blocking timeout is anyway hit since tenacity did not retry more + # therefore we can safely assume we could not acquire the semaphore in time _logger.debug( "Timeout acquiring semaphore '%s' (instance: %s)", self.key, @@ -257,6 +261,7 @@ async def _try_acquire() -> list[str] | None: return False if tokens_key_token is None: + # NOTE: when BRPOP returns None it means it timed out properly # when this triggers it is the blocking timeout of the brpop call _logger.debug( "Timeout acquiring semaphore '%s' (instance: %s)", @@ -425,7 +430,7 @@ async def get_current_count(self) -> int: async def get_available_count(self) -> int: """Get the number of available semaphore slots""" - await self._initialize_semaphore() + await self._ensure_semaphore_initialized() return await handle_redis_returns_union_types( self.redis_client.redis.llen(self.tokens_key) ) From 4377bc3c8b43a95eb9273da570e789b7cb8f4c6e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 16:43:22 +0200 Subject: [PATCH 10/68] handle blocking timeouts --- .../src/servicelib/redis/_semaphore.py | 34 ++++++------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 3aedf7dd06f9..5c446c36ee19 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -215,30 +215,29 @@ async def acquire(self) -> bool: return True ttl_seconds = int(self.ttl.total_seconds()) - blocking_timeout_seconds = 0.001 - if self.blocking: - blocking_timeout_seconds = ( - self.blocking_timeout.total_seconds() if self.blocking_timeout else 0 - ) try: @retry( stop=stop_after_delay( # this is the time after the first attempt - blocking_timeout_seconds + (self.blocking_timeout or 0) if self.blocking else 0 ), wait=wait_random_exponential(min=0.1), retry=retry_if_exception_type(redis.exceptions.TimeoutError), ) async def _try_acquire() -> list[str] | None: # NOTE: brpop returns None on timeout - # NOTE: the timeout here is for the socket, not for the semaphore itself + # NOTE: redis-py library timeouts when the socket times out which is defined + # elsewhere on the client (e.g. DEFAULT_SOCKET_TIMEOUT) + # we always block forever since tenacity takes care of timing out + # therefore we can distinguish between a proper timeout (returns None) and a socket + # timeout (raises an exception) tokens_key_token: list[str] | None = ( await handle_redis_returns_union_types( self.redis_client.redis.brpop( [self.tokens_key], - timeout=blocking_timeout_seconds, + timeout=None, # NOTE: we always block forever since tenacity takes care of timing out ) ) ) @@ -246,9 +245,7 @@ async def _try_acquire() -> list[str] | None: tokens_key_token = await _try_acquire() except RetryError as e: - # NOTE: this can happen with either the blocking timeout or the socket timeout - # but the blocking timeout is anyway hit since tenacity did not retry more - # therefore we can safely assume we could not acquire the semaphore in time + # NOTE: if we end up here that means we could not acquire the semaphore _logger.debug( "Timeout acquiring semaphore '%s' (instance: %s)", self.key, @@ -260,19 +257,8 @@ async def _try_acquire() -> list[str] | None: ) from e return False - if tokens_key_token is None: - # NOTE: when BRPOP returns None it means it timed out properly - # when this triggers it is the blocking timeout of the brpop call - _logger.debug( - "Timeout acquiring semaphore '%s' (instance: %s)", - self.key, - self.instance_id, - ) - if self.blocking: - raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) - return False - - assert len(tokens_key_token) == 2 # nosec + assert tokens_key_token is not None # nosec + assert len(tokens_key_token) == 2 # nosec # noqa: PLR2004 assert tokens_key_token[0] == self.tokens_key # nosec token = tokens_key_token[1] From 2ec306c8ed450822b0e1da1b37d784391d888105 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 16:44:27 +0200 Subject: [PATCH 11/68] remove unused code --- packages/service-library/src/servicelib/redis/_semaphore.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 5c446c36ee19..0780fdeccfff 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -158,12 +158,6 @@ def holder_key(self) -> str: """Redis key for this instance's holder entry.""" return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders_:{self.instance_id}" - @computed_field - @property - def holder_prefix(self) -> str: - """Prefix for holder keys (used in cleanup).""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders_:" - # Additional validation @field_validator("ttl") @classmethod From 0d6979151bf9a2d48db9570dca93b2ff3ca258f4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 16:49:00 +0200 Subject: [PATCH 12/68] clean --- .../src/servicelib/redis/_semaphore.py | 35 +++---------- .../src/servicelib/redis/_semaphore_lua.py | 7 --- .../redis/lua/acquire_fair_semaphore_v2.lua | 3 +- .../redis/lua/acquire_semaphore.lua | 40 --------------- .../redis/lua/count_fair_semaphore_v2.lua | 17 ------- .../servicelib/redis/lua/count_semaphore.lua | 23 --------- .../redis/lua/release_semaphore.lua | 46 ----------------- .../servicelib/redis/lua/renew_semaphore.lua | 49 ------------------- 8 files changed, 7 insertions(+), 213 deletions(-) delete mode 100644 packages/service-library/src/servicelib/redis/lua/acquire_semaphore.lua delete mode 100644 packages/service-library/src/servicelib/redis/lua/count_fair_semaphore_v2.lua delete mode 100644 packages/service-library/src/servicelib/redis/lua/count_semaphore.lua delete mode 100644 packages/service-library/src/servicelib/redis/lua/release_semaphore.lua delete mode 100644 packages/service-library/src/servicelib/redis/lua/renew_semaphore.lua diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 0780fdeccfff..6fe2a4c7f09e 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -272,40 +272,17 @@ async def _try_acquire() -> list[str] | None: assert isinstance(result, list) # nosec exit_code, status, token, current_count = result - if exit_code == SCRIPT_OK_EXIT_CODE: - _logger.debug( - "Acquired semaphore '%s' with token %s (instance: %s, count: %s)", - self.key, - token, - self.instance_id, - current_count, - ) - return True - - if status == "timeout": - if self.blocking: - _logger.debug( - "Timeout acquiring semaphore '%s' (instance: %s, count: %s)", - self.key, - self.instance_id, - current_count, - ) - raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) - _logger.debug( - "Timeout acquiring semaphore '%s' (instance: %s, count: %s)", - self.key, - self.instance_id, - current_count, - ) - return False + assert exit_code == SCRIPT_OK_EXIT_CODE # nosec + assert status == "acquired" # nosec _logger.debug( - "Failed to acquire semaphore '%s' - %s (count: %s)", + "Acquired semaphore '%s' with token %s (instance: %s, count: %s)", self.key, - status, + token, + self.instance_id, current_count, ) - raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) + return True async def release(self) -> None: """ diff --git a/packages/service-library/src/servicelib/redis/_semaphore_lua.py b/packages/service-library/src/servicelib/redis/_semaphore_lua.py index 6d95006dc965..1ce94c486f90 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore_lua.py +++ b/packages/service-library/src/servicelib/redis/_semaphore_lua.py @@ -26,19 +26,12 @@ def _load_script(script_name: str) -> str: return script_file.read_text(encoding="utf-8").strip() -# TODO: old -ACQUIRE_SEMAPHORE_SCRIPT: Final[str] = _load_script("acquire_semaphore") -RELEASE_SEMAPHORE_SCRIPT: Final[str] = _load_script("release_semaphore") -RENEW_SEMAPHORE_SCRIPT: Final[str] = _load_script("renew_semaphore") -COUNT_SEMAPHORE_SCRIPT: Final[str] = _load_script("count_semaphore") - # fair semaphore scripts (token pool based) REGISTER_FAIR_SEMAPHORE_SCRIPT: Final[str] = _load_script("register_semaphore_holder") ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("acquire_fair_semaphore_v2") RELEASE_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("release_fair_semaphore_v2") CLEANUP_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("cleanup_fair_semaphore_v2") RENEW_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("renew_fair_semaphore_v2") -COUNT_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("count_fair_semaphore_v2") SCRIPT_OK_EXIT_CODE: Final[int] = 0 diff --git a/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua index dd4f7ba3e192..fc30065d3d2b 100644 --- a/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua +++ b/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua @@ -1,11 +1,10 @@ -- Fair distributed semaphore using token pool (BRPOP-based) -- KEYS[1]: tokens_key (LIST of available tokens) -- KEYS[2]: holders_key (SET of current holder instance IDs) --- KEYS[3]: holder_key (individual holder TTL key for this instance) + -- ARGV[1]: instance_id -- ARGV[2]: capacity (max concurrent holders) -- ARGV[3]: ttl_seconds --- ARGV[4]: timeout_seconds (for BRPOP) -- -- Returns: {exit_code, status, token, current_count} -- exit_code: 0 if acquired, 255 if timeout/failed diff --git a/packages/service-library/src/servicelib/redis/lua/acquire_semaphore.lua b/packages/service-library/src/servicelib/redis/lua/acquire_semaphore.lua deleted file mode 100644 index b73608677909..000000000000 --- a/packages/service-library/src/servicelib/redis/lua/acquire_semaphore.lua +++ /dev/null @@ -1,40 +0,0 @@ --- Atomically acquire a distributed semaphore --- KEYS[1]: semaphore_key (ZSET storing holders with timestamps) --- KEYS[2]: holder_key (individual holder TTL key) --- ARGV[1]: instance_id --- ARGV[2]: capacity (max concurrent holders) --- ARGV[3]: ttl_seconds --- --- Returns: {exit_code, status, current_count, expired_count} --- exit_code: 0 if acquired, 255 if failed --- status: 'acquired' or 'capacity_full' --- current_count: number of holders after operation --- expired_count: number of expired entries cleaned up - -local semaphore_key = KEYS[1] -local holder_key = KEYS[2] -local instance_id = ARGV[1] -local capacity = tonumber(ARGV[2]) -local ttl_seconds = tonumber(ARGV[3]) - --- Get current Redis server time -local time_result = redis.call('TIME') -local current_time = tonumber(time_result[1]) + (tonumber(time_result[2]) / 1000000) - --- Step 1: Clean up expired entries -local expiry_threshold = current_time - ttl_seconds -local expired_count = redis.call('ZREMRANGEBYSCORE', semaphore_key, '-inf', expiry_threshold) - --- Step 2: Check current capacity after cleanup -local current_count = redis.call('ZCARD', semaphore_key) - --- Step 3: Try to acquire if under capacity -if current_count < capacity then - -- Atomically add to semaphore and set holder key - redis.call('ZADD', semaphore_key, current_time, instance_id) - redis.call('SETEX', holder_key, ttl_seconds, '1') - - return {0, 'acquired', current_count + 1, expired_count} -else - return {255, 'capacity_full', current_count, expired_count} -end diff --git a/packages/service-library/src/servicelib/redis/lua/count_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/count_fair_semaphore_v2.lua deleted file mode 100644 index fb04f4e4356d..000000000000 --- a/packages/service-library/src/servicelib/redis/lua/count_fair_semaphore_v2.lua +++ /dev/null @@ -1,17 +0,0 @@ --- Count current semaphore holders (simplified for token pool design) --- KEYS[1]: holders_key (SET of current holders) --- KEYS[2]: tokens_key (LIST of available tokens) --- ARGV[1]: capacity (total semaphore capacity) --- --- Returns: {current_holders, available_tokens, total_capacity} - -local holders_key = KEYS[1] -local tokens_key = KEYS[2] - -local capacity = tonumber(ARGV[1]) - --- Count current holders and available tokens -local current_holders = redis.call('SCARD', holders_key) -local available_tokens = redis.call('LLEN', tokens_key) - -return {current_holders, available_tokens, capacity} diff --git a/packages/service-library/src/servicelib/redis/lua/count_semaphore.lua b/packages/service-library/src/servicelib/redis/lua/count_semaphore.lua deleted file mode 100644 index a7c023bae89a..000000000000 --- a/packages/service-library/src/servicelib/redis/lua/count_semaphore.lua +++ /dev/null @@ -1,23 +0,0 @@ --- Atomically count current semaphore holders (with cleanup) --- KEYS[1]: semaphore_key (ZSET storing holders with timestamps) --- ARGV[1]: ttl_seconds --- --- Returns: {current_count, expired_count} --- current_count: number of active holders after cleanup --- expired_count: number of expired entries cleaned up - -local semaphore_key = KEYS[1] -local ttl_seconds = tonumber(ARGV[1]) - --- Get current Redis server time -local time_result = redis.call('TIME') -local current_time = tonumber(time_result[1]) + (tonumber(time_result[2]) / 1000000) - --- Step 1: Clean up expired entries -local expiry_threshold = current_time - ttl_seconds -local expired_count = redis.call('ZREMRANGEBYSCORE', semaphore_key, '-inf', expiry_threshold) - --- Step 2: Count remaining entries -local current_count = redis.call('ZCARD', semaphore_key) - -return {current_count, expired_count} diff --git a/packages/service-library/src/servicelib/redis/lua/release_semaphore.lua b/packages/service-library/src/servicelib/redis/lua/release_semaphore.lua deleted file mode 100644 index a1411060a99a..000000000000 --- a/packages/service-library/src/servicelib/redis/lua/release_semaphore.lua +++ /dev/null @@ -1,46 +0,0 @@ --- Atomically release a distributed semaphore --- KEYS[1]: semaphore_key (ZSET storing holders with timestamps) --- KEYS[2]: holder_key (individual holder TTL key) --- ARGV[1]: instance_id --- ARGV[2]: ttl_seconds --- --- Returns: {success, status, current_count, expired_count} --- exit_code: 0 if released, 255 if failed --- status: 'released', 'not_held', or 'already_expired' --- current_count: number of holders after operation --- expired_count: number of expired entries cleaned up - -local semaphore_key = KEYS[1] -local holder_key = KEYS[2] -local instance_id = ARGV[1] -local ttl_seconds = tonumber(ARGV[2]) - --- Get current Redis server time -local time_result = redis.call('TIME') -local current_time = tonumber(time_result[1]) + (tonumber(time_result[2]) / 1000000) - --- Step 1: Clean up expired entries -local expiry_threshold = current_time - ttl_seconds -local expired_count = redis.call('ZREMRANGEBYSCORE', semaphore_key, '-inf', expiry_threshold) - --- Step 2: Check if this instance currently holds the semaphore -local score = redis.call('ZSCORE', semaphore_key, instance_id) - -if score == false then - -- Instance doesn't hold the semaphore - local current_count = redis.call('ZCARD', semaphore_key) - return {255, 'not_held', current_count, expired_count} -end - --- Step 3: Remove the semaphore entry and holder key -local removed_from_zset = redis.call('ZREM', semaphore_key, instance_id) -local removed_holder = redis.call('DEL', holder_key) - -local current_count = redis.call('ZCARD', semaphore_key) - -if removed_from_zset == 1 then - return {0, 'released', current_count, expired_count} -else - -- This shouldn't happen since we checked ZSCORE above, but handle it - return {255, 'already_expired', current_count, expired_count} -end diff --git a/packages/service-library/src/servicelib/redis/lua/renew_semaphore.lua b/packages/service-library/src/servicelib/redis/lua/renew_semaphore.lua deleted file mode 100644 index 231d9d421ea0..000000000000 --- a/packages/service-library/src/servicelib/redis/lua/renew_semaphore.lua +++ /dev/null @@ -1,49 +0,0 @@ --- Atomically renew a distributed semaphore holder's TTL --- KEYS[1]: semaphore_key (ZSET storing holders with timestamps) --- KEYS[2]: holder_key (individual holder TTL key) --- ARGV[1]: instance_id --- ARGV[2]: ttl_seconds --- --- Returns: {success, status, current_count, expired_count} --- exit_code: 0 if renewed, 255 if failed --- status: 'renewed', 'not_held', or 'expired' --- current_count: number of holders after operation --- expired_count: number of expired entries cleaned up - -local semaphore_key = KEYS[1] -local holder_key = KEYS[2] -local instance_id = ARGV[1] -local ttl_seconds = tonumber(ARGV[2]) - --- Get current Redis server time -local time_result = redis.call('TIME') -local current_time = tonumber(time_result[1]) + (tonumber(time_result[2]) / 1000000) - --- Step 1: Clean up expired entries -local expiry_threshold = current_time - ttl_seconds -local expired_count = redis.call('ZREMRANGEBYSCORE', semaphore_key, '-inf', expiry_threshold) - --- Step 2: Check if this instance currently holds the semaphore -local score = redis.call('ZSCORE', semaphore_key, instance_id) - -if score == false then - -- Instance doesn't hold the semaphore - local current_count = redis.call('ZCARD', semaphore_key) - return {255, 'not_held', current_count, expired_count} -end - --- Step 3: Check if the holder key still exists (not expired) -local exists = redis.call('EXISTS', holder_key) -if exists == 0 then - -- Holder key expired, remove from semaphore and fail renewal - redis.call('ZREM', semaphore_key, instance_id) - local current_count = redis.call('ZCARD', semaphore_key) - return {255, 'expired', current_count, expired_count + 1} -end - --- Step 4: Renew both the semaphore entry and holder key -redis.call('ZADD', semaphore_key, current_time, instance_id) -redis.call('SETEX', holder_key, ttl_seconds, '1') - -local current_count = redis.call('ZCARD', semaphore_key) -return {0, 'renewed', current_count, expired_count} From 0a8617455a4d41db9b0beeae4a985e292c874400 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 16:49:46 +0200 Subject: [PATCH 13/68] clean --- packages/service-library/src/servicelib/redis/_semaphore.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 6fe2a4c7f09e..ebdab89c4786 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -35,7 +35,6 @@ ) from ._semaphore_lua import ( ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT, - COUNT_FAIR_SEMAPHORE_V2_SCRIPT, REGISTER_FAIR_SEMAPHORE_SCRIPT, RELEASE_FAIR_SEMAPHORE_V2_SCRIPT, RENEW_FAIR_SEMAPHORE_V2_SCRIPT, @@ -103,7 +102,6 @@ class DistributedSemaphore(BaseModel): # Class and/or Private state attributes (not part of the model) register_semaphore: ClassVar[AsyncScript | None] = None acquire_script: ClassVar[AsyncScript | None] = None - count_script: ClassVar[AsyncScript | None] = None release_script: ClassVar[AsyncScript | None] = None renew_script: ClassVar[AsyncScript | None] = None @@ -120,9 +118,6 @@ def _register_scripts(cls, redis_client: RedisClientSDK) -> None: cls.acquire_script = redis_client.redis.register_script( ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT ) - cls.count_script = redis_client.redis.register_script( - COUNT_FAIR_SEMAPHORE_V2_SCRIPT - ) cls.release_script = redis_client.redis.register_script( RELEASE_FAIR_SEMAPHORE_V2_SCRIPT ) From 85d7d52bcef64d19ef8ff895367dd3681811836f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:00:54 +0200 Subject: [PATCH 14/68] ensure blocking semaphore works as expected --- .../src/servicelib/redis/_semaphore.py | 12 +++++++++--- .../service-library/tests/redis/test_semaphore.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index ebdab89c4786..5ed0d3c8c14d 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -19,6 +19,7 @@ retry, retry_if_exception_type, stop_after_delay, + stop_never, wait_random_exponential, ) @@ -206,11 +207,16 @@ async def acquire(self) -> bool: ttl_seconds = int(self.ttl.total_seconds()) try: + stop_condition = stop_after_delay(0) # non-blocking by default + if self.blocking: + stop_condition = ( + stop_after_delay(self.blocking_timeout) + if self.blocking_timeout + else stop_never + ) @retry( - stop=stop_after_delay( # this is the time after the first attempt - (self.blocking_timeout or 0) if self.blocking else 0 - ), + stop=stop_condition, wait=wait_random_exponential(min=0.1), retry=retry_if_exception_type(redis.exceptions.TimeoutError), ) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index eeae5e34acc9..7d8fc43fa367 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -239,6 +239,17 @@ async def test_semaphore_blocking_timeout( ): await semaphore2.acquire() + # now try infinite timeout + semaphore3 = DistributedSemaphore( + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=capacity, + blocking_timeout=None, # wait forever + ) + acquire_task = asyncio.create_task(semaphore3.acquire()) + await asyncio.sleep(5) # give some time to start acquiring + assert not acquire_task.done() + async def test_semaphore_blocking_acquire_waits( redis_client_sdk: RedisClientSDK, From 0a0fb4545cd1a2ab64818b285b395a1be2cef6e9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:05:33 +0200 Subject: [PATCH 15/68] maybe --- .../tests/redis/test_semaphore_decorator.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/packages/service-library/tests/redis/test_semaphore_decorator.py b/packages/service-library/tests/redis/test_semaphore_decorator.py index 21d8425caf43..f5d523a3805e 100644 --- a/packages/service-library/tests/redis/test_semaphore_decorator.py +++ b/packages/service-library/tests/redis/test_semaphore_decorator.py @@ -15,9 +15,7 @@ from pytest_mock import MockerFixture from pytest_simcore.helpers.logging_tools import log_context from servicelib.redis import RedisClientSDK -from servicelib.redis._constants import ( - SEMAPHORE_HOLDER_KEY_PREFIX, -) +from servicelib.redis._constants import SEMAPHORE_KEY_PREFIX from servicelib.redis._errors import SemaphoreLostError from servicelib.redis._semaphore import ( DistributedSemaphore, @@ -136,7 +134,7 @@ async def coro_that_should_fail() -> Literal["should not reach here"]: # Find and delete all holder keys for this semaphore holder_keys = await redis_client_sdk.redis.keys( - f"{SEMAPHORE_HOLDER_KEY_PREFIX}{semaphore_name}:*" + f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders_:*" ) assert holder_keys, "Holder keys should exist before deletion" await redis_client_sdk.redis.delete(*holder_keys) @@ -736,7 +734,7 @@ async def use_failing_cm() -> None: # Find and delete all holder keys for this semaphore holder_keys = await redis_client_sdk.redis.keys( - f"{SEMAPHORE_HOLDER_KEY_PREFIX}{semaphore_name}:*" + f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders_:*" ) assert holder_keys, "Holder keys should exist before deletion" await redis_client_sdk.redis.delete(*holder_keys) From 86ac7f5a4ca5857ff72f8fccee478459d4d7663d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:09:55 +0200 Subject: [PATCH 16/68] we are fair? --- .../tests/redis/test_semaphore_decorator.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/packages/service-library/tests/redis/test_semaphore_decorator.py b/packages/service-library/tests/redis/test_semaphore_decorator.py index f5d523a3805e..5481891b7f8a 100644 --- a/packages/service-library/tests/redis/test_semaphore_decorator.py +++ b/packages/service-library/tests/redis/test_semaphore_decorator.py @@ -422,8 +422,19 @@ async def limited_function() -> None: assert "longer than expected" in caplog.messages[-1] -@pytest.mark.skip +@pytest.fixture +def with_slow_redis_socket_timeout( + mock_redis_socket_timeout: None, mocker: MockerFixture +) -> None: + # put back to higher value to allow normal operations + mocker.patch( + "servicelib.redis._client.DEFAULT_SOCKET_TIMEOUT", + datetime.timedelta(seconds=30), + ) + + async def test_semaphore_fair_queuing( + with_slow_redis_socket_timeout: None, redis_client_sdk: RedisClientSDK, semaphore_name: str, ): @@ -436,7 +447,7 @@ async def test_semaphore_fair_queuing( ) async def limited_function(call_id: int): entered_order.append(call_id) - await asyncio.sleep(0.1) + await asyncio.sleep(0.2) return call_id # Launch tasks in a specific order @@ -444,7 +455,7 @@ async def limited_function(call_id: int): tasks = [] for i in range(num_tasks): tasks.append(asyncio.create_task(limited_function(i))) - await asyncio.sleep(0.01) # Small delay to help preserve order + await asyncio.sleep(0.1) # Small delay to help preserve order results = await asyncio.gather(*tasks) # All should complete successfully and in order From 0865b0bb3d57192146fffc821c9d089e68958c46 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:26:27 +0200 Subject: [PATCH 17/68] cleanup --- .../redis/lua/acquire_fair_semaphore_v2.lua | 14 +++++++------- .../redis/lua/register_semaphore_holder.lua | 10 ++++------ .../redis/lua/release_fair_semaphore_v2.lua | 1 + 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua index fc30065d3d2b..26ea1827b2ec 100644 --- a/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua +++ b/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua @@ -1,14 +1,14 @@ -- Fair distributed semaphore using token pool (BRPOP-based) --- KEYS[1]: tokens_key (LIST of available tokens) --- KEYS[2]: holders_key (SET of current holder instance IDs) +-- KEYS[1]: holders_key (SET of current holder instance IDs) +-- KEYS[2]: holder_key (individual holder TTL key for this instance) --- ARGV[1]: instance_id --- ARGV[2]: capacity (max concurrent holders) --- ARGV[3]: ttl_seconds +-- ARGV[1]: token (the token received from BRPOP) +-- ARGV[2]: instance_id (the instance trying to acquire the semaphore) +-- ARGV[3]: ttl_seconds (for the holder_key) -- -- Returns: {exit_code, status, token, current_count} --- exit_code: 0 if acquired, 255 if timeout/failed --- status: 'acquired' or 'timeout' +-- exit_code: 0 if acquired +-- status: 'acquired' local holders_key = KEYS[1] local holder_key = KEYS[2] diff --git a/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua b/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua index 7d447d14bdc4..b9ddd7335b9f 100644 --- a/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua +++ b/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua @@ -1,13 +1,11 @@ -- Simple token initialization and management for Python BRPOP -- KEYS[1]: tokens_key (LIST of available tokens) -- KEYS[2]: holders_key (SET of current holder instance IDs) --- KEYS[3]: holder_key (individual holder TTL key for this instance) --- ARGV[1]: instance_id --- ARGV[2]: capacity (max concurrent holders) --- ARGV[3]: ttl_seconds --- ARGV[4]: token (the token received from BRPOP) + +-- ARGV[1]: capacity (max concurrent holders) +-- ARGV[2]: ttl_seconds -- --- Returns: {exit_code, status, current_count} +-- Returns: {exit_code} -- exit_code: 0 if registered successfully local tokens_key = KEYS[1] diff --git a/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua index a675568bc391..d244810bae42 100644 --- a/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua +++ b/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua @@ -2,6 +2,7 @@ -- KEYS[1]: tokens_key (LIST of available tokens) -- KEYS[2]: holders_key (SET of current holders) -- KEYS[3]: holder_key (individual holder TTL key for this instance) + -- ARGV[1]: instance_id -- -- Returns: {exit_code, status, current_count} From d4142339ee2044d75d23617b54c0c55966f7e6d2 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:28:11 +0200 Subject: [PATCH 18/68] cleanup --- .../src/servicelib/redis/fair_semaphore.py | 404 ----------------- .../redis/fair_semaphore_decorator.py | 272 ------------ .../servicelib/redis/pure_brpop_semaphore.py | 417 ------------------ 3 files changed, 1093 deletions(-) delete mode 100644 packages/service-library/src/servicelib/redis/fair_semaphore.py delete mode 100644 packages/service-library/src/servicelib/redis/fair_semaphore_decorator.py delete mode 100644 packages/service-library/src/servicelib/redis/pure_brpop_semaphore.py diff --git a/packages/service-library/src/servicelib/redis/fair_semaphore.py b/packages/service-library/src/servicelib/redis/fair_semaphore.py deleted file mode 100644 index f70c699e9fcc..000000000000 --- a/packages/service-library/src/servicelib/redis/fair_semaphore.py +++ /dev/null @@ -1,404 +0,0 @@ -"""Fair distributed semaphore using token pool with crash recovery.""" - -import asyncio -import datetime -import logging -import uuid -from typing import ClassVar - -from pydantic import BaseModel, Field, PositiveInt, computed_field, field_validator -from redis.commands.core import AsyncScript - -from ._client import RedisClientSDK -from ._constants import ( - DEFAULT_SEMAPHORE_TTL, - DEFAULT_SOCKET_TIMEOUT, - SEMAPHORE_HOLDER_KEY_PREFIX, - SEMAPHORE_KEY_PREFIX, -) -from ._errors import ( - SemaphoreAcquisitionError, - SemaphoreLostError, - SemaphoreNotAcquiredError, -) -from ._semaphore_lua import ( - ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT, - CLEANUP_FAIR_SEMAPHORE_V2_SCRIPT, - COUNT_FAIR_SEMAPHORE_V2_SCRIPT, - RELEASE_FAIR_SEMAPHORE_V2_SCRIPT, - RENEW_FAIR_SEMAPHORE_V2_SCRIPT, - SCRIPT_OK_EXIT_CODE, -) - -_logger = logging.getLogger(__name__) - - -class FairSemaphore(BaseModel): - """ - A fair distributed semaphore using Redis token pool with BRPOP. - - Features: - - True FIFO fairness via BRPOP blocking operations - - Crash recovery through TTL-based cleanup - - No Python-side retry logic needed - - Automatic token pool management - """ - - capacity: PositiveInt = Field(description="Maximum number of concurrent holders") - key: str = Field(description="Unique semaphore identifier") - ttl: datetime.timedelta = Field( - default=DEFAULT_SEMAPHORE_TTL, - description="How long a holder can keep the semaphore", - ) - timeout: datetime.timedelta = Field( - default=DEFAULT_SOCKET_TIMEOUT, - description="How long to block waiting for semaphore (0 = infinite)", - ) - cleanup_interval: datetime.timedelta = Field( - default=datetime.timedelta(seconds=30), - description="How often to run cleanup to recover crashed client tokens", - ) - enable_auto_cleanup: bool = Field( - default=True, description="Whether to automatically run background cleanup" - ) - - # Internal state - instance_id: str = Field( - default_factory=lambda: str(uuid.uuid4())[:8], - description="Unique identifier for this semaphore instance", - ) - _acquired: bool = Field(default=False, exclude=True) - _token: str | None = Field(default=None, exclude=True) - _redis_client: RedisClientSDK | None = Field(default=None, exclude=True) - _cleanup_task: asyncio.Task | None = Field(default=None, exclude=True) - - # Class-level script storage - _acquire_script: ClassVar[AsyncScript | None] = None - _release_script: ClassVar[AsyncScript | None] = None - _cleanup_script: ClassVar[AsyncScript | None] = None - _renew_script: ClassVar[AsyncScript | None] = None - _count_script: ClassVar[AsyncScript | None] = None - - @computed_field - @property - def tokens_key(self) -> str: - """Redis key for the token pool LIST.""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}:tokens" - - @computed_field - @property - def holders_key(self) -> str: - """Redis key for the holders SET.""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders" - - @computed_field - @property - def holder_key(self) -> str: - """Redis key for this instance's holder TTL key.""" - return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:{self.instance_id}" - - @computed_field - @property - def holder_prefix(self) -> str: - """Prefix for holder keys (used in cleanup).""" - return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:" - - @field_validator("ttl", "timeout", "cleanup_interval") - @classmethod - def validate_positive_timedelta(cls, v: datetime.timedelta) -> datetime.timedelta: - if v.total_seconds() <= 0: - raise ValueError("Timedelta must be positive") - return v - - def model_post_init(self, __context) -> None: - """Initialize Redis client.""" - if self._redis_client is None: - self._redis_client = RedisClientSDK() - - async def _load_scripts(self) -> None: - """Load Lua scripts into Redis.""" - if self.__class__._acquire_script is None: - redis = await self._redis_client.get_redis_client() - - self.__class__._acquire_script = redis.register_script( - ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT - ) - self.__class__._release_script = redis.register_script( - RELEASE_FAIR_SEMAPHORE_V2_SCRIPT - ) - self.__class__._cleanup_script = redis.register_script( - CLEANUP_FAIR_SEMAPHORE_V2_SCRIPT - ) - self.__class__._renew_script = redis.register_script( - RENEW_FAIR_SEMAPHORE_V2_SCRIPT - ) - self.__class__._count_script = redis.register_script( - COUNT_FAIR_SEMAPHORE_V2_SCRIPT - ) - - async def _start_cleanup_task(self) -> None: - """Start the background cleanup task if enabled.""" - if self.enable_auto_cleanup and ( - self._cleanup_task is None or self._cleanup_task.done() - ): - self._cleanup_task = asyncio.create_task(self._cleanup_loop()) - - async def _cleanup_loop(self) -> None: - """Background task to periodically clean up crashed client tokens.""" - try: - while True: - await asyncio.sleep(self.cleanup_interval.total_seconds()) - try: - await self._recover_crashed_tokens() - except Exception as e: - _logger.warning(f"Cleanup failed for semaphore {self.key}: {e}") - except asyncio.CancelledError: - _logger.debug(f"Cleanup task cancelled for semaphore {self.key}") - - async def _recover_crashed_tokens(self) -> dict: - """Recover tokens from crashed clients.""" - await self._load_scripts() - - result = await self.__class__._cleanup_script( - keys=[self.tokens_key, self.holders_key, self.holder_prefix], - args=[self.capacity], - ) - - recovered_tokens, current_holders, available_tokens, total_cleaned = result - - cleanup_stats = { - "recovered_tokens": recovered_tokens, - "current_holders": current_holders, - "available_tokens": available_tokens, - "total_cleaned": total_cleaned, - } - - if recovered_tokens > 0 or total_cleaned > 0: - _logger.info( - f"Semaphore cleanup for '{self.key}': " - f"recovered {recovered_tokens} tokens, " - f"cleaned {total_cleaned} crashed holders, " - f"current state: {current_holders} holders, {available_tokens} available" - ) - - return cleanup_stats - - async def acquire(self) -> bool: - """ - Acquire the semaphore using blocking Redis operation. - - This method blocks until a semaphore slot becomes available or timeout. - Uses Redis BRPOP for true FIFO fairness with no starvation possible. - - Returns: - True if acquired successfully - - Raises: - SemaphoreAcquisitionError: If acquisition fails or times out - """ - await self._load_scripts() - - if self.enable_auto_cleanup: - await self._start_cleanup_task() - - if self._acquired: - raise SemaphoreAcquisitionError( - "Semaphore already acquired by this instance" - ) - - ttl_seconds = max(1, int(self.ttl.total_seconds())) - timeout_seconds = int(self.timeout.total_seconds()) - - _logger.debug( - f"Attempting to acquire fair semaphore '{self.key}' " - f"(timeout: {timeout_seconds}s, ttl: {ttl_seconds}s)" - ) - - try: - result = await self.__class__._acquire_script( - keys=[self.tokens_key, self.holders_key, self.holder_key], - args=[self.instance_id, self.capacity, ttl_seconds, timeout_seconds], - ) - - exit_code, status, token, current_count = result - - _logger.debug( - f"Fair semaphore acquisition result for '{self.key}'", - extra={ - "instance_id": self.instance_id, - "exit_code": exit_code, - "status": status, - "token": token, - "current_count": current_count, - }, - ) - - if exit_code == SCRIPT_OK_EXIT_CODE: # Success - self._acquired = True - self._token = token - _logger.info( - f"Acquired fair semaphore '{self.key}' with token '{token}'" - ) - return True - # Timeout or error - raise SemaphoreAcquisitionError(f"Failed to acquire semaphore: {status}") - - except Exception as e: - _logger.error(f"Error acquiring semaphore '{self.key}': {e}") - raise SemaphoreAcquisitionError(f"Redis error during acquisition: {e}") - - async def release(self) -> bool: - """ - Release the semaphore and return token to pool. - - This automatically makes the semaphore available to waiting clients. - The token is returned to the pool, unblocking any BRPOP waiters. - - Returns: - True if released successfully - - Raises: - SemaphoreNotAcquiredError: If semaphore not held by this instance - """ - await self._load_scripts() - - if not self._acquired: - raise SemaphoreNotAcquiredError("Semaphore not acquired by this instance") - - try: - result = await self.__class__._release_script( - keys=[self.tokens_key, self.holders_key, self.holder_key], - args=[self.instance_id], - ) - - exit_code, status, current_count = result - - _logger.debug( - f"Fair semaphore release result for '{self.key}'", - extra={ - "instance_id": self.instance_id, - "exit_code": exit_code, - "status": status, - "current_count": current_count, - }, - ) - - if exit_code == SCRIPT_OK_EXIT_CODE: # Success - self._acquired = False - _logger.info( - f"Released fair semaphore '{self.key}' with token '{self._token}'" - ) - self._token = None - return True - # Error - self._acquired = False # Mark as not acquired even on error - raise SemaphoreNotAcquiredError(f"Failed to release semaphore: {status}") - - except Exception as e: - _logger.error(f"Error releasing semaphore '{self.key}': {e}") - self._acquired = False # Mark as not acquired on error - raise SemaphoreNotAcquiredError(f"Redis error during release: {e}") - - async def renew(self) -> bool: - """ - Renew the semaphore TTL. - - Returns: - True if renewed successfully - - Raises: - SemaphoreLostError: If semaphore was lost (expired or not held) - """ - await self._load_scripts() - - if not self._acquired: - raise SemaphoreNotAcquiredError("Semaphore not acquired by this instance") - - ttl_seconds = max(1, int(self.ttl.total_seconds())) - - try: - result = await self.__class__._renew_script( - keys=[self.holders_key, self.holder_key], - args=[self.instance_id, ttl_seconds], - ) - - exit_code, status, current_count = result - - if exit_code == SCRIPT_OK_EXIT_CODE: - _logger.debug(f"Renewed semaphore '{self.key}' TTL") - return True - self._acquired = False - raise SemaphoreLostError(f"Semaphore was lost: {status}") - - except Exception as e: - _logger.error(f"Error renewing semaphore '{self.key}': {e}") - # Don't mark as not acquired on network errors - raise SemaphoreLostError(f"Redis error during renewal: {e}") - - async def count(self) -> dict: - """ - Get semaphore usage statistics. - - Returns: - Dictionary with current_holders, available_tokens, capacity - """ - await self._load_scripts() - - result = await self.__class__._count_script( - keys=[self.holders_key, self.tokens_key], args=[self.capacity] - ) - - current_holders, available_tokens, capacity = result - - return { - "current_holders": current_holders, - "available_tokens": available_tokens, - "capacity": capacity, - "utilization": current_holders / capacity if capacity > 0 else 0.0, - } - - async def health_check(self) -> dict: - """Get comprehensive semaphore health information.""" - count_info = await self.count() - cleanup_stats = await self._recover_crashed_tokens() - - total_accounted = count_info["current_holders"] + count_info["available_tokens"] - - return { - **count_info, - **cleanup_stats, - "total_accounted": total_accounted, - "is_healthy": total_accounted == self.capacity, - "cleanup_enabled": self.enable_auto_cleanup, - "instance_acquired": self._acquired, - } - - async def force_cleanup(self) -> dict: - """Manually trigger cleanup and return recovery statistics.""" - return await self._recover_crashed_tokens() - - async def __aenter__(self): - """Async context manager entry.""" - await self.acquire() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - """Async context manager exit.""" - if self._acquired: - try: - await self.release() - except Exception as e: - _logger.error(f"Error releasing semaphore in __aexit__: {e}") - - # Cancel cleanup task when exiting - if self._cleanup_task and not self._cleanup_task.done(): - self._cleanup_task.cancel() - try: - await self._cleanup_task - except asyncio.CancelledError: - pass - - @property - def acquired(self) -> bool: - """Check if semaphore is currently acquired.""" - return self._acquired diff --git a/packages/service-library/src/servicelib/redis/fair_semaphore_decorator.py b/packages/service-library/src/servicelib/redis/fair_semaphore_decorator.py deleted file mode 100644 index 8202ebf79d56..000000000000 --- a/packages/service-library/src/servicelib/redis/fair_semaphore_decorator.py +++ /dev/null @@ -1,272 +0,0 @@ -"""Fair semaphore decorator with automatic renewal and crash recovery.""" - -import asyncio -import datetime -import functools -import logging -from collections.abc import Callable, Coroutine -from contextlib import asynccontextmanager -from typing import Any, ParamSpec, TypeVar - -from common_library.logging.logging_errors import create_troubleshooting_log_kwargs - -from ._constants import ( - DEFAULT_EXPECTED_LOCK_OVERALL_TIME, - DEFAULT_SEMAPHORE_TTL, - DEFAULT_SOCKET_TIMEOUT, -) -from ._errors import ( - SemaphoreAcquisitionError, - SemaphoreLostError, - SemaphoreNotAcquiredError, -) -from .fair_semaphore import FairSemaphore - -_logger = logging.getLogger(__name__) - -P = ParamSpec("P") -R = TypeVar("R") - - -@asynccontextmanager -async def _managed_fair_semaphore_execution( - semaphore: FairSemaphore, - semaphore_key: str, - ttl: datetime.timedelta, - execution_context: str, - enable_auto_renewal: bool = True, -): - """Context manager for fair semaphore with auto-renewal.""" - - async def _auto_renewal(): - """Background task to automatically renew semaphore.""" - if not enable_auto_renewal: - return - - renewal_interval = ttl.total_seconds() / 3 # Renew at 1/3 TTL - - while semaphore.acquired: - try: - await asyncio.sleep(renewal_interval) - if semaphore.acquired: # Check again after sleep - await semaphore.renew() - _logger.debug(f"Renewed fair semaphore {semaphore_key}") - except SemaphoreLostError: - _logger.error( - f"Fair semaphore {semaphore_key} was lost during execution" - ) - break - except Exception as e: - _logger.warning(f"Failed to renew fair semaphore {semaphore_key}: {e}") - break - - renewal_task = None - try: - # Acquire the semaphore (blocks until available) - if not await semaphore.acquire(): - raise SemaphoreAcquisitionError( - f"Failed to acquire fair semaphore {semaphore_key}" - ) - - _logger.info(f"Acquired fair semaphore {semaphore_key} for {execution_context}") - - # Start auto-renewal task if enabled - if enable_auto_renewal: - renewal_task = asyncio.create_task(_auto_renewal()) - - yield - - except Exception as e: - _logger.error( - f"Error in fair semaphore-protected execution: {e}", - extra=create_troubleshooting_log_kwargs( - context=execution_context, - semaphore_key=semaphore_key, - ), - ) - raise - finally: - # Cancel renewal task - if renewal_task and not renewal_task.done(): - renewal_task.cancel() - try: - await renewal_task - except asyncio.CancelledError: - pass - - # Release semaphore - if semaphore.acquired: - try: - await semaphore.release() - _logger.info(f"Released fair semaphore {semaphore_key}") - except Exception as e: - _logger.error(f"Failed to release fair semaphore {semaphore_key}: {e}") - - -def fair_semaphore( - *, - key: str, - capacity: int, - ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL, - timeout: datetime.timedelta = DEFAULT_SOCKET_TIMEOUT, - expected_execution_time: datetime.timedelta = DEFAULT_EXPECTED_LOCK_OVERALL_TIME, - cleanup_interval: datetime.timedelta = datetime.timedelta(seconds=30), - enable_auto_cleanup: bool = True, - enable_auto_renewal: bool = True, -) -> Callable[ - [Callable[P, Coroutine[Any, Any, R]]], Callable[P, Coroutine[Any, Any, R]] -]: - """ - Decorator that protects async functions with a fair distributed semaphore. - - Uses Redis BRPOP for true FIFO fairness - first requester gets first slot. - No starvation possible, automatic crash recovery. - - Args: - key: Unique semaphore identifier - capacity: Maximum concurrent executions allowed - ttl: How long each holder can keep the semaphore - timeout: How long to wait for semaphore (0 = infinite wait) - expected_execution_time: Expected total execution time (unused, kept for compatibility) - cleanup_interval: How often to run cleanup for crashed clients - enable_auto_cleanup: Whether to run background cleanup - enable_auto_renewal: Whether to automatically renew TTL during execution - - Example: - @fair_semaphore( - key="api_calls", - capacity=10, - ttl=datetime.timedelta(seconds=30), - timeout=datetime.timedelta(seconds=60) - ) - async def call_external_api(): - # This will block fairly until semaphore available - # Maximum 10 concurrent executions - # First-come-first-served ordering guaranteed - pass - """ - - def decorator( - func: Callable[P, Coroutine[Any, Any, R]], - ) -> Callable[P, Coroutine[Any, Any, R]]: - @functools.wraps(func) - async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: - semaphore = FairSemaphore( - key=key, - capacity=capacity, - ttl=ttl, - timeout=timeout, - cleanup_interval=cleanup_interval, - enable_auto_cleanup=enable_auto_cleanup, - ) - - execution_context = f"{func.__module__}.{func.__qualname__}" - - async with _managed_fair_semaphore_execution( - semaphore=semaphore, - semaphore_key=key, - ttl=ttl, - execution_context=execution_context, - enable_auto_renewal=enable_auto_renewal, - ): - return await func(*args, **kwargs) - - return wrapper - - return decorator - - -class FairSemaphoreContext: - """Async context manager for manual fair semaphore control.""" - - def __init__( - self, - key: str, - capacity: int, - ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL, - timeout: datetime.timedelta = DEFAULT_SOCKET_TIMEOUT, - cleanup_interval: datetime.timedelta = datetime.timedelta(seconds=30), - enable_auto_cleanup: bool = True, - enable_auto_renewal: bool = True, - ): - self.semaphore = FairSemaphore( - key=key, - capacity=capacity, - ttl=ttl, - timeout=timeout, - cleanup_interval=cleanup_interval, - enable_auto_cleanup=enable_auto_cleanup, - ) - self.ttl = ttl - self.enable_auto_renewal = enable_auto_renewal - self._renewal_task: Optional[asyncio.Task] = None - - async def __aenter__(self) -> FairSemaphore: - """Acquire semaphore and start auto-renewal.""" - await self.semaphore.acquire() - - # Start auto-renewal if enabled - if self.enable_auto_renewal: - - async def _auto_renewal(): - renewal_interval = self.ttl.total_seconds() / 3 - while self.semaphore.acquired: - try: - await asyncio.sleep(renewal_interval) - if self.semaphore.acquired: - await self.semaphore.renew() - except (SemaphoreLostError, SemaphoreNotAcquiredError): - break - except Exception as e: - _logger.warning(f"Auto-renewal failed: {e}") - - self._renewal_task = asyncio.create_task(_auto_renewal()) - - return self.semaphore - - async def __aexit__(self, exc_type, exc_val, exc_tb): - """Stop renewal and release semaphore.""" - if self._renewal_task and not self._renewal_task.done(): - self._renewal_task.cancel() - try: - await self._renewal_task - except asyncio.CancelledError: - pass - - if self.semaphore.acquired: - await self.semaphore.release() - - -# Convenience function for creating fair semaphore contexts -def fair_semaphore_context( - key: str, - capacity: int, - ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL, - timeout: datetime.timedelta = DEFAULT_SOCKET_TIMEOUT, - cleanup_interval: datetime.timedelta = datetime.timedelta(seconds=30), - enable_auto_cleanup: bool = True, - enable_auto_renewal: bool = True, -) -> FairSemaphoreContext: - """ - Create an async context manager for fair semaphore usage. - - Example: - async with fair_semaphore_context( - "my_resource", - capacity=5, - timeout=datetime.timedelta(seconds=30) - ) as sem: - # Protected code here - guaranteed fair access - # sem is the FairSemaphore instance - stats = await sem.count() - print(f"Current holders: {stats['current_holders']}") - """ - return FairSemaphoreContext( - key=key, - capacity=capacity, - ttl=ttl, - timeout=timeout, - cleanup_interval=cleanup_interval, - enable_auto_cleanup=enable_auto_cleanup, - enable_auto_renewal=enable_auto_renewal, - ) diff --git a/packages/service-library/src/servicelib/redis/pure_brpop_semaphore.py b/packages/service-library/src/servicelib/redis/pure_brpop_semaphore.py deleted file mode 100644 index 0eb6a781f87e..000000000000 --- a/packages/service-library/src/servicelib/redis/pure_brpop_semaphore.py +++ /dev/null @@ -1,417 +0,0 @@ -"""Pure Python BRPOP-based fair semaphore implementation.""" - -import asyncio -import datetime -import logging -import uuid -from types import TracebackType -from typing import Annotated, ClassVar - -from common_library.basic_types import DEFAULT_FACTORY -from pydantic import ( - BaseModel, - Field, - PositiveInt, - computed_field, - field_validator, -) -from redis.commands.core import AsyncScript - -from ._client import RedisClientSDK -from ._constants import ( - DEFAULT_SEMAPHORE_TTL, - DEFAULT_SOCKET_TIMEOUT, - SEMAPHORE_HOLDER_KEY_PREFIX, - SEMAPHORE_KEY_PREFIX, -) -from ._errors import ( - SemaphoreAcquisitionError, - SemaphoreLostError, - SemaphoreNotAcquiredError, -) -from ._semaphore_lua import ( - CLEANUP_FAIR_SEMAPHORE_V2_SCRIPT, - COUNT_FAIR_SEMAPHORE_V2_SCRIPT, - REGISTER_SEMAPHORE_HOLDER_SCRIPT, - RELEASE_FAIR_SEMAPHORE_V2_SCRIPT, - RENEW_FAIR_SEMAPHORE_V2_SCRIPT, - SCRIPT_OK_EXIT_CODE, -) - -_logger = logging.getLogger(__name__) - - -class PureBRPOPSemaphore(BaseModel): - """ - A pure Python BRPOP-based fair semaphore implementation. - - This approach uses Redis BRPOP directly from Python for true blocking fairness, - with minimal Lua scripts only for registration and cleanup. - - Features: - - True FIFO fairness guaranteed by Redis BRPOP - - Native Redis blocking - no Python-side polling needed - - Crash recovery through TTL-based cleanup - - Maximum simplicity and reliability - """ - - model_config = { - "arbitrary_types_allowed": True, # For RedisClientSDK - } - - # Configuration fields - redis_client: RedisClientSDK - key: Annotated[ - str, Field(min_length=1, description="Unique identifier for the semaphore") - ] - capacity: Annotated[ - PositiveInt, Field(description="Maximum number of concurrent holders") - ] - ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL - blocking_timeout: Annotated[ - datetime.timedelta | None, - Field(description="Maximum time to wait when blocking"), - ] = DEFAULT_SOCKET_TIMEOUT - - enable_auto_cleanup: bool = Field( - default=True, - description="Whether to enable automatic cleanup of crashed holders", - ) - - instance_id: Annotated[ - str, - Field( - description="Unique instance identifier", - default_factory=lambda: f"{uuid.uuid4()}", - ), - ] = DEFAULT_FACTORY - - # Class-level script storage - register_script: ClassVar[AsyncScript | None] = None - release_script: ClassVar[AsyncScript | None] = None - renew_script: ClassVar[AsyncScript | None] = None - count_script: ClassVar[AsyncScript | None] = None - cleanup_script: ClassVar[AsyncScript | None] = None - - # Private state - _acquired: bool = False - _token: str | None = None - _cleanup_task: asyncio.Task | None = None - - @classmethod - def _register_scripts(cls, redis_client: RedisClientSDK) -> None: - """Register minimal Lua scripts with Redis.""" - if cls.register_script is None: - cls.register_script = redis_client.redis.register_script( - REGISTER_SEMAPHORE_HOLDER_SCRIPT - ) - cls.release_script = redis_client.redis.register_script( - RELEASE_FAIR_SEMAPHORE_V2_SCRIPT - ) - cls.renew_script = redis_client.redis.register_script( - RENEW_FAIR_SEMAPHORE_V2_SCRIPT - ) - cls.count_script = redis_client.redis.register_script( - COUNT_FAIR_SEMAPHORE_V2_SCRIPT - ) - cls.cleanup_script = redis_client.redis.register_script( - CLEANUP_FAIR_SEMAPHORE_V2_SCRIPT - ) - - def __init__(self, **data) -> None: - super().__init__(**data) - self.__class__._register_scripts(self.redis_client) - - # Start cleanup task if enabled - if self.enable_auto_cleanup: - self._start_cleanup_task() - - def _start_cleanup_task(self) -> None: - """Start background cleanup task for crashed holders.""" - if self._cleanup_task is None or self._cleanup_task.done(): - self._cleanup_task = asyncio.create_task(self._cleanup_loop()) - - async def _cleanup_loop(self) -> None: - """Background task to clean up crashed holders.""" - try: - while True: - await asyncio.sleep(30) # Cleanup every 30 seconds - try: - await self._recover_crashed_tokens() - except Exception as e: - _logger.warning(f"Cleanup failed for semaphore {self.key}: {e}") - except asyncio.CancelledError: - pass - - async def _recover_crashed_tokens(self) -> None: - """Recover tokens from crashed clients.""" - cls = type(self) - assert cls.cleanup_script is not None - - result = await cls.cleanup_script( - keys=[self.tokens_key, self.holders_key, self.holder_prefix], - args=[self.capacity], - client=self.redis_client.redis, - ) - - recovered_tokens, current_holders, available_tokens, total_cleaned = result - - if recovered_tokens > 0 or total_cleaned > 0: - _logger.info( - f"Recovered {recovered_tokens} tokens from {total_cleaned} crashed holders " - f"for semaphore '{self.key}'" - ) - - @computed_field - @property - def tokens_key(self) -> str: - """Redis key for the token pool LIST.""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}:tokens" - - @computed_field - @property - def holders_key(self) -> str: - """Redis key for the holders SET.""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders" - - @computed_field - @property - def holder_key(self) -> str: - """Redis key for this instance's holder entry.""" - return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:{self.instance_id}" - - @computed_field - @property - def holder_prefix(self) -> str: - """Prefix for holder keys (used in cleanup).""" - return f"{SEMAPHORE_HOLDER_KEY_PREFIX}{self.key}:" - - @field_validator("ttl") - @classmethod - def validate_ttl(cls, v: datetime.timedelta) -> datetime.timedelta: - if v.total_seconds() <= 0: - raise ValueError("TTL must be positive") - return v - - @field_validator("blocking_timeout") - @classmethod - def validate_timeout( - cls, v: datetime.timedelta | None - ) -> datetime.timedelta | None: - if v is not None and v.total_seconds() <= 0: - raise ValueError("Timeout must be positive") - return v - - async def acquire(self) -> bool: - """ - Acquire the semaphore using pure Python BRPOP. - - This is the cleanest possible approach: - 1. Call Redis BRPOP directly from Python (guaranteed FIFO fairness) - 2. Use minimal Lua script only to register as holder - 3. No complex retry logic or notifications needed - - Returns: - True if acquired successfully - - Raises: - SemaphoreAcquisitionError: If acquisition fails or times out - """ - if self._acquired: - raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) - - timeout_seconds = ( - int(self.blocking_timeout.total_seconds()) if self.blocking_timeout else 0 - ) - ttl_seconds = int(self.ttl.total_seconds()) - - _logger.debug( - f"Attempting to acquire semaphore '{self.key}' using BRPOP " - f"(timeout: {timeout_seconds}s)" - ) - - try: - # Use Redis BRPOP directly from Python - this is perfectly legal! - # BRPOP blocks until a token is available or timeout occurs - result = await self.redis_client.redis.brpop( - self.tokens_key, timeout=timeout_seconds - ) - - if result is None: - # Timeout occurred - raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) - - # result is (key, token) tuple - _, token = result - token = token.decode("utf-8") if isinstance(token, bytes) else token - - # Register as holder using minimal Lua script - cls = type(self) - assert cls.register_script is not None - - register_result = await cls.register_script( - keys=[self.tokens_key, self.holders_key, self.holder_key], - args=[self.instance_id, self.capacity, ttl_seconds, token], - client=self.redis_client.redis, - ) - - exit_code, status, current_count = register_result - - if exit_code == SCRIPT_OK_EXIT_CODE: - self._acquired = True - self._token = token - - _logger.info( - f"Acquired semaphore '{self.key}' with token '{token}' " - f"(instance: {self.instance_id}, count: {current_count})" - ) - return True - else: - # Registration failed - this shouldn't happen but be safe - # Return the token to the pool - await self.redis_client.redis.lpush(self.tokens_key, token) - raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) - - except TimeoutError: - raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) - except Exception as e: - _logger.error(f"Error acquiring semaphore '{self.key}': {e}") - raise SemaphoreAcquisitionError(name=self.key, capacity=self.capacity) - - async def release(self) -> None: - """ - Release the semaphore and return token to pool. - - Raises: - SemaphoreNotAcquiredError: If semaphore was not acquired by this instance - """ - if not self._acquired: - raise SemaphoreNotAcquiredError(name=self.key) - - try: - # Use existing release script - cls = type(self) - assert cls.release_script is not None - - result = await cls.release_script( - keys=[self.tokens_key, self.holders_key, self.holder_key], - args=[self.instance_id], - client=self.redis_client.redis, - ) - - exit_code, status, current_count = result - - if exit_code == SCRIPT_OK_EXIT_CODE: - released_token = self._token - self._acquired = False - self._token = None - - _logger.info( - f"Released semaphore '{self.key}' with token '{released_token}' " - f"(instance: {self.instance_id}, count: {current_count})" - ) - return - - # Release failed - _logger.error( - f"Failed to release semaphore '{self.key}' - {status} " - f"(instance: {self.instance_id}, count: {current_count})" - ) - # Mark as not acquired anyway to prevent stuck state - self._acquired = False - self._token = None - raise SemaphoreNotAcquiredError(name=self.key) - - except Exception as e: - _logger.error(f"Error releasing semaphore '{self.key}': {e}") - # Mark as not acquired to prevent stuck state - self._acquired = False - self._token = None - raise SemaphoreNotAcquiredError(name=self.key) - - async def renew(self) -> None: - """ - Renew the semaphore TTL. - - Raises: - SemaphoreLostError: If the semaphore was lost or expired - """ - if not self._acquired: - raise SemaphoreNotAcquiredError(name=self.key) - - ttl_seconds = int(self.ttl.total_seconds()) - - try: - cls = type(self) - assert cls.renew_script is not None - - result = await cls.renew_script( - keys=[self.holders_key, self.holder_key], - args=[self.instance_id, ttl_seconds], - client=self.redis_client.redis, - ) - - exit_code, status, current_count = result - - if exit_code == SCRIPT_OK_EXIT_CODE: - _logger.debug(f"Renewed semaphore '{self.key}' TTL") - return - - # Renewal failed - semaphore was lost - _logger.warning( - f"Semaphore '{self.key}' was lost during renewal - {status} " - f"(instance: {self.instance_id})" - ) - self._acquired = False - self._token = None - raise SemaphoreLostError(name=self.key, instance_id=self.instance_id) - - except Exception as e: - _logger.error(f"Error renewing semaphore '{self.key}': {e}") - raise SemaphoreLostError(name=self.key, instance_id=self.instance_id) - - async def get_current_count(self) -> int: - """Get the current number of semaphore holders.""" - cls = type(self) - assert cls.count_script is not None - - result = await cls.count_script( - keys=[self.holders_key, self.tokens_key], - args=[self.capacity], - client=self.redis_client.redis, - ) - - current_holders, available_tokens, capacity = result - return int(current_holders) - - async def get_available_count(self) -> int: - """Get the number of available semaphore slots.""" - current_count = await self.get_current_count() - return max(0, self.capacity - current_count) - - @property - def acquired(self) -> bool: - """Check if semaphore is currently acquired.""" - return self._acquired - - # Context manager support - async def __aenter__(self) -> "PureBRPOPSemaphore": - await self.acquire() - return self - - async def __aexit__( - self, - exc_type: type[BaseException] | None, - exc_val: BaseException | None, - exc_tb: TracebackType | None, - ) -> None: - if self._acquired: - await self.release() - - # Clean up background task - if self._cleanup_task and not self._cleanup_task.done(): - self._cleanup_task.cancel() - try: - await self._cleanup_task - except asyncio.CancelledError: - pass From 0f9cbe2066ea9c90e336e2bf8bb594f29e06fc85 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:36:49 +0200 Subject: [PATCH 19/68] cleanup --- packages/service-library/src/servicelib/redis/_semaphore.py | 6 +++--- packages/service-library/tests/redis/test_semaphore.py | 2 +- .../service-library/tests/redis/test_semaphore_decorator.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 5ed0d3c8c14d..68f494a45122 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -144,15 +144,15 @@ def tokens_key(self) -> str: @computed_field # type: ignore[prop-decorator] @property - def holders_key(self) -> str: + def holders_set(self) -> str: """Redis key for the holders SET.""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders_set" @computed_field # type: ignore[prop-decorator] @property def holder_key(self) -> str: """Redis key for this instance's holder entry.""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders_:{self.instance_id}" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders:{self.instance_id}" # Additional validation @field_validator("ttl") diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 7d8fc43fa367..3da3b4a57a48 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -59,7 +59,7 @@ async def test_semaphore_initialization( assert semaphore.instance_id is not None assert semaphore.semaphore_key == f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}" assert semaphore.holder_key.startswith( - f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders_:" + f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders:" ) diff --git a/packages/service-library/tests/redis/test_semaphore_decorator.py b/packages/service-library/tests/redis/test_semaphore_decorator.py index 5481891b7f8a..b2d3e60b17a4 100644 --- a/packages/service-library/tests/redis/test_semaphore_decorator.py +++ b/packages/service-library/tests/redis/test_semaphore_decorator.py @@ -134,7 +134,7 @@ async def coro_that_should_fail() -> Literal["should not reach here"]: # Find and delete all holder keys for this semaphore holder_keys = await redis_client_sdk.redis.keys( - f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders_:*" + f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders:*" ) assert holder_keys, "Holder keys should exist before deletion" await redis_client_sdk.redis.delete(*holder_keys) @@ -745,7 +745,7 @@ async def use_failing_cm() -> None: # Find and delete all holder keys for this semaphore holder_keys = await redis_client_sdk.redis.keys( - f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders_:*" + f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders:*" ) assert holder_keys, "Holder keys should exist before deletion" await redis_client_sdk.redis.delete(*holder_keys) From 41550f91c35ad7ec497b71c87ee60fef733a37eb Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:37:30 +0200 Subject: [PATCH 20/68] cleanup --- .../service-library/src/servicelib/redis/_semaphore.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 68f494a45122..c6b8f41faacf 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -179,7 +179,7 @@ async def _ensure_semaphore_initialized(self) -> None: cls = type(self) assert cls.register_semaphore is not None # nosec await cls.register_semaphore( # pylint: disable=not-callable - keys=[self.tokens_key, self.holders_key], + keys=[self.tokens_key, self.holders_set], args=[self.capacity, ttl_seconds], client=self.redis_client.redis, ) @@ -260,7 +260,7 @@ async def _try_acquire() -> list[str] | None: cls = type(self) assert cls.acquire_script is not None # nosec result = await cls.acquire_script( # pylint: disable=not-callable - keys=[self.holders_key, self.holder_key], + keys=[self.holders_set, self.holder_key], args=[ token, self.instance_id, @@ -297,7 +297,7 @@ async def release(self) -> None: cls = type(self) assert cls.release_script is not None # nosec result = await cls.release_script( # pylint: disable=not-callable - keys=[self.tokens_key, self.holders_key, self.holder_key], + keys=[self.tokens_key, self.holders_set, self.holder_key], args=[self.instance_id], client=self.redis_client.redis, ) @@ -342,7 +342,7 @@ async def reacquire(self) -> None: cls = type(self) assert cls.renew_script is not None # nosec result = await cls.renew_script( # pylint: disable=not-callable - keys=[self.holders_key, self.holder_key], + keys=[self.holders_set, self.holder_key], args=[self.instance_id, ttl_seconds], client=self.redis_client.redis, ) @@ -383,7 +383,7 @@ async def is_acquired(self) -> bool: async def get_current_count(self) -> int: """Get the current number of semaphore holders""" return await handle_redis_returns_union_types( - self.redis_client.redis.scard(self.holders_key) + self.redis_client.redis.scard(self.holders_set) ) async def get_available_count(self) -> int: From c5f3384794522e2b8de0bb4a45026f17107915e7 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 18:28:42 +0200 Subject: [PATCH 21/68] mypy --- .../src/servicelib/redis/_semaphore.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index c6b8f41faacf..b4308a01ff24 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -207,16 +207,17 @@ async def acquire(self) -> bool: ttl_seconds = int(self.ttl.total_seconds()) try: - stop_condition = stop_after_delay(0) # non-blocking by default - if self.blocking: - stop_condition = ( - stop_after_delay(self.blocking_timeout) - if self.blocking_timeout - else stop_never - ) @retry( - stop=stop_condition, + stop=( + stop_after_delay(0) + if not self.blocking + else ( + stop_after_delay(self.blocking_timeout) + if self.blocking_timeout + else stop_never + ) + ), wait=wait_random_exponential(min=0.1), retry=retry_if_exception_type(redis.exceptions.TimeoutError), ) @@ -373,7 +374,7 @@ async def reacquire(self) -> None: async def is_acquired(self) -> bool: """Check if the semaphore is currently acquired by this instance.""" - return ( + return bool( await handle_redis_returns_union_types( self.redis_client.redis.exists(self.holder_key) ) From 8e1bfc30b8af8511cc504a33cea327bccec6050c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 18:31:28 +0200 Subject: [PATCH 22/68] ensure we do not wait too much --- packages/service-library/src/servicelib/redis/_semaphore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index b4308a01ff24..136366c3099e 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -218,7 +218,7 @@ async def acquire(self) -> bool: else stop_never ) ), - wait=wait_random_exponential(min=0.1), + wait=wait_random_exponential(min=0.1, max=0.5), retry=retry_if_exception_type(redis.exceptions.TimeoutError), ) async def _try_acquire() -> list[str] | None: From 055dd8ced720b72ddb26a960fc07581f4adc9e86 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 18:37:51 +0200 Subject: [PATCH 23/68] sonar --- .../src/servicelib/redis/_semaphore.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 136366c3099e..fdb6df64d872 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -22,6 +22,7 @@ stop_never, wait_random_exponential, ) +from tenacity.stop import stop_base from ._client import RedisClientSDK from ._constants import ( @@ -206,18 +207,19 @@ async def acquire(self) -> bool: ttl_seconds = int(self.ttl.total_seconds()) + # Determine retry stop condition based on blocking configuration + stop_condition: stop_base = stop_after_delay(0) + if self.blocking: + stop_condition = ( + stop_after_delay(self.blocking_timeout) + if self.blocking_timeout + else stop_never + ) + try: @retry( - stop=( - stop_after_delay(0) - if not self.blocking - else ( - stop_after_delay(self.blocking_timeout) - if self.blocking_timeout - else stop_never - ) - ), + stop=stop_condition, wait=wait_random_exponential(min=0.1, max=0.5), retry=retry_if_exception_type(redis.exceptions.TimeoutError), ) From 39ba892c33d46a12dd7337e51e2fdf411c9fc8b2 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 18:45:49 +0200 Subject: [PATCH 24/68] improve testing --- .../service-library/tests/redis/test_semaphore.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 3da3b4a57a48..f962d1f94c3c 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -281,10 +281,16 @@ async def delayed_release() -> None: await semaphore2.release() +@pytest.mark.parametrize( + "exception", + [RuntimeError, asyncio.CancelledError], + ids=str, +) async def test_semaphore_context_manager_with_exception( redis_client_sdk: RedisClientSDK, semaphore_name: str, semaphore_capacity: int, + exception: type[Exception | asyncio.CancelledError], ): captured_semaphore: DistributedSemaphore | None = None @@ -296,10 +302,9 @@ async def _raising_context(): ) as sem: nonlocal captured_semaphore captured_semaphore = sem - msg = "Test exception" - raise RuntimeError(msg) + raise exception("Test") - with pytest.raises(RuntimeError, match="Test exception"): + with pytest.raises(exception, match="Test"): await _raising_context() # Should be released even after exception From ef9e8b08da9d7ccb6c4f71f69960707f8f4fecca Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 18:50:16 +0200 Subject: [PATCH 25/68] ongoing --- .../tests/redis/test_semaphore.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index f962d1f94c3c..236363bbc78c 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -37,7 +37,7 @@ def with_short_default_semaphore_ttl( ) -> datetime.timedelta: short_ttl = datetime.timedelta(seconds=0.5) mocker.patch( - "servicelib.redis._semaphore._DEFAULT_SEMAPHORE_TTL", + "servicelib.redis._semaphore.DEFAULT_SEMAPHORE_TTL", short_ttl, ) return short_ttl @@ -99,6 +99,7 @@ async def test_semaphore_acquire_release_single( redis_client_sdk: RedisClientSDK, semaphore_name: str, semaphore_capacity: int, + with_short_default_semaphore_ttl: datetime.timedelta, ): semaphore = DistributedSemaphore( redis_client=redis_client_sdk, @@ -143,6 +144,24 @@ async def test_semaphore_acquire_release_single( ): await semaphore.reacquire() + # now check what happens once TTL is expired + await semaphore.acquire() + assert await semaphore.get_current_count() == 1 + assert await semaphore.get_available_count() == semaphore_capacity - 1 + await asyncio.sleep(with_short_default_semaphore_ttl.total_seconds() + 0.1) + # TTL expired, reacquire should fail + with pytest.raises( + SemaphoreLostError, + match=f"Semaphore '{semaphore_name}' was lost by this instance", + ): + await semaphore.reacquire() + # and release should also fail + with pytest.raises( + SemaphoreNotAcquiredError, + match=f"Semaphore '{semaphore_name}' was not acquired by this instance", + ): + await semaphore.release() + async def test_semaphore_context_manager( redis_client_sdk: RedisClientSDK, From d80df656727fc8395e8ab1118cfcf9ebb1041d26 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 22 Sep 2025 18:51:03 +0200 Subject: [PATCH 26/68] ongoing --- packages/service-library/src/servicelib/redis/_semaphore.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index fdb6df64d872..a112d92fc49a 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -330,8 +330,7 @@ async def release(self) -> None: async def reacquire(self) -> None: """ - Atomically renew a semaphore entry using Lua script. - + Atomically renew a semaphore This function is intended to be called by decorators or external renewal mechanisms. From f4dc251a43729ce23a650221fef9eda60b51b041 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 08:23:19 +0200 Subject: [PATCH 27/68] fixing test --- .../service-library/src/servicelib/redis/_semaphore.py | 8 ++++---- packages/service-library/tests/redis/test_semaphore.py | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index a112d92fc49a..a5f754806357 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -159,7 +159,7 @@ def holder_key(self) -> str: @field_validator("ttl") @classmethod def validate_ttl(cls, v: datetime.timedelta) -> datetime.timedelta: - if v.total_seconds() <= 0: + if v.total_seconds() < 1: msg = "TTL must be positive" raise ValueError(msg) return v @@ -176,7 +176,7 @@ def validate_timeout( async def _ensure_semaphore_initialized(self) -> None: """Initializes the semaphore in Redis if not already done.""" - ttl_seconds = int(self.ttl.total_seconds()) + ttl_seconds = self.ttl.total_seconds() cls = type(self) assert cls.register_semaphore is not None # nosec await cls.register_semaphore( # pylint: disable=not-callable @@ -205,7 +205,7 @@ async def acquire(self) -> bool: ) return True - ttl_seconds = int(self.ttl.total_seconds()) + ttl_seconds = self.ttl.total_seconds() # Determine retry stop condition based on blocking configuration stop_condition: stop_base = stop_after_delay(0) @@ -338,7 +338,7 @@ async def reacquire(self) -> None: SemaphoreLostError: If the semaphore was lost or expired """ - ttl_seconds = int(self.ttl.total_seconds()) + ttl_seconds = self.ttl.total_seconds() # Execute the renewal Lua script atomically cls = type(self) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 236363bbc78c..e40268669fe8 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -35,7 +35,7 @@ def with_short_default_semaphore_ttl( mocker: MockerFixture, ) -> datetime.timedelta: - short_ttl = datetime.timedelta(seconds=0.5) + short_ttl = datetime.timedelta(seconds=1) mocker.patch( "servicelib.redis._semaphore.DEFAULT_SEMAPHORE_TTL", short_ttl, @@ -105,6 +105,7 @@ async def test_semaphore_acquire_release_single( redis_client=redis_client_sdk, key=semaphore_name, capacity=semaphore_capacity, + ttl=with_short_default_semaphore_ttl, ) # Initially not acquired From 5a98fe0afa9874564115911e558018fb12d3298b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 08:23:58 +0200 Subject: [PATCH 28/68] adjust --- packages/service-library/tests/redis/test_semaphore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index e40268669fe8..9d1bbd62d349 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -35,7 +35,7 @@ def with_short_default_semaphore_ttl( mocker: MockerFixture, ) -> datetime.timedelta: - short_ttl = datetime.timedelta(seconds=1) + short_ttl = datetime.timedelta(seconds=2) mocker.patch( "servicelib.redis._semaphore.DEFAULT_SEMAPHORE_TTL", short_ttl, From a189d9d1b0e493dca48794e8350dd34ff4fed6ec Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 08:36:05 +0200 Subject: [PATCH 29/68] clean --- .../src/servicelib/redis/_semaphore.py | 13 +++---- .../tests/redis/test_semaphore.py | 36 +++++++++---------- .../tests/redis/test_semaphore_decorator.py | 18 +++++----- 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index a5f754806357..7883fd494663 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -155,7 +155,6 @@ def holder_key(self) -> str: """Redis key for this instance's holder entry.""" return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders:{self.instance_id}" - # Additional validation @field_validator("ttl") @classmethod def validate_ttl(cls, v: datetime.timedelta) -> datetime.timedelta: @@ -255,11 +254,13 @@ async def _try_acquire() -> list[str] | None: ) from e return False + # If we got here it means we acquired a token assert tokens_key_token is not None # nosec assert len(tokens_key_token) == 2 # nosec # noqa: PLR2004 assert tokens_key_token[0] == self.tokens_key # nosec token = tokens_key_token[1] + # set up the semaphore holder with a TTL cls = type(self) assert cls.acquire_script is not None # nosec result = await cls.acquire_script( # pylint: disable=not-callable @@ -290,7 +291,7 @@ async def _try_acquire() -> list[str] | None: async def release(self) -> None: """ - Release the semaphore atomically using Lua script. + Release the semaphore Raises: SemaphoreNotAcquiredError: If semaphore was not acquired by this instance @@ -330,7 +331,7 @@ async def release(self) -> None: async def reacquire(self) -> None: """ - Atomically renew a semaphore + Re-acquire a semaphore This function is intended to be called by decorators or external renewal mechanisms. @@ -382,14 +383,14 @@ async def is_acquired(self) -> bool: == 1 ) - async def get_current_count(self) -> int: + async def current_count(self) -> int: """Get the current number of semaphore holders""" return await handle_redis_returns_union_types( self.redis_client.redis.scard(self.holders_set) ) - async def get_available_count(self) -> int: - """Get the number of available semaphore slots""" + async def size(self) -> int: + """Get the size of the semaphore (number of available tokens)""" await self._ensure_semaphore_initialized() return await handle_redis_returns_union_types( self.redis_client.redis.llen(self.tokens_key) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 9d1bbd62d349..505755bb3c32 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -109,8 +109,8 @@ async def test_semaphore_acquire_release_single( ) # Initially not acquired - assert await semaphore.get_current_count() == 0 - assert await semaphore.get_available_count() == semaphore_capacity + assert await semaphore.current_count() == 0 + assert await semaphore.size() == semaphore_capacity assert await semaphore.is_acquired() is False # Acquire successfully @@ -118,25 +118,25 @@ async def test_semaphore_acquire_release_single( assert result is True # Check Redis state - assert await semaphore.get_current_count() == 1 - assert await semaphore.get_available_count() == semaphore_capacity - 1 + assert await semaphore.current_count() == 1 + assert await semaphore.size() == semaphore_capacity - 1 assert await semaphore.is_acquired() is True # Acquire again on same instance should return True immediately and keep the same count (reentrant) result = await semaphore.acquire() assert result is True - assert await semaphore.get_current_count() == 1 - assert await semaphore.get_available_count() == semaphore_capacity - 1 + assert await semaphore.current_count() == 1 + assert await semaphore.size() == semaphore_capacity - 1 # reacquire should just work await semaphore.reacquire() - assert await semaphore.get_current_count() == 1 - assert await semaphore.get_available_count() == semaphore_capacity - 1 + assert await semaphore.current_count() == 1 + assert await semaphore.size() == semaphore_capacity - 1 # Release await semaphore.release() - assert await semaphore.get_current_count() == 0 - assert await semaphore.get_available_count() == semaphore_capacity + assert await semaphore.current_count() == 0 + assert await semaphore.size() == semaphore_capacity # reacquire after release should fail with pytest.raises( @@ -147,8 +147,8 @@ async def test_semaphore_acquire_release_single( # now check what happens once TTL is expired await semaphore.acquire() - assert await semaphore.get_current_count() == 1 - assert await semaphore.get_available_count() == semaphore_capacity - 1 + assert await semaphore.current_count() == 1 + assert await semaphore.size() == semaphore_capacity - 1 await asyncio.sleep(with_short_default_semaphore_ttl.total_seconds() + 0.1) # TTL expired, reacquire should fail with pytest.raises( @@ -172,10 +172,10 @@ async def test_semaphore_context_manager( async with DistributedSemaphore( redis_client=redis_client_sdk, key=semaphore_name, capacity=semaphore_capacity ) as semaphore: - assert await semaphore.get_current_count() == 1 + assert await semaphore.current_count() == 1 # Should be released after context - assert await semaphore.get_current_count() == 0 + assert await semaphore.current_count() == 0 async def test_semaphore_release_without_acquire_raises( @@ -216,13 +216,13 @@ async def test_semaphore_multiple_instances_capacity_limit( assert await semaphore.acquire() is False # Check counts - assert await semaphores[0].get_current_count() == 2 - assert await semaphores[0].get_available_count() == 0 + assert await semaphores[0].current_count() == 2 + assert await semaphores[0].size() == 0 # Release one await semaphores[0].release() - assert await semaphores[0].get_current_count() == 1 - assert await semaphores[0].get_available_count() == 1 + assert await semaphores[0].current_count() == 1 + assert await semaphores[0].size() == 1 # Now third can acquire assert await semaphores[2].acquire() is True diff --git a/packages/service-library/tests/redis/test_semaphore_decorator.py b/packages/service-library/tests/redis/test_semaphore_decorator.py index b2d3e60b17a4..2b9be00f4720 100644 --- a/packages/service-library/tests/redis/test_semaphore_decorator.py +++ b/packages/service-library/tests/redis/test_semaphore_decorator.py @@ -92,8 +92,8 @@ async def long_running_work() -> Literal["success"]: capacity=semaphore_capacity, ttl=short_ttl, ) - assert await temp_semaphore.get_current_count() == 1 - assert await temp_semaphore.get_available_count() == semaphore_capacity - 1 + assert await temp_semaphore.current_count() == 1 + assert await temp_semaphore.size() == semaphore_capacity - 1 # Wait for work to complete result = await task @@ -101,8 +101,8 @@ async def long_running_work() -> Literal["success"]: assert work_completed.is_set() # After completion, semaphore should be released - assert await temp_semaphore.get_current_count() == 0 - assert await temp_semaphore.get_available_count() == semaphore_capacity + assert await temp_semaphore.current_count() == 0 + assert await temp_semaphore.size() == semaphore_capacity async def test_auto_renewal_lose_semaphore_raises( @@ -335,7 +335,7 @@ async def failing_function(): ttl=short_ttl, ) assert ( - await test_semaphore.get_current_count() == 0 + await test_semaphore.current_count() == 0 ), "Semaphore should be released after exception" @@ -605,16 +605,16 @@ async def use_long_running_cm(): capacity=semaphore_capacity, ttl=short_ttl, ) - assert await temp_semaphore.get_current_count() == 1 - assert await temp_semaphore.get_available_count() == semaphore_capacity - 1 + assert await temp_semaphore.current_count() == 1 + assert await temp_semaphore.size() == semaphore_capacity - 1 # Wait for work to complete await task assert work_completed.is_set() # After completion, semaphore should be released - assert await temp_semaphore.get_current_count() == 0 - assert await temp_semaphore.get_available_count() == semaphore_capacity + assert await temp_semaphore.current_count() == 0 + assert await temp_semaphore.size() == semaphore_capacity async def test_context_manager_with_callable_parameters( From 56d3611c67e06ab76434e503ea3a43ed30d5bdfe Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 08:57:12 +0200 Subject: [PATCH 30/68] minor --- .../src/servicelib/redis/_errors.py | 8 ++++-- .../src/servicelib/redis/_semaphore.py | 11 ++++++-- .../tests/redis/test_semaphore.py | 28 ++++++++++++++++--- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_errors.py b/packages/service-library/src/servicelib/redis/_errors.py index e83b40e4ec62..3aada3b0e96d 100644 --- a/packages/service-library/src/servicelib/redis/_errors.py +++ b/packages/service-library/src/servicelib/redis/_errors.py @@ -27,11 +27,15 @@ class LockLostError(BaseRedisError): class SemaphoreAcquisitionError(BaseRedisError): - msg_template: str = "Could not acquire semaphore '{name}' (capacity: {capacity})" + msg_template: str = ( + "Could not acquire semaphore '{name}' by this instance `{instance_id}`" + ) class SemaphoreNotAcquiredError(BaseRedisError): - msg_template: str = "Semaphore '{name}' was not acquired by this instance" + msg_template: str = ( + "Semaphore '{name}' was not acquired by this instance `{instance_id}`" + ) class SemaphoreLostError(BaseRedisError): diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 7883fd494663..86659ededfee 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -250,7 +250,7 @@ async def _try_acquire() -> list[str] | None: ) if self.blocking: raise SemaphoreAcquisitionError( - name=self.key, capacity=self.capacity + name=self.key, instance_id=self.instance_id ) from e return False @@ -327,7 +327,10 @@ async def release(self) -> None: self.instance_id, current_count, ) - raise SemaphoreNotAcquiredError(name=self.key) + if status == "not_held": + raise SemaphoreNotAcquiredError(name=self.key, instance_id=self.instance_id) + assert status == "already_expired" # nosec + raise SemaphoreLostError(name=self.key, instance_id=self.instance_id) async def reacquire(self) -> None: """ @@ -371,7 +374,9 @@ async def reacquire(self) -> None: status, current_count, ) - + if status == "not_held": + raise SemaphoreNotAcquiredError(name=self.key, instance_id=self.instance_id) + assert status == "already_expired" # nosec raise SemaphoreLostError(name=self.key, instance_id=self.instance_id) async def is_acquired(self) -> bool: diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 505755bb3c32..9317167e23f0 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -95,7 +95,7 @@ async def test_invalid_semaphore_initialization( ) -async def test_semaphore_acquire_release_single( +async def test_semaphore_acquire_release_basic( redis_client_sdk: RedisClientSDK, semaphore_name: str, semaphore_capacity: int, @@ -145,11 +145,31 @@ async def test_semaphore_acquire_release_single( ): await semaphore.reacquire() - # now check what happens once TTL is expired + with pytest.raises( + SemaphoreNotAcquiredError, + match=f"Semaphore '{semaphore_name}' was not acquired by this instance", + ): + await semaphore.release() + + +async def test_semaphore_acquire_release_with_ttl_expiry( + redis_client_sdk: RedisClientSDK, + semaphore_name: str, + semaphore_capacity: int, + with_short_default_semaphore_ttl: datetime.timedelta, +): + semaphore = DistributedSemaphore( + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=semaphore_capacity, + ttl=with_short_default_semaphore_ttl, + ) await semaphore.acquire() assert await semaphore.current_count() == 1 assert await semaphore.size() == semaphore_capacity - 1 + # wait for TTL to expire await asyncio.sleep(with_short_default_semaphore_ttl.total_seconds() + 0.1) + # TTL expired, reacquire should fail with pytest.raises( SemaphoreLostError, @@ -255,7 +275,7 @@ async def test_semaphore_blocking_timeout( with pytest.raises( SemaphoreAcquisitionError, - match=f"Could not acquire semaphore '{semaphore_name}' \\(capacity: {capacity}\\)", + match=f"Could not acquire semaphore '{semaphore_name}' by this instance", ): await semaphore2.acquire() @@ -330,7 +350,7 @@ async def _raising_context(): # Should be released even after exception assert captured_semaphore is not None # captured_semaphore is guaranteed to be not None by the assert above - assert await captured_semaphore.get_current_count() == 0 + assert await captured_semaphore.current_count() == 0 async def test_multiple_semaphores_different_keys( From 5f4f043adc6d7958bd4bc972e295812814de9a87 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 09:00:11 +0200 Subject: [PATCH 31/68] check ttl expiration --- packages/service-library/src/servicelib/redis/_semaphore.py | 2 +- packages/service-library/tests/redis/test_semaphore.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 86659ededfee..2a06bea4c06c 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -376,7 +376,7 @@ async def reacquire(self) -> None: ) if status == "not_held": raise SemaphoreNotAcquiredError(name=self.key, instance_id=self.instance_id) - assert status == "already_expired" # nosec + assert status == "expired" # nosec raise SemaphoreLostError(name=self.key, instance_id=self.instance_id) async def is_acquired(self) -> bool: diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 9317167e23f0..28c54b154486 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -140,8 +140,8 @@ async def test_semaphore_acquire_release_basic( # reacquire after release should fail with pytest.raises( - SemaphoreLostError, - match=f"Semaphore '{semaphore_name}' was lost by this instance", + SemaphoreNotAcquiredError, + match=f"Semaphore '{semaphore_name}' was not acquired by this instance", ): await semaphore.reacquire() From 2b223a1c364b99fe29121357df3e7f9807b8f703 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 09:03:30 +0200 Subject: [PATCH 32/68] adjust test --- .../tests/redis/test_semaphore.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 28c54b154486..2ec711ffe867 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -84,6 +84,13 @@ async def test_invalid_semaphore_initialization( capacity=1, ttl=datetime.timedelta(seconds=0), ) + with pytest.raises(ValueError, match="TTL must be positive"): + DistributedSemaphore( + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=1, + ttl=datetime.timedelta(seconds=0.5), + ) with pytest.raises(ValueError, match="Timeout must be positive"): DistributedSemaphore( redis_client=redis_client_sdk, @@ -113,11 +120,9 @@ async def test_semaphore_acquire_release_basic( assert await semaphore.size() == semaphore_capacity assert await semaphore.is_acquired() is False - # Acquire successfully + # Acquire result = await semaphore.acquire() assert result is True - - # Check Redis state assert await semaphore.current_count() == 1 assert await semaphore.size() == semaphore_capacity - 1 assert await semaphore.is_acquired() is True @@ -127,16 +132,19 @@ async def test_semaphore_acquire_release_basic( assert result is True assert await semaphore.current_count() == 1 assert await semaphore.size() == semaphore_capacity - 1 + assert await semaphore.is_acquired() is True # reacquire should just work await semaphore.reacquire() assert await semaphore.current_count() == 1 assert await semaphore.size() == semaphore_capacity - 1 + assert await semaphore.is_acquired() is True # Release await semaphore.release() assert await semaphore.current_count() == 0 assert await semaphore.size() == semaphore_capacity + assert await semaphore.is_acquired() is False # reacquire after release should fail with pytest.raises( @@ -145,6 +153,7 @@ async def test_semaphore_acquire_release_basic( ): await semaphore.reacquire() + # so does release again with pytest.raises( SemaphoreNotAcquiredError, match=f"Semaphore '{semaphore_name}' was not acquired by this instance", @@ -178,8 +187,8 @@ async def test_semaphore_acquire_release_with_ttl_expiry( await semaphore.reacquire() # and release should also fail with pytest.raises( - SemaphoreNotAcquiredError, - match=f"Semaphore '{semaphore_name}' was not acquired by this instance", + SemaphoreLostError, + match=f"Semaphore '{semaphore_name}' was lost by this instance", ): await semaphore.release() From 7698eda9d0e4d98dda15a624a19e3d9715df0f5e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 09:11:27 +0200 Subject: [PATCH 33/68] refactor lua scripts --- .../redis/lua/release_fair_semaphore_v2.lua | 21 +++++++------------ .../redis/lua/renew_fair_semaphore_v2.lua | 13 ++++-------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua index d244810bae42..69148d33eeba 100644 --- a/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua +++ b/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua @@ -19,21 +19,17 @@ local instance_id = ARGV[1] local is_holder = redis.call('SISMEMBER', holders_key, instance_id) if is_holder == 0 then -- Not in holders set - check if holder key exists - local exists = redis.call('EXISTS', holder_key) - if exists == 1 then - -- Holder key exists but not in set - clean it up - redis.call('DEL', holder_key) - return {255, 'already_expired', redis.call('SCARD', holders_key)} - else - return {255, 'not_held', redis.call('SCARD', holders_key)} - end + return {255, 'not_held', redis.call('SCARD', holders_key)} end --- Step 2: Get the token from holder key before releasing +-- Step 2: Get the token from holder key local token = redis.call('GET', holder_key) if not token then - -- Fallback token if somehow missing - token = 'token_default' + -- the token expired but we are still in the holders set + -- this indicates a lost semaphore (e.g. due to TTL expiry) + -- remove from holders set and return error + redis.call('SREM', holders_key, instance_id) + return {255, 'already_expired', redis.call('SCARD', holders_key)} end -- Step 3: Release the semaphore @@ -44,6 +40,5 @@ redis.call('DEL', holder_key) -- This automatically unblocks any waiting BRPOP calls redis.call('LPUSH', tokens_key, token) -local new_count = redis.call('SCARD', holders_key) -return {0, 'released', new_count} +return {0, 'released', redis.call('SCARD', holders_key)} diff --git a/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua index 3c897ed8d90f..9526fa414b54 100644 --- a/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua +++ b/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua @@ -18,23 +18,18 @@ local ttl_seconds = tonumber(ARGV[2]) local is_holder = redis.call('SISMEMBER', holders_key, instance_id) if is_holder == 0 then -- Not in holders set - local current_count = redis.call('SCARD', holders_key) - return {255, 'not_held', current_count} + return {255, 'not_held', redis.call('SCARD', holders_key)} end -- Step 2: Check if holder key exists (to detect if it expired) local exists = redis.call('EXISTS', holder_key) if exists == 0 then - -- Holder key expired - remove from set and fail renewal - redis.call('SREM', holders_key, instance_id) - local current_count = redis.call('SCARD', holders_key) - return {255, 'expired', current_count} + -- Holder key expired + return {255, 'expired', redis.call('SCARD', holders_key)} end -- Step 3: Renew the holder key TTL local token = redis.call('GET', holder_key) redis.call('SETEX', holder_key, ttl_seconds, token) -local current_count = redis.call('SCARD', holders_key) - -return {0, 'renewed', current_count} +return {0, 'renewed', redis.call('SCARD', holders_key)} From 3a3a725370bc9850656f3a5b131f07475c80a73d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 09:12:41 +0200 Subject: [PATCH 34/68] minor --- packages/service-library/src/servicelib/redis/_semaphore.py | 2 +- .../src/servicelib/redis/lua/release_fair_semaphore_v2.lua | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 2a06bea4c06c..433678ba3125 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -329,7 +329,7 @@ async def release(self) -> None: ) if status == "not_held": raise SemaphoreNotAcquiredError(name=self.key, instance_id=self.instance_id) - assert status == "already_expired" # nosec + assert status == "expired" # nosec raise SemaphoreLostError(name=self.key, instance_id=self.instance_id) async def reacquire(self) -> None: diff --git a/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua index 69148d33eeba..7b431aa37c7e 100644 --- a/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua +++ b/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua @@ -7,7 +7,7 @@ -- -- Returns: {exit_code, status, current_count} -- exit_code: 0 if released, 255 if failed --- status: 'released', 'not_held', or 'already_expired' +-- status: 'released', 'not_held', or 'expired' local tokens_key = KEYS[1] local holders_key = KEYS[2] @@ -29,7 +29,7 @@ if not token then -- this indicates a lost semaphore (e.g. due to TTL expiry) -- remove from holders set and return error redis.call('SREM', holders_key, instance_id) - return {255, 'already_expired', redis.call('SCARD', holders_key)} + return {255, 'expired', redis.call('SCARD', holders_key)} end -- Step 3: Release the semaphore From 4cef3693fe5327697210068e878f485196aac2f7 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 09:40:58 +0200 Subject: [PATCH 35/68] work with lost tokens --- .../src/servicelib/redis/_semaphore.py | 11 ++- .../redis/lua/release_fair_semaphore_v2.lua | 7 ++ .../tests/redis/test_semaphore.py | 97 ++++++++++++++++++- 3 files changed, 109 insertions(+), 6 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 433678ba3125..47c6fa803b6f 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -107,6 +107,8 @@ class DistributedSemaphore(BaseModel): release_script: ClassVar[AsyncScript | None] = None renew_script: ClassVar[AsyncScript | None] = None + _token: str | None = None # currently held token, if any + @classmethod def _register_scripts(cls, redis_client: RedisClientSDK) -> None: """Register Lua scripts with Redis if not already done. @@ -258,7 +260,7 @@ async def _try_acquire() -> list[str] | None: assert tokens_key_token is not None # nosec assert len(tokens_key_token) == 2 # nosec # noqa: PLR2004 assert tokens_key_token[0] == self.tokens_key # nosec - token = tokens_key_token[1] + self._token = tokens_key_token[1] # set up the semaphore holder with a TTL cls = type(self) @@ -266,7 +268,7 @@ async def _try_acquire() -> list[str] | None: result = await cls.acquire_script( # pylint: disable=not-callable keys=[self.holders_set, self.holder_key], args=[ - token, + self._token, self.instance_id, ttl_seconds, ], @@ -300,9 +302,12 @@ async def release(self) -> None: # Execute the release Lua script atomically cls = type(self) assert cls.release_script is not None # nosec + release_args = [self.instance_id] + if self._token is not None: + release_args.append(self._token) result = await cls.release_script( # pylint: disable=not-callable keys=[self.tokens_key, self.holders_set, self.holder_key], - args=[self.instance_id], + args=release_args, client=self.redis_client.redis, ) diff --git a/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua index 7b431aa37c7e..088662c83842 100644 --- a/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua +++ b/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua @@ -4,6 +4,7 @@ -- KEYS[3]: holder_key (individual holder TTL key for this instance) -- ARGV[1]: instance_id +-- ARGV[2]: passed_token (the token held by this instance or nil if unknown) -- -- Returns: {exit_code, status, current_count} -- exit_code: 0 if released, 255 if failed @@ -14,6 +15,7 @@ local holders_key = KEYS[2] local holder_key = KEYS[3] local instance_id = ARGV[1] +local passed_token = ARGV[2] -- Step 1: Check if this instance is currently a holder local is_holder = redis.call('SISMEMBER', holders_key, instance_id) @@ -29,6 +31,11 @@ if not token then -- this indicates a lost semaphore (e.g. due to TTL expiry) -- remove from holders set and return error redis.call('SREM', holders_key, instance_id) + -- if the token was passed return it to the pool + if passed_token then + redis.call('LPUSH', tokens_key, passed_token) + end + -- Note: we do NOT push a recovered token since we don't know its state return {255, 'expired', redis.call('SCARD', holders_key)} end diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 2ec711ffe867..4f6153be768f 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -58,9 +58,9 @@ async def test_semaphore_initialization( assert semaphore.blocking is True assert semaphore.instance_id is not None assert semaphore.semaphore_key == f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}" - assert semaphore.holder_key.startswith( - f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders:" - ) + assert semaphore.tokens_key.startswith(f"{semaphore.semaphore_key}:") + assert semaphore.holders_set.startswith(f"{semaphore.semaphore_key}:") + assert semaphore.holder_key.startswith(f"{semaphore.semaphore_key}:") async def test_invalid_semaphore_initialization( @@ -102,6 +102,28 @@ async def test_invalid_semaphore_initialization( ) +async def _assert_semaphore_redis_state( + redis_client_sdk: RedisClientSDK, + semaphore: DistributedSemaphore, + *, + expected_count: int, + expected_free_tokens: int, + expected_expired: bool = False, +): + """Helper to assert the internal Redis state of the semaphore""" + holders = await redis_client_sdk.redis.smembers(semaphore.holders_set) + assert len(holders) == expected_count + if expected_count > 0: + assert semaphore.instance_id in holders + holder_key_exists = await redis_client_sdk.redis.exists(semaphore.holder_key) + if expected_expired: + assert holder_key_exists == 0 + else: + assert holder_key_exists == 1 + tokens = await redis_client_sdk.redis.lrange(semaphore.tokens_key, 0, -1) + assert len(tokens) == expected_free_tokens + + async def test_semaphore_acquire_release_basic( redis_client_sdk: RedisClientSDK, semaphore_name: str, @@ -119,6 +141,12 @@ async def test_semaphore_acquire_release_basic( assert await semaphore.current_count() == 0 assert await semaphore.size() == semaphore_capacity assert await semaphore.is_acquired() is False + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=0, + expected_free_tokens=semaphore_capacity, + ) # Acquire result = await semaphore.acquire() @@ -126,6 +154,12 @@ async def test_semaphore_acquire_release_basic( assert await semaphore.current_count() == 1 assert await semaphore.size() == semaphore_capacity - 1 assert await semaphore.is_acquired() is True + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=1, + expected_free_tokens=semaphore_capacity - 1, + ) # Acquire again on same instance should return True immediately and keep the same count (reentrant) result = await semaphore.acquire() @@ -133,18 +167,36 @@ async def test_semaphore_acquire_release_basic( assert await semaphore.current_count() == 1 assert await semaphore.size() == semaphore_capacity - 1 assert await semaphore.is_acquired() is True + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=1, + expected_free_tokens=semaphore_capacity - 1, + ) # reacquire should just work await semaphore.reacquire() assert await semaphore.current_count() == 1 assert await semaphore.size() == semaphore_capacity - 1 assert await semaphore.is_acquired() is True + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=1, + expected_free_tokens=semaphore_capacity - 1, + ) # Release await semaphore.release() assert await semaphore.current_count() == 0 assert await semaphore.size() == semaphore_capacity assert await semaphore.is_acquired() is False + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=0, + expected_free_tokens=semaphore_capacity, + ) # reacquire after release should fail with pytest.raises( @@ -152,6 +204,12 @@ async def test_semaphore_acquire_release_basic( match=f"Semaphore '{semaphore_name}' was not acquired by this instance", ): await semaphore.reacquire() + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=0, + expected_free_tokens=semaphore_capacity, + ) # so does release again with pytest.raises( @@ -159,6 +217,12 @@ async def test_semaphore_acquire_release_basic( match=f"Semaphore '{semaphore_name}' was not acquired by this instance", ): await semaphore.release() + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=0, + expected_free_tokens=semaphore_capacity, + ) async def test_semaphore_acquire_release_with_ttl_expiry( @@ -176,8 +240,22 @@ async def test_semaphore_acquire_release_with_ttl_expiry( await semaphore.acquire() assert await semaphore.current_count() == 1 assert await semaphore.size() == semaphore_capacity - 1 + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=1, + expected_free_tokens=semaphore_capacity - 1, + ) + # wait for TTL to expire await asyncio.sleep(with_short_default_semaphore_ttl.total_seconds() + 0.1) + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=1, + expected_free_tokens=semaphore_capacity - 1, + expected_expired=True, + ) # TTL expired, reacquire should fail with pytest.raises( @@ -185,12 +263,25 @@ async def test_semaphore_acquire_release_with_ttl_expiry( match=f"Semaphore '{semaphore_name}' was lost by this instance", ): await semaphore.reacquire() + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=1, + expected_free_tokens=semaphore_capacity - 1, + expected_expired=True, + ) # and release should also fail with pytest.raises( SemaphoreLostError, match=f"Semaphore '{semaphore_name}' was lost by this instance", ): await semaphore.release() + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=0, + expected_free_tokens=semaphore_capacity, + ) async def test_semaphore_context_manager( From 46a0baab6ee01931f1d6c7024620868bcec67716 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 09:44:09 +0200 Subject: [PATCH 36/68] release pushes the token back in --- .../tests/redis/test_semaphore.py | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 4f6153be768f..313c6c267617 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -199,10 +199,7 @@ async def test_semaphore_acquire_release_basic( ) # reacquire after release should fail - with pytest.raises( - SemaphoreNotAcquiredError, - match=f"Semaphore '{semaphore_name}' was not acquired by this instance", - ): + with pytest.raises(SemaphoreNotAcquiredError): await semaphore.reacquire() await _assert_semaphore_redis_state( redis_client_sdk, @@ -212,10 +209,7 @@ async def test_semaphore_acquire_release_basic( ) # so does release again - with pytest.raises( - SemaphoreNotAcquiredError, - match=f"Semaphore '{semaphore_name}' was not acquired by this instance", - ): + with pytest.raises(SemaphoreNotAcquiredError): await semaphore.release() await _assert_semaphore_redis_state( redis_client_sdk, @@ -258,10 +252,7 @@ async def test_semaphore_acquire_release_with_ttl_expiry( ) # TTL expired, reacquire should fail - with pytest.raises( - SemaphoreLostError, - match=f"Semaphore '{semaphore_name}' was lost by this instance", - ): + with pytest.raises(SemaphoreLostError): await semaphore.reacquire() await _assert_semaphore_redis_state( redis_client_sdk, @@ -271,10 +262,17 @@ async def test_semaphore_acquire_release_with_ttl_expiry( expected_expired=True, ) # and release should also fail - with pytest.raises( - SemaphoreLostError, - match=f"Semaphore '{semaphore_name}' was lost by this instance", - ): + with pytest.raises(SemaphoreLostError): + await semaphore.release() + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=0, + expected_free_tokens=semaphore_capacity, + ) + + # and release again should also fail with different error + with pytest.raises(SemaphoreNotAcquiredError): await semaphore.release() await _assert_semaphore_redis_state( redis_client_sdk, From c6079e511e0c802a820303588cede1efc4fc9d2b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 09:44:36 +0200 Subject: [PATCH 37/68] release pushes the token back in --- packages/service-library/src/servicelib/redis/_semaphore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 47c6fa803b6f..4b9f81cc8634 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -310,6 +310,7 @@ async def release(self) -> None: args=release_args, client=self.redis_client.redis, ) + self._token = None assert isinstance(result, list) # nosec exit_code, status, current_count = result From be8b3ff525f4d4091f1eda41e2bd10b3a9d315d9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 10:30:08 +0200 Subject: [PATCH 38/68] re-created context manager --- .../src/servicelib/redis/_semaphore.py | 126 ++++++++++++++++-- .../servicelib/redis/_semaphore_decorator.py | 4 +- .../tests/redis/test_semaphore.py | 30 ----- 3 files changed, 118 insertions(+), 42 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 4b9f81cc8634..bdf296c8eacf 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -1,11 +1,17 @@ +import asyncio +import contextlib import datetime import logging +import socket import uuid -from types import TracebackType +from collections.abc import AsyncIterator from typing import Annotated, ClassVar +import arrow import redis.exceptions +from common_library.async_tools import cancel_wait_task from common_library.basic_types import DEFAULT_FACTORY +from common_library.logging.logging_errors import create_troubleshooting_log_kwargs from pydantic import ( BaseModel, Field, @@ -24,8 +30,10 @@ ) from tenacity.stop import stop_base +from ..background_task import periodic from ._client import RedisClientSDK from ._constants import ( + DEFAULT_EXPECTED_LOCK_OVERALL_TIME, DEFAULT_SEMAPHORE_TTL, DEFAULT_SOCKET_TIMEOUT, SEMAPHORE_KEY_PREFIX, @@ -407,15 +415,111 @@ async def size(self) -> int: self.redis_client.redis.llen(self.tokens_key) ) - # Context manager support - async def __aenter__(self) -> "DistributedSemaphore": - await self.acquire() - return self - async def __aexit__( - self, - exc_type: type[BaseException] | None, - exc_val: BaseException | None, - exc_tb: TracebackType | None, +@contextlib.asynccontextmanager +async def distributed_semaphore( + redis_client: RedisClientSDK, + *, + key: str, + capacity: PositiveInt, + ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL, + blocking: bool = True, + blocking_timeout: datetime.timedelta | None = DEFAULT_SOCKET_TIMEOUT, + expected_lock_overall_time: datetime.timedelta = DEFAULT_EXPECTED_LOCK_OVERALL_TIME, +) -> AsyncIterator[DistributedSemaphore]: + """ + Async context manager for DistributedSemaphore. + + Example: + async with distributed_semaphore(redis_client, "my_resource", capacity=3) as sem: + # Only 3 instances can execute this block concurrently + await do_limited_work() + """ + semaphore = DistributedSemaphore( + redis_client=redis_client, + key=key, + capacity=capacity, + ttl=ttl, + blocking=blocking, + blocking_timeout=blocking_timeout, + ) + + @periodic(interval=semaphore.ttl / 3, raise_on_error=True) + async def _periodic_reacquisition( + semaphore: DistributedSemaphore, started: asyncio.Event ) -> None: - await self.release() + await semaphore.reacquire() + if not started.is_set(): + started.set() + + try: + if not await semaphore.acquire(): + raise SemaphoreAcquisitionError(name=key, instance_id=semaphore.instance_id) + lock_acquisition_time = arrow.utcnow() + + async with ( + asyncio.TaskGroup() as tg + ): # NOTE: using task group ensures proper cancellation propagation of parent task + auto_reacquisition_started = asyncio.Event() + auto_reacquisition_task = tg.create_task( + _periodic_reacquisition(semaphore, auto_reacquisition_started), + name=f"semaphore/auto_reacquisition_task_{semaphore.key}_{semaphore.instance_id}", + ) + await auto_reacquisition_started.wait() + + yield semaphore + + await cancel_wait_task(auto_reacquisition_task) + except BaseExceptionGroup as eg: + semaphore_lost_errors, other_errors = eg.split( + SemaphoreLostError | SemaphoreNotAcquiredError + ) + if other_errors: + assert len(other_errors.exceptions) == 1 # nosec + raise other_errors from eg + assert semaphore_lost_errors is not None # nosec + assert len(semaphore_lost_errors.exceptions) == 1 # nosec + raise semaphore_lost_errors.exceptions[0] from eg + finally: + try: + await semaphore.release() + except SemaphoreNotAcquiredError as exc: + _logger.exception( + **create_troubleshooting_log_kwargs( + f"Unexpected error while releasing semaphore '{semaphore.key}'", + error=exc, + error_context={ + "semaphore_key": semaphore.key, + "semaphore_instance_id": semaphore.instance_id, + "hostname": socket.gethostname(), + }, + tip="This indicates a logic error in the code using the semaphore", + ) + ) + except SemaphoreLostError as exc: + _logger.exception( + **create_troubleshooting_log_kwargs( + f"Unexpected error while releasing semaphore '{semaphore.key}'", + error=exc, + error_context={ + "semaphore_key": semaphore.key, + "semaphore_instance_id": semaphore.instance_id, + "hostname": socket.gethostname(), + }, + tip="This indicates that the semaphore was lost or expired before release. " + "Look for synchronouse code or the loop is very busy and cannot schedule the reacquisition task.", + ) + ) + finally: + lock_release_time = arrow.utcnow() + locking_time = lock_release_time - lock_acquisition_time + if locking_time > expected_lock_overall_time: + _logger.warning( + "Semaphore '%s' was held for %s by %s which is longer than expected (%s). " + "TIP: consider reducing the locking time by optimizing the code inside " + "the critical section or increasing the default locking time", + semaphore.key, + locking_time, + semaphore.instance_id, + expected_lock_overall_time, + ) diff --git a/packages/service-library/src/servicelib/redis/_semaphore_decorator.py b/packages/service-library/src/servicelib/redis/_semaphore_decorator.py index 529eb33fb22a..f458fb37b3d8 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore_decorator.py +++ b/packages/service-library/src/servicelib/redis/_semaphore_decorator.py @@ -43,7 +43,9 @@ async def _managed_semaphore_execution( """Common semaphore management logic with auto-renewal.""" # Acquire the semaphore first if not await semaphore.acquire(): - raise SemaphoreAcquisitionError(name=semaphore_key, capacity=semaphore.capacity) + raise SemaphoreAcquisitionError( + name=semaphore_key, instance_id=semaphore.instance_id + ) lock_acquisition_time = arrow.utcnow() try: diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 313c6c267617..f826034961fb 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -282,36 +282,6 @@ async def test_semaphore_acquire_release_with_ttl_expiry( ) -async def test_semaphore_context_manager( - redis_client_sdk: RedisClientSDK, - semaphore_name: str, - semaphore_capacity: int, -): - async with DistributedSemaphore( - redis_client=redis_client_sdk, key=semaphore_name, capacity=semaphore_capacity - ) as semaphore: - assert await semaphore.current_count() == 1 - - # Should be released after context - assert await semaphore.current_count() == 0 - - -async def test_semaphore_release_without_acquire_raises( - redis_client_sdk: RedisClientSDK, - semaphore_name: str, - semaphore_capacity: int, -): - semaphore = DistributedSemaphore( - redis_client=redis_client_sdk, key=semaphore_name, capacity=semaphore_capacity - ) - - with pytest.raises( - SemaphoreNotAcquiredError, - match=f"Semaphore '{semaphore_name}' was not acquired by this instance", - ): - await semaphore.release() - - async def test_semaphore_multiple_instances_capacity_limit( redis_client_sdk: RedisClientSDK, semaphore_name: str, From 48a4eb41b676a52ec1ac3e4f42d9f6c40c83861e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 10:30:59 +0200 Subject: [PATCH 39/68] ruff --- packages/service-library/tests/redis/test_semaphore.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index f826034961fb..971242a63b79 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -22,6 +22,7 @@ SemaphoreAcquisitionError, SemaphoreNotAcquiredError, ) +from servicelib.redis._utils import handle_redis_returns_union_types pytest_simcore_core_services_selection = [ "redis", @@ -111,7 +112,9 @@ async def _assert_semaphore_redis_state( expected_expired: bool = False, ): """Helper to assert the internal Redis state of the semaphore""" - holders = await redis_client_sdk.redis.smembers(semaphore.holders_set) + holders = await handle_redis_returns_union_types( + redis_client_sdk.redis.smembers(semaphore.holders_set) + ) assert len(holders) == expected_count if expected_count > 0: assert semaphore.instance_id in holders @@ -120,7 +123,9 @@ async def _assert_semaphore_redis_state( assert holder_key_exists == 0 else: assert holder_key_exists == 1 - tokens = await redis_client_sdk.redis.lrange(semaphore.tokens_key, 0, -1) + tokens = await handle_redis_returns_union_types( + redis_client_sdk.redis.lrange(semaphore.tokens_key, 0, -1) + ) assert len(tokens) == expected_free_tokens From 57452410f56c55402ee8ee59018b50dd2666b3f6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 10:43:09 +0200 Subject: [PATCH 40/68] ongoing --- .../tests/redis/test_semaphore.py | 62 +++++++++++++++---- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 971242a63b79..44ca253d5cc8 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -21,6 +21,7 @@ DistributedSemaphore, SemaphoreAcquisitionError, SemaphoreNotAcquiredError, + distributed_semaphore, ) from servicelib.redis._utils import handle_redis_returns_union_types @@ -301,31 +302,58 @@ async def test_semaphore_multiple_instances_capacity_limit( # Acquire first two should succeed assert await semaphores[0].acquire() is True + assert await semaphores[0].is_acquired() is True + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphores[0], + expected_count=1, + expected_free_tokens=capacity - 1, + ) + assert await semaphores[1].is_acquired() is False + for sem in semaphores[:4]: + assert await sem.current_count() == 1 + assert await sem.size() == capacity - 1 + + # acquire second assert await semaphores[1].acquire() is True + for sem in semaphores[:2]: + assert await sem.is_acquired() is True + assert await sem.current_count() == 2 + assert await sem.size() == capacity - 2 + await _assert_semaphore_redis_state( + redis_client_sdk, + sem, + expected_count=2, + expected_free_tokens=capacity - 2, + ) # Third and fourth should fail in non-blocking mode - for semaphore in semaphores[2:]: - semaphore.blocking = False - assert await semaphore.acquire() is False - - # Check counts - assert await semaphores[0].current_count() == 2 - assert await semaphores[0].size() == 0 + for sem in semaphores[2:]: + sem.blocking = False + assert await sem.acquire() is False + assert await sem.is_acquired() is False + assert await sem.current_count() == 2 + assert await sem.size() == capacity - 2 # Release one await semaphores[0].release() - assert await semaphores[0].current_count() == 1 - assert await semaphores[0].size() == 1 + assert await semaphores[0].is_acquired() is False + for sem in semaphores[:4]: + assert await sem.current_count() == 1 + assert await sem.size() == capacity - 1 # Now third can acquire assert await semaphores[2].acquire() is True + for sem in semaphores[:4]: + assert await sem.current_count() == 2 + assert await sem.size() == capacity - 2 # Clean up await semaphores[1].release() await semaphores[2].release() -async def test_semaphore_blocking_timeout( +async def test_semaphore_context_manager( redis_client_sdk: RedisClientSDK, semaphore_name: str, ): @@ -333,11 +361,21 @@ async def test_semaphore_blocking_timeout( timeout = datetime.timedelta(seconds=0.1) # First semaphore acquires - async with DistributedSemaphore( + async with distributed_semaphore( redis_client=redis_client_sdk, key=semaphore_name, capacity=capacity, - ): + ) as semaphore1: + assert await semaphore1.is_acquired() is True + assert await semaphore1.current_count() == 1 + assert await semaphore1.size() == 0 + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore1, + expected_count=1, + expected_free_tokens=0, + ) + # Second semaphore should timeout semaphore2 = DistributedSemaphore( redis_client=redis_client_sdk, From f7f8c86e1fec3505aa7d450e10afe66178b9b96c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 10:44:03 +0200 Subject: [PATCH 41/68] rename --- .../src/servicelib/redis/_semaphore.py | 2 +- .../tests/redis/test_semaphore.py | 24 +++++++++---------- .../tests/redis/test_semaphore_decorator.py | 8 +++---- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index bdf296c8eacf..c8ff487358be 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -408,7 +408,7 @@ async def current_count(self) -> int: self.redis_client.redis.scard(self.holders_set) ) - async def size(self) -> int: + async def available_tokens(self) -> int: """Get the size of the semaphore (number of available tokens)""" await self._ensure_semaphore_initialized() return await handle_redis_returns_union_types( diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 44ca253d5cc8..ba5cdeb47603 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -145,7 +145,7 @@ async def test_semaphore_acquire_release_basic( # Initially not acquired assert await semaphore.current_count() == 0 - assert await semaphore.size() == semaphore_capacity + assert await semaphore.available_tokens() == semaphore_capacity assert await semaphore.is_acquired() is False await _assert_semaphore_redis_state( redis_client_sdk, @@ -158,7 +158,7 @@ async def test_semaphore_acquire_release_basic( result = await semaphore.acquire() assert result is True assert await semaphore.current_count() == 1 - assert await semaphore.size() == semaphore_capacity - 1 + assert await semaphore.available_tokens() == semaphore_capacity - 1 assert await semaphore.is_acquired() is True await _assert_semaphore_redis_state( redis_client_sdk, @@ -171,7 +171,7 @@ async def test_semaphore_acquire_release_basic( result = await semaphore.acquire() assert result is True assert await semaphore.current_count() == 1 - assert await semaphore.size() == semaphore_capacity - 1 + assert await semaphore.available_tokens() == semaphore_capacity - 1 assert await semaphore.is_acquired() is True await _assert_semaphore_redis_state( redis_client_sdk, @@ -183,7 +183,7 @@ async def test_semaphore_acquire_release_basic( # reacquire should just work await semaphore.reacquire() assert await semaphore.current_count() == 1 - assert await semaphore.size() == semaphore_capacity - 1 + assert await semaphore.available_tokens() == semaphore_capacity - 1 assert await semaphore.is_acquired() is True await _assert_semaphore_redis_state( redis_client_sdk, @@ -195,7 +195,7 @@ async def test_semaphore_acquire_release_basic( # Release await semaphore.release() assert await semaphore.current_count() == 0 - assert await semaphore.size() == semaphore_capacity + assert await semaphore.available_tokens() == semaphore_capacity assert await semaphore.is_acquired() is False await _assert_semaphore_redis_state( redis_client_sdk, @@ -239,7 +239,7 @@ async def test_semaphore_acquire_release_with_ttl_expiry( ) await semaphore.acquire() assert await semaphore.current_count() == 1 - assert await semaphore.size() == semaphore_capacity - 1 + assert await semaphore.available_tokens() == semaphore_capacity - 1 await _assert_semaphore_redis_state( redis_client_sdk, semaphore, @@ -312,14 +312,14 @@ async def test_semaphore_multiple_instances_capacity_limit( assert await semaphores[1].is_acquired() is False for sem in semaphores[:4]: assert await sem.current_count() == 1 - assert await sem.size() == capacity - 1 + assert await sem.available_tokens() == capacity - 1 # acquire second assert await semaphores[1].acquire() is True for sem in semaphores[:2]: assert await sem.is_acquired() is True assert await sem.current_count() == 2 - assert await sem.size() == capacity - 2 + assert await sem.available_tokens() == capacity - 2 await _assert_semaphore_redis_state( redis_client_sdk, sem, @@ -333,20 +333,20 @@ async def test_semaphore_multiple_instances_capacity_limit( assert await sem.acquire() is False assert await sem.is_acquired() is False assert await sem.current_count() == 2 - assert await sem.size() == capacity - 2 + assert await sem.available_tokens() == capacity - 2 # Release one await semaphores[0].release() assert await semaphores[0].is_acquired() is False for sem in semaphores[:4]: assert await sem.current_count() == 1 - assert await sem.size() == capacity - 1 + assert await sem.available_tokens() == capacity - 1 # Now third can acquire assert await semaphores[2].acquire() is True for sem in semaphores[:4]: assert await sem.current_count() == 2 - assert await sem.size() == capacity - 2 + assert await sem.available_tokens() == capacity - 2 # Clean up await semaphores[1].release() @@ -368,7 +368,7 @@ async def test_semaphore_context_manager( ) as semaphore1: assert await semaphore1.is_acquired() is True assert await semaphore1.current_count() == 1 - assert await semaphore1.size() == 0 + assert await semaphore1.available_tokens() == 0 await _assert_semaphore_redis_state( redis_client_sdk, semaphore1, diff --git a/packages/service-library/tests/redis/test_semaphore_decorator.py b/packages/service-library/tests/redis/test_semaphore_decorator.py index 2b9be00f4720..4be6d30fb02b 100644 --- a/packages/service-library/tests/redis/test_semaphore_decorator.py +++ b/packages/service-library/tests/redis/test_semaphore_decorator.py @@ -93,7 +93,7 @@ async def long_running_work() -> Literal["success"]: ttl=short_ttl, ) assert await temp_semaphore.current_count() == 1 - assert await temp_semaphore.size() == semaphore_capacity - 1 + assert await temp_semaphore.available_tokens() == semaphore_capacity - 1 # Wait for work to complete result = await task @@ -102,7 +102,7 @@ async def long_running_work() -> Literal["success"]: # After completion, semaphore should be released assert await temp_semaphore.current_count() == 0 - assert await temp_semaphore.size() == semaphore_capacity + assert await temp_semaphore.available_tokens() == semaphore_capacity async def test_auto_renewal_lose_semaphore_raises( @@ -606,7 +606,7 @@ async def use_long_running_cm(): ttl=short_ttl, ) assert await temp_semaphore.current_count() == 1 - assert await temp_semaphore.size() == semaphore_capacity - 1 + assert await temp_semaphore.available_tokens() == semaphore_capacity - 1 # Wait for work to complete await task @@ -614,7 +614,7 @@ async def use_long_running_cm(): # After completion, semaphore should be released assert await temp_semaphore.current_count() == 0 - assert await temp_semaphore.size() == semaphore_capacity + assert await temp_semaphore.available_tokens() == semaphore_capacity async def test_context_manager_with_callable_parameters( From 738750fb6f3c015648868dbd15a95eddd9407494 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 10:55:49 +0200 Subject: [PATCH 42/68] minor --- .../tests/redis/test_semaphore.py | 128 ++++++++++++------ 1 file changed, 89 insertions(+), 39 deletions(-) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index ba5cdeb47603..7a451847663f 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -353,9 +353,75 @@ async def test_semaphore_multiple_instances_capacity_limit( await semaphores[2].release() +async def test_semaphore_with_timeout( + redis_client_sdk: RedisClientSDK, + semaphore_name: str, +): + timeout = datetime.timedelta(seconds=0.5) + semaphore1 = DistributedSemaphore( + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=1, + blocking_timeout=timeout, + ) + assert await semaphore1.acquire() is True + assert await semaphore1.is_acquired() is True + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore1, + expected_count=1, + expected_free_tokens=0, + ) + semaphore2 = DistributedSemaphore( + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=1, + blocking_timeout=timeout, + ) + # Second should timeout + with pytest.raises(SemaphoreAcquisitionError): + await semaphore2.acquire() + assert await semaphore2.is_acquired() is False + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore1, + expected_count=1, + expected_free_tokens=0, + ) + + async def test_semaphore_context_manager( redis_client_sdk: RedisClientSDK, semaphore_name: str, +): + async with distributed_semaphore( + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=1, + ) as semaphore1: + assert await semaphore1.is_acquired() is True + assert await semaphore1.current_count() == 1 + assert await semaphore1.available_tokens() == 0 + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore1, + expected_count=1, + expected_free_tokens=0, + ) + assert await semaphore1.is_acquired() is False + assert await semaphore1.current_count() == 0 + assert await semaphore1.available_tokens() == 1 + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore1, + expected_count=0, + expected_free_tokens=1, + ) + + +async def test_semaphore_context_manager_with_timeout( + redis_client_sdk: RedisClientSDK, + semaphore_name: str, ): capacity = 1 timeout = datetime.timedelta(seconds=0.1) @@ -375,20 +441,34 @@ async def test_semaphore_context_manager( expected_count=1, expected_free_tokens=0, ) - - # Second semaphore should timeout + # Second semaphore should raise on timeout + with pytest.raises(SemaphoreAcquisitionError): + async with distributed_semaphore( + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=capacity, + blocking=True, + blocking_timeout=timeout, + ): + ... + + # non-blocking should also raise when used with context manager + with pytest.raises(SemaphoreAcquisitionError): + async with distributed_semaphore( + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=capacity, + blocking=False, + ): + ... + # using the semaphore directly should in non-blocking mode should return False semaphore2 = DistributedSemaphore( redis_client=redis_client_sdk, key=semaphore_name, capacity=capacity, - blocking_timeout=timeout, + blocking=False, ) - - with pytest.raises( - SemaphoreAcquisitionError, - match=f"Could not acquire semaphore '{semaphore_name}' by this instance", - ): - await semaphore2.acquire() + assert await semaphore2.acquire() is False # now try infinite timeout semaphore3 = DistributedSemaphore( @@ -402,36 +482,6 @@ async def test_semaphore_context_manager( assert not acquire_task.done() -async def test_semaphore_blocking_acquire_waits( - redis_client_sdk: RedisClientSDK, - semaphore_name: str, -): - capacity = 1 - semaphore1 = DistributedSemaphore( - redis_client=redis_client_sdk, key=semaphore_name, capacity=capacity - ) - semaphore2 = DistributedSemaphore( - redis_client=redis_client_sdk, key=semaphore_name, capacity=capacity - ) - - # First acquires immediately - await semaphore1.acquire() - - # Second will wait - async def delayed_release() -> None: - await asyncio.sleep(0.1) - await semaphore1.release() - - acquire_task = asyncio.create_task(semaphore2.acquire()) - release_task = asyncio.create_task(delayed_release()) - - # Both should complete successfully - results = await asyncio.gather(acquire_task, release_task) - assert results[0] is True # acquire succeeded - - await semaphore2.release() - - @pytest.mark.parametrize( "exception", [RuntimeError, asyncio.CancelledError], From 5be7c644506c2e8dfc64e8d8c4492207f16b589f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 11:04:55 +0200 Subject: [PATCH 43/68] getting there --- .../src/servicelib/redis/_errors.py | 12 +++++++++--- .../src/servicelib/redis/_semaphore.py | 15 +++++++-------- .../service-library/tests/redis/test_semaphore.py | 6 +++--- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_errors.py b/packages/service-library/src/servicelib/redis/_errors.py index 3aada3b0e96d..2d09a3730869 100644 --- a/packages/service-library/src/servicelib/redis/_errors.py +++ b/packages/service-library/src/servicelib/redis/_errors.py @@ -26,17 +26,23 @@ class LockLostError(BaseRedisError): ProjectLockError: TypeAlias = redis.exceptions.LockError # NOTE: backwards compatible -class SemaphoreAcquisitionError(BaseRedisError): +class SemaphoreError(BaseRedisError): + msg_template: str = ( + "Unexpected error with semaphore '{name}' by this instance `{instance_id}`" + ) + + +class SemaphoreAcquisitionError(SemaphoreError): msg_template: str = ( "Could not acquire semaphore '{name}' by this instance `{instance_id}`" ) -class SemaphoreNotAcquiredError(BaseRedisError): +class SemaphoreNotAcquiredError(SemaphoreError): msg_template: str = ( "Semaphore '{name}' was not acquired by this instance `{instance_id}`" ) -class SemaphoreLostError(BaseRedisError): +class SemaphoreLostError(SemaphoreError): msg_template: str = "Semaphore '{name}' was lost by this instance `{instance_id}`" diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index c8ff487358be..86862f25b6f6 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -40,6 +40,7 @@ ) from ._errors import ( SemaphoreAcquisitionError, + SemaphoreError, SemaphoreLostError, SemaphoreNotAcquiredError, ) @@ -452,10 +453,10 @@ async def _periodic_reacquisition( if not started.is_set(): started.set() + lock_acquisition_time = arrow.utcnow() try: if not await semaphore.acquire(): raise SemaphoreAcquisitionError(name=key, instance_id=semaphore.instance_id) - lock_acquisition_time = arrow.utcnow() async with ( asyncio.TaskGroup() as tg @@ -471,15 +472,13 @@ async def _periodic_reacquisition( await cancel_wait_task(auto_reacquisition_task) except BaseExceptionGroup as eg: - semaphore_lost_errors, other_errors = eg.split( - SemaphoreLostError | SemaphoreNotAcquiredError - ) + semaphore_errors, other_errors = eg.split(SemaphoreError) if other_errors: assert len(other_errors.exceptions) == 1 # nosec - raise other_errors from eg - assert semaphore_lost_errors is not None # nosec - assert len(semaphore_lost_errors.exceptions) == 1 # nosec - raise semaphore_lost_errors.exceptions[0] from eg + raise other_errors.exceptions[0] from eg + assert semaphore_errors is not None # nosec + assert len(semaphore_errors.exceptions) == 1 # nosec + raise semaphore_errors.exceptions[0] from eg finally: try: await semaphore.release() diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 7a451847663f..1d0512fa4328 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -496,7 +496,7 @@ async def test_semaphore_context_manager_with_exception( captured_semaphore: DistributedSemaphore | None = None async def _raising_context(): - async with DistributedSemaphore( + async with distributed_semaphore( redis_client=redis_client_sdk, key=semaphore_name, capacity=semaphore_capacity, @@ -524,10 +524,10 @@ async def test_multiple_semaphores_different_keys( capacity = 1 async with ( - DistributedSemaphore( + distributed_semaphore( redis_client=redis_client_sdk, key=key1, capacity=capacity ), - DistributedSemaphore( + distributed_semaphore( redis_client=redis_client_sdk, key=key2, capacity=capacity ), ): From 5acf5a278e0692cff5630178f40f91156b173c17 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 11:17:03 +0200 Subject: [PATCH 44/68] getting there --- .../tests/redis/test_semaphore.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 1d0512fa4328..776bd253fb04 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -514,6 +514,39 @@ async def _raising_context(): assert await captured_semaphore.current_count() == 0 +async def test_semaphore_context_manager_lost_renewal( + redis_client_sdk: RedisClientSDK, + semaphore_name: str, + with_short_default_semaphore_ttl: datetime.timedelta, +): + with pytest.raises(SemaphoreLostError): # noqa: PT012 + async with distributed_semaphore( + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=1, + ttl=with_short_default_semaphore_ttl, + ) as semaphore: + assert await semaphore.is_acquired() is True + assert await semaphore.current_count() == 1 + assert await semaphore.available_tokens() == 0 + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=1, + expected_free_tokens=0, + ) + + # now simulate lost renewal by deleting the holder key + await redis_client_sdk.redis.delete(semaphore.holder_key) + # wait a bit to let the auto-renewal task detect the lost lock + # the sleep will be interrupted by the exception and the context manager will exit + with pytest.raises(asyncio.CancelledError): + await asyncio.sleep( + with_short_default_semaphore_ttl.total_seconds() + 0.5 + ) + raise asyncio.CancelledError + + async def test_multiple_semaphores_different_keys( redis_client_sdk: RedisClientSDK, faker: Faker, From a9b0649ea1bd573ab52ad1496199eddea29cbc84 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 11:29:56 +0200 Subject: [PATCH 45/68] ensure we test it all --- .../tests/redis/test_semaphore.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 776bd253fb04..890acd351900 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -7,6 +7,7 @@ import asyncio import datetime +import logging import pytest from faker import Faker @@ -547,6 +548,61 @@ async def test_semaphore_context_manager_lost_renewal( raise asyncio.CancelledError +async def test_semaphore_context_manager_auto_renewal( + redis_client_sdk: RedisClientSDK, + semaphore_name: str, + with_short_default_semaphore_ttl: datetime.timedelta, +): + async with distributed_semaphore( + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=1, + ttl=with_short_default_semaphore_ttl, + ) as semaphore: + assert await semaphore.is_acquired() is True + assert await semaphore.current_count() == 1 + assert await semaphore.available_tokens() == 0 + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=1, + expected_free_tokens=0, + ) + + # wait for a few TTLs to ensure auto-renewal is working + total_wait = with_short_default_semaphore_ttl.total_seconds() * 3 + await asyncio.sleep(total_wait) + + # should still be acquired + assert await semaphore.is_acquired() is True + assert await semaphore.current_count() == 1 + assert await semaphore.available_tokens() == 0 + await _assert_semaphore_redis_state( + redis_client_sdk, + semaphore, + expected_count=1, + expected_free_tokens=0, + ) + + +async def test_semaphore_context_manager_logs_warning_when_hold_too_long( + redis_client_sdk: RedisClientSDK, + semaphore_name: str, + caplog: pytest.LogCaptureFixture, +): + """Test that a warning is logged when holding the semaphore for too long""" + with caplog.at_level(logging.WARNING): + async with distributed_semaphore( + redis_client=redis_client_sdk, + key=semaphore_name, + capacity=1, + expected_lock_overall_time=datetime.timedelta(milliseconds=200), + ): + await asyncio.sleep(0.3) + assert caplog.records + assert "longer than expected" in caplog.messages[-1] + + async def test_multiple_semaphores_different_keys( redis_client_sdk: RedisClientSDK, faker: Faker, From 73d783dc2419c66a0431f52234cd0b4c99fb6dd8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 11:30:20 +0200 Subject: [PATCH 46/68] clean --- packages/service-library/tests/redis/test_semaphore_decorator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/service-library/tests/redis/test_semaphore_decorator.py b/packages/service-library/tests/redis/test_semaphore_decorator.py index 4be6d30fb02b..1c3b8f36346a 100644 --- a/packages/service-library/tests/redis/test_semaphore_decorator.py +++ b/packages/service-library/tests/redis/test_semaphore_decorator.py @@ -402,7 +402,6 @@ async def test_long_locking_logs_warning( redis_client_sdk: RedisClientSDK, semaphore_name: str, caplog: pytest.LogCaptureFixture, - mocker: MockerFixture, ): @with_limited_concurrency( redis_client_sdk, From 51b326801bca87ffa5d18cdd3a8ccbbcc3d57ebb Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 12:34:56 +0200 Subject: [PATCH 47/68] fixed blocking and non-blocking --- .../src/servicelib/redis/_semaphore.py | 113 +++++++++--------- .../tests/redis/test_semaphore.py | 14 ++- 2 files changed, 69 insertions(+), 58 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 86862f25b6f6..82e2c885888c 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -21,14 +21,10 @@ ) from redis.commands.core import AsyncScript from tenacity import ( - RetryError, retry, retry_if_exception_type, - stop_after_delay, - stop_never, wait_random_exponential, ) -from tenacity.stop import stop_base from ..background_task import periodic from ._client import RedisClientSDK @@ -195,6 +191,55 @@ async def _ensure_semaphore_initialized(self) -> None: client=self.redis_client.redis, ) + async def _blocking_acquire(self) -> str | None: + @retry( + wait=wait_random_exponential(min=0.1, max=0.5), + retry=retry_if_exception_type(redis.exceptions.TimeoutError), + ) + async def _acquire_forever_on_socket_timeout() -> list[str] | None: + # NOTE: brpop returns None on timeout + + tokens_key_token: list[str] | None = await handle_redis_returns_union_types( + self.redis_client.redis.brpop( + [self.tokens_key], + timeout=None, # NOTE: we always block forever since tenacity takes care of timing out + ) + ) + return tokens_key_token + + try: + # NOTE: redis-py library timeouts when the defined socket timeout triggers (e.g. DEFAULT_SOCKET_TIMEOUT) + # The BRPOP command itself could timeout but the redis-py socket timeout defeats the purpose + # so we always block forever on BRPOP, tenacity takes care of retrying when a socket timeout happens + # and we use asyncio.timeout to enforce the blocking_timeout if defined + async with asyncio.timeout( + self.blocking_timeout.total_seconds() if self.blocking_timeout else None + ): + tokens_key_token = await _acquire_forever_on_socket_timeout() + assert tokens_key_token is not None # nosec + assert len(tokens_key_token) == 2 # nosec # noqa: PLR2004 + assert tokens_key_token[0] == self.tokens_key # nosec + return tokens_key_token[1] + except TimeoutError as e: + raise SemaphoreAcquisitionError( + name=self.key, instance_id=self.instance_id + ) from e + + async def _non_blocking_acquire(self) -> str | None: + token: str | list[str] | None = await handle_redis_returns_union_types( + self.redis_client.redis.rpop(self.tokens_key) + ) + if token is None: + _logger.debug( + "Semaphore '%s' not acquired (no tokens available) (instance: %s)", + self.key, + self.instance_id, + ) + return None + + assert isinstance(token, str) # nosec + return token + async def acquire(self) -> bool: """ Acquire the semaphore. @@ -217,60 +262,14 @@ async def acquire(self) -> bool: ttl_seconds = self.ttl.total_seconds() - # Determine retry stop condition based on blocking configuration - stop_condition: stop_base = stop_after_delay(0) - if self.blocking: - stop_condition = ( - stop_after_delay(self.blocking_timeout) - if self.blocking_timeout - else stop_never - ) - - try: - - @retry( - stop=stop_condition, - wait=wait_random_exponential(min=0.1, max=0.5), - retry=retry_if_exception_type(redis.exceptions.TimeoutError), - ) - async def _try_acquire() -> list[str] | None: - # NOTE: brpop returns None on timeout - # NOTE: redis-py library timeouts when the socket times out which is defined - # elsewhere on the client (e.g. DEFAULT_SOCKET_TIMEOUT) - # we always block forever since tenacity takes care of timing out - # therefore we can distinguish between a proper timeout (returns None) and a socket - # timeout (raises an exception) - - tokens_key_token: list[str] | None = ( - await handle_redis_returns_union_types( - self.redis_client.redis.brpop( - [self.tokens_key], - timeout=None, # NOTE: we always block forever since tenacity takes care of timing out - ) - ) - ) - return tokens_key_token - - tokens_key_token = await _try_acquire() - except RetryError as e: - # NOTE: if we end up here that means we could not acquire the semaphore - _logger.debug( - "Timeout acquiring semaphore '%s' (instance: %s)", - self.key, - self.instance_id, - ) - if self.blocking: - raise SemaphoreAcquisitionError( - name=self.key, instance_id=self.instance_id - ) from e - return False - - # If we got here it means we acquired a token - assert tokens_key_token is not None # nosec - assert len(tokens_key_token) == 2 # nosec # noqa: PLR2004 - assert tokens_key_token[0] == self.tokens_key # nosec - self._token = tokens_key_token[1] + if self.blocking is False: + self._token = await self._non_blocking_acquire() + if not self._token: + return False + else: + self._token = await self._blocking_acquire() + assert self._token is not None # nosec # set up the semaphore holder with a TTL cls = type(self) assert cls.acquire_script is not None # nosec diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 890acd351900..38dc2ebb77a3 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -354,11 +354,23 @@ async def test_semaphore_multiple_instances_capacity_limit( await semaphores[2].release() +@pytest.fixture +def with_slow_redis_socket_timeout( + mock_redis_socket_timeout: None, mocker: MockerFixture +) -> None: + # put back to higher value to allow normal operations + mocker.patch( + "servicelib.redis._client.DEFAULT_SOCKET_TIMEOUT", + datetime.timedelta(seconds=30), + ) + + async def test_semaphore_with_timeout( + with_slow_redis_socket_timeout, redis_client_sdk: RedisClientSDK, semaphore_name: str, ): - timeout = datetime.timedelta(seconds=0.5) + timeout = datetime.timedelta(seconds=1) semaphore1 = DistributedSemaphore( redis_client=redis_client_sdk, key=semaphore_name, From 42a31e0e4735e470ae84937ebf769c1aa65d3d90 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 13:11:13 +0200 Subject: [PATCH 48/68] change test --- .../service-library/tests/redis/test_semaphore_decorator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/service-library/tests/redis/test_semaphore_decorator.py b/packages/service-library/tests/redis/test_semaphore_decorator.py index 1c3b8f36346a..99ac1cadf335 100644 --- a/packages/service-library/tests/redis/test_semaphore_decorator.py +++ b/packages/service-library/tests/redis/test_semaphore_decorator.py @@ -364,7 +364,7 @@ async def test_with_large_capacity( redis_client_sdk: RedisClientSDK, semaphore_name: str, ): - large_capacity = 20 + large_capacity = 100 concurrent_count = 0 max_concurrent = 0 sleep_time_s = 10 From 62bc8f3b8caf7ecb69a1216146f29cc43160ff1e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 13:58:27 +0200 Subject: [PATCH 49/68] use underlying context manager --- .../servicelib/redis/_semaphore_decorator.py | 202 ++++-------------- 1 file changed, 40 insertions(+), 162 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore_decorator.py b/packages/service-library/src/servicelib/redis/_semaphore_decorator.py index f458fb37b3d8..5f0a064a2403 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore_decorator.py +++ b/packages/service-library/src/servicelib/redis/_semaphore_decorator.py @@ -1,29 +1,17 @@ -import asyncio import datetime import functools import logging -import socket from collections.abc import AsyncIterator, Callable, Coroutine from contextlib import AbstractAsyncContextManager, asynccontextmanager from typing import Any, ParamSpec, TypeVar -import arrow -from common_library.async_tools import cancel_wait_task -from common_library.logging.logging_errors import create_troubleshooting_log_kwargs - -from ..background_task import periodic from ._client import RedisClientSDK from ._constants import ( DEFAULT_EXPECTED_LOCK_OVERALL_TIME, DEFAULT_SEMAPHORE_TTL, DEFAULT_SOCKET_TIMEOUT, ) -from ._errors import ( - SemaphoreAcquisitionError, - SemaphoreLostError, - SemaphoreNotAcquiredError, -) -from ._semaphore import DistributedSemaphore +from ._semaphore import distributed_semaphore _logger = logging.getLogger(__name__) @@ -32,126 +20,6 @@ R = TypeVar("R") -@asynccontextmanager -async def _managed_semaphore_execution( - semaphore: DistributedSemaphore, - semaphore_key: str, - ttl: datetime.timedelta, - execution_context: str, - expected_lock_overall_time: datetime.timedelta, -) -> AsyncIterator: - """Common semaphore management logic with auto-renewal.""" - # Acquire the semaphore first - if not await semaphore.acquire(): - raise SemaphoreAcquisitionError( - name=semaphore_key, instance_id=semaphore.instance_id - ) - - lock_acquisition_time = arrow.utcnow() - try: - # NOTE: Use TaskGroup for proper exception propagation, this ensures that in case of error the context manager will be properly exited - # and the semaphore released. - # If we use create_task() directly, exceptions in the task are not propagated to the parent task - # and the context manager may never exit, leading to semaphore leaks. - async with asyncio.TaskGroup() as tg: - started_event = asyncio.Event() - - # Create auto-renewal task - @periodic(interval=ttl / 3, raise_on_error=True) - async def _periodic_renewer() -> None: - await semaphore.reacquire() - if not started_event.is_set(): - started_event.set() - - # Start the renewal task - renewal_task = tg.create_task( - _periodic_renewer(), - name=f"semaphore/autorenewal_{semaphore_key}_{semaphore.instance_id}", - ) - await started_event.wait() - - yield - - # NOTE: if we do not explicitely await the task inside the context manager - # it sometimes hangs forever (Python issue?) - await cancel_wait_task(renewal_task, max_delay=None) - - except BaseExceptionGroup as eg: - semaphore_lost_errors, other_errors = eg.split(SemaphoreLostError) - # If there are any other errors, re-raise them - if other_errors: - assert len(other_errors.exceptions) == 1 # nosec - raise other_errors.exceptions[0] from eg - - assert semaphore_lost_errors is not None # nosec - assert len(semaphore_lost_errors.exceptions) == 1 # nosec - raise semaphore_lost_errors.exceptions[0] from eg - - finally: - # Always attempt to release the semaphore - try: - await semaphore.release() - except SemaphoreNotAcquiredError as exc: - _logger.exception( - **create_troubleshooting_log_kwargs( - f"Unexpected error while releasing semaphore '{semaphore_key}'", - error=exc, - error_context={ - "semaphore_key": semaphore_key, - "client_name": semaphore.redis_client.client_name, - "hostname": socket.gethostname(), - "execution_context": execution_context, - }, - tip="This might happen if the semaphore was lost before releasing it. " - "Look for synchronous code that prevents refreshing the semaphore or asyncio loop overload.", - ) - ) - finally: - lock_release_time = arrow.utcnow() - locking_time = lock_release_time - lock_acquisition_time - if locking_time > expected_lock_overall_time: - _logger.warning( - "Semaphore '%s' was held for %s which is longer than expected (%s). " - "TIP: consider reducing the locking time by optimizing the code inside " - "the critical section or increasing the default locking time", - semaphore_key, - locking_time, - expected_lock_overall_time, - ) - - -def _create_semaphore( - redis_client: RedisClientSDK | Callable[..., RedisClientSDK], - args: tuple[Any, ...], - *, - key: str | Callable[..., str], - capacity: int | Callable[..., int], - ttl: datetime.timedelta, - blocking: bool, - blocking_timeout: datetime.timedelta | None, - kwargs: dict[str, Any], -) -> tuple[DistributedSemaphore, str]: - """Create and configure a distributed semaphore from callable or static parameters.""" - semaphore_key = key(*args, **kwargs) if callable(key) else key - semaphore_capacity = capacity(*args, **kwargs) if callable(capacity) else capacity - client = redis_client(*args, **kwargs) if callable(redis_client) else redis_client - - assert isinstance(semaphore_key, str) # nosec - assert isinstance(semaphore_capacity, int) # nosec - assert isinstance(client, RedisClientSDK) # nosec - - semaphore = DistributedSemaphore( - redis_client=client, - key=semaphore_key, - capacity=semaphore_capacity, - ttl=ttl, - blocking=blocking, - blocking_timeout=blocking_timeout, - ) - - return semaphore, semaphore_key - - def with_limited_concurrency( redis_client: RedisClientSDK | Callable[..., RedisClientSDK], *, @@ -202,23 +70,28 @@ def _decorator( ) -> Callable[P, Coroutine[Any, Any, R]]: @functools.wraps(coro) async def _wrapper(*args: P.args, **kwargs: P.kwargs) -> R: - semaphore, semaphore_key = _create_semaphore( - redis_client, - args, - key=key, - capacity=capacity, + semaphore_key = key(*args, **kwargs) if callable(key) else key + semaphore_capacity = ( + capacity(*args, **kwargs) if callable(capacity) else capacity + ) + client = ( + redis_client(*args, **kwargs) + if callable(redis_client) + else redis_client + ) + + assert isinstance(semaphore_key, str) # nosec + assert isinstance(semaphore_capacity, int) # nosec + assert isinstance(client, RedisClientSDK) # nosec + + async with distributed_semaphore( + redis_client=client, + key=semaphore_key, + capacity=semaphore_capacity, ttl=ttl, blocking=blocking, blocking_timeout=blocking_timeout, - kwargs=kwargs, - ) - - async with _managed_semaphore_execution( - semaphore, - semaphore_key, - ttl, - f"coroutine_{coro.__name__}", - expected_lock_overall_time, + expected_lock_overall_time=expected_lock_overall_time, ): return await coro(*args, **kwargs) @@ -279,24 +152,29 @@ def _decorator( @functools.wraps(cm_func) @asynccontextmanager async def _wrapper(*args: P.args, **kwargs: P.kwargs) -> AsyncIterator[R]: - semaphore, semaphore_key = _create_semaphore( - redis_client, - args, - key=key, - capacity=capacity, - ttl=ttl, - blocking=blocking, - blocking_timeout=blocking_timeout, - kwargs=kwargs, + semaphore_key = key(*args, **kwargs) if callable(key) else key + semaphore_capacity = ( + capacity(*args, **kwargs) if callable(capacity) else capacity ) + client = ( + redis_client(*args, **kwargs) + if callable(redis_client) + else redis_client + ) + + assert isinstance(semaphore_key, str) # nosec + assert isinstance(semaphore_capacity, int) # nosec + assert isinstance(client, RedisClientSDK) # nosec async with ( - _managed_semaphore_execution( - semaphore, - semaphore_key, - ttl, - f"context_manager_{cm_func.__name__}", - expected_lock_overall_time, + distributed_semaphore( + redis_client=client, + key=semaphore_key, + capacity=semaphore_capacity, + ttl=ttl, + blocking=blocking, + blocking_timeout=blocking_timeout, + expected_lock_overall_time=expected_lock_overall_time, ), cm_func(*args, **kwargs) as value, ): From 063c4524a553bd50cc333cfbb8626577028eaaa3 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 13:58:45 +0200 Subject: [PATCH 50/68] ensure cancellation happens for real --- .../src/servicelib/redis/_semaphore.py | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 82e2c885888c..48c1331d06db 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -446,29 +446,37 @@ async def distributed_semaphore( @periodic(interval=semaphore.ttl / 3, raise_on_error=True) async def _periodic_reacquisition( - semaphore: DistributedSemaphore, started: asyncio.Event + semaphore: DistributedSemaphore, + started: asyncio.Event, + cancellation_event: asyncio.Event, ) -> None: - await semaphore.reacquire() + if cancellation_event.is_set(): + raise asyncio.CancelledError if not started.is_set(): started.set() + await semaphore.reacquire() - lock_acquisition_time = arrow.utcnow() try: if not await semaphore.acquire(): raise SemaphoreAcquisitionError(name=key, instance_id=semaphore.instance_id) + lock_acquisition_time = arrow.utcnow() + async with ( asyncio.TaskGroup() as tg ): # NOTE: using task group ensures proper cancellation propagation of parent task auto_reacquisition_started = asyncio.Event() + cancellation_event = asyncio.Event() auto_reacquisition_task = tg.create_task( - _periodic_reacquisition(semaphore, auto_reacquisition_started), + _periodic_reacquisition( + semaphore, auto_reacquisition_started, cancellation_event + ), name=f"semaphore/auto_reacquisition_task_{semaphore.key}_{semaphore.instance_id}", ) await auto_reacquisition_started.wait() yield semaphore - + cancellation_event.set() # NOTE: this ensure cancellation is effective await cancel_wait_task(auto_reacquisition_task) except BaseExceptionGroup as eg: semaphore_errors, other_errors = eg.split(SemaphoreError) @@ -508,16 +516,15 @@ async def _periodic_reacquisition( "Look for synchronouse code or the loop is very busy and cannot schedule the reacquisition task.", ) ) - finally: - lock_release_time = arrow.utcnow() - locking_time = lock_release_time - lock_acquisition_time - if locking_time > expected_lock_overall_time: - _logger.warning( - "Semaphore '%s' was held for %s by %s which is longer than expected (%s). " - "TIP: consider reducing the locking time by optimizing the code inside " - "the critical section or increasing the default locking time", - semaphore.key, - locking_time, - semaphore.instance_id, - expected_lock_overall_time, - ) + lock_release_time = arrow.utcnow() + locking_time = lock_release_time - lock_acquisition_time + if locking_time > expected_lock_overall_time: + _logger.warning( + "Semaphore '%s' was held for %s by %s which is longer than expected (%s). " + "TIP: consider reducing the locking time by optimizing the code inside " + "the critical section or increasing the default locking time", + semaphore.key, + locking_time, + semaphore.instance_id, + expected_lock_overall_time, + ) From ba10d178b47200305ebec8acd8958736ef1cf2eb Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 14:01:24 +0200 Subject: [PATCH 51/68] ok --- .../tests/redis/test_semaphore_decorator.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/packages/service-library/tests/redis/test_semaphore_decorator.py b/packages/service-library/tests/redis/test_semaphore_decorator.py index 99ac1cadf335..031a60247147 100644 --- a/packages/service-library/tests/redis/test_semaphore_decorator.py +++ b/packages/service-library/tests/redis/test_semaphore_decorator.py @@ -361,6 +361,7 @@ async def function_raising_cancelled_error(): @pytest.mark.heavy_load async def test_with_large_capacity( + with_slow_redis_socket_timeout: None, redis_client_sdk: RedisClientSDK, semaphore_name: str, ): @@ -377,16 +378,18 @@ async def test_with_large_capacity( blocking=True, blocking_timeout=None, ) - async def limited_function() -> None: + async def limited_function(task_id: int) -> None: nonlocal concurrent_count, max_concurrent concurrent_count += 1 max_concurrent = max(max_concurrent, concurrent_count) - with log_context(logging.INFO, f"task with {concurrent_count=}"): + with log_context(logging.INFO, f"{task_id=}") as ctx: + ctx.logger.info("started %s with %s", task_id, concurrent_count) await asyncio.sleep(sleep_time_s) + ctx.logger.info("done %s with %s", task_id, concurrent_count) concurrent_count -= 1 # Start tasks equal to the large capacity - tasks = [asyncio.create_task(limited_function()) for _ in range(num_tasks)] + tasks = [asyncio.create_task(limited_function(i)) for i in range(num_tasks)] done, pending = await asyncio.wait( tasks, timeout=float(num_tasks) / float(large_capacity) * 10.0 * float(sleep_time_s), From 8736204c1322b23995456b184890fc47093954ee Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 14:37:04 +0200 Subject: [PATCH 52/68] fixed variable not accessed --- .../src/servicelib/redis/_semaphore.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 48c1331d06db..1f4a3cfcffb2 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -456,6 +456,7 @@ async def _periodic_reacquisition( started.set() await semaphore.reacquire() + lock_acquisition_time = None try: if not await semaphore.acquire(): raise SemaphoreAcquisitionError(name=key, instance_id=semaphore.instance_id) @@ -516,15 +517,16 @@ async def _periodic_reacquisition( "Look for synchronouse code or the loop is very busy and cannot schedule the reacquisition task.", ) ) - lock_release_time = arrow.utcnow() - locking_time = lock_release_time - lock_acquisition_time - if locking_time > expected_lock_overall_time: - _logger.warning( - "Semaphore '%s' was held for %s by %s which is longer than expected (%s). " - "TIP: consider reducing the locking time by optimizing the code inside " - "the critical section or increasing the default locking time", - semaphore.key, - locking_time, - semaphore.instance_id, - expected_lock_overall_time, - ) + if lock_acquisition_time is not None: + lock_release_time = arrow.utcnow() + locking_time = lock_release_time - lock_acquisition_time + if locking_time > expected_lock_overall_time: + _logger.warning( + "Semaphore '%s' was held for %s by %s which is longer than expected (%s). " + "TIP: consider reducing the locking time by optimizing the code inside " + "the critical section or increasing the default locking time", + semaphore.key, + locking_time, + semaphore.instance_id, + expected_lock_overall_time, + ) From 2a3e7e8e2bc9517b527e42e23879634b2d453f82 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 14:58:19 +0200 Subject: [PATCH 53/68] found a problem --- .../service-library/tests/redis/test_semaphore.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 38dc2ebb77a3..33b9005db1a7 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -506,26 +506,17 @@ async def test_semaphore_context_manager_with_exception( semaphore_capacity: int, exception: type[Exception | asyncio.CancelledError], ): - captured_semaphore: DistributedSemaphore | None = None - async def _raising_context(): async with distributed_semaphore( redis_client=redis_client_sdk, key=semaphore_name, capacity=semaphore_capacity, - ) as sem: - nonlocal captured_semaphore - captured_semaphore = sem + ): raise exception("Test") with pytest.raises(exception, match="Test"): await _raising_context() - # Should be released even after exception - assert captured_semaphore is not None - # captured_semaphore is guaranteed to be not None by the assert above - assert await captured_semaphore.current_count() == 0 - async def test_semaphore_context_manager_lost_renewal( redis_client_sdk: RedisClientSDK, From 44d16aea3feee8e4469a56beff16d9179e55028b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:21:36 +0200 Subject: [PATCH 54/68] should be fixed now --- .../src/servicelib/redis/_semaphore.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 1f4a3cfcffb2..32a5d35a9dbc 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -450,7 +450,7 @@ async def _periodic_reacquisition( started: asyncio.Event, cancellation_event: asyncio.Event, ) -> None: - if cancellation_event.is_set(): + if cancellation_event.is_set() or asyncio.current_task().cancelled(): raise asyncio.CancelledError if not started.is_set(): started.set() @@ -475,10 +475,11 @@ async def _periodic_reacquisition( name=f"semaphore/auto_reacquisition_task_{semaphore.key}_{semaphore.instance_id}", ) await auto_reacquisition_started.wait() - - yield semaphore - cancellation_event.set() # NOTE: this ensure cancellation is effective - await cancel_wait_task(auto_reacquisition_task) + try: + yield semaphore + finally: + cancellation_event.set() # NOTE: this ensure cancellation is effective + await cancel_wait_task(auto_reacquisition_task) except BaseExceptionGroup as eg: semaphore_errors, other_errors = eg.split(SemaphoreError) if other_errors: From bd8e19175c71498ab51f2d66db28e2323aeab738 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:21:58 +0200 Subject: [PATCH 55/68] clean --- packages/service-library/src/servicelib/redis/_semaphore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 32a5d35a9dbc..d8c291e70ae3 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -450,7 +450,7 @@ async def _periodic_reacquisition( started: asyncio.Event, cancellation_event: asyncio.Event, ) -> None: - if cancellation_event.is_set() or asyncio.current_task().cancelled(): + if cancellation_event.is_set(): raise asyncio.CancelledError if not started.is_set(): started.set() From c4d46a390dbd6492c320e7e960182f65069483ce Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:23:04 +0200 Subject: [PATCH 56/68] add some docs --- packages/service-library/src/servicelib/redis/_semaphore.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index d8c291e70ae3..a009258a0486 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -476,6 +476,8 @@ async def _periodic_reacquisition( ) await auto_reacquisition_started.wait() try: + # NOTE: this try/finally ensures that cancellation_event is set when we exit the context + # even in case of exceptions yield semaphore finally: cancellation_event.set() # NOTE: this ensure cancellation is effective From fc0c2d045dc7a6f22454bde5f006eb1299ccb087 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:23:57 +0200 Subject: [PATCH 57/68] ruff --- packages/service-library/src/servicelib/redis/_semaphore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index a009258a0486..1e95b9925460 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -417,7 +417,7 @@ async def available_tokens(self) -> int: @contextlib.asynccontextmanager -async def distributed_semaphore( +async def distributed_semaphore( # noqa: C901 redis_client: RedisClientSDK, *, key: str, From e3e2afd49686ff5452b1927035db52d92f0d1c80 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:44:38 +0200 Subject: [PATCH 58/68] removed socket timeout on redis clients as this is a dangerous setting --- .../src/pytest_simcore/redis_service.py | 9 --------- .../service-library/src/servicelib/redis/_client.py | 3 +-- .../src/servicelib/redis/_constants.py | 4 +++- .../src/servicelib/redis/_semaphore.py | 7 +++---- .../src/servicelib/redis/_semaphore_decorator.py | 5 ++--- packages/service-library/tests/conftest.py | 4 ++-- .../tests/deferred_tasks/test_deferred_tasks.py | 11 ----------- packages/service-library/tests/redis/test_client.py | 5 +---- .../tests/redis/test_clients_manager.py | 1 - .../service-library/tests/redis/test_semaphore.py | 12 ------------ .../tests/redis/test_semaphore_decorator.py | 13 ------------- .../01/notifications/test_rabbitmq_consumers.py | 1 - 12 files changed, 12 insertions(+), 63 deletions(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/redis_service.py b/packages/pytest-simcore/src/pytest_simcore/redis_service.py index 824d61a57fb9..04177c7f9e28 100644 --- a/packages/pytest-simcore/src/pytest_simcore/redis_service.py +++ b/packages/pytest-simcore/src/pytest_simcore/redis_service.py @@ -4,7 +4,6 @@ import logging from collections.abc import AsyncIterator -from datetime import timedelta import pytest import tenacity @@ -116,14 +115,6 @@ async def wait_till_redis_responsive(redis_url: URL | str) -> None: await client.aclose(close_connection_pool=True) -@pytest.fixture -def mock_redis_socket_timeout(mocker: MockerFixture) -> None: - # lowered to allow CI to properly shutdown RedisClientSDK instances - mocker.patch( - "servicelib.redis._client.DEFAULT_SOCKET_TIMEOUT", timedelta(seconds=0.25) - ) - - @pytest.fixture async def use_in_memory_redis(mocker: MockerFixture) -> RedisSettings: mocker.patch("redis.asyncio.from_url", FakeAsyncRedis) diff --git a/packages/service-library/src/servicelib/redis/_client.py b/packages/service-library/src/servicelib/redis/_client.py index de407a74fe8d..4b725cf1948d 100644 --- a/packages/service-library/src/servicelib/redis/_client.py +++ b/packages/service-library/src/servicelib/redis/_client.py @@ -20,7 +20,6 @@ DEFAULT_DECODE_RESPONSES, DEFAULT_HEALTH_CHECK_INTERVAL, DEFAULT_LOCK_TTL, - DEFAULT_SOCKET_TIMEOUT, ) _logger = logging.getLogger(__name__) @@ -65,7 +64,7 @@ def __post_init__(self) -> None: redis.exceptions.ConnectionError, ], retry_on_timeout=True, - socket_timeout=DEFAULT_SOCKET_TIMEOUT.total_seconds(), + socket_timeout=None, # NOTE: setting a timeout here can lead to issues with long running commands encoding="utf-8", decode_responses=self.decode_responses, client_name=self.client_name, diff --git a/packages/service-library/src/servicelib/redis/_constants.py b/packages/service-library/src/servicelib/redis/_constants.py index 456d86b4d2a6..845e70d7fa8b 100644 --- a/packages/service-library/src/servicelib/redis/_constants.py +++ b/packages/service-library/src/servicelib/redis/_constants.py @@ -7,8 +7,10 @@ seconds=30 ) DEFAULT_LOCK_TTL: Final[datetime.timedelta] = datetime.timedelta(seconds=10) -DEFAULT_SOCKET_TIMEOUT: Final[datetime.timedelta] = datetime.timedelta(seconds=30) +DEFAULT_SEMAPHORE_BLOCK_TIMEOUT: Final[datetime.timedelta] = datetime.timedelta( + seconds=30 +) DEFAULT_SEMAPHORE_TTL: Final[datetime.timedelta] = datetime.timedelta(seconds=10) SEMAPHORE_KEY_PREFIX: Final[str] = "semaphores:" diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 1e95b9925460..aa48cdec6b74 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -31,7 +31,6 @@ from ._constants import ( DEFAULT_EXPECTED_LOCK_OVERALL_TIME, DEFAULT_SEMAPHORE_TTL, - DEFAULT_SOCKET_TIMEOUT, SEMAPHORE_KEY_PREFIX, ) from ._errors import ( @@ -97,7 +96,7 @@ class DistributedSemaphore(BaseModel): blocking_timeout: Annotated[ datetime.timedelta | None, Field(description="Maximum time to wait when blocking"), - ] = DEFAULT_SOCKET_TIMEOUT + ] = None instance_id: Annotated[ str, Field( @@ -208,7 +207,7 @@ async def _acquire_forever_on_socket_timeout() -> list[str] | None: return tokens_key_token try: - # NOTE: redis-py library timeouts when the defined socket timeout triggers (e.g. DEFAULT_SOCKET_TIMEOUT) + # NOTE: redis-py library timeouts when the defined socket timeout triggers # The BRPOP command itself could timeout but the redis-py socket timeout defeats the purpose # so we always block forever on BRPOP, tenacity takes care of retrying when a socket timeout happens # and we use asyncio.timeout to enforce the blocking_timeout if defined @@ -424,7 +423,7 @@ async def distributed_semaphore( # noqa: C901 capacity: PositiveInt, ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL, blocking: bool = True, - blocking_timeout: datetime.timedelta | None = DEFAULT_SOCKET_TIMEOUT, + blocking_timeout: datetime.timedelta | None = None, expected_lock_overall_time: datetime.timedelta = DEFAULT_EXPECTED_LOCK_OVERALL_TIME, ) -> AsyncIterator[DistributedSemaphore]: """ diff --git a/packages/service-library/src/servicelib/redis/_semaphore_decorator.py b/packages/service-library/src/servicelib/redis/_semaphore_decorator.py index 5f0a064a2403..72e7fd9d309e 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore_decorator.py +++ b/packages/service-library/src/servicelib/redis/_semaphore_decorator.py @@ -9,7 +9,6 @@ from ._constants import ( DEFAULT_EXPECTED_LOCK_OVERALL_TIME, DEFAULT_SEMAPHORE_TTL, - DEFAULT_SOCKET_TIMEOUT, ) from ._semaphore import distributed_semaphore @@ -27,7 +26,7 @@ def with_limited_concurrency( capacity: int | Callable[..., int], ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL, blocking: bool = True, - blocking_timeout: datetime.timedelta | None = DEFAULT_SOCKET_TIMEOUT, + blocking_timeout: datetime.timedelta | None = None, expected_lock_overall_time: datetime.timedelta = DEFAULT_EXPECTED_LOCK_OVERALL_TIME, ) -> Callable[ [Callable[P, Coroutine[Any, Any, R]]], Callable[P, Coroutine[Any, Any, R]] @@ -107,7 +106,7 @@ def with_limited_concurrency_cm( capacity: int | Callable[..., int], ttl: datetime.timedelta = DEFAULT_SEMAPHORE_TTL, blocking: bool = True, - blocking_timeout: datetime.timedelta | None = DEFAULT_SOCKET_TIMEOUT, + blocking_timeout: datetime.timedelta | None = None, expected_lock_overall_time: datetime.timedelta = DEFAULT_EXPECTED_LOCK_OVERALL_TIME, ) -> Callable[ [Callable[P, AbstractAsyncContextManager[R]]], diff --git a/packages/service-library/tests/conftest.py b/packages/service-library/tests/conftest.py index c4f63a18a1ba..f06739423b20 100644 --- a/packages/service-library/tests/conftest.py +++ b/packages/service-library/tests/conftest.py @@ -112,7 +112,7 @@ async def _cleanup_redis_data(clients_manager: RedisClientsManager) -> None: @pytest.fixture async def get_redis_client_sdk( - mock_redis_socket_timeout: None, use_in_memory_redis: RedisSettings + use_in_memory_redis: RedisSettings, ) -> AsyncIterable[ Callable[[RedisDatabase], AbstractAsyncContextManager[RedisClientSDK]] ]: @@ -122,7 +122,7 @@ async def get_redis_client_sdk( @pytest.fixture async def get_in_process_redis_client_sdk( - mock_redis_socket_timeout: None, redis_service: RedisSettings + redis_service: RedisSettings, ) -> AsyncIterable[ Callable[[RedisDatabase], AbstractAsyncContextManager[RedisClientSDK]] ]: diff --git a/packages/service-library/tests/deferred_tasks/test_deferred_tasks.py b/packages/service-library/tests/deferred_tasks/test_deferred_tasks.py index 7e60c71cb30e..0bb6254542e9 100644 --- a/packages/service-library/tests/deferred_tasks/test_deferred_tasks.py +++ b/packages/service-library/tests/deferred_tasks/test_deferred_tasks.py @@ -3,7 +3,6 @@ import asyncio import contextlib -import datetime import itertools import json import random @@ -19,7 +18,6 @@ from common_library.json_serialization import json_dumps from common_library.serialization import model_dump_with_secrets from pydantic import NonNegativeFloat, NonNegativeInt -from pytest_mock import MockerFixture from servicelib.rabbitmq import RabbitMQClient from servicelib.redis import RedisClientSDK from servicelib.sequences_utils import partition_gen @@ -385,19 +383,10 @@ async def pause_redis(self) -> AsyncIterator[None]: yield -@pytest.fixture -def mock_default_socket_timeout(mocker: MockerFixture) -> None: - mocker.patch( - "servicelib.redis._client.DEFAULT_SOCKET_TIMEOUT", - datetime.timedelta(seconds=0.25), - ) - - @pytest.mark.parametrize("max_workers", [10]) @pytest.mark.parametrize("deferred_tasks_to_start", [100]) @pytest.mark.parametrize("service", ["rabbit", "redis"]) async def test_workflow_with_third_party_services_outages( - mock_default_socket_timeout: None, paused_container: Callable[[str], AbstractAsyncContextManager[None]], redis_client_sdk_deferred_tasks: RedisClientSDK, rabbit_client: RabbitMQClient, diff --git a/packages/service-library/tests/redis/test_client.py b/packages/service-library/tests/redis/test_client.py index 91ff29e5f38e..580c47d0facb 100644 --- a/packages/service-library/tests/redis/test_client.py +++ b/packages/service-library/tests/redis/test_client.py @@ -104,9 +104,7 @@ async def test_redis_lock_with_ttl( assert not await ttl_lock.locked() -async def test_redis_client_sdk_setup_shutdown( - mock_redis_socket_timeout: None, redis_service: RedisSettings -): +async def test_redis_client_sdk_setup_shutdown(redis_service: RedisSettings): # setup redis_resources_dns = redis_service.build_redis_dsn(RedisDatabase.RESOURCES) client = RedisClientSDK(redis_resources_dns, client_name="pytest") @@ -130,7 +128,6 @@ async def test_redis_client_sdk_setup_shutdown( async def test_regression_fails_on_redis_service_outage( - mock_redis_socket_timeout: None, paused_container: Callable[[str], AbstractAsyncContextManager[None]], redis_client_sdk: RedisClientSDK, ): diff --git a/packages/service-library/tests/redis/test_clients_manager.py b/packages/service-library/tests/redis/test_clients_manager.py index eeb110557e33..4bf5bc454f46 100644 --- a/packages/service-library/tests/redis/test_clients_manager.py +++ b/packages/service-library/tests/redis/test_clients_manager.py @@ -16,7 +16,6 @@ async def test_redis_client_sdks_manager( - mock_redis_socket_timeout: None, redis_service: RedisSettings, ): all_redis_configs: set[RedisManagerDBConfig] = { diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 33b9005db1a7..6dd557370bf4 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -354,19 +354,7 @@ async def test_semaphore_multiple_instances_capacity_limit( await semaphores[2].release() -@pytest.fixture -def with_slow_redis_socket_timeout( - mock_redis_socket_timeout: None, mocker: MockerFixture -) -> None: - # put back to higher value to allow normal operations - mocker.patch( - "servicelib.redis._client.DEFAULT_SOCKET_TIMEOUT", - datetime.timedelta(seconds=30), - ) - - async def test_semaphore_with_timeout( - with_slow_redis_socket_timeout, redis_client_sdk: RedisClientSDK, semaphore_name: str, ): diff --git a/packages/service-library/tests/redis/test_semaphore_decorator.py b/packages/service-library/tests/redis/test_semaphore_decorator.py index 031a60247147..545cfd36e30c 100644 --- a/packages/service-library/tests/redis/test_semaphore_decorator.py +++ b/packages/service-library/tests/redis/test_semaphore_decorator.py @@ -361,7 +361,6 @@ async def function_raising_cancelled_error(): @pytest.mark.heavy_load async def test_with_large_capacity( - with_slow_redis_socket_timeout: None, redis_client_sdk: RedisClientSDK, semaphore_name: str, ): @@ -424,19 +423,7 @@ async def limited_function() -> None: assert "longer than expected" in caplog.messages[-1] -@pytest.fixture -def with_slow_redis_socket_timeout( - mock_redis_socket_timeout: None, mocker: MockerFixture -) -> None: - # put back to higher value to allow normal operations - mocker.patch( - "servicelib.redis._client.DEFAULT_SOCKET_TIMEOUT", - datetime.timedelta(seconds=30), - ) - - async def test_semaphore_fair_queuing( - with_slow_redis_socket_timeout: None, redis_client_sdk: RedisClientSDK, semaphore_name: str, ): diff --git a/services/web/server/tests/integration/01/notifications/test_rabbitmq_consumers.py b/services/web/server/tests/integration/01/notifications/test_rabbitmq_consumers.py index 99ef45fdd1d1..d29c362f8a77 100644 --- a/services/web/server/tests/integration/01/notifications/test_rabbitmq_consumers.py +++ b/services/web/server/tests/integration/01/notifications/test_rabbitmq_consumers.py @@ -132,7 +132,6 @@ async def _assert_handler_called_with_json( @pytest.fixture async def client( docker_registry: str, - mock_redis_socket_timeout: None, aiohttp_client: Callable, app_config: dict[str, Any], rabbit_service: RabbitSettings, From b0f4018ee232e0a8b5630ec87644008a757e5052 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 16:00:40 +0200 Subject: [PATCH 59/68] ensure we have auto expiry on keys --- .../src/servicelib/redis/_semaphore.py | 29 ++++++++++++++----- .../redis/lua/acquire_fair_semaphore_v2.lua | 4 ++- .../redis/lua/register_semaphore_holder.lua | 2 +- .../redis/lua/renew_fair_semaphore_v2.lua | 10 +++++++ .../tests/redis/test_semaphore.py | 2 +- 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index aa48cdec6b74..4fa79774268b 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -155,6 +155,18 @@ def holders_set(self) -> str: """Redis key for the holders SET.""" return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders_set" + @computed_field # type: ignore[prop-decorator] + @property + def holders_set_ttl(self) -> datetime.timedelta: + """TTL for the holders SET""" + return self.ttl * 5 + + @computed_field # type: ignore[prop-decorator] + @property + def tokens_set_ttl(self) -> datetime.timedelta: + """TTL for the tokens SET""" + return self.ttl * 5 + @computed_field # type: ignore[prop-decorator] @property def holder_key(self) -> str: @@ -181,12 +193,11 @@ def validate_timeout( async def _ensure_semaphore_initialized(self) -> None: """Initializes the semaphore in Redis if not already done.""" - ttl_seconds = self.ttl.total_seconds() cls = type(self) assert cls.register_semaphore is not None # nosec await cls.register_semaphore( # pylint: disable=not-callable keys=[self.tokens_key, self.holders_set], - args=[self.capacity, ttl_seconds], + args=[self.capacity, self.holders_set_ttl.total_seconds()], client=self.redis_client.redis, ) @@ -259,8 +270,6 @@ async def acquire(self) -> bool: ) return True - ttl_seconds = self.ttl.total_seconds() - if self.blocking is False: self._token = await self._non_blocking_acquire() if not self._token: @@ -277,7 +286,8 @@ async def acquire(self) -> bool: args=[ self._token, self.instance_id, - ttl_seconds, + self.ttl.total_seconds(), + self.holders_set_ttl.total_seconds(), ], client=self.redis_client.redis, ) @@ -361,8 +371,13 @@ async def reacquire(self) -> None: cls = type(self) assert cls.renew_script is not None # nosec result = await cls.renew_script( # pylint: disable=not-callable - keys=[self.holders_set, self.holder_key], - args=[self.instance_id, ttl_seconds], + keys=[self.holders_set, self.holder_key, self.tokens_key], + args=[ + self.instance_id, + ttl_seconds, + self.holders_set_ttl.total_seconds(), + self.tokens_set_ttl.total_seconds(), + ], client=self.redis_client.redis, ) diff --git a/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua index 26ea1827b2ec..396a2ec34df4 100644 --- a/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua +++ b/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua @@ -5,6 +5,7 @@ -- ARGV[1]: token (the token received from BRPOP) -- ARGV[2]: instance_id (the instance trying to acquire the semaphore) -- ARGV[3]: ttl_seconds (for the holder_key) +-- ARGV[4]: holders_set_ttl_seconds (to set expiry on holders set) -- -- Returns: {exit_code, status, token, current_count} -- exit_code: 0 if acquired @@ -16,6 +17,7 @@ local holder_key = KEYS[2] local token = ARGV[1] local instance_id = ARGV[2] local ttl_seconds = tonumber(ARGV[3]) +local holders_set_ttl_seconds = tonumber(ARGV[4]) @@ -24,7 +26,7 @@ redis.call('SADD', holders_key, instance_id) redis.call('SETEX', holder_key, ttl_seconds, token) -- Step 2: Set expiry on holders set to prevent infinite growth -redis.call('EXPIRE', holders_key, ttl_seconds * 10) +redis.call('EXPIRE', holders_key, holders_set_ttl_seconds) local current_count = redis.call('SCARD', holders_key) diff --git a/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua b/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua index b9ddd7335b9f..c41c53ab9fc7 100644 --- a/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua +++ b/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua @@ -23,7 +23,7 @@ if tokens_exist == 0 and holders_exist == 0 then redis.call('LPUSH', tokens_key, 'token_' .. i) end -- Set expiry on tokens list to prevent infinite growth - -- redis.call('EXPIRE', tokens_key, ttl_seconds) + redis.call('EXPIRE', tokens_key, ttl_seconds) end return 0 diff --git a/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua index 9526fa414b54..ef229ef91b0b 100644 --- a/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua +++ b/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua @@ -1,8 +1,11 @@ -- Renew semaphore holder TTL (simplified for token pool design) -- KEYS[1]: holders_key (SET of current holders) -- KEYS[2]: holder_key (individual holder TTL key for this instance) +-- KEYS[3]: tokens_key (LIST of available tokens) -- ARGV[1]: instance_id -- ARGV[2]: ttl_seconds +-- ARGV[3]: holders_ttl_seconds (to renew holders set) +-- ARGV[4]: tokens_ttl_seconds (to renew tokens list) -- -- Returns: {exit_code, status, current_count} -- exit_code: 0 if renewed, 255 if failed @@ -10,9 +13,12 @@ local holders_key = KEYS[1] local holder_key = KEYS[2] +local tokens_key = KEYS[3] local instance_id = ARGV[1] local ttl_seconds = tonumber(ARGV[2]) +local holders_ttl_seconds = tonumber(ARGV[3]) +local tokens_ttl_seconds = tonumber(ARGV[4]) -- Step 1: Check if this instance is currently a holder local is_holder = redis.call('SISMEMBER', holders_key, instance_id) @@ -32,4 +38,8 @@ end local token = redis.call('GET', holder_key) redis.call('SETEX', holder_key, ttl_seconds, token) +-- Step 4: Renew the holders set and tokens list TTLs to prevent infinite growth +redis.call('EXPIRE', holders_key, holders_ttl_seconds) +redis.call('EXPIRE', tokens_key, tokens_ttl_seconds) + return {0, 'renewed', redis.call('SCARD', holders_key)} diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index 6dd557370bf4..d3dd0eb60714 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -38,7 +38,7 @@ def with_short_default_semaphore_ttl( mocker: MockerFixture, ) -> datetime.timedelta: - short_ttl = datetime.timedelta(seconds=2) + short_ttl = datetime.timedelta(seconds=5) mocker.patch( "servicelib.redis._semaphore.DEFAULT_SEMAPHORE_TTL", short_ttl, From b8f9b8fc8a136db7e1a8d558dbda523fd876bab7 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:25:03 +0200 Subject: [PATCH 60/68] ensure we keep docker up --- packages/celery-library/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/celery-library/Makefile b/packages/celery-library/Makefile index 04596d3d1249..ccce149f8530 100644 --- a/packages/celery-library/Makefile +++ b/packages/celery-library/Makefile @@ -27,6 +27,7 @@ tests: ## runs unit tests --durations=10 \ --exitfirst \ --failed-first \ + --keep-docker-up \ --pdb \ -vv \ $(CURDIR)/tests @@ -41,6 +42,7 @@ tests-ci: ## runs unit tests --cov-report=term-missing \ --cov-report=xml \ --junitxml=junit.xml -o junit_family=legacy \ + --keep-docker-up \ --cov=celery_library \ --durations=10 \ --log-date-format="%Y-%m-%d %H:%M:%S" \ From d39b298e7362a2687755da448f86d12f3e5ac50b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:25:12 +0200 Subject: [PATCH 61/68] ensure cancellation is done --- packages/service-library/src/servicelib/redis/_client.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_client.py b/packages/service-library/src/servicelib/redis/_client.py index 4b725cf1948d..4e32d47c9fc5 100644 --- a/packages/service-library/src/servicelib/redis/_client.py +++ b/packages/service-library/src/servicelib/redis/_client.py @@ -71,6 +71,7 @@ def __post_init__(self) -> None: ) self._is_healthy = False self._started_event_task_health_check = asyncio.Event() + self._cancelled_event_task_health_check = asyncio.Event() async def setup(self) -> None: @periodic(interval=self.health_check_interval) @@ -78,6 +79,8 @@ async def _periodic_check_health() -> None: assert self._started_event_task_health_check # nosec self._started_event_task_health_check.set() self._is_healthy = await self.ping() + if self._cancelled_event_task_health_check.is_set(): + raise asyncio.CancelledError self._task_health_check = asyncio.create_task( _periodic_check_health(), @@ -99,10 +102,8 @@ async def shutdown(self) -> None: if self._task_health_check: assert self._started_event_task_health_check # nosec await self._started_event_task_health_check.wait() - - await cancel_wait_task( - self._task_health_check, max_delay=_HEALTHCHECK_TIMEOUT_S - ) + self._cancelled_event_task_health_check.set() + await cancel_wait_task(self._task_health_check, max_delay=None) await self._client.aclose(close_connection_pool=True) From 6933f970826beb6729ba55e4b371e087730544e9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:35:05 +0200 Subject: [PATCH 62/68] missing declaration --- packages/service-library/src/servicelib/redis/_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/service-library/src/servicelib/redis/_client.py b/packages/service-library/src/servicelib/redis/_client.py index 4e32d47c9fc5..84907d51f645 100644 --- a/packages/service-library/src/servicelib/redis/_client.py +++ b/packages/service-library/src/servicelib/redis/_client.py @@ -48,6 +48,7 @@ class RedisClientSDK: _client: aioredis.Redis = field(init=False) _task_health_check: Task | None = None _started_event_task_health_check: asyncio.Event | None = None + _cancelled_event_task_health_check: asyncio.Event | None = None _is_healthy: bool = False @property From 0dc0e93b4197c1ff987c00ee9238c54477ec5936 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:47:00 +0200 Subject: [PATCH 63/68] mypy --- packages/service-library/src/servicelib/redis/_client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/service-library/src/servicelib/redis/_client.py b/packages/service-library/src/servicelib/redis/_client.py index 84907d51f645..ee4e9a2040e0 100644 --- a/packages/service-library/src/servicelib/redis/_client.py +++ b/packages/service-library/src/servicelib/redis/_client.py @@ -78,6 +78,7 @@ async def setup(self) -> None: @periodic(interval=self.health_check_interval) async def _periodic_check_health() -> None: assert self._started_event_task_health_check # nosec + assert self._cancelled_event_task_health_check # nosec self._started_event_task_health_check.set() self._is_healthy = await self.ping() if self._cancelled_event_task_health_check.is_set(): @@ -103,6 +104,7 @@ async def shutdown(self) -> None: if self._task_health_check: assert self._started_event_task_health_check # nosec await self._started_event_task_health_check.wait() + assert self._cancelled_event_task_health_check # nosec self._cancelled_event_task_health_check.set() await cancel_wait_task(self._task_health_check, max_delay=None) From 421cbfc9ece9e5b633d9751ba75f98fa2ce2c10e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 18:03:53 +0200 Subject: [PATCH 64/68] ensure changing capacity changes the key as well --- .../src/servicelib/redis/_semaphore.py | 18 +++++++++--------- .../tests/redis/test_semaphore.py | 5 ++++- .../tests/redis/test_semaphore_decorator.py | 4 ++-- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 4fa79774268b..e4962cc46b90 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -141,19 +141,25 @@ def __init__(self, **data) -> None: @property def semaphore_key(self) -> str: """Redis key for the semaphore sorted set.""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}" + return f"{SEMAPHORE_KEY_PREFIX}{self.key}_cap{self.capacity}" @computed_field # type: ignore[prop-decorator] @property def tokens_key(self) -> str: """Redis key for the token pool LIST.""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}:tokens" + return f"{self.semaphore_key}:tokens" @computed_field # type: ignore[prop-decorator] @property def holders_set(self) -> str: """Redis key for the holders SET.""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders_set" + return f"{self.semaphore_key}:holders_set" + + @computed_field # type: ignore[prop-decorator] + @property + def holder_key(self) -> str: + """Redis key for this instance's holder entry.""" + return f"{self.semaphore_key}:holders:{self.instance_id}" @computed_field # type: ignore[prop-decorator] @property @@ -167,12 +173,6 @@ def tokens_set_ttl(self) -> datetime.timedelta: """TTL for the tokens SET""" return self.ttl * 5 - @computed_field # type: ignore[prop-decorator] - @property - def holder_key(self) -> str: - """Redis key for this instance's holder entry.""" - return f"{SEMAPHORE_KEY_PREFIX}{self.key}:holders:{self.instance_id}" - @field_validator("ttl") @classmethod def validate_ttl(cls, v: datetime.timedelta) -> datetime.timedelta: diff --git a/packages/service-library/tests/redis/test_semaphore.py b/packages/service-library/tests/redis/test_semaphore.py index d3dd0eb60714..755ce716bfe2 100644 --- a/packages/service-library/tests/redis/test_semaphore.py +++ b/packages/service-library/tests/redis/test_semaphore.py @@ -60,7 +60,10 @@ async def test_semaphore_initialization( assert semaphore.ttl == DEFAULT_SEMAPHORE_TTL assert semaphore.blocking is True assert semaphore.instance_id is not None - assert semaphore.semaphore_key == f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}" + assert ( + semaphore.semaphore_key + == f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}_cap{semaphore_capacity}" + ) assert semaphore.tokens_key.startswith(f"{semaphore.semaphore_key}:") assert semaphore.holders_set.startswith(f"{semaphore.semaphore_key}:") assert semaphore.holder_key.startswith(f"{semaphore.semaphore_key}:") diff --git a/packages/service-library/tests/redis/test_semaphore_decorator.py b/packages/service-library/tests/redis/test_semaphore_decorator.py index 545cfd36e30c..f42af1368dbf 100644 --- a/packages/service-library/tests/redis/test_semaphore_decorator.py +++ b/packages/service-library/tests/redis/test_semaphore_decorator.py @@ -134,7 +134,7 @@ async def coro_that_should_fail() -> Literal["should not reach here"]: # Find and delete all holder keys for this semaphore holder_keys = await redis_client_sdk.redis.keys( - f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders:*" + f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}_cap{semaphore_capacity}:holders:*" ) assert holder_keys, "Holder keys should exist before deletion" await redis_client_sdk.redis.delete(*holder_keys) @@ -734,7 +734,7 @@ async def use_failing_cm() -> None: # Find and delete all holder keys for this semaphore holder_keys = await redis_client_sdk.redis.keys( - f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}:holders:*" + f"{SEMAPHORE_KEY_PREFIX}{semaphore_name}_cap{semaphore_capacity}:holders:*" ) assert holder_keys, "Holder keys should exist before deletion" await redis_client_sdk.redis.delete(*holder_keys) From d063248dca307d6b49f9557bc99e013c3481ee4d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 18:14:22 +0200 Subject: [PATCH 65/68] rename key --- .../tests/redis/test_semaphore_decorator.py | 39 +++---------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/packages/service-library/tests/redis/test_semaphore_decorator.py b/packages/service-library/tests/redis/test_semaphore_decorator.py index f42af1368dbf..7a8164bb36da 100644 --- a/packages/service-library/tests/redis/test_semaphore_decorator.py +++ b/packages/service-library/tests/redis/test_semaphore_decorator.py @@ -458,37 +458,6 @@ async def limited_function(call_id: int): async def test_context_manager_basic_functionality( redis_client_sdk: RedisClientSDK, semaphore_name: str, -): - call_count = 0 - - @with_limited_concurrency_cm( - redis_client_sdk, - key=semaphore_name, - capacity=1, - ) - @asynccontextmanager - async def limited_context_manager(): - nonlocal call_count - call_count += 1 - yield call_count - - # Multiple concurrent context managers - async def use_context_manager() -> int: - async with limited_context_manager() as value: - await asyncio.sleep(0.1) - return value - - tasks = [asyncio.create_task(use_context_manager()) for _ in range(3)] - results = await asyncio.gather(*tasks) - - # All should complete successfully - assert len(results) == 3 - assert all(isinstance(r, int) for r in results) - - -async def test_context_manager_capacity_enforcement( - redis_client_sdk: RedisClientSDK, - semaphore_name: str, ): concurrent_count = 0 max_concurrent = 0 @@ -510,13 +479,17 @@ async def limited_context_manager(): finally: concurrent_count -= 1 - async def use_context_manager() -> None: + async def use_context_manager() -> int: async with limited_context_manager(): await asyncio.sleep(0.1) + return 1 # Start concurrent context managers tasks = [asyncio.create_task(use_context_manager()) for _ in range(20)] - await asyncio.gather(*tasks) + results = await asyncio.gather(*tasks) + # All should complete successfully + assert len(results) == 20 + assert all(isinstance(r, int) for r in results) # Should never exceed capacity of 2 assert max_concurrent <= 2 From c5d5b3db9f99361312eb5125e187d876095f0bce Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 18:22:46 +0200 Subject: [PATCH 66/68] clean the cleanup script --- .../redis/lua/cleanup_fair_semaphore_v2.lua | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua index 88da6ca5ca1b..34a3b87dd28d 100644 --- a/packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua +++ b/packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua @@ -4,7 +4,7 @@ -- KEYS[3]: holder_prefix (prefix for holder keys, e.g. "semaphores:holders:key:") -- ARGV[1]: capacity (total semaphore capacity) -- --- Returns: {recovered_tokens, current_holders, available_tokens, total_cleaned} +-- Returns: {recovered_tokens, missing_tokens, excess_tokens} -- This script should be run periodically to recover tokens from crashed clients local tokens_key = KEYS[1] @@ -52,15 +52,5 @@ for i = 1, excess_tokens do redis.call('RPOP', tokens_key) end --- Step 4: Refresh expiry on data structures to prevent cleanup -local final_holders = redis.call('SCARD', holders_key) -local final_available = redis.call('LLEN', tokens_key) -if final_holders > 0 then - redis.call('EXPIRE', holders_key, 3600) -- 1 hour expiry -end -if final_available > 0 then - redis.call('EXPIRE', tokens_key, 3600) -- 1 hour expiry -end - -return {recovered_tokens, final_holders, final_available, #cleaned_holders} +return {recovered_tokens, missing_tokens, excess_tokens} From 84c7212f518ce697411c7612c2ebb3705c1d21b4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 19:55:38 +0200 Subject: [PATCH 67/68] @pcrespov review: rename scripts --- .../src/servicelib/redis/_semaphore.py | 16 ++++++++-------- .../src/servicelib/redis/_semaphore_lua.py | 10 +++++----- ...ir_semaphore_v2.lua => acquire_semaphore.lua} | 0 ...ir_semaphore_v2.lua => cleanup_semaphore.lua} | 0 ..._holder.lua => register_semaphore_tokens.lua} | 0 ...ir_semaphore_v2.lua => release_semaphore.lua} | 0 ...fair_semaphore_v2.lua => renew_semaphore.lua} | 0 7 files changed, 13 insertions(+), 13 deletions(-) rename packages/service-library/src/servicelib/redis/lua/{acquire_fair_semaphore_v2.lua => acquire_semaphore.lua} (100%) rename packages/service-library/src/servicelib/redis/lua/{cleanup_fair_semaphore_v2.lua => cleanup_semaphore.lua} (100%) rename packages/service-library/src/servicelib/redis/lua/{register_semaphore_holder.lua => register_semaphore_tokens.lua} (100%) rename packages/service-library/src/servicelib/redis/lua/{release_fair_semaphore_v2.lua => release_semaphore.lua} (100%) rename packages/service-library/src/servicelib/redis/lua/{renew_fair_semaphore_v2.lua => renew_semaphore.lua} (100%) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index e4962cc46b90..05c3a7b1def5 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -40,10 +40,10 @@ SemaphoreNotAcquiredError, ) from ._semaphore_lua import ( - ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT, - REGISTER_FAIR_SEMAPHORE_SCRIPT, - RELEASE_FAIR_SEMAPHORE_V2_SCRIPT, - RENEW_FAIR_SEMAPHORE_V2_SCRIPT, + ACQUIRE_SEMAPHORE_SCRIPT, + REGISTER_SEMAPHORE_TOKEN_SCRIPT, + RELEASE_SEMAPHORE_SCRIPT, + RENEW_SEMAPHORE_SCRIPT, SCRIPT_BAD_EXIT_CODE, SCRIPT_OK_EXIT_CODE, ) @@ -121,16 +121,16 @@ def _register_scripts(cls, redis_client: RedisClientSDK) -> None: the script is only registered once.""" if cls.acquire_script is None: cls.register_semaphore = redis_client.redis.register_script( - REGISTER_FAIR_SEMAPHORE_SCRIPT + REGISTER_SEMAPHORE_TOKEN_SCRIPT ) cls.acquire_script = redis_client.redis.register_script( - ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT + ACQUIRE_SEMAPHORE_SCRIPT ) cls.release_script = redis_client.redis.register_script( - RELEASE_FAIR_SEMAPHORE_V2_SCRIPT + RELEASE_SEMAPHORE_SCRIPT ) cls.renew_script = redis_client.redis.register_script( - RENEW_FAIR_SEMAPHORE_V2_SCRIPT + RENEW_SEMAPHORE_SCRIPT ) def __init__(self, **data) -> None: diff --git a/packages/service-library/src/servicelib/redis/_semaphore_lua.py b/packages/service-library/src/servicelib/redis/_semaphore_lua.py index 1ce94c486f90..71f29fa88817 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore_lua.py +++ b/packages/service-library/src/servicelib/redis/_semaphore_lua.py @@ -27,11 +27,11 @@ def _load_script(script_name: str) -> str: # fair semaphore scripts (token pool based) -REGISTER_FAIR_SEMAPHORE_SCRIPT: Final[str] = _load_script("register_semaphore_holder") -ACQUIRE_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("acquire_fair_semaphore_v2") -RELEASE_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("release_fair_semaphore_v2") -CLEANUP_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("cleanup_fair_semaphore_v2") -RENEW_FAIR_SEMAPHORE_V2_SCRIPT: Final[str] = _load_script("renew_fair_semaphore_v2") +REGISTER_SEMAPHORE_TOKEN_SCRIPT: Final[str] = _load_script("register_semaphore_tokens") +ACQUIRE_SEMAPHORE_SCRIPT: Final[str] = _load_script("acquire_semaphore") +RELEASE_SEMAPHORE_SCRIPT: Final[str] = _load_script("release_semaphore") +CLEANUP_SEMAPHORE_SCRIPT: Final[str] = _load_script("cleanup_semaphore") +RENEW_SEMAPHORE_SCRIPT: Final[str] = _load_script("renew_semaphore") SCRIPT_OK_EXIT_CODE: Final[int] = 0 diff --git a/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/acquire_semaphore.lua similarity index 100% rename from packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua rename to packages/service-library/src/servicelib/redis/lua/acquire_semaphore.lua diff --git a/packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/cleanup_semaphore.lua similarity index 100% rename from packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua rename to packages/service-library/src/servicelib/redis/lua/cleanup_semaphore.lua diff --git a/packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua b/packages/service-library/src/servicelib/redis/lua/register_semaphore_tokens.lua similarity index 100% rename from packages/service-library/src/servicelib/redis/lua/register_semaphore_holder.lua rename to packages/service-library/src/servicelib/redis/lua/register_semaphore_tokens.lua diff --git a/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/release_semaphore.lua similarity index 100% rename from packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua rename to packages/service-library/src/servicelib/redis/lua/release_semaphore.lua diff --git a/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/renew_semaphore.lua similarity index 100% rename from packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua rename to packages/service-library/src/servicelib/redis/lua/renew_semaphore.lua From 454417900b20673eda45d1e88304c8cdceaa34f1 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Sep 2025 20:04:47 +0200 Subject: [PATCH 68/68] use self call --- .../src/servicelib/redis/_semaphore.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/packages/service-library/src/servicelib/redis/_semaphore.py b/packages/service-library/src/servicelib/redis/_semaphore.py index 05c3a7b1def5..3fab1b459e60 100644 --- a/packages/service-library/src/servicelib/redis/_semaphore.py +++ b/packages/service-library/src/servicelib/redis/_semaphore.py @@ -193,9 +193,8 @@ def validate_timeout( async def _ensure_semaphore_initialized(self) -> None: """Initializes the semaphore in Redis if not already done.""" - cls = type(self) - assert cls.register_semaphore is not None # nosec - await cls.register_semaphore( # pylint: disable=not-callable + assert self.register_semaphore is not None # nosec + await self.register_semaphore( # pylint: disable=not-callable keys=[self.tokens_key, self.holders_set], args=[self.capacity, self.holders_set_ttl.total_seconds()], client=self.redis_client.redis, @@ -279,9 +278,8 @@ async def acquire(self) -> bool: assert self._token is not None # nosec # set up the semaphore holder with a TTL - cls = type(self) - assert cls.acquire_script is not None # nosec - result = await cls.acquire_script( # pylint: disable=not-callable + assert self.acquire_script is not None # nosec + result = await self.acquire_script( # pylint: disable=not-callable keys=[self.holders_set, self.holder_key], args=[ self._token, @@ -317,12 +315,11 @@ async def release(self) -> None: """ # Execute the release Lua script atomically - cls = type(self) - assert cls.release_script is not None # nosec + assert self.release_script is not None # nosec release_args = [self.instance_id] if self._token is not None: release_args.append(self._token) - result = await cls.release_script( # pylint: disable=not-callable + result = await self.release_script( # pylint: disable=not-callable keys=[self.tokens_key, self.holders_set, self.holder_key], args=release_args, client=self.redis_client.redis, @@ -368,9 +365,8 @@ async def reacquire(self) -> None: ttl_seconds = self.ttl.total_seconds() # Execute the renewal Lua script atomically - cls = type(self) - assert cls.renew_script is not None # nosec - result = await cls.renew_script( # pylint: disable=not-callable + assert self.renew_script is not None # nosec + result = await self.renew_script( # pylint: disable=not-callable keys=[self.holders_set, self.holder_key, self.tokens_key], args=[ self.instance_id,