feat: Enhance Redis client management and implement rate limiting for token requests

TimilsinaBimal · TimilsinaBimal · commit 538240834267 · 2025-12-14T16:15:39.000+05:45
diff --git a/app/api/endpoints/health.py b/app/api/endpoints/health.py
@@ -1,8 +1,36 @@
 from fastapi import APIRouter
+from loguru import logger
+
+from app.services.token_store import token_store
 
 router = APIRouter(tags=["health"])
 
 
 @router.get("/health", summary="Simple readiness probe")
 async def health_check() -> dict[str, str]:
     return {"status": "ok"}
+
+
+@router.get("/metrics", summary="Runtime metrics (lightweight)")
+async def metrics() -> dict:
+    """Return lightweight runtime metrics useful for diagnosing Redis connection growth."""
+    try:
+        client = await token_store._get_client()
+    except Exception as exc:
+        logger.warning(f"Failed to fetch Redis client for metrics: {exc}")
+        return {"redis": "unavailable"}
+
+    metrics: dict = {}
+    try:
+        info = await client.info(section="clients")
+        metrics["redis_connected_clients"] = int(info.get("connected_clients", 0))
+    except Exception as exc:
+        logger.warning(f"Failed to read Redis INFO clients: {exc}")
+        metrics["redis_connected_clients"] = "error"
+
+    try:
+        metrics["per_request_redis_calls_last"] = token_store.get_call_count()
+    except Exception:
+        metrics["per_request_redis_calls_last"] = "error"
+
+    return metrics
diff --git a/app/core/app.py b/app/core/app.py
@@ -3,6 +3,7 @@
 from contextlib import asynccontextmanager
 from pathlib import Path
 
+from cachetools import TTLCache
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import HTMLResponse
@@ -61,6 +62,12 @@ def _on_done(t: asyncio.Task):
         await catalog_updater.stop()
         catalog_updater = None
         logger.info("Background catalog updates stopped")
+    # Close shared token store Redis client
+    try:
+        await token_store.close()
+        logger.info("TokenStore Redis client closed")
+    except Exception as exc:
+        logger.warning(f"Failed to close TokenStore Redis client: {exc}")
 
 
 if settings.APP_ENV != "development":
@@ -73,6 +80,8 @@ def _on_done(t: asyncio.Task):
     description="Stremio catalog addon for movie and series recommendations",
     version=__version__,
     lifespan=lifespan,
+    docs_url=None if settings.APP_ENV != "development" else "/docs",
+    redoc_url=None if settings.APP_ENV != "development" else "/redoc",
 )
 
 app.add_middleware(
@@ -84,6 +93,33 @@ def _on_done(t: asyncio.Task):
 )
 
 
+# Simple IP-based rate limiter for repeated probes of missing tokens.
+# Tracks recent failure counts per IP to avoid expensive repeated requests.
+_ip_failure_cache: TTLCache = TTLCache(maxsize=10000, ttl=600)
+_IP_FAILURE_THRESHOLD = 8
+
+
+@app.middleware("http")
+async def block_missing_token_middleware(request: Request, call_next):
+    # Extract first path segment which is commonly the token in addon routes
+    path = request.url.path.lstrip("/")
+    seg = path.split("/", 1)[0] if path else ""
+    try:
+        # If token is known-missing, short-circuit and track IP failures
+        if seg and seg in token_store._missing_tokens:
+            ip = request.client.host if request.client else "unknown"
+            try:
+                _ip_failure_cache[ip] = _ip_failure_cache.get(ip, 0) + 1
+            except Exception:
+                pass
+            if _ip_failure_cache.get(ip, 0) > _IP_FAILURE_THRESHOLD:
+                return HTMLResponse(content="Too many requests", status_code=429)
+            return HTMLResponse(content="Invalid token", status_code=401)
+    except Exception:
+        pass
+    return await call_next(request)
+
+
 # Middleware to track per-request Redis calls and attach as response header for diagnostics
 @app.middleware("http")
 async def redis_calls_middleware(request: Request, call_next):
diff --git a/app/core/config.py b/app/core/config.py
@@ -20,6 +20,12 @@ class Settings(BaseSettings):
     ADDON_ID: str = "com.bimal.watchly"
     ADDON_NAME: str = "Watchly"
     REDIS_URL: str = "redis://redis:6379/0"
+    # Maximum number of connections Redis client will open per process
+    # Set conservatively to avoid unbounded connection growth under high concurrency
+    REDIS_MAX_CONNECTIONS: int = 20
+    # If total connected clients reported by Redis exceeds this, background
+    # Redis-heavy jobs will back off. Tune according to your Redis capacity.
+    REDIS_CONNECTIONS_THRESHOLD: int = 100
     REDIS_TOKEN_KEY: str = "watchly:token:"
     TOKEN_SALT: str = "change-me"
     TOKEN_TTL_SECONDS: int = 0  # 0 = never expire
diff --git a/app/core/version.py b/app/core/version.py
@@ -1 +1 @@
-__version__ = "1.1.0"
+__version__ = "1.1.1"
diff --git a/app/services/catalog_updater.py b/app/services/catalog_updater.py
@@ -143,6 +143,21 @@ async def _update_safe(key: str, payload: dict[str, Any]) -> None:
                     logger.error(f"Background refresh failed for {redact_token(key)}: {exc}", exc_info=True)
 
         try:
+            # Check Redis connected clients and back off if overloaded
+            try:
+                client = await token_store._get_client()
+                info = await client.info(section="clients")
+                connected = int(info.get("connected_clients", 0))
+                threshold = getattr(settings, "REDIS_CONNECTIONS_THRESHOLD", 1000)
+                if connected > threshold:
+                    logger.warning(
+                        f"Redis connected clients {connected} exceed threshold {threshold}; skipping"
+                        "background refresh."
+                    )
+                    return
+            except Exception as exc:
+                logger.warning(f"Failed to check Redis client info before refresh: {exc}")
+
             async for key, payload in token_store.iter_payloads():
                 # Extract token from redis key prefix
                 prefix = token_store.KEY_PREFIX
diff --git a/app/services/token_store.py b/app/services/token_store.py
@@ -5,6 +5,7 @@
 from typing import Any
 
 import redis.asyncio as redis
+from async_lru import alru_cache
 from cachetools import TTLCache
 from cryptography.fernet import Fernet, InvalidToken
 from cryptography.hazmat.primitives import hashes
@@ -22,9 +23,9 @@ class TokenStore:
 
     def __init__(self) -> None:
         self._client: redis.Redis | None = None
-        # Cache decrypted payloads for 1 day (86400s) to reduce Redis hits
-        # Max size 5000 allows many active users without eviction
-        self._payload_cache: TTLCache = TTLCache(maxsize=5000, ttl=86400)
+        # Negative cache for missing tokens to avoid repeated Redis GETs
+        # when external probes request non-existent tokens.
+        self._missing_tokens: TTLCache = TTLCache(maxsize=10000, ttl=3600)
         # per-request redis call counter (context-local)
         self._redis_calls_var: contextvars.ContextVar[int] = contextvars.ContextVar("watchly_redis_calls", default=0)
 
@@ -66,15 +67,59 @@ def decrypt_token(self, enc: str) -> str:
     async def _get_client(self) -> redis.Redis:
         if self._client is None:
             # Add socket timeouts to avoid hanging on Redis operations
+            import traceback
+
+            logger.info("Creating shared Redis client")
+            # Limit the number of pooled connections to avoid unbounded growth
+            # `max_connections` is forwarded to ConnectionPool.from_url
             self._client = redis.from_url(
                 settings.REDIS_URL,
                 decode_responses=True,
                 encoding="utf-8",
                 socket_connect_timeout=5,
                 socket_timeout=5,
+                max_connections=getattr(settings, "REDIS_MAX_CONNECTIONS", 100),
+                health_check_interval=30,
+                socket_keepalive=True,
             )
+            # If _get_client is called multiple times in different contexts it
+            # could indicate multiple processes/threads or a bug opening
+            # additional clients; log a stacktrace for debugging.
+            if getattr(self, "_creation_count", None) is None:
+                self._creation_count = 1
+            else:
+                self._creation_count += 1
+                logger.warning(
+                    f"Redis client creation invoked again (count={self._creation_count})."
+                    f" Stack:\n{''.join(traceback.format_stack())}"
+                )
         return self._client
 
+    async def close(self) -> None:
+        """Close and disconnect the shared Redis client (call on shutdown)."""
+        if self._client is None:
+            return
+        try:
+            logger.info("Closing shared Redis client")
+            # Close client and disconnect underlying pool
+            try:
+                await self._client.close()
+            except Exception:
+                pass
+            try:
+                pool = getattr(self._client, "connection_pool", None)
+                if pool is not None:
+                    # connection_pool.disconnect may be a coroutine in some redis implementations
+                    disconnect = getattr(pool, "disconnect", None)
+                    if disconnect:
+                        res = disconnect()
+                        if hasattr(res, "__await__"):
+                            await res
+            except Exception:
+                pass
+        finally:
+            self._client = None
+
     def _format_key(self, token: str) -> str:
         """Format Redis key from token."""
         return f"{self.KEY_PREFIX}{token}"
@@ -109,30 +154,49 @@ async def store_user_data(self, user_id: str, payload: dict[str, Any]) -> str:
             self._incr_calls()
             await client.set(key, json_str)
 
-        # Update cache with the payload
-        self._payload_cache[token] = payload
+        # Invalidate async LRU cached reads so future reads use the updated payload
+        try:
+            self.get_user_data.cache_clear()
+        except Exception:
+            pass
+
+        # Ensure we remove from negative cache so new value is read next time
+        try:
+            if token in self._missing_tokens:
+                del self._missing_tokens[token]
+        except Exception:
+            pass
 
         return token
 
+    @alru_cache(maxsize=5000)
     async def get_user_data(self, token: str) -> dict[str, Any] | None:
-        if token in self._payload_cache:
-            logger.info(f"[REDIS] Using cached redis data {token}")
-            return self._payload_cache[token]
-        logger.info(f"[REDIS]Caching Failed. Fetching data from redis for {token}")
+        # Short-circuit for tokens known to be missing
+        try:
+            if token in self._missing_tokens:
+                logger.debug(f"[REDIS] Negative cache hit for missing token {token}")
+                return None
+        except Exception:
+            pass
 
+        logger.debug(f"[REDIS] Cache miss. Fetching data from redis for {token}")
         key = self._format_key(token)
         client = await self._get_client()
         self._incr_calls()
         data_raw = await client.get(key)
 
         if not data_raw:
+            # remember negative result briefly
+            try:
+                self._missing_tokens[token] = True
+            except Exception:
+                pass
             return None
 
         try:
             data = json.loads(data_raw)
             if data.get("authKey"):
                 data["authKey"] = self.decrypt_token(data["authKey"])
-            self._payload_cache[token] = data
             return data
         except (json.JSONDecodeError, InvalidToken):
             return None
@@ -147,9 +211,17 @@ async def delete_token(self, token: str = None, key: str = None) -> None:
         self._incr_calls()
         await client.delete(key)
 
-        # Invalidate local cache
-        if token and token in self._payload_cache:
-            del self._payload_cache[token]
+        # Invalidate async LRU cached reads
+        try:
+            self.get_user_data.cache_clear()
+        except Exception:
+            pass
+        # Remove from negative cache as token is deleted
+        try:
+            if token and token in self._missing_tokens:
+                del self._missing_tokens[token]
+        except Exception:
+            pass
 
     async def iter_payloads(self, batch_size: int = 200) -> AsyncIterator[tuple[str, dict[str, Any]]]:
         try:
@@ -185,9 +257,8 @@ async def iter_payloads(self, batch_size: int = 200) -> AsyncIterator[tuple[str,
                                 payload["authKey"] = self.decrypt_token(payload["authKey"])
                         except Exception:
                             pass
-                        # Update L1 cache (token only)
+                        # Token payload ready for consumer
                         tok = k[len(self.KEY_PREFIX) :] if k.startswith(self.KEY_PREFIX) else k  # noqa
-                        self._payload_cache[tok] = payload
                         yield k, payload
                     buffer.clear()
 
@@ -213,7 +284,6 @@ async def iter_payloads(self, batch_size: int = 200) -> AsyncIterator[tuple[str,
                     except Exception:
                         pass
                     tok = k[len(self.KEY_PREFIX) :] if k.startswith(self.KEY_PREFIX) else k  # noqa
-                    self._payload_cache[tok] = payload
                     yield k, payload
         except (redis.RedisError, OSError) as exc:
             logger.warning(f"Failed to scan credential tokens: {exc}")

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "1.1.0"`
	`1`	`+__version__ = "1.1.1"`