diff --git a/.github/workflows/backend-ci.yml b/.github/workflows/backend-ci.yml
index f3c56abd..82db3ee2 100644
--- a/.github/workflows/backend-ci.yml
+++ b/.github/workflows/backend-ci.yml
@@ -5,21 +5,24 @@ on:
     branches: [main, dev]
     paths:
       - 'backend/**'
+      - 'docker-compose.yaml'
+      - 'docker-bake.hcl'
       - '.github/workflows/backend-ci.yml'
-      - 'docker-compose.ci.yaml'
   pull_request:
     branches: [main, dev]
     paths:
       - 'backend/**'
+      - 'docker-compose.yaml'
+      - 'docker-bake.hcl'
       - '.github/workflows/backend-ci.yml'
-      - 'docker-compose.ci.yaml'
   workflow_dispatch:
 
 # Pin image versions for cache key consistency
 env:
   MONGO_IMAGE: mongo:8.0
   REDIS_IMAGE: redis:7-alpine
-  KAFKA_IMAGE: apache/kafka:3.9.0
+  ZOOKEEPER_IMAGE: confluentinc/cp-zookeeper:7.5.0
+  KAFKA_IMAGE: confluentinc/cp-kafka:7.5.0
   SCHEMA_REGISTRY_IMAGE: confluentinc/cp-schema-registry:7.5.0
 
 jobs:
@@ -71,7 +74,7 @@ jobs:
       - name: Cache and load Docker images
         uses: ./.github/actions/docker-cache
         with:
-          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
+          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
 
       - name: Set up uv
         uses: astral-sh/setup-uv@v7
@@ -86,9 +89,13 @@ jobs:
           uv sync --frozen
 
       - name: Start infrastructure services
+        env:
+          KAFKA_HEAP_OPTS: "-Xms256M -Xmx512M"
         run: |
-          docker compose -f docker-compose.ci.yaml up -d --wait --wait-timeout 120
-          docker compose -f docker-compose.ci.yaml ps
+          # Start only infra services (no workers, no build)
+          docker compose up -d --wait --wait-timeout 180 \
+            mongo redis zookeeper-certgen zookeeper kafka schema-registry
+          docker compose ps
 
       - name: Create Kafka topics
         timeout-minutes: 2
@@ -134,15 +141,15 @@ jobs:
         if: failure()
         run: |
           mkdir -p logs
-          docker compose -f docker-compose.ci.yaml logs > logs/docker-compose.log 2>&1
-          docker compose -f docker-compose.ci.yaml logs kafka > logs/kafka.log 2>&1
-          docker compose -f docker-compose.ci.yaml logs schema-registry > logs/schema-registry.log 2>&1
+          docker compose logs > logs/docker-compose.log 2>&1
+          docker compose logs kafka > logs/kafka.log 2>&1
+          docker compose logs schema-registry > logs/schema-registry.log 2>&1
 
       - name: Upload logs
         if: failure()
         uses: actions/upload-artifact@v6
         with:
-          name: backend-logs
+          name: integration-logs
           path: logs/
 
   e2e:
@@ -152,10 +159,31 @@ jobs:
     steps:
       - uses: actions/checkout@v6
 
+      # Cache third-party images (mongo, redis, kafka, etc.)
       - name: Cache and load Docker images
         uses: ./.github/actions/docker-cache
         with:
-          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
+          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
+
+      # Set up Docker Buildx for bake action (use latest for GHA cache v2 support)
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+        with:
+          version: latest
+
+      # Build all backend images using bake with GitHub Actions cache
+      - name: Build images with cache
+        uses: docker/bake-action@v6
+        with:
+          files: docker-bake.hcl
+          targets: backend-e2e
+          load: true
+          set: |
+            *.cache-from=type=gha
+            *.cache-to=type=gha,mode=max
+
+      - name: Prune Docker build cache
+        run: docker builder prune -af
 
       - name: Set up uv
         uses: astral-sh/setup-uv@v7
@@ -169,29 +197,48 @@ jobs:
           uv python install 3.12
           uv sync --frozen
 
-      - name: Start infrastructure services
-        run: |
-          docker compose -f docker-compose.ci.yaml up -d --wait --wait-timeout 120
-          docker compose -f docker-compose.ci.yaml ps
-
+      # Setup K3s before starting services (workers need kubeconfig)
       - name: Setup Kubernetes (k3s)
         run: |
-          curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik" sh -
+          curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --tls-san host.docker.internal" sh -
           mkdir -p /home/runner/.kube
           sudo k3s kubectl config view --raw > /home/runner/.kube/config
           sudo chmod 600 /home/runner/.kube/config
           export KUBECONFIG=/home/runner/.kube/config
           timeout 90 bash -c 'until sudo k3s kubectl cluster-info; do sleep 5; done'
           kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f -
+          # Create kubeconfig for containers: use kubectl config view which is more reliable
+          sudo k3s kubectl config view --raw | sed 's/127.0.0.1/host.docker.internal/g' > backend/kubeconfig.yaml
+          # Verify the kubeconfig is valid
+          echo "=== Verifying kubeconfig ==="
+          grep -q "current-context" backend/kubeconfig.yaml && echo "OK: current-context found" || (echo "ERROR: current-context missing"; cat backend/kubeconfig.yaml; exit 1)
+          grep -q "host.docker.internal" backend/kubeconfig.yaml && echo "OK: host.docker.internal found" || (echo "ERROR: host.docker.internal missing"; exit 1)
 
-      - name: Create Kafka topics
-        timeout-minutes: 2
+      # Start all services (images already built by bake)
+      - name: Start services
         env:
-          KAFKA_BOOTSTRAP_SERVERS: localhost:9092
-          KAFKA_TOPIC_PREFIX: "ci.${{ github.run_id }}."
+          MONGO_ROOT_USER: root
+          MONGO_ROOT_PASSWORD: rootpassword
+          ENABLE_TRACING: "false"
+          KAFKA_HEAP_OPTS: "-Xms256M -Xmx512M"
         run: |
-          cd backend
-          uv run python -m scripts.create_topics
+          # Start cert generation first (backend needs certs)
+          docker compose up -d --no-build shared-ca
+          docker compose up -d --no-build cert-generator
+
+          # Wait for certs to be generated
+          timeout 60 bash -c 'until [ -f backend/certs/server.key ]; do sleep 2; done'
+          echo "Certificates generated"
+
+          # Start infra
+          docker compose up -d --no-build --wait --wait-timeout 180 \
+            mongo redis zookeeper-certgen zookeeper kafka schema-registry
+
+          # Start backend and workers (Docker Compose handles init job dependencies via service_completed_successfully)
+          docker compose up -d --no-build --wait --wait-timeout 180 \
+            backend coordinator saga-orchestrator k8s-worker pod-monitor result-processor
+
+          docker compose ps
 
       - name: Run E2E tests
         timeout-minutes: 10
@@ -200,13 +247,18 @@ jobs:
           MONGO_ROOT_PASSWORD: rootpassword
           MONGODB_URL: mongodb://root:rootpassword@127.0.0.1:27017/?authSource=admin
           KAFKA_BOOTSTRAP_SERVERS: localhost:9092
-          KAFKA_TOPIC_PREFIX: "ci.${{ github.run_id }}."
+          KAFKA_TOPIC_PREFIX: ""
           SCHEMA_REGISTRY_URL: http://localhost:8081
           REDIS_HOST: localhost
           REDIS_PORT: 6379
-          SCHEMA_SUBJECT_PREFIX: "ci.${{ github.run_id }}."
+          SCHEMA_SUBJECT_PREFIX: ""
           KUBECONFIG: /home/runner/.kube/config
           K8S_NAMESPACE: integr8scode
+          # Tests connect to backend running in container (HTTPS)
+          BACKEND_URL: https://localhost:443
+          # Trust self-signed certs
+          REQUESTS_CA_BUNDLE: ""
+          CURL_CA_BUNDLE: ""
         run: |
           cd backend
           uv run pytest tests/e2e -v -rs \
@@ -228,13 +280,18 @@ jobs:
         if: failure()
         run: |
           mkdir -p logs
-          docker compose -f docker-compose.ci.yaml logs > logs/docker-compose.log 2>&1
+          docker compose logs > logs/docker-compose.log 2>&1
+          docker compose logs backend > logs/backend.log 2>&1
+          docker compose logs saga-orchestrator > logs/saga-orchestrator.log 2>&1
+          docker compose logs k8s-worker > logs/k8s-worker.log 2>&1
+          docker compose logs pod-monitor > logs/pod-monitor.log 2>&1
           kubectl get events --sort-by='.metadata.creationTimestamp' -A > logs/k8s-events.log 2>&1 || true
           kubectl describe pods -A > logs/k8s-describe-pods.log 2>&1 || true
+          kubectl logs -l app=executor -n integr8scode --tail=100 > logs/executor-pods.log 2>&1 || true
 
       - name: Upload logs
         if: failure()
         uses: actions/upload-artifact@v6
         with:
-          name: k8s-logs
+          name: e2e-logs
           path: logs/
diff --git a/.github/workflows/frontend-ci.yml b/.github/workflows/frontend-ci.yml
index c36fff8a..672201c2 100644
--- a/.github/workflows/frontend-ci.yml
+++ b/.github/workflows/frontend-ci.yml
@@ -5,16 +5,25 @@ on:
     branches: [main, dev]
     paths:
       - 'frontend/**'
+      - 'docker-compose.yaml'
+      - 'docker-bake.hcl'
       - '.github/workflows/frontend-ci.yml'
-      - 'docker-compose.ci.yaml'
   pull_request:
     branches: [main, dev]
     paths:
       - 'frontend/**'
+      - 'docker-compose.yaml'
+      - 'docker-bake.hcl'
       - '.github/workflows/frontend-ci.yml'
-      - 'docker-compose.ci.yaml'
   workflow_dispatch:
 
+env:
+  MONGO_IMAGE: mongo:8.0
+  REDIS_IMAGE: redis:7-alpine
+  ZOOKEEPER_IMAGE: confluentinc/cp-zookeeper:7.5.0
+  KAFKA_IMAGE: confluentinc/cp-kafka:7.5.0
+  SCHEMA_REGISTRY_IMAGE: confluentinc/cp-schema-registry:7.5.0
+
 jobs:
   unit:
     name: Unit Tests
@@ -52,26 +61,13 @@ jobs:
     needs: unit
     runs-on: ubuntu-latest
 
-    # Local registry for buildx to reference base image (docker-container driver is isolated)
-    services:
-      registry:
-        image: registry:2
-        ports:
-          - 5000:5000
-
-    env:
-      MONGO_IMAGE: mongo:8.0
-      REDIS_IMAGE: redis:7-alpine
-      KAFKA_IMAGE: apache/kafka:3.9.0
-      SCHEMA_REGISTRY_IMAGE: confluentinc/cp-schema-registry:7.5.0
-
     steps:
       - uses: actions/checkout@v6
 
       - name: Cache and load Docker images
         uses: ./.github/actions/docker-cache
         with:
-          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
+          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
 
       - name: Setup Node.js
         uses: actions/setup-node@v6
@@ -88,10 +84,24 @@ jobs:
         working-directory: frontend
         run: npx playwright install chromium
 
-      - name: Setup Docker Buildx
+      - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
         with:
-          driver-opts: network=host
+          version: latest
+
+      # Build all images using bake with GitHub Actions cache
+      - name: Build images with cache
+        uses: docker/bake-action@v6
+        with:
+          files: docker-bake.hcl
+          targets: all
+          load: true
+          set: |
+            *.cache-from=type=gha
+            *.cache-to=type=gha,mode=max
+
+      - name: Prune Docker build cache
+        run: docker builder prune -af
 
       - name: Setup Kubernetes (k3s)
         run: |
@@ -101,72 +111,41 @@ jobs:
           sudo chmod 600 /home/runner/.kube/config
           export KUBECONFIG=/home/runner/.kube/config
           timeout 90 bash -c 'until sudo k3s kubectl cluster-info; do sleep 5; done'
+          kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f -
+          # Create kubeconfig for containers: use kubectl config view which is more reliable
+          sudo k3s kubectl config view --raw | sed 's/127.0.0.1/host.docker.internal/g' > backend/kubeconfig.yaml
+          # Verify the kubeconfig is valid
+          echo "=== Verifying kubeconfig ==="
+          grep -q "current-context" backend/kubeconfig.yaml && echo "OK: current-context found" || (echo "ERROR: current-context missing"; cat backend/kubeconfig.yaml; exit 1)
+          grep -q "host.docker.internal" backend/kubeconfig.yaml && echo "OK: host.docker.internal found" || (echo "ERROR: host.docker.internal missing"; exit 1)
 
-      - name: Create kubeconfig for Docker containers
-        run: |
-          # Copy k3s kubeconfig with host.docker.internal for container networking
-          sed 's|https://127.0.0.1:6443|https://host.docker.internal:6443|g' \
-            /home/runner/.kube/config > backend/kubeconfig.yaml
-          chmod 644 backend/kubeconfig.yaml
-
-      # Build images with GitHub Actions cache for faster subsequent builds
-      # Base image pushed to local registry so buildx can reference it
-      - name: Build and push base image
-        uses: docker/build-push-action@v6
-        with:
-          context: ./backend
-          file: ./backend/Dockerfile.base
-          push: true
-          tags: localhost:5000/integr8scode-base:latest
-          cache-from: type=gha,scope=backend-base
-          cache-to: type=gha,mode=max,scope=backend-base
-
-      # Pull base to Docker daemon (needed for docker-compose)
-      - name: Load base image to Docker daemon
+      - name: Start full stack
+        env:
+          MONGO_ROOT_USER: root
+          MONGO_ROOT_PASSWORD: rootpassword
+          ENABLE_TRACING: "false"
+          KAFKA_HEAP_OPTS: "-Xms256M -Xmx512M"
         run: |
-          docker pull localhost:5000/integr8scode-base:latest
-          docker tag localhost:5000/integr8scode-base:latest integr8scode-base:latest
+          # Start cert generation first
+          docker compose up -d --no-build shared-ca
+          docker compose up -d --no-build cert-generator
 
-      - name: Build backend image
-        uses: docker/build-push-action@v6
-        with:
-          context: ./backend
-          file: ./backend/Dockerfile
-          load: true
-          tags: integr8scode-backend:latest
-          build-contexts: |
-            base=docker-image://localhost:5000/integr8scode-base:latest
-          cache-from: type=gha,scope=backend
-          cache-to: type=gha,mode=max,scope=backend
-
-      - name: Build cert-generator image
-        uses: docker/build-push-action@v6
-        with:
-          context: ./cert-generator
-          file: ./cert-generator/Dockerfile
-          load: true
-          tags: integr8scode-cert-generator:latest
-          cache-from: type=gha,scope=cert-generator
-          cache-to: type=gha,mode=max,scope=cert-generator
+          # Wait for certs
+          timeout 60 bash -c 'until [ -f backend/certs/server.key ]; do sleep 2; done'
+          echo "Certificates generated"
 
-      - name: Build frontend image
-        uses: docker/build-push-action@v6
-        with:
-          context: ./frontend
-          file: ./frontend/Dockerfile
-          load: true
-          tags: integr8scode-frontend:latest
-          cache-from: type=gha,scope=frontend
-          cache-to: type=gha,mode=max,scope=frontend
+          # Start infra
+          docker compose up -d --no-build --wait --wait-timeout 180 \
+            mongo redis zookeeper-certgen zookeeper kafka schema-registry
 
-      - name: Start full stack
-        run: |
-          docker compose -f docker-compose.ci.yaml --profile full up -d --wait --wait-timeout 300
-          docker compose -f docker-compose.ci.yaml ps
+          # Start backend + workers (Docker Compose handles init job dependencies via service_completed_successfully)
+          docker compose up -d --no-build --wait --wait-timeout 180 \
+            backend coordinator saga-orchestrator k8s-worker pod-monitor result-processor
 
-      - name: Seed test users
-        run: |
-          docker compose -f docker-compose.ci.yaml exec -T backend uv run python scripts/seed_users.py
+          # Start frontend
+          docker compose up -d --no-build --wait --wait-timeout 60 frontend
+
+          docker compose ps
 
       - name: Run E2E tests
         working-directory: frontend
@@ -185,10 +164,10 @@ jobs:
         if: failure()
         run: |
           mkdir -p logs
-          docker compose -f docker-compose.ci.yaml logs > logs/docker-compose.log 2>&1
-          docker compose -f docker-compose.ci.yaml logs backend > logs/backend.log 2>&1
-          docker compose -f docker-compose.ci.yaml logs frontend > logs/frontend.log 2>&1
-          docker compose -f docker-compose.ci.yaml logs kafka > logs/kafka.log 2>&1
+          docker compose logs > logs/docker-compose.log 2>&1
+          docker compose logs backend > logs/backend.log 2>&1
+          docker compose logs frontend > logs/frontend.log 2>&1
+          docker compose logs kafka > logs/kafka.log 2>&1
           kubectl get events --sort-by='.metadata.creationTimestamp' -A > logs/k8s-events.log 2>&1 || true
 
       - name: Upload logs
diff --git a/backend/app/api/routes/execution.py b/backend/app/api/routes/execution.py
index 37723a01..86d4336f 100644
--- a/backend/app/api/routes/execution.py
+++ b/backend/app/api/routes/execution.py
@@ -163,12 +163,6 @@ async def cancel_execution(
     cancel_request: CancelExecutionRequest,
     event_service: FromDishka[KafkaEventService],
 ) -> CancelResponse:
-    # Handle terminal states
-    terminal_states = [ExecutionStatus.COMPLETED, ExecutionStatus.FAILED, ExecutionStatus.TIMEOUT]
-
-    if execution.status in terminal_states:
-        raise HTTPException(status_code=400, detail=f"Cannot cancel execution in {str(execution.status)} state")
-
     # Handle idempotency - if already cancelled, return success
     if execution.status == ExecutionStatus.CANCELLED:
         return CancelResponse(
@@ -178,6 +172,10 @@ async def cancel_execution(
             event_id="-1",  # exact event_id unknown
         )
 
+    # Reject cancellation for other terminal states
+    if execution.status.is_terminal:
+        raise HTTPException(status_code=400, detail=f"Cannot cancel execution in {execution.status} state")
+
     settings = get_settings()
     payload = {
         "execution_id": execution.execution_id,
diff --git a/backend/app/core/container.py b/backend/app/core/container.py
index 97411a49..3faf31e5 100644
--- a/backend/app/core/container.py
+++ b/backend/app/core/container.py
@@ -118,6 +118,7 @@ def create_pod_monitor_container(settings: Settings) -> AsyncContainer:
         SettingsProvider(),
         LoggingProvider(),
         DatabaseProvider(),
+        RedisProvider(),
         CoreServicesProvider(),
         MetricsProvider(),
         RepositoryProvider(),
diff --git a/backend/app/core/k8s_clients.py b/backend/app/core/k8s_clients.py
index 2a475df3..051c459f 100644
--- a/backend/app/core/k8s_clients.py
+++ b/backend/app/core/k8s_clients.py
@@ -1,33 +1,36 @@
 import logging
 from dataclasses import dataclass
 
-from kubernetes import client as k8s_client
-from kubernetes import config as k8s_config
+from kubernetes_asyncio import client as k8s_client
+from kubernetes_asyncio import config as k8s_config
 
 
 @dataclass(frozen=True)
 class K8sClients:
+    """Container for Kubernetes API clients (kubernetes_asyncio)."""
+
     api_client: k8s_client.ApiClient
     v1: k8s_client.CoreV1Api
     apps_v1: k8s_client.AppsV1Api
     networking_v1: k8s_client.NetworkingV1Api
 
 
-def create_k8s_clients(
+async def create_k8s_clients(
     logger: logging.Logger, kubeconfig_path: str | None = None, in_cluster: bool | None = None
 ) -> K8sClients:
+    """Create Kubernetes API clients (async for kubernetes_asyncio)."""
     if in_cluster:
         k8s_config.load_incluster_config()
-    elif kubeconfig_path:
-        k8s_config.load_kube_config(config_file=kubeconfig_path)
     else:
-        k8s_config.load_kube_config()
+        await k8s_config.load_kube_config(config_file=kubeconfig_path)  # None → default ~/.kube/config
+
+    # Create API client for kubernetes_asyncio
+    api_client = k8s_client.ApiClient()
+    configuration = api_client.configuration
 
-    configuration = k8s_client.Configuration.get_default_copy()
     logger.info(f"Kubernetes API host: {configuration.host}")
     logger.info(f"SSL CA configured: {configuration.ssl_ca_cert is not None}")
 
-    api_client = k8s_client.ApiClient(configuration)
     return K8sClients(
         api_client=api_client,
         v1=k8s_client.CoreV1Api(api_client),
@@ -36,7 +39,7 @@ def create_k8s_clients(
     )
 
 
-def close_k8s_clients(clients: K8sClients) -> None:
-    close = getattr(clients.api_client, "close", None)
-    if callable(close):
-        close()
+async def close_k8s_clients(clients: K8sClients) -> None:
+    """Close Kubernetes API client (async for kubernetes_asyncio)."""
+    if clients.api_client:
+        await clients.api_client.close()
diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py
index c1f29693..2d73ad8a 100644
--- a/backend/app/core/providers.py
+++ b/backend/app/core/providers.py
@@ -155,7 +155,7 @@ async def get_kafka_producer(
         self, settings: Settings, schema_registry: SchemaRegistryManager, logger: logging.Logger
     ) -> AsyncIterator[UnifiedProducer]:
         config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS)
-        async with UnifiedProducer(config, schema_registry, logger) as producer:
+        async with UnifiedProducer(config, schema_registry, settings, logger) as producer:
             yield producer
 
     @provide
@@ -227,11 +227,11 @@ class KubernetesProvider(Provider):
 
     @provide
     async def get_k8s_clients(self, settings: Settings, logger: logging.Logger) -> AsyncIterator[K8sClients]:
-        clients = create_k8s_clients(logger)
+        clients = await create_k8s_clients(logger)
         try:
             yield clients
         finally:
-            close_k8s_clients(clients)
+            await close_k8s_clients(clients)
 
 
 class MetricsProvider(Provider):
@@ -651,7 +651,6 @@ async def get_kubernetes_worker(
         kafka_producer: UnifiedProducer,
         schema_registry: SchemaRegistryManager,
         settings: Settings,
-        event_store: EventStore,
         idempotency_manager: IdempotencyManager,
         logger: logging.Logger,
     ) -> AsyncIterator[KubernetesWorker]:
@@ -661,7 +660,6 @@ async def get_kubernetes_worker(
             producer=kafka_producer,
             schema_registry_manager=schema_registry,
             settings=settings,
-            event_store=event_store,
             idempotency_manager=idempotency_manager,
             logger=logger,
         ) as worker:
diff --git a/backend/app/db/docs/replay.py b/backend/app/db/docs/replay.py
index b707cd0e..a399f630 100644
--- a/backend/app/db/docs/replay.py
+++ b/backend/app/db/docs/replay.py
@@ -13,17 +13,27 @@
 class ReplayFilter(BaseModel):
     """Replay filter configuration (embedded document).
 
-    Copied from domain/replay/models.py ReplayFilter.
+    Must match domain/replay/models.py ReplayFilter exactly.
     """
 
+    # Event selection filters
+    event_ids: List[str] | None = None
     execution_id: str | None = None
+    correlation_id: str | None = None
+    aggregate_id: str | None = None
     event_types: List[EventType] | None = None
+    exclude_event_types: List[EventType] | None = None
+
+    # Time range
     start_time: datetime | None = None
     end_time: datetime | None = None
+
+    # Metadata filters
     user_id: str | None = None
     service_name: str | None = None
+
+    # Escape hatch for complex queries
     custom_query: Dict[str, Any] | None = None
-    exclude_event_types: List[EventType] | None = None
 
     model_config = ConfigDict(from_attributes=True)
 
@@ -43,7 +53,7 @@ class ReplayConfig(BaseModel):
     batch_size: int = Field(default=100, ge=1, le=1000)
     max_events: int | None = Field(default=None, ge=1)
 
-    target_topics: Dict[str, str] | None = None  # EventType -> topic mapping as strings
+    target_topics: Dict[EventType, str] | None = None
     target_file_path: str | None = None
 
     skip_errors: bool = True
diff --git a/backend/app/db/repositories/event_repository.py b/backend/app/db/repositories/event_repository.py
index b7cbc245..7cb2a48e 100644
--- a/backend/app/db/repositories/event_repository.py
+++ b/backend/app/db/repositories/event_repository.py
@@ -1,10 +1,10 @@
 import logging
-from dataclasses import asdict
-from datetime import datetime, timedelta, timezone
+from dataclasses import asdict, fields
+from datetime import datetime, timezone
 from typing import Any, Mapping
 
 from beanie.odm.enums import SortDirection
-from beanie.operators import GTE, LT, LTE, In, Not, Or, RegEx
+from beanie.operators import GTE, LTE, In, Not, Or, RegEx
 
 from app.core.tracing import EventAttributes
 from app.core.tracing.utils import add_span_attributes
@@ -55,50 +55,12 @@ async def store_event(self, event: Event) -> str:
         self.logger.debug(f"Stored event {event.event_id} of type {event.event_type}")
         return event.event_id
 
-    async def store_events_batch(self, events: list[Event]) -> list[str]:
-        if not events:
-            return []
-        now = datetime.now(timezone.utc)
-        docs = []
-        for event in events:
-            data = asdict(event)
-            if not data.get("stored_at"):
-                data["stored_at"] = now
-            # Remove None values so EventDocument defaults can apply
-            data = {k: v for k, v in data.items() if v is not None}
-            docs.append(EventDocument(**data))
-        await EventDocument.insert_many(docs)
-        add_span_attributes(**{"events.batch.count": len(events)})
-        self.logger.info(f"Stored {len(events)} events in batch")
-        return [event.event_id for event in events]
-
     async def get_event(self, event_id: str) -> Event | None:
         doc = await EventDocument.find_one({"event_id": event_id})
         if not doc:
             return None
         return Event(**doc.model_dump(exclude={"id", "revision_id"}))
 
-    async def get_events_by_type(
-        self,
-        event_type: str,
-        start_time: datetime | None = None,
-        end_time: datetime | None = None,
-        limit: int = 100,
-        skip: int = 0,
-    ) -> list[Event]:
-        conditions = [
-            EventDocument.event_type == event_type,
-            *self._time_conditions(start_time, end_time),
-        ]
-        docs = (
-            await EventDocument.find(*conditions)
-            .sort([("timestamp", SortDirection.DESCENDING)])
-            .skip(skip)
-            .limit(limit)
-            .to_list()
-        )
-        return [Event(**d.model_dump(exclude={"id", "revision_id"})) for d in docs]
-
     async def get_events_by_aggregate(
         self, aggregate_id: str, event_types: list[EventType] | None = None, limit: int = 100
     ) -> list[Event]:
@@ -125,30 +87,6 @@ async def get_events_by_correlation(self, correlation_id: str, limit: int = 100,
             has_more=(skip + limit) < total_count,
         )
 
-    async def get_events_by_user(
-        self,
-        user_id: str,
-        event_types: list[str] | None = None,
-        start_time: datetime | None = None,
-        end_time: datetime | None = None,
-        limit: int = 100,
-        skip: int = 0,
-    ) -> list[Event]:
-        conditions = [
-            EventDocument.metadata.user_id == user_id,
-            In(EventDocument.event_type, event_types) if event_types else None,
-            *self._time_conditions(start_time, end_time),
-        ]
-        conditions = [c for c in conditions if c is not None]
-        docs = (
-            await EventDocument.find(*conditions)
-            .sort([("timestamp", SortDirection.DESCENDING)])
-            .skip(skip)
-            .limit(limit)
-            .to_list()
-        )
-        return [Event(**d.model_dump(exclude={"id", "revision_id"})) for d in docs]
-
     async def get_execution_events(
         self, execution_id: str, limit: int = 100, skip: int = 0, exclude_system_events: bool = False
     ) -> EventListResult:
@@ -240,26 +178,6 @@ async def get_event_statistics(
 
         return EventStatistics(total_events=0, events_by_type={}, events_by_service={}, events_by_hour=[])
 
-    async def cleanup_old_events(
-        self, older_than_days: int = 30, event_types: list[str] | None = None, dry_run: bool = False
-    ) -> int:
-        cutoff_dt = datetime.now(timezone.utc) - timedelta(days=older_than_days)
-        conditions: list[Any] = [
-            LT(EventDocument.timestamp, cutoff_dt),
-            In(EventDocument.event_type, event_types) if event_types else None,
-        ]
-        conditions = [c for c in conditions if c is not None]
-
-        if dry_run:
-            count = await EventDocument.find(*conditions).count()
-            self.logger.info(f"Would delete {count} events older than {older_than_days} days")
-            return count
-
-        result = await EventDocument.find(*conditions).delete()
-        deleted_count = result.deleted_count if result else 0
-        self.logger.info(f"Deleted {deleted_count} events older than {older_than_days} days")
-        return deleted_count
-
     async def get_user_events_paginated(
         self,
         user_id: str,
@@ -290,9 +208,6 @@ async def get_user_events_paginated(
             has_more=(skip + limit) < total_count,
         )
 
-    async def count_events(self, *conditions: Any) -> int:
-        return await EventDocument.find(*conditions).count()
-
     async def query_events(
         self,
         query: dict[str, Any],
@@ -338,15 +253,7 @@ async def delete_event_with_archival(
 
         deleted_at = datetime.now(timezone.utc)
         archived_doc = EventArchiveDocument(
-            event_id=doc.event_id,
-            event_type=doc.event_type,
-            event_version=doc.event_version,
-            timestamp=doc.timestamp,
-            metadata=doc.metadata,
-            payload=doc.payload,
-            aggregate_id=doc.aggregate_id,
-            stored_at=doc.stored_at,
-            ttl_expires_at=doc.ttl_expires_at,
+            **doc.model_dump(exclude={"id", "revision_id"}),
             deleted_at=deleted_at,
             deleted_by=deleted_by,
             deletion_reason=deletion_reason,
@@ -360,9 +267,6 @@ async def delete_event_with_archival(
             deletion_reason=deletion_reason,
         )
 
-    async def get_aggregate_events_for_replay(self, aggregate_id: str, limit: int = 10000) -> list[Event]:
-        return await self.get_events_by_aggregate(aggregate_id=aggregate_id, limit=limit)
-
     async def get_aggregate_replay_info(self, aggregate_id: str) -> EventReplayInfo | None:
         pipeline = [
             {"$match": {"aggregate_id": aggregate_id}},
@@ -380,14 +284,12 @@ async def get_aggregate_replay_info(self, aggregate_id: str) -> EventReplayInfo
             {"$project": {"_id": 0}},
         ]
 
-        async for doc in EventDocument.aggregate(pipeline):
-            events = [Event(**e) for e in doc["events"]]
-            return EventReplayInfo(
-                events=events,
-                event_count=doc["event_count"],
-                event_types=doc["event_types"],
-                start_time=doc["start_time"],
-                end_time=doc["end_time"],
-            )
-
-        return None
+        doc = await anext(EventDocument.aggregate(pipeline), None)
+        if not doc:
+            return None
+        # Only pass keys that Event dataclass accepts (filters out _id, revision_id, etc.)
+        event_keys = {f.name for f in fields(Event)}
+        return EventReplayInfo(
+            events=[Event(**{k: v for k, v in e.items() if k in event_keys}) for e in doc["events"]],
+            **{k: v for k, v in doc.items() if k != "events"},
+        )
diff --git a/backend/app/db/repositories/replay_repository.py b/backend/app/db/repositories/replay_repository.py
index 387f489a..c0732bfa 100644
--- a/backend/app/db/repositories/replay_repository.py
+++ b/backend/app/db/repositories/replay_repository.py
@@ -66,9 +66,6 @@ async def delete_old_sessions(self, cutoff_time: datetime) -> int:
         ).delete()
         return result.deleted_count if result else 0
 
-    async def count_sessions(self, *conditions: Any) -> int:
-        return await ReplaySessionDocument.find(*conditions).count()
-
     async def update_replay_session(self, session_id: str, updates: ReplaySessionUpdate) -> bool:
         update_dict = {k: (v.value if hasattr(v, "value") else v) for k, v in asdict(updates).items() if v is not None}
         if not update_dict:
diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py
index f64ca00f..f6f76705 100644
--- a/backend/app/dlq/manager.py
+++ b/backend/app/dlq/manager.py
@@ -6,6 +6,7 @@
 
 from confluent_kafka import Consumer, KafkaError, Message, Producer
 from opentelemetry.trace import SpanKind
+from pymongo.errors import DuplicateKeyError
 
 from app.core.lifecycle import LifecycleEnabled
 from app.core.metrics.context import get_dlq_metrics
@@ -251,8 +252,9 @@ async def _process_dlq_message(self, message: DLQMessage) -> None:
                 self.logger.info("Message filtered out", extra={"event_id": message.event_id})
                 return
 
-        # Store in MongoDB via Beanie
-        await self._store_message(message)
+        # Store in MongoDB via Beanie (returns False if already processed)
+        if not await self._store_message(message):
+            return
 
         # Get retry policy for topic
         retry_policy = self._retry_policies.get(message.original_topic, self.default_retry_policy)
@@ -275,18 +277,26 @@ async def _process_dlq_message(self, message: DLQMessage) -> None:
         if retry_policy.strategy == RetryStrategy.IMMEDIATE:
             await self._retry_message(message)
 
-    async def _store_message(self, message: DLQMessage) -> None:
-        # Ensure message has proper status and timestamps
+    async def _store_message(self, message: DLQMessage) -> bool:
+        """Store message. Skip only if already terminal (DISCARDED/RETRIED)."""
+        existing = await DLQMessageDocument.find_one({"event_id": message.event_id})
+
+        if existing and existing.status in {DLQMessageStatus.DISCARDED, DLQMessageStatus.RETRIED}:
+            return False
+
         message.status = DLQMessageStatus.PENDING
         message.last_updated = datetime.now(timezone.utc)
-
         doc = self._message_to_doc(message)
 
-        # Upsert using Beanie
-        existing = await DLQMessageDocument.find_one({"event_id": message.event_id})
         if existing:
             doc.id = existing.id
-        await doc.save()
+
+        try:
+            await doc.save()
+        except DuplicateKeyError:
+            return False  # Lost race - Kafka will redeliver
+
+        return True
 
     async def _update_message_status(self, event_id: str, update: DLQMessageUpdate) -> None:
         doc = await DLQMessageDocument.find_one({"event_id": event_id})
@@ -467,11 +477,13 @@ def create_dlq_manager(
     dlq_topic: KafkaTopic = KafkaTopic.DEAD_LETTER_QUEUE,
     retry_topic_suffix: str = "-retry",
     default_retry_policy: RetryPolicy | None = None,
+    group_id_suffix: str | None = None,
 ) -> DLQManager:
+    suffix = group_id_suffix or settings.KAFKA_GROUP_SUFFIX
     consumer = Consumer(
         {
             "bootstrap.servers": settings.KAFKA_BOOTSTRAP_SERVERS,
-            "group.id": f"{GroupId.DLQ_MANAGER}.{settings.KAFKA_GROUP_SUFFIX}",
+            "group.id": f"{GroupId.DLQ_MANAGER}.{suffix}",
             "enable.auto.commit": False,
             "auto.offset.reset": "earliest",
             "client.id": "dlq-manager-consumer",
diff --git a/backend/app/domain/enums/execution.py b/backend/app/domain/enums/execution.py
index abb4809d..28f4e4fb 100644
--- a/backend/app/domain/enums/execution.py
+++ b/backend/app/domain/enums/execution.py
@@ -12,3 +12,14 @@ class ExecutionStatus(StringEnum):
     TIMEOUT = "timeout"
     CANCELLED = "cancelled"
     ERROR = "error"
+
+    @property
+    def is_terminal(self) -> bool:
+        """True if this status represents a final state (no further transitions)."""
+        return self in (
+            ExecutionStatus.COMPLETED,
+            ExecutionStatus.FAILED,
+            ExecutionStatus.TIMEOUT,
+            ExecutionStatus.CANCELLED,
+            ExecutionStatus.ERROR,
+        )
diff --git a/backend/app/events/core/consumer.py b/backend/app/events/core/consumer.py
index ab5656d5..fb4be960 100644
--- a/backend/app/events/core/consumer.py
+++ b/backend/app/events/core/consumer.py
@@ -1,6 +1,7 @@
 import asyncio
 import json
 import logging
+import threading
 from collections.abc import Awaitable, Callable
 from datetime import datetime, timezone
 from typing import Any
@@ -20,6 +21,10 @@
 from .dispatcher import EventDispatcher
 from .types import ConsumerConfig, ConsumerMetrics, ConsumerMetricsSnapshot, ConsumerState, ConsumerStatus
 
+# Global lock to serialize Consumer initialization (workaround for librdkafka race condition)
+# See: https://github.com/confluentinc/confluent-kafka-python/issues/1797
+_consumer_init_lock = threading.Lock()
+
 
 class UnifiedConsumer:
     def __init__(
@@ -52,7 +57,9 @@ async def start(self, topics: list[KafkaTopic]) -> None:
         if self._stats_callback:
             consumer_config["stats_cb"] = self._handle_stats
 
-        self._consumer = Consumer(consumer_config)
+        # Serialize Consumer initialization to prevent librdkafka race condition
+        with _consumer_init_lock:
+            self._consumer = Consumer(consumer_config)
         topic_strings = [f"{self._topic_prefix}{str(topic)}" for topic in topics]
         self._consumer.subscribe(topic_strings)
         self._running = True
diff --git a/backend/app/events/core/producer.py b/backend/app/events/core/producer.py
index b45858ea..f62b7481 100644
--- a/backend/app/events/core/producer.py
+++ b/backend/app/events/core/producer.py
@@ -15,7 +15,7 @@
 from app.domain.enums.kafka import KafkaTopic
 from app.events.schema.schema_registry import SchemaRegistryManager
 from app.infrastructure.kafka.events import BaseEvent
-from app.settings import get_settings
+from app.settings import Settings
 
 from .types import ProducerConfig, ProducerMetrics, ProducerState
 
@@ -32,6 +32,7 @@ def __init__(
         self,
         config: ProducerConfig,
         schema_registry_manager: SchemaRegistryManager,
+        settings: Settings,
         logger: logging.Logger,
         stats_callback: StatsCallback | None = None,
     ):
@@ -45,8 +46,8 @@ def __init__(
         self._metrics = ProducerMetrics()
         self._event_metrics = get_event_metrics()  # Singleton for Kafka metrics
         self._poll_task: asyncio.Task[None] | None = None
-        # Topic prefix (for tests/local isolation); cached on init
-        self._topic_prefix = get_settings().KAFKA_TOPIC_PREFIX
+        # Topic prefix (for tests/local isolation); use injected settings
+        self._topic_prefix = settings.KAFKA_TOPIC_PREFIX
 
     @property
     def is_running(self) -> bool:
diff --git a/backend/app/services/k8s_worker/pod_builder.py b/backend/app/services/k8s_worker/pod_builder.py
index c4db7a48..8327bc53 100644
--- a/backend/app/services/k8s_worker/pod_builder.py
+++ b/backend/app/services/k8s_worker/pod_builder.py
@@ -1,4 +1,4 @@
-from kubernetes import client as k8s_client
+from kubernetes_asyncio import client as k8s_client
 
 from app.infrastructure.kafka.events.saga import CreatePodCommandEvent
 from app.services.k8s_worker.config import K8sWorkerConfig
diff --git a/backend/app/services/k8s_worker/worker.py b/backend/app/services/k8s_worker/worker.py
index 8bad97c2..3a73edb0 100644
--- a/backend/app/services/k8s_worker/worker.py
+++ b/backend/app/services/k8s_worker/worker.py
@@ -5,9 +5,9 @@
 from pathlib import Path
 from typing import Any
 
-from kubernetes import client as k8s_client
-from kubernetes import config as k8s_config
-from kubernetes.client.rest import ApiException
+from kubernetes_asyncio import client as k8s_client
+from kubernetes_asyncio import config as k8s_config
+from kubernetes_asyncio.client.exceptions import ApiException
 
 from app.core.lifecycle import LifecycleEnabled
 from app.core.metrics import ExecutionMetrics, KubernetesMetrics
@@ -15,7 +15,6 @@
 from app.domain.enums.kafka import KafkaTopic
 from app.domain.enums.storage import ExecutionErrorType
 from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer
-from app.events.event_store import EventStore
 from app.events.schema.schema_registry import (
     SchemaRegistryManager,
 )
@@ -52,7 +51,6 @@ def __init__(
         producer: UnifiedProducer,
         schema_registry_manager: SchemaRegistryManager,
         settings: Settings,
-        event_store: EventStore,
         idempotency_manager: IdempotencyManager,
         logger: logging.Logger,
     ):
@@ -64,9 +62,9 @@ def __init__(
         self._settings = settings
 
         self.kafka_servers = self.config.kafka_bootstrap_servers or self._settings.KAFKA_BOOTSTRAP_SERVERS
-        self._event_store = event_store
 
-        # Kubernetes clients
+        # Kubernetes clients (kubernetes_asyncio)
+        self._api_client: k8s_client.ApiClient | None = None
         self.v1: k8s_client.CoreV1Api | None = None
         self.networking_v1: k8s_client.NetworkingV1Api | None = None
         self.apps_v1: k8s_client.AppsV1Api | None = None
@@ -94,8 +92,8 @@ async def _on_start(self) -> None:
                 "KubernetesWorker namespace 'default' is forbidden. Set K8S_NAMESPACE to a dedicated namespace."
             )
 
-        # Initialize Kubernetes client
-        self._initialize_kubernetes_client()
+        # Initialize Kubernetes client (async for kubernetes_asyncio)
+        await self._initialize_kubernetes_client()
         self.logger.info("DEBUG: Kubernetes client initialized")
 
         self.logger.info("Using provided producer")
@@ -166,45 +164,47 @@ async def _on_stop(self) -> None:
         # Close idempotency manager
         await self.idempotency_manager.close()
 
+        # Close Kubernetes API client (kubernetes_asyncio requires explicit close)
+        if self._api_client:
+            await self._api_client.close()
+            self._api_client = None
+
         # Note: producer is managed by DI container, not stopped here
 
         self.logger.info("KubernetesWorker service stopped")
 
-    def _initialize_kubernetes_client(self) -> None:
-        """Initialize Kubernetes API clients"""
+    async def _initialize_kubernetes_client(self) -> None:
+        """Initialize Kubernetes API clients (async for kubernetes_asyncio)."""
         try:
-            # Load config
+            # Load config (async for kubernetes_asyncio)
             if self.config.in_cluster:
                 self.logger.info("Using in-cluster Kubernetes configuration")
                 k8s_config.load_incluster_config()
             elif self.config.kubeconfig_path and os.path.exists(self.config.kubeconfig_path):
                 self.logger.info(f"Using kubeconfig from {self.config.kubeconfig_path}")
-                k8s_config.load_kube_config(config_file=self.config.kubeconfig_path)
+                await k8s_config.load_kube_config(config_file=self.config.kubeconfig_path)
+            elif os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount"):
+                self.logger.info("Auto-detected in-cluster environment")
+                k8s_config.load_incluster_config()
             else:
-                # Try default locations
-                if os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount"):
-                    self.logger.info("Detected in-cluster environment")
-                    k8s_config.load_incluster_config()
-                else:
-                    self.logger.info("Using default kubeconfig")
-                    k8s_config.load_kube_config()
+                self.logger.info("Using default kubeconfig")
+                await k8s_config.load_kube_config()  # None → ~/.kube/config
 
-            # Get the default configuration that was set by load_kube_config
-            configuration = k8s_client.Configuration.get_default_copy()
+            # Create API client for kubernetes_asyncio
+            self._api_client = k8s_client.ApiClient()
+            configuration = self._api_client.configuration
 
-            # The certificate data should already be configured by load_kube_config
             # Log the configuration for debugging
             self.logger.info(f"Kubernetes API host: {configuration.host}")
             self.logger.info(f"SSL CA cert configured: {configuration.ssl_ca_cert is not None}")
 
-            # Create API clients with the configuration
-            api_client = k8s_client.ApiClient(configuration)
-            self.v1 = k8s_client.CoreV1Api(api_client)
-            self.networking_v1 = k8s_client.NetworkingV1Api(api_client)
-            self.apps_v1 = k8s_client.AppsV1Api(api_client)
+            # Create API clients with the shared api_client
+            self.v1 = k8s_client.CoreV1Api(self._api_client)
+            self.networking_v1 = k8s_client.NetworkingV1Api(self._api_client)
+            self.apps_v1 = k8s_client.AppsV1Api(self._api_client)
 
-            # Test connection with namespace-scoped operation
-            _ = self.v1.list_namespaced_pod(namespace=self.config.namespace, limit=1)
+            # Test connection with namespace-scoped operation (native async)
+            await self.v1.list_namespaced_pod(namespace=self.config.namespace, limit=1)
             self.logger.info(f"Successfully connected to Kubernetes API, namespace {self.config.namespace} accessible")
 
         except Exception as e:
@@ -241,23 +241,20 @@ async def _handle_delete_pod_command(self, command: DeletePodCommandEvent) -> No
         self.logger.info(f"Deleting pod for execution {execution_id} due to: {command.reason}")
 
         try:
-            # Delete the pod
+            # Delete the pod (native async with kubernetes_asyncio)
             pod_name = f"executor-{execution_id}"
             if self.v1:
-                await asyncio.to_thread(
-                    self.v1.delete_namespaced_pod,
+                await self.v1.delete_namespaced_pod(
                     name=pod_name,
                     namespace=self.config.namespace,
                     grace_period_seconds=30,
                 )
                 self.logger.info(f"Successfully deleted pod {pod_name}")
 
-            # Delete associated ConfigMap
+            # Delete associated ConfigMap (native async)
             configmap_name = f"script-{execution_id}"
             if self.v1:
-                await asyncio.to_thread(
-                    self.v1.delete_namespaced_config_map, name=configmap_name, namespace=self.config.namespace
-                )
+                await self.v1.delete_namespaced_config_map(name=configmap_name, namespace=self.config.namespace)
                 self.logger.info(f"Successfully deleted ConfigMap {configmap_name}")
 
             # NetworkPolicy cleanup is managed via a static cluster policy; no per-execution NP deletion
@@ -344,13 +341,11 @@ async def _get_entrypoint_script(self) -> str:
 """
 
     async def _create_config_map(self, config_map: k8s_client.V1ConfigMap) -> None:
-        """Create ConfigMap in Kubernetes"""
+        """Create ConfigMap in Kubernetes (native async with kubernetes_asyncio)."""
         if not self.v1:
             raise RuntimeError("Kubernetes client not initialized")
         try:
-            await asyncio.to_thread(
-                self.v1.create_namespaced_config_map, namespace=self.config.namespace, body=config_map
-            )
+            await self.v1.create_namespaced_config_map(namespace=self.config.namespace, body=config_map)
             self.metrics.record_k8s_config_map_created("success")
             self.logger.debug(f"Created ConfigMap {config_map.metadata.name}")
         except ApiException as e:
@@ -362,11 +357,11 @@ async def _create_config_map(self, config_map: k8s_client.V1ConfigMap) -> None:
                 raise
 
     async def _create_pod(self, pod: k8s_client.V1Pod) -> None:
-        """Create Pod in Kubernetes"""
+        """Create Pod in Kubernetes (native async with kubernetes_asyncio)."""
         if not self.v1:
             raise RuntimeError("Kubernetes client not initialized")
         try:
-            await asyncio.to_thread(self.v1.create_namespaced_pod, namespace=self.config.namespace, body=pod)
+            await self.v1.create_namespaced_pod(namespace=self.config.namespace, body=pod)
             self.logger.debug(f"Created Pod {pod.metadata.name}")
         except ApiException as e:
             if e.status == 409:  # Already exists
@@ -478,20 +473,17 @@ async def ensure_image_pre_puller_daemonset(self) -> None:
             }
 
             try:
-                await asyncio.to_thread(
-                    self.apps_v1.read_namespaced_daemon_set, name=daemonset_name, namespace=namespace
-                )
+                # Native async calls with kubernetes_asyncio
+                await self.apps_v1.read_namespaced_daemon_set(name=daemonset_name, namespace=namespace)
                 self.logger.info(f"DaemonSet '{daemonset_name}' exists. Replacing to ensure it is up-to-date.")
-                await asyncio.to_thread(
-                    self.apps_v1.replace_namespaced_daemon_set, name=daemonset_name, namespace=namespace, body=manifest
+                await self.apps_v1.replace_namespaced_daemon_set(
+                    name=daemonset_name, namespace=namespace, body=manifest
                 )
                 self.logger.info(f"DaemonSet '{daemonset_name}' replaced successfully.")
             except ApiException as e:
                 if e.status == 404:
                     self.logger.info(f"DaemonSet '{daemonset_name}' not found. Creating...")
-                    await asyncio.to_thread(
-                        self.apps_v1.create_namespaced_daemon_set, namespace=namespace, body=manifest
-                    )
+                    await self.apps_v1.create_namespaced_daemon_set(namespace=namespace, body=manifest)
                     self.logger.info(f"DaemonSet '{daemonset_name}' created successfully.")
                 else:
                     raise
diff --git a/backend/app/services/pod_monitor/event_mapper.py b/backend/app/services/pod_monitor/event_mapper.py
index c608035a..db02623b 100644
--- a/backend/app/services/pod_monitor/event_mapper.py
+++ b/backend/app/services/pod_monitor/event_mapper.py
@@ -1,10 +1,11 @@
 import ast
 import json
 import logging
+from collections.abc import Awaitable
 from dataclasses import dataclass
 from typing import Protocol
 
-from kubernetes import client as k8s_client
+from kubernetes_asyncio import client as k8s_client
 
 from app.domain.enums.kafka import GroupId
 from app.domain.enums.storage import ExecutionErrorType
@@ -49,9 +50,9 @@ class PodLogs:
 
 
 class EventMapper(Protocol):
-    """Protocol for event mapping functions"""
+    """Protocol for async event mapping functions"""
 
-    def __call__(self, ctx: PodContext) -> BaseEvent | None: ...
+    def __call__(self, ctx: PodContext) -> Awaitable[BaseEvent | None]: ...
 
 
 class PodEventMapper:
@@ -75,8 +76,8 @@ def __init__(self, logger: logging.Logger, k8s_api: k8s_client.CoreV1Api | None
             "DELETED": [self._map_terminated],
         }
 
-    def map_pod_event(self, pod: k8s_client.V1Pod, event_type: str) -> EventList:
-        """Map a Kubernetes pod to application events"""
+    async def map_pod_event(self, pod: k8s_client.V1Pod, event_type: str) -> EventList:
+        """Map a Kubernetes pod to application events (async for kubernetes_asyncio)"""
         self.logger.info(
             f"POD-EVENT: type={event_type} name={getattr(pod.metadata, 'name', None)} "
             f"ns={getattr(pod.metadata, 'namespace', None)} phase={getattr(pod.status, 'phase', None)}"
@@ -114,7 +115,7 @@ def map_pod_event(self, pod: k8s_client.V1Pod, event_type: str) -> EventList:
         events: list[BaseEvent] = []
 
         # Check for timeout first - if pod timed out, only return timeout event
-        if timeout_event := self._check_timeout(ctx):
+        if timeout_event := await self._check_timeout(ctx):
             self.logger.info(
                 f"POD-EVENT: mapped TIMEOUT exec={ctx.execution_id} phase={ctx.phase} "
                 f"adl={getattr(getattr(pod, 'spec', None), 'active_deadline_seconds', None)}"
@@ -135,16 +136,16 @@ def map_pod_event(self, pod: k8s_client.V1Pod, event_type: str) -> EventList:
             )
             return events
 
-        # Phase-based mappers
+        # Phase-based mappers (async)
         for mapper in self._phase_mappers.get(phase, []):
-            if event := mapper(ctx):
+            if event := await mapper(ctx):
                 mapper_name = getattr(mapper, "__name__", repr(mapper))
                 self.logger.info(f"POD-EVENT: phase-map {mapper_name} -> {event.event_type} exec={ctx.execution_id}")
                 events.append(event)
 
-        # Event type mappers
+        # Event type mappers (async)
         for mapper in self._event_type_mappers.get(event_type, []):
-            if event := mapper(ctx):
+            if event := await mapper(ctx):
                 mapper_name = getattr(mapper, "__name__", repr(mapper))
                 self.logger.info(f"POD-EVENT: type-map {mapper_name} -> {event.event_type} exec={ctx.execution_id}")
                 events.append(event)
@@ -200,7 +201,7 @@ def _is_duplicate(self, pod_name: str, phase: PodPhase) -> bool:
         self._event_cache[pod_name] = phase
         return False
 
-    def _map_scheduled(self, ctx: PodContext) -> PodScheduledEvent | None:
+    async def _map_scheduled(self, ctx: PodContext) -> PodScheduledEvent | None:
         """Map pending pod to scheduled event"""
         # K8s API can return pods without status
         if not ctx.pod.status or not ctx.pod.status.conditions:
@@ -223,7 +224,7 @@ def _map_scheduled(self, ctx: PodContext) -> PodScheduledEvent | None:
         self.logger.debug(f"POD-EVENT: mapped scheduled -> {evt.event_type} exec={ctx.execution_id}")
         return evt
 
-    def _map_running(self, ctx: PodContext) -> PodRunningEvent | None:
+    async def _map_running(self, ctx: PodContext) -> PodRunningEvent | None:
         """Map running pod to running event"""
         # K8s API can return pods without status
         if not ctx.pod.status:
@@ -248,13 +249,13 @@ def _map_running(self, ctx: PodContext) -> PodRunningEvent | None:
         self.logger.debug(f"POD-EVENT: mapped running -> {evt.event_type} exec={ctx.execution_id}")
         return evt
 
-    def _map_completed(self, ctx: PodContext) -> ExecutionCompletedEvent | None:
+    async def _map_completed(self, ctx: PodContext) -> ExecutionCompletedEvent | None:
         """Map succeeded pod to completed event"""
         container = self._get_main_container(ctx.pod)
         if not container or not container.state or not container.state.terminated:
             return None
 
-        logs = self._extract_logs(ctx.pod)
+        logs = await self._extract_logs(ctx.pod)
         if not logs:
             self.logger.error(f"POD-EVENT: failed to extract logs for completed pod exec={ctx.execution_id}")
             return None
@@ -271,20 +272,20 @@ def _map_completed(self, ctx: PodContext) -> ExecutionCompletedEvent | None:
         self.logger.info(f"POD-EVENT: mapped completed exec={ctx.execution_id} exit_code={logs.exit_code}")
         return evt
 
-    def _map_failed_or_completed(self, ctx: PodContext) -> BaseEvent | None:
+    async def _map_failed_or_completed(self, ctx: PodContext) -> BaseEvent | None:
         """Map failed pod to either timeout, completed, or failed"""
         if ctx.pod.status and ctx.pod.status.reason == "DeadlineExceeded":
-            return self._check_timeout(ctx)
+            return await self._check_timeout(ctx)
 
         if self._all_containers_succeeded(ctx.pod):
-            return self._map_completed(ctx)
+            return await self._map_completed(ctx)
 
-        return self._map_failed(ctx)
+        return await self._map_failed(ctx)
 
-    def _map_failed(self, ctx: PodContext) -> ExecutionFailedEvent | None:
+    async def _map_failed(self, ctx: PodContext) -> ExecutionFailedEvent | None:
         """Map failed pod to failed event"""
         error_info = self._analyze_failure(ctx.pod)
-        logs = self._extract_logs(ctx.pod)
+        logs = await self._extract_logs(ctx.pod)
 
         # Use logs data if available, fallback to error_info
         stdout = logs.stdout if logs else ""
@@ -307,7 +308,7 @@ def _map_failed(self, ctx: PodContext) -> ExecutionFailedEvent | None:
         )
         return evt
 
-    def _map_terminated(self, ctx: PodContext) -> PodTerminatedEvent | None:
+    async def _map_terminated(self, ctx: PodContext) -> PodTerminatedEvent | None:
         """Map deleted pod to terminated event"""
         container = self._get_main_container(ctx.pod)
         if not container or not container.state or not container.state.terminated:
@@ -328,11 +329,11 @@ def _map_terminated(self, ctx: PodContext) -> PodTerminatedEvent | None:
         )
         return evt
 
-    def _check_timeout(self, ctx: PodContext) -> ExecutionTimeoutEvent | None:
+    async def _check_timeout(self, ctx: PodContext) -> ExecutionTimeoutEvent | None:
         if not (ctx.pod.status and ctx.pod.status.reason == "DeadlineExceeded"):
             return None
 
-        logs = self._extract_logs(ctx.pod)
+        logs = await self._extract_logs(ctx.pod)
         if not logs:
             self.logger.error(f"POD-EVENT: failed to extract logs for timed out pod exec={ctx.execution_id}")
             return None
@@ -443,7 +444,7 @@ def _analyze_failure(self, pod: k8s_client.V1Pod) -> FailureInfo:
 
         return default
 
-    def _extract_logs(self, pod: k8s_client.V1Pod) -> PodLogs | None:
+    async def _extract_logs(self, pod: k8s_client.V1Pod) -> PodLogs | None:
         """Extract and parse pod logs. Returns None if extraction fails."""
         # Without k8s API or metadata, can't fetch logs
         if not self._k8s_api or not pod.metadata:
@@ -459,7 +460,7 @@ def _extract_logs(self, pod: k8s_client.V1Pod) -> PodLogs | None:
             return None
 
         try:
-            logs = self._k8s_api.read_namespaced_pod_log(
+            logs = await self._k8s_api.read_namespaced_pod_log(
                 name=pod.metadata.name, namespace=pod.metadata.namespace or "integr8scode", tail_lines=10000
             )
 
diff --git a/backend/app/services/pod_monitor/monitor.py b/backend/app/services/pod_monitor/monitor.py
index bdc61583..7c857ab7 100644
--- a/backend/app/services/pod_monitor/monitor.py
+++ b/backend/app/services/pod_monitor/monitor.py
@@ -7,10 +7,10 @@
 from enum import auto
 from typing import Any
 
-from kubernetes import client as k8s_client
-from kubernetes import config as k8s_config
-from kubernetes import watch
-from kubernetes.client.rest import ApiException
+from kubernetes_asyncio import client as k8s_client
+from kubernetes_asyncio import config as k8s_config
+from kubernetes_asyncio import watch
+from kubernetes_asyncio.client.exceptions import ApiException
 
 from app.core.k8s_clients import K8sClients
 from app.core.lifecycle import LifecycleEnabled
@@ -112,8 +112,8 @@ def __init__(
         self.config = config or PodMonitorConfig()
 
         # Kubernetes clients (initialized on start)
+        self._api_client: k8s_client.ApiClient | None = None
         self._v1: k8s_client.CoreV1Api | None = None
-        self._watch: watch.Watch | None = None
         self._clients: K8sClients | None = k8s_clients
 
         # Components
@@ -142,8 +142,8 @@ async def _on_start(self) -> None:
         """Start the pod monitor."""
         self.logger.info("Starting PodMonitor service...")
 
-        # Initialize components
-        self._initialize_kubernetes_client()
+        # Initialize components (async for kubernetes_asyncio)
+        await self._initialize_kubernetes_client()
 
         # Start monitoring
         self._state = MonitorState.RUNNING
@@ -169,9 +169,10 @@ async def _on_stop(self) -> None:
         if tasks:
             await asyncio.gather(*tasks, return_exceptions=True)
 
-        # Close watch
-        if self._watch:
-            self._watch.stop()
+        # Close API client only if we created it (not injected)
+        if self._api_client and self._clients is None:
+            await self._api_client.close()
+        self._api_client = None
 
         # Clear state
         self._tracked_pods.clear()
@@ -180,31 +181,31 @@ async def _on_stop(self) -> None:
         self._state = MonitorState.STOPPED
         self.logger.info("PodMonitor service stopped")
 
-    def _initialize_kubernetes_client(self) -> None:
-        """Initialize Kubernetes API clients."""
+    async def _initialize_kubernetes_client(self) -> None:
+        """Initialize Kubernetes API clients (async for kubernetes_asyncio)."""
         if self._clients is None:
-            match (self.config.in_cluster, self.config.kubeconfig_path):
-                case (True, _):
-                    self.logger.info("Using in-cluster Kubernetes configuration")
-                    k8s_config.load_incluster_config()
-                case (False, path) if path:
-                    self.logger.info(f"Using kubeconfig from {path}")
-                    k8s_config.load_kube_config(config_file=path)
-                case _:
-                    self.logger.info("Using default kubeconfig")
-                    k8s_config.load_kube_config()
-
-            configuration = k8s_client.Configuration.get_default_copy()
+            if self.config.in_cluster:
+                self.logger.info("Using in-cluster Kubernetes configuration")
+                k8s_config.load_incluster_config()
+            else:
+                path = self.config.kubeconfig_path
+                self.logger.info(f"Using kubeconfig from {path or 'default location'}")
+                await k8s_config.load_kube_config(config_file=path)  # None → ~/.kube/config
+
+            # Create API client for kubernetes_asyncio
+            self._api_client = k8s_client.ApiClient()
+            self._v1 = k8s_client.CoreV1Api(self._api_client)
+
+            configuration = self._api_client.configuration
             self.logger.info(f"Kubernetes API host: {configuration.host}")
             self.logger.info(f"SSL CA cert configured: {configuration.ssl_ca_cert is not None}")
-
-            api_client = k8s_client.ApiClient(configuration)
-            self._v1 = k8s_client.CoreV1Api(api_client)
         else:
+            # Use injected clients (for testing)
+            self._api_client = self._clients.api_client
             self._v1 = self._clients.v1
 
-        self._watch = watch.Watch()
-        self._v1.get_api_resources()
+        # Test connection
+        await self._v1.get_api_resources()
         self.logger.info("Successfully connected to Kubernetes API")
         self._event_mapper = PodEventMapper(logger=self.logger, k8s_api=self._v1)
 
@@ -233,8 +234,9 @@ async def _watch_pods(self) -> None:
                 await self._handle_watch_error()
 
     async def _watch_pod_events(self) -> None:
-        """Watch for pod events."""
-        # self._v1 and self._watch are guaranteed initialized by start()
+        """Watch for pod events using async iteration (non-blocking)."""
+        if not self._v1:
+            raise RuntimeError("API not initialized")
 
         context = WatchContext(
             namespace=self.config.namespace,
@@ -246,8 +248,8 @@ async def _watch_pod_events(self) -> None:
 
         self.logger.info(f"Starting pod watch with selector: {context.label_selector}, namespace: {context.namespace}")
 
-        # Create watch stream
-        kwargs = {
+        # Create watch stream kwargs
+        kwargs: dict[str, Any] = {
             "namespace": context.namespace,
             "label_selector": context.label_selector,
             "timeout_seconds": context.timeout_seconds,
@@ -259,30 +261,26 @@ async def _watch_pod_events(self) -> None:
         if context.resource_version:
             kwargs["resource_version"] = context.resource_version
 
-        # Watch stream
-        if not self._watch or not self._v1:
-            raise RuntimeError("Watch or API not initialized")
-
-        stream = self._watch.stream(self._v1.list_namespaced_pod, **kwargs)
+        # Create new Watch instance for this iteration
+        w = watch.Watch()
 
         try:
-            for event in stream:
+            # Use async for - this is the KEY fix for non-blocking watch
+            async for event in w.stream(self._v1.list_namespaced_pod, **kwargs):
                 if self._state != MonitorState.RUNNING:
+                    w.stop()
                     break
 
                 await self._process_raw_event(event)
 
+                # Update resource version from watch for continuity
+                if w.resource_version:
+                    self._last_resource_version = w.resource_version
+
         finally:
-            # Store resource version for next watch
-            self._update_resource_version(stream)
+            # Proper cleanup for kubernetes_asyncio watch
+            await w.close()
 
-    def _update_resource_version(self, stream: Any) -> None:
-        """Update last resource version from stream."""
-        try:
-            if stream._stop_event and stream._stop_event.resource_version:
-                self._last_resource_version = stream._stop_event.resource_version
-        except AttributeError:
-            pass
 
     async def _process_raw_event(self, raw_event: KubeEvent) -> None:
         """Process a raw Kubernetes watch event."""
@@ -327,8 +325,8 @@ async def _process_pod_event(self, event: PodEvent) -> None:
             # Update metrics
             self._metrics.update_pod_monitor_pods_watched(len(self._tracked_pods))
 
-            # Map to application events
-            app_events = self._event_mapper.map_pod_event(event.pod, event.event_type)
+            # Map to application events (async for kubernetes_asyncio log fetching)
+            app_events = await self._event_mapper.map_pod_event(event.pod, event.event_type)
 
             # Publish events
             for app_event in app_events:
@@ -423,8 +421,9 @@ async def _reconcile_state(self) -> ReconciliationResult:
                     error="K8s API not initialized",
                 )
 
-            pods = await asyncio.to_thread(
-                self._v1.list_namespaced_pod, namespace=self.config.namespace, label_selector=self.config.label_selector
+            # Native async call with kubernetes_asyncio
+            pods = await self._v1.list_namespaced_pod(
+                namespace=self.config.namespace, label_selector=self.config.label_selector
             )
 
             # Get current pod names
diff --git a/backend/app/services/result_processor/resource_cleaner.py b/backend/app/services/result_processor/resource_cleaner.py
index db6ff518..2c51d426 100644
--- a/backend/app/services/result_processor/resource_cleaner.py
+++ b/backend/app/services/result_processor/resource_cleaner.py
@@ -1,12 +1,11 @@
 import asyncio
 import logging
 from datetime import datetime, timedelta, timezone
-from functools import partial
 from typing import Any
 
-from kubernetes import client as k8s_client
-from kubernetes import config as k8s_config
-from kubernetes.client.rest import ApiException
+from kubernetes_asyncio import client as k8s_client
+from kubernetes_asyncio import config as k8s_config
+from kubernetes_asyncio.client.exceptions import ApiException
 
 from app.domain.exceptions import InfrastructureError, InvalidStateError
 
@@ -16,16 +15,17 @@
 
 
 class ResourceCleaner:
-    """Service for cleaning up Kubernetes resources"""
+    """Service for cleaning up Kubernetes resources (uses kubernetes_asyncio)."""
 
     def __init__(self, logger: logging.Logger) -> None:
+        self._api_client: k8s_client.ApiClient | None = None
         self.v1: k8s_client.CoreV1Api | None = None
         self.networking_v1: k8s_client.NetworkingV1Api | None = None
         self._initialized = False
         self.logger = logger
 
     async def initialize(self) -> None:
-        """Initialize Kubernetes clients"""
+        """Initialize Kubernetes clients (async for kubernetes_asyncio)."""
         if self._initialized:
             return
 
@@ -34,17 +34,26 @@ async def initialize(self) -> None:
                 k8s_config.load_incluster_config()
                 self.logger.info("Using in-cluster Kubernetes config")
             except k8s_config.ConfigException:
-                k8s_config.load_kube_config()
+                await k8s_config.load_kube_config()
                 self.logger.info("Using kubeconfig")
 
-            self.v1 = k8s_client.CoreV1Api()
-            self.networking_v1 = k8s_client.NetworkingV1Api()
+            # Create API client for kubernetes_asyncio
+            self._api_client = k8s_client.ApiClient()
+            self.v1 = k8s_client.CoreV1Api(self._api_client)
+            self.networking_v1 = k8s_client.NetworkingV1Api(self._api_client)
             self._initialized = True
 
         except Exception as e:
             self.logger.error(f"Failed to initialize Kubernetes client: {e}")
             raise InfrastructureError(f"Kubernetes initialization failed: {e}") from e
 
+    async def close(self) -> None:
+        """Close Kubernetes API client."""
+        if self._api_client:
+            await self._api_client.close()
+            self._api_client = None
+        self._initialized = False
+
     async def cleanup_pod_resources(
         self,
         pod_name: str,
@@ -82,18 +91,13 @@ async def cleanup_pod_resources(
             raise InfrastructureError(f"Resource cleanup failed: {e}") from e
 
     async def _delete_pod(self, pod_name: str, namespace: str) -> None:
-        """Delete a pod"""
+        """Delete a pod (native async with kubernetes_asyncio)."""
         if not self.v1:
             raise InvalidStateError("Kubernetes client not initialized")
 
         try:
-            loop = asyncio.get_running_loop()
-            await loop.run_in_executor(None, self.v1.read_namespaced_pod, pod_name, namespace)
-
-            await loop.run_in_executor(
-                None, partial(self.v1.delete_namespaced_pod, pod_name, namespace, grace_period_seconds=30)
-            )
-
+            await self.v1.read_namespaced_pod(pod_name, namespace)
+            await self.v1.delete_namespaced_pod(pod_name, namespace, grace_period_seconds=30)
             self.logger.info(f"Deleted pod: {pod_name}")
 
         except ApiException as e:
@@ -132,15 +136,15 @@ async def _delete_pvcs(self, execution_id: str, namespace: str) -> None:
     async def _delete_labeled_resources(
         self, execution_id: str, namespace: str, list_func: Any, delete_func: Any, resource_type: str
     ) -> None:
-        """Generic function to delete labeled resources"""
+        """Generic function to delete labeled resources (native async with kubernetes_asyncio)."""
         try:
-            loop = asyncio.get_running_loop()
             label_selector = f"execution-id={execution_id}"
 
-            resources = await loop.run_in_executor(None, partial(list_func, namespace, label_selector=label_selector))
+            # Native async calls with kubernetes_asyncio
+            resources = await list_func(namespace, label_selector=label_selector)
 
             for resource in resources.items:
-                await loop.run_in_executor(None, delete_func, resource.metadata.name, namespace)
+                await delete_func(resource.metadata.name, namespace)
                 self.logger.info(f"Deleted {resource_type}: {resource.metadata.name}")
 
         except ApiException as e:
@@ -175,14 +179,12 @@ async def cleanup_orphaned_resources(
     async def _cleanup_orphaned_pods(
         self, namespace: str, cutoff_time: datetime, cleaned: ResourceDict, dry_run: bool
     ) -> None:
-        """Clean up orphaned pods"""
+        """Clean up orphaned pods (native async with kubernetes_asyncio)."""
         if not self.v1:
             raise InvalidStateError("Kubernetes client not initialized")
 
-        loop = asyncio.get_running_loop()
-        pods = await loop.run_in_executor(
-            None, partial(self.v1.list_namespaced_pod, namespace, label_selector="app=integr8s")
-        )
+        # Native async call
+        pods = await self.v1.list_namespaced_pod(namespace, label_selector="app=integr8s")
 
         terminal_phases = {"Succeeded", "Failed", "Unknown"}
 
@@ -202,14 +204,12 @@ async def _cleanup_orphaned_pods(
     async def _cleanup_orphaned_configmaps(
         self, namespace: str, cutoff_time: datetime, cleaned: ResourceDict, dry_run: bool
     ) -> None:
-        """Clean up orphaned ConfigMaps"""
+        """Clean up orphaned ConfigMaps (native async with kubernetes_asyncio)."""
         if not self.v1:
             raise InvalidStateError("Kubernetes client not initialized")
 
-        loop = asyncio.get_running_loop()
-        configmaps = await loop.run_in_executor(
-            None, partial(self.v1.list_namespaced_config_map, namespace, label_selector="app=integr8s")
-        )
+        # Native async call
+        configmaps = await self.v1.list_namespaced_config_map(namespace, label_selector="app=integr8s")
 
         for cm in configmaps.items:
             if cm.metadata.creation_timestamp.replace(tzinfo=timezone.utc) < cutoff_time:
@@ -217,19 +217,15 @@ async def _cleanup_orphaned_configmaps(
 
                 if not dry_run:
                     try:
-                        await loop.run_in_executor(
-                            None, self.v1.delete_namespaced_config_map, cm.metadata.name, namespace
-                        )
+                        await self.v1.delete_namespaced_config_map(cm.metadata.name, namespace)
                     except Exception as e:
                         self.logger.error(f"Failed to delete orphaned ConfigMap {cm.metadata.name}: {e}")
 
     async def get_resource_usage(self, namespace: str = "default") -> CountDict:
-        """Get current resource usage counts"""
+        """Get current resource usage counts (native async with kubernetes_asyncio)."""
         await self.initialize()
 
-        loop = asyncio.get_running_loop()
         label_selector = "app=integr8s"
-
         default_counts = {"pods": 0, "configmaps": 0, "network_policies": 0}
 
         try:
@@ -238,9 +234,7 @@ async def get_resource_usage(self, namespace: str = "default") -> CountDict:
                 if not self.v1:
                     raise InvalidStateError("Kubernetes client not initialized")
 
-                pods = await loop.run_in_executor(
-                    None, partial(self.v1.list_namespaced_pod, namespace, label_selector=label_selector)
-                )
+                pods = await self.v1.list_namespaced_pod(namespace, label_selector=label_selector)
                 pod_count = len(pods.items)
             except Exception as e:
                 self.logger.warning(f"Failed to get pods: {e}")
@@ -251,9 +245,7 @@ async def get_resource_usage(self, namespace: str = "default") -> CountDict:
                 if not self.v1:
                     raise InvalidStateError("Kubernetes client not initialized")
 
-                configmaps = await loop.run_in_executor(
-                    None, partial(self.v1.list_namespaced_config_map, namespace, label_selector=label_selector)
-                )
+                configmaps = await self.v1.list_namespaced_config_map(namespace, label_selector=label_selector)
                 configmap_count = len(configmaps.items)
             except Exception as e:
                 self.logger.warning(f"Failed to get configmaps: {e}")
@@ -264,11 +256,8 @@ async def get_resource_usage(self, namespace: str = "default") -> CountDict:
                 if not self.networking_v1:
                     raise InvalidStateError("Kubernetes networking client not initialized")
 
-                policies = await loop.run_in_executor(
-                    None,
-                    partial(
-                        self.networking_v1.list_namespaced_network_policy, namespace, label_selector=label_selector
-                    ),
+                policies = await self.networking_v1.list_namespaced_network_policy(
+                    namespace, label_selector=label_selector
                 )
                 policy_count = len(policies.items)
             except Exception as e:
diff --git a/backend/app/settings.py b/backend/app/settings.py
index 6e80b55f..7fd61e47 100644
--- a/backend/app/settings.py
+++ b/backend/app/settings.py
@@ -162,4 +162,4 @@ class Settings(BaseSettings):
 
 @lru_cache(maxsize=1)
 def get_settings() -> Settings:
-    return Settings()  # type: ignore[call-arg]
+    return Settings()
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index b8a3b5ec..475d4e3c 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -50,7 +50,6 @@ dependencies = [
     "itsdangerous==2.2.0",
     "Jinja2==3.1.6",
     "kiwisolver==1.4.9",
-    "kubernetes==31.0.0",
     "limits==3.13.0",
     "markdown-it-py==4.0.0",
     "MarkupSafe==3.0.2",
@@ -115,13 +114,14 @@ dependencies = [
     "tiktoken==0.11.0",
     "tomli==2.0.2",
     "typing_extensions==4.12.2",
-    "urllib3==2.6.2",
+    "urllib3==2.6.3",
     "uvicorn==0.34.2",
     "websocket-client==1.8.0",
     "Werkzeug==3.1.4",
     "wrapt==1.16.0",
     "yarl==1.20.1",
     "zipp==3.20.2",
+    "kubernetes-asyncio==33.3.0",
 ]
 
 [build-system]
@@ -134,6 +134,7 @@ packages = ["app", "workers"]
 [dependency-groups]
 dev = [
     "coverage==7.13.0",
+    "fakeredis>=2.33.0",
     "hypothesis==6.103.4",
     "iniconfig==2.0.0",
     "matplotlib==3.10.8",
@@ -182,8 +183,12 @@ warn_unused_configs = true
 disallow_untyped_defs = true
 disallow_incomplete_defs = true
 disable_error_code = ["import-untyped", "import-not-found"]
-# TODO: REMOVE NEXT LINE
-exclude = '(^tests/|/tests/)'
+plugins = ["pydantic.mypy"]
+
+[tool.pydantic-mypy]
+init_forbid_extra = true
+init_typed = true
+warn_required_dynamic_aliases = true
 
 # Pytest configuration
 [tool.pytest.ini_options]
diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py
index 2b1b00a1..4427874f 100644
--- a/backend/tests/conftest.py
+++ b/backend/tests/conftest.py
@@ -1,22 +1,36 @@
 import os
 import uuid
+from collections.abc import AsyncGenerator, Callable, Coroutine
 from contextlib import asynccontextmanager
-from typing import AsyncGenerator
+from typing import Any
 
 import httpx
 import pytest
 import pytest_asyncio
 import redis.asyncio as redis
 from app.core.database_context import Database
+from app.domain.enums.user import UserRole
 from app.main import create_app
 from app.settings import Settings
 from dishka import AsyncContainer
+from fastapi import FastAPI
 from httpx import ASGITransport
 from pydantic_settings import SettingsConfigDict
 
+# Disable OpenTelemetry/tracing exporters to prevent stalls from reconnection attempts
+os.environ.setdefault("OTEL_EXPORTER_OTLP_ENDPOINT", "")
+os.environ.setdefault("OTEL_METRICS_EXPORTER", "none")
+os.environ.setdefault("OTEL_TRACES_EXPORTER", "none")
+os.environ.setdefault("OTEL_LOGS_EXPORTER", "none")
+# Disable Jaeger tracing (custom code uses JAEGER_AGENT_HOST to build endpoint)
+os.environ.setdefault("JAEGER_AGENT_HOST", "")
+os.environ.setdefault("ENABLE_TRACING", "false")
+# Disable rate limiting in tests (parallel workers share Redis, would hit 429s)
+os.environ.setdefault("RATE_LIMIT_ENABLED", "false")
+
 
 class TestSettings(Settings):
-    """Test configuration - loads from .env.test instead of .env"""
+    """Test configuration - loads from .env.test instead of .env."""
 
     model_config = SettingsConfigDict(
         env_file=".env.test",
@@ -26,79 +40,59 @@ class TestSettings(Settings):
     )
 
 
-# ===== Worker-specific isolation for pytest-xdist =====
-def _compute_worker_id() -> str:
-    return os.environ.get("PYTEST_XDIST_WORKER", "gw0")
-
-
-def _setup_worker_env() -> None:
-    """Set worker-specific environment variables for pytest-xdist isolation.
+# ===== Settings fixture with pytest-xdist worker isolation =====
+@pytest.fixture(scope="session")
+def test_settings(worker_id: str) -> Settings:
+    """Test settings with worker-specific isolation for pytest-xdist.
 
-    Must be called BEFORE TestSettings is instantiated so env vars are picked up.
+    Uses the built-in worker_id fixture from pytest-xdist.
+    - "master": non-xdist run, uses defaults from .env.test
+    - "gw0", "gw1", etc.: xdist workers get unique DB/Redis/Kafka config
     """
-    session_id = os.environ.get("PYTEST_SESSION_ID") or uuid.uuid4().hex[:8]
-    worker_id = _compute_worker_id()
-    os.environ["PYTEST_SESSION_ID"] = session_id
-
-    # Unique database name per worker
-    os.environ["DATABASE_NAME"] = f"integr8scode_test_{session_id}_{worker_id}"
-
-    # Distribute Redis DBs across workers (0-15)
-    try:
-        worker_num = int(worker_id[2:]) if worker_id.startswith("gw") else 0
-        os.environ["REDIS_DB"] = str(worker_num % 16)
-    except Exception:
-        os.environ.setdefault("REDIS_DB", "0")
-
-    # Unique Kafka consumer group per worker
-    os.environ["KAFKA_GROUP_SUFFIX"] = f"{session_id}.{worker_id}"
-
-    # Unique Schema Registry prefix per worker
-    os.environ["SCHEMA_SUBJECT_PREFIX"] = f"test.{session_id}.{worker_id}."
-
-    # Disable OpenTelemetry exporters to prevent "otel-collector:4317" retry noise
-    os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = ""
-    os.environ["OTEL_METRICS_EXPORTER"] = "none"
-    os.environ["OTEL_TRACES_EXPORTER"] = "none"
-    os.environ["OTEL_LOGS_EXPORTER"] = "none"
+    if worker_id == "master":
+        return TestSettings()
 
+    # xdist worker: create isolated settings
+    worker_num = int(worker_id[2:]) if worker_id.startswith("gw") else 0
 
-# Set up worker env at module load time (before any Settings instantiation)
-_setup_worker_env()
+    # Set env var for schema registry (read directly from env, not Settings)
+    os.environ["SCHEMA_SUBJECT_PREFIX"] = f"test.{worker_id}."
 
-
-# ===== Settings fixture =====
-@pytest.fixture(scope="session")
-def test_settings() -> Settings:
-    """Provide TestSettings for tests that need to create their own components."""
-    return TestSettings()
+    return TestSettings(
+        DATABASE_NAME=f"integr8scode_test_{worker_id}",
+        REDIS_DB=worker_num % 16,
+        KAFKA_GROUP_SUFFIX=worker_id,
+    )
 
 
 # ===== App fixture =====
 @pytest_asyncio.fixture(scope="session")
-async def app():
-    """Create FastAPI app with TestSettings.
+async def app(test_settings: Settings) -> AsyncGenerator[FastAPI, None]:
+    """Create FastAPI app with worker-isolated settings.
 
     Session-scoped to avoid Pydantic schema validator memory issues when
     FastAPI recreates OpenAPI schemas hundreds of times with pytest-xdist.
+
+    Runs the app lifespan to initialize Beanie ODM, schema registry, etc.
     """
-    application = create_app(settings=TestSettings())
+    application = create_app(settings=test_settings)
 
-    yield application
+    # Run lifespan to trigger init_beanie() and other startup tasks
+    async with application.router.lifespan_context(application):
+        yield application
 
-    if hasattr(application.state, "dishka_container"):
-        await application.state.dishka_container.close()
+    await application.state.dishka_container.close()
 
 
 @pytest_asyncio.fixture(scope="session")
-async def app_container(app):
+async def app_container(app: FastAPI) -> AsyncContainer:
     """Expose the Dishka container attached to the app."""
     container: AsyncContainer = app.state.dishka_container
     return container
 
 
 @pytest_asyncio.fixture
-async def client(app) -> AsyncGenerator[httpx.AsyncClient, None]:
+async def client(app: FastAPI) -> AsyncGenerator[httpx.AsyncClient, None]:
     """HTTP client for testing API endpoints."""
     async with httpx.AsyncClient(
         transport=ASGITransport(app=app),
@@ -110,100 +104,101 @@ async def client(app) -> AsyncGenerator[httpx.AsyncClient, None]:
 
 
 @asynccontextmanager
-async def _container_scope(container: AsyncContainer):
+async def _container_scope(container: AsyncContainer) -> AsyncGenerator[AsyncContainer, None]:
     async with container() as scope:
         yield scope
 
 
 @pytest_asyncio.fixture
-async def scope(app_container: AsyncContainer):
+async def scope(app_container: AsyncContainer) -> AsyncGenerator[AsyncContainer, None]:
     async with _container_scope(app_container) as s:
         yield s
 
 
 @pytest_asyncio.fixture
-async def db(scope) -> AsyncGenerator[Database, None]:
-    database: Database = await scope.get(Database)
+async def db(scope: AsyncContainer) -> AsyncGenerator[Database, None]:
+    database = await scope.get(Database)
     yield database
 
 
 @pytest_asyncio.fixture
-async def redis_client(scope) -> AsyncGenerator[redis.Redis, None]:
-    client: redis.Redis = await scope.get(redis.Redis)
+async def redis_client(scope: AsyncContainer) -> AsyncGenerator[redis.Redis, None]:
+    client = await scope.get(redis.Redis)
     yield client
 
 
-# ===== HTTP helpers (auth) =====
-async def _http_login(client: httpx.AsyncClient, username: str, password: str) -> str:
-    data = {"username": username, "password": password}
-    resp = await client.post("/api/v1/auth/login", data=data)
-    resp.raise_for_status()
-    return resp.json().get("csrf_token", "")
+# ===== User creation & authentication =====
+async def _register_and_login(
+    client: httpx.AsyncClient, role: UserRole = UserRole.USER
+) -> dict[str, Any]:
+    """Create user with role, register, login, return user info with CSRF headers.
 
-
-@pytest.fixture
-def test_user_credentials():
+    Registration may fail with 400 if user already exists (no per-test cleanup).
+    This is fine - we just proceed to login with the same credentials.
+    """
     uid = uuid.uuid4().hex[:8]
-    return {
-        "username": f"test_user_{uid}",
-        "email": f"test_user_{uid}@example.com",
+    creds = {
+        "username": f"{role.value}_{uid}",
+        "email": f"{role.value}_{uid}@example.com",
         "password": "TestPass123!",
-        "role": "user",
+        "role": role.value,
     }
+    r = await client.post("/api/v1/auth/register", json=creds)
+    # 400 = user already exists (acceptable without per-test cleanup)
+    # 409 = email already exists (same reason)
+    if r.status_code not in (200, 201, 400, 409):
+        r.raise_for_status()
+
+    # Login - this should always succeed if registration succeeded or user exists
+    resp = await client.post(
+        "/api/v1/auth/login",
+        data={"username": creds["username"], "password": creds["password"]},
+    )
+    resp.raise_for_status()
+    csrf: str = resp.json().get("csrf_token", "")
+    return {**creds, "csrf_token": csrf, "headers": {"X-CSRF-Token": csrf}}
 
 
-@pytest.fixture
-def test_admin_credentials():
-    uid = uuid.uuid4().hex[:8]
-    return {
-        "username": f"admin_user_{uid}",
-        "email": f"admin_user_{uid}@example.com",
-        "password": "AdminPass123!",
-        "role": "admin",
-    }
+# Type alias for the make_user factory
+MakeUser = Callable[[UserRole], Coroutine[Any, Any, dict[str, Any]]]
 
 
 @pytest_asyncio.fixture
-async def test_user(client: httpx.AsyncClient, test_user_credentials):
-    """Function-scoped authenticated user."""
-    creds = test_user_credentials
-    r = await client.post("/api/v1/auth/register", json=creds)
-    if r.status_code not in (200, 201, 400):
-        pytest.fail(f"Cannot create test user (status {r.status_code}): {r.text}")
-    csrf = await _http_login(client, creds["username"], creds["password"])
-    return {**creds, "csrf_token": csrf, "headers": {"X-CSRF-Token": csrf}}
+async def make_user(client: httpx.AsyncClient) -> MakeUser:
+    """Factory to create users with any role. Use for isolation tests.
+
+    Example:
+        user1 = await make_user(UserRole.USER)
+        user2 = await make_user(UserRole.USER)  # another user
+        admin = await make_user(UserRole.ADMIN)
+    """
+
+    async def _make(role: UserRole = UserRole.USER) -> dict[str, Any]:
+        return await _register_and_login(client, role)
+
+    return _make
 
 
 @pytest_asyncio.fixture
-async def test_admin(client: httpx.AsyncClient, test_admin_credentials):
-    """Function-scoped authenticated admin."""
-    creds = test_admin_credentials
-    r = await client.post("/api/v1/auth/register", json=creds)
-    if r.status_code not in (200, 201, 400):
-        pytest.fail(f"Cannot create test admin (status {r.status_code}): {r.text}")
-    csrf = await _http_login(client, creds["username"], creds["password"])
-    return {**creds, "csrf_token": csrf, "headers": {"X-CSRF-Token": csrf}}
+async def authenticated_client(client: httpx.AsyncClient) -> httpx.AsyncClient:
+    """HTTP client logged in as regular user.
+
+    Note: This fixture mutates and returns the same `client` instance with
+    auth headers applied. Do NOT use both `client` and `authenticated_client`
+    in the same test. For multi-user tests, use `client` + `make_user` fixture.
+    """
+    user = await _register_and_login(client, UserRole.USER)
+    client.headers.update(user["headers"])
+    return client
 
 
 @pytest_asyncio.fixture
-async def another_user(client: httpx.AsyncClient):
-    username = f"test_user_{uuid.uuid4().hex[:8]}"
-    email = f"{username}@example.com"
-    password = "TestPass123!"
-    await client.post(
-        "/api/v1/auth/register",
-        json={
-            "username": username,
-            "email": email,
-            "password": password,
-            "role": "user",
-        },
-    )
-    csrf = await _http_login(client, username, password)
-    return {
-        "username": username,
-        "email": email,
-        "password": password,
-        "csrf_token": csrf,
-        "headers": {"X-CSRF-Token": csrf},
-    }
+async def authenticated_admin_client(client: httpx.AsyncClient) -> httpx.AsyncClient:
+    """HTTP client logged in as admin.
+
+    Note: This fixture mutates and returns the same `client` instance with
+    admin auth headers applied. For multi-user tests, use `make_user` fixture.
+    """
+    admin = await _register_and_login(client, UserRole.ADMIN)
+    client.headers.update(admin["headers"])
+    return client
diff --git a/backend/tests/e2e/conftest.py b/backend/tests/e2e/conftest.py
index e8243e1c..d1c0f9e1 100644
--- a/backend/tests/e2e/conftest.py
+++ b/backend/tests/e2e/conftest.py
@@ -1,18 +1,30 @@
-"""E2E tests conftest - with infrastructure cleanup."""
+"""E2E tests - hit real containers via HTTP."""
+import ssl
+from collections.abc import AsyncGenerator
+
+import httpx
+import pytest
 import pytest_asyncio
-import redis.asyncio as redis
+from app.settings import Settings
+
 
-from app.core.database_context import Database
-from tests.helpers.cleanup import cleanup_db_and_redis
+@pytest.fixture(scope="session")
+def test_settings() -> Settings:
+    """E2E tests use Settings matching containers (no worker isolation)."""
+    return Settings()
 
 
-@pytest_asyncio.fixture(autouse=True)
-async def _cleanup(db: Database, redis_client: redis.Redis):
-    """Clean DB and Redis before each E2E test.
+@pytest_asyncio.fixture
+async def client(test_settings: Settings) -> AsyncGenerator[httpx.AsyncClient, None]:
+    """HTTP client hitting real backend containers."""
+    ssl_context = ssl.create_default_context()
+    ssl_context.check_hostname = False
+    ssl_context.verify_mode = ssl.CERT_NONE
 
-    Only pre-test cleanup - post-test cleanup causes event loop issues
-    when SSE/streaming tests hold connections across loop boundaries.
-    """
-    await cleanup_db_and_redis(db, redis_client)
-    yield
-    # No post-test cleanup to avoid "Event loop is closed" errors
+    async with httpx.AsyncClient(
+        base_url=f"https://localhost:{test_settings.SERVER_PORT}",
+        timeout=60.0,
+        follow_redirects=True,
+        verify=ssl_context,
+    ) as c:
+        yield c
diff --git a/backend/tests/e2e/test_execution_routes.py b/backend/tests/e2e/test_execution_routes.py
index 2cb1fa7a..bb2d27f9 100644
--- a/backend/tests/e2e/test_execution_routes.py
+++ b/backend/tests/e2e/test_execution_routes.py
@@ -1,17 +1,13 @@
 import asyncio
-import os
-from typing import Dict
+from typing import Any
 from uuid import UUID
 
 import pytest
+from app.domain.enums.execution import ExecutionStatus as ExecutionStatusEnum
+from app.schemas_pydantic.execution import ExecutionResponse, ExecutionResult, ResourceLimits, ResourceUsage
 from httpx import AsyncClient
 
-from app.domain.enums.execution import ExecutionStatus as ExecutionStatusEnum
-from app.schemas_pydantic.execution import (
-    ExecutionResponse,
-    ExecutionResult,
-    ResourceUsage
-)
+from tests.helpers.sse import wait_for_execution_terminal
 
 pytestmark = [pytest.mark.e2e, pytest.mark.k8s]
 
@@ -37,24 +33,15 @@ async def test_execute_requires_authentication(self, client: AsyncClient) -> Non
                    for word in ["not authenticated", "unauthorized", "login"])
 
     @pytest.mark.asyncio
-    async def test_execute_simple_python_script(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_execute_simple_python_script(self, authenticated_client: AsyncClient) -> None:
         """Test executing a simple Python script."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Execute script
         execution_request = {
             "script": "print('Hello from real backend!')",
             "lang": "python",
             "lang_version": "3.11"
         }
 
-        response = await client.post("/api/v1/execute", json=execution_request)
+        response = await authenticated_client.post("/api/v1/execute", json=execution_request)
         assert response.status_code == 200
 
         # Validate response structure
@@ -80,30 +67,21 @@ async def test_execute_simple_python_script(self, client: AsyncClient, test_user
         ]
 
     @pytest.mark.asyncio
-    async def test_get_execution_result(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_execution_result(self, authenticated_client: AsyncClient) -> None:
         """Test getting execution result after completion using SSE (event-driven)."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Execute a simple script
         execution_request = {
             "script": "print('Test output')\nprint('Line 2')",
             "lang": "python",
             "lang_version": "3.11"
         }
 
-        exec_response = await client.post("/api/v1/execute", json=execution_request)
+        exec_response = await authenticated_client.post("/api/v1/execute", json=execution_request)
         assert exec_response.status_code == 200
 
         execution_id = exec_response.json()["execution_id"]
 
         # Immediately fetch result - no waiting
-        result_response = await client.get(f"/api/v1/result/{execution_id}")
+        result_response = await authenticated_client.get(f"/api/v1/result/{execution_id}")
         assert result_response.status_code == 200
 
         result_data = result_response.json()
@@ -114,48 +92,50 @@ async def test_get_execution_result(self, client: AsyncClient, test_user: Dict[s
 
         # Execution might be in any state - that's fine
         # If completed, validate output; if not, that's valid too
-        if execution_result.status == ExecutionStatusEnum.COMPLETED:
+        if execution_result.status == ExecutionStatusEnum.COMPLETED.value:
             assert execution_result.stdout is not None
             assert "Test output" in execution_result.stdout
             assert "Line 2" in execution_result.stdout
 
     @pytest.mark.asyncio
-    async def test_execute_with_error(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
-        """Test executing a script that produces an error."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
+    async def test_execute_with_error(self, authenticated_client: AsyncClient) -> None:
+        """Test executing a script that produces an error.
 
-        # Execute script with intentional error
+        Workers run as containers (docker-compose) for full pipeline:
+        API -> SagaOrchestrator -> KubernetesWorker -> PodMonitor -> SSE.
+        Uses SSE to wait for terminal state (event-driven, no polling).
+        """
         execution_request = {
             "script": "print('Before error')\nraise ValueError('Test error')\nprint('After error')",
             "lang": "python",
-            "lang_version": "3.11"
+            "lang_version": "3.11",
         }
 
-        exec_response = await client.post("/api/v1/execute", json=execution_request)
+        exec_response = await authenticated_client.post("/api/v1/execute", json=execution_request)
         assert exec_response.status_code == 200
 
         execution_id = exec_response.json()["execution_id"]
 
-        # No waiting - execution was accepted, error will be processed asynchronously
+        # Wait for terminal state via SSE (event-driven, no polling)
+        await wait_for_execution_terminal(
+            authenticated_client, execution_id, timeout=120.0
+        )
+
+        # Fetch final result to verify error was captured
+        result_response = await authenticated_client.get(f"/api/v1/result/{execution_id}")
+        assert result_response.status_code == 200
+        result: dict[str, Any] = result_response.json()
+
+        assert result["status"] in (ExecutionStatusEnum.FAILED.value, ExecutionStatusEnum.ERROR.value)
+        assert "ValueError" in (result.get("stderr") or result.get("stdout") or "")
 
     @pytest.mark.asyncio
-    async def test_execute_with_resource_tracking(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
-        """Test that execution tracks resource usage."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
+    async def test_execute_with_resource_tracking(self, authenticated_client: AsyncClient) -> None:
+        """Test that execution tracks resource usage.
 
-        # Execute script that uses some resources
+        Workers run as containers (docker-compose) for full pipeline:
+        API -> SagaOrchestrator -> KubernetesWorker -> PodMonitor -> SSE.
+        """
         execution_request = {
             "script": """
 import time
@@ -166,38 +146,33 @@ async def test_execute_with_resource_tracking(self, client: AsyncClient, test_us
 print('Done')
 """,
             "lang": "python",
-            "lang_version": "3.11"
+            "lang_version": "3.11",
         }
 
-        exec_response = await client.post("/api/v1/execute", json=execution_request)
+        exec_response = await authenticated_client.post("/api/v1/execute", json=execution_request)
         assert exec_response.status_code == 200
 
         execution_id = exec_response.json()["execution_id"]
 
-        # No waiting - execution was accepted, error will be processed asynchronously
+        # Wait for terminal state via SSE (event-driven, no polling)
+        await wait_for_execution_terminal(authenticated_client, execution_id, timeout=120.0)
+
+        # Fetch final result to verify resource tracking
+        result_response = await authenticated_client.get(f"/api/v1/result/{execution_id}")
+        assert result_response.status_code == 200
+        result: dict[str, Any] = result_response.json()
 
-        # Fetch result and validate resource usage if present
-        result_response = await client.get(f"/api/v1/result/{execution_id}")
-        if result_response.status_code == 200 and result_response.json().get("resource_usage"):
-            resource_usage = ResourceUsage(**result_response.json()["resource_usage"])
-            if resource_usage.execution_time_wall_seconds is not None:
-                assert resource_usage.execution_time_wall_seconds >= 0
-            if resource_usage.peak_memory_kb is not None:
-                assert resource_usage.peak_memory_kb >= 0
+        assert result["status"] == ExecutionStatusEnum.COMPLETED.value
+
+        # Resource usage must be present after completion
+        assert result.get("resource_usage") is not None, "resource_usage should be populated"
+        resource_usage = ResourceUsage(**result["resource_usage"])
+        assert resource_usage.execution_time_wall_seconds is not None
+        assert resource_usage.execution_time_wall_seconds >= 0
 
     @pytest.mark.asyncio
-    async def test_execute_with_different_language_versions(self, client: AsyncClient,
-                                                            test_user: Dict[str, str]) -> None:
+    async def test_execute_with_different_language_versions(self, authenticated_client: AsyncClient) -> None:
         """Test execution with different Python versions."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Test different Python versions (if supported)
         test_cases = [
             ("3.10", "import sys; print(f'Python {sys.version}')"),
             ("3.11", "import sys; print(f'Python {sys.version}')"),
@@ -211,7 +186,7 @@ async def test_execute_with_different_language_versions(self, client: AsyncClien
                 "lang_version": version
             }
 
-            response = await client.post("/api/v1/execute", json=execution_request)
+            response = await authenticated_client.post("/api/v1/execute", json=execution_request)
             # Should either accept (200) or reject unsupported version (400/422)
             assert response.status_code in [200, 400, 422]
 
@@ -220,17 +195,8 @@ async def test_execute_with_different_language_versions(self, client: AsyncClien
                 assert "execution_id" in data
 
     @pytest.mark.asyncio
-    async def test_execute_with_large_output(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_execute_with_large_output(self, authenticated_client: AsyncClient) -> None:
         """Test execution with large output."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Script that produces large output
         execution_request = {
             "script": """
 # Generate large output
@@ -242,14 +208,14 @@ async def test_execute_with_large_output(self, client: AsyncClient, test_user: D
             "lang_version": "3.11"
         }
 
-        exec_response = await client.post("/api/v1/execute", json=execution_request)
+        exec_response = await authenticated_client.post("/api/v1/execute", json=execution_request)
         assert exec_response.status_code == 200
 
         execution_id = exec_response.json()["execution_id"]
 
         # No waiting - execution was accepted, error will be processed asynchronously
         # Validate output from result endpoint (best-effort)
-        result_response = await client.get(f"/api/v1/result/{execution_id}")
+        result_response = await authenticated_client.get(f"/api/v1/result/{execution_id}")
         if result_response.status_code == 200:
             result_data = result_response.json()
             if result_data.get("status") == "COMPLETED":
@@ -258,17 +224,13 @@ async def test_execute_with_large_output(self, client: AsyncClient, test_user: D
                 assert "End of output" in result_data["stdout"] or len(result_data["stdout"]) > 10000
 
     @pytest.mark.asyncio
-    async def test_cancel_running_execution(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
-        """Test cancelling a running execution."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
+    async def test_cancel_running_execution(self, authenticated_client: AsyncClient) -> None:
+        """Test cancelling a running execution.
 
-        # Start a long-running script
+        Workers run as containers (docker-compose) for full pipeline:
+        API -> SagaOrchestrator -> KubernetesWorker -> PodMonitor -> SSE.
+        Submits a long-running script and immediately requests cancellation.
+        """
         execution_request = {
             "script": """
 import time
@@ -279,47 +241,34 @@ async def test_cancel_running_execution(self, client: AsyncClient, test_user: Di
 print('Should not reach here if cancelled')
 """,
             "lang": "python",
-            "lang_version": "3.11"
+            "lang_version": "3.11",
         }
 
-        exec_response = await client.post("/api/v1/execute", json=execution_request)
+        exec_response = await authenticated_client.post("/api/v1/execute", json=execution_request)
         assert exec_response.status_code == 200
 
         execution_id = exec_response.json()["execution_id"]
 
-        # Try to cancel immediately - no waiting
-        cancel_request = {
-            "reason": "Test cancellation"
-        }
-
-        try:
-            cancel_response = await client.post(f"/api/v1/{execution_id}/cancel", json=cancel_request)
-        except Exception:
-            pytest.skip("Cancel endpoint not available or connection dropped")
-        if cancel_response.status_code >= 500:
-            pytest.skip("Cancellation not wired; backend returned 5xx")
-        # Should succeed or fail if already completed
-        assert cancel_response.status_code in [200, 400, 404]
+        # Try to cancel immediately
+        cancel_request = {"reason": "Test cancellation"}
+        cancel_response = await authenticated_client.post(
+            f"/api/v1/{execution_id}/cancel", json=cancel_request
+        )
 
-        # Cancel response of 200 means cancellation was accepted
+        # Cancel should succeed (200), or fail if execution already completed (400/404)
+        # 5xx errors indicate a real bug in the cancellation endpoint
+        assert cancel_response.status_code in [200, 400, 404], (
+            f"Unexpected cancel response: {cancel_response.status_code} - {cancel_response.text}"
+        )
 
     @pytest.mark.asyncio
-    async def test_execution_with_timeout(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_execution_with_timeout(self, authenticated_client: AsyncClient) -> None:
         """Bounded check: long-running executions don't finish immediately.
 
         The backend's default timeout is 300s. To keep integration fast,
         assert that within a short window the execution is either still
         running or has transitioned to a terminal state due to platform limits.
         """
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Script that would run forever
         execution_request = {
             "script": """
 import time
@@ -332,26 +281,25 @@ async def test_execution_with_timeout(self, client: AsyncClient, test_user: Dict
             "lang_version": "3.11"
         }
 
-        exec_response = await client.post("/api/v1/execute", json=execution_request)
+        exec_response = await authenticated_client.post("/api/v1/execute", json=execution_request)
         assert exec_response.status_code == 200
 
         execution_id = exec_response.json()["execution_id"]
+        assert execution_id is not None
+        assert len(execution_id) > 0
+
+        # Verify the execution was created and is being tracked
+        result_response = await authenticated_client.get(f"/api/v1/result/{execution_id}")
+        assert result_response.status_code == 200
 
-        # Just verify the execution was created - it will run forever until timeout
-        # No need to wait or observe states
+        result_data = result_response.json()
+        assert result_data["execution_id"] == execution_id
+        # Execution should be in some valid state (likely queued/running since it's long-running)
+        assert result_data["status"] in [e.value for e in ExecutionStatusEnum]
 
     @pytest.mark.asyncio
-    async def test_sandbox_restrictions(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_sandbox_restrictions(self, authenticated_client: AsyncClient) -> None:
         """Test that dangerous operations are blocked by sandbox."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Try dangerous operations that should be blocked
         dangerous_scripts = [
             # File system access
             "open('/etc/passwd', 'r').read()",
@@ -370,14 +318,14 @@ async def test_sandbox_restrictions(self, client: AsyncClient, test_user: Dict[s
                 "lang_version": "3.11"
             }
 
-            exec_response = await client.post("/api/v1/execute", json=execution_request)
+            exec_response = await authenticated_client.post("/api/v1/execute", json=execution_request)
 
             # Should either reject immediately or fail during execution
             if exec_response.status_code == 200:
                 execution_id = exec_response.json()["execution_id"]
 
                 # Immediately check result - no waiting
-                result_resp = await client.get(f"/api/v1/result/{execution_id}")
+                result_resp = await authenticated_client.get(f"/api/v1/result/{execution_id}")
                 if result_resp.status_code == 200:
                     result_data = result_resp.json()
                     # Dangerous operations should either:
@@ -397,17 +345,8 @@ async def test_sandbox_restrictions(self, client: AsyncClient, test_user: Dict[s
                 assert exec_response.status_code in [400, 422]
 
     @pytest.mark.asyncio
-    async def test_concurrent_executions_by_same_user(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_concurrent_executions_by_same_user(self, authenticated_client: AsyncClient) -> None:
         """Test running multiple executions concurrently."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Submit multiple executions
         execution_request = {
             "script": "import time; time.sleep(1); print('Concurrent test')",
             "lang": "python",
@@ -415,8 +354,8 @@ async def test_concurrent_executions_by_same_user(self, client: AsyncClient, tes
         }
 
         tasks = []
-        for i in range(3):
-            task = client.post("/api/v1/execute", json=execution_request)
+        for _ in range(3):
+            task = authenticated_client.post("/api/v1/execute", json=execution_request)
             tasks.append(task)
 
         responses = await asyncio.gather(*tasks)
@@ -438,54 +377,40 @@ async def test_concurrent_executions_by_same_user(self, client: AsyncClient, tes
 
     @pytest.mark.asyncio
     async def test_get_example_scripts(self, client: AsyncClient) -> None:
-        """Example scripts endpoint returns available example scripts."""
+        """Test getting example scripts (public endpoint)."""
         response = await client.get("/api/v1/example-scripts")
+
         assert response.status_code == 200
         data = response.json()
-        assert isinstance(data, dict)
         assert "scripts" in data
         assert isinstance(data["scripts"], dict)
 
     @pytest.mark.asyncio
     async def test_get_k8s_resource_limits(self, client: AsyncClient) -> None:
-        """K8s limits endpoint returns cluster execution limits if configured."""
+        """Test getting K8s resource limits."""
         response = await client.get("/api/v1/k8s-limits")
         assert response.status_code == 200
-        limits = response.json()
-        # Validate ResourceLimits shape
-        for key in [
-            "cpu_limit",
-            "memory_limit",
-            "cpu_request",
-            "memory_request",
-            "execution_timeout",
-            "supported_runtimes",
-        ]:
-            assert key in limits
+
+        # Validate response matches schema
+        limits = ResourceLimits.model_validate(response.json())
+
+        # Verify sensible values
+        assert limits.execution_timeout > 0
+        assert len(limits.supported_runtimes) > 0
 
     @pytest.mark.asyncio
-    async def test_get_user_executions_list(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_user_executions_list(self, authenticated_client: AsyncClient) -> None:
         """User executions list returns paginated executions for current user."""
-        # Login first
-        login_data = {"username": test_user["username"], "password": test_user["password"]}
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # List executions
-        response = await client.get("/api/v1/user/executions?limit=5&skip=0")
+        response = await authenticated_client.get("/api/v1/user/executions?limit=5&skip=0")
         assert response.status_code == 200
         payload = response.json()
         assert set(["executions", "total", "limit", "skip", "has_more"]).issubset(payload.keys())
 
     @pytest.mark.asyncio
-    async def test_execution_idempotency_same_key_returns_same_execution(self, client: AsyncClient,
-                                                                         test_user: Dict[str, str]) -> None:
+    async def test_execution_idempotency_same_key_returns_same_execution(
+        self, authenticated_client: AsyncClient
+    ) -> None:
         """Submitting the same request with the same Idempotency-Key yields the same execution_id."""
-        # Login first
-        login_data = {"username": test_user["username"], "password": test_user["password"]}
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         execution_request = {
             "script": "print('Idempotency integration test')",
             "lang": "python",
@@ -495,13 +420,12 @@ async def test_execution_idempotency_same_key_returns_same_execution(self, clien
         headers = {"Idempotency-Key": "it-idem-key-123"}
 
         # Use idempotency header on both requests to guarantee keying
-        r1 = await client.post("/api/v1/execute", json=execution_request, headers=headers)
-        assert r1.status_code == 200
+        r1 = await authenticated_client.post("/api/v1/execute", json=execution_request, headers=headers)
         assert r1.status_code == 200
         e1 = r1.json()["execution_id"]
 
         # Second request with same key must return the same execution id
-        r2 = await client.post("/api/v1/execute", json=execution_request, headers=headers)
+        r2 = await authenticated_client.post("/api/v1/execute", json=execution_request, headers=headers)
         assert r2.status_code == 200
         e2 = r2.json()["execution_id"]
 
diff --git a/backend/tests/e2e/test_k8s_worker_create_pod.py b/backend/tests/e2e/test_k8s_worker_create_pod.py
index 63c6c0ee..eb2ef2d0 100644
--- a/backend/tests/e2e/test_k8s_worker_create_pod.py
+++ b/backend/tests/e2e/test_k8s_worker_create_pod.py
@@ -1,10 +1,16 @@
+"""E2E test for KubernetesWorker pod creation.
+
+Requires:
+- K8S_NAMESPACE env var set to a non-default namespace
+- KUBECONFIG pointing to a valid kubeconfig or in-cluster config
+- Permissions to create/delete ConfigMaps and Pods in the namespace
+"""
 import logging
 import os
 import uuid
 
 import pytest
 from app.events.core import UnifiedProducer
-from app.events.event_store import EventStore
 from app.events.schema.schema_registry import SchemaRegistryManager
 from app.infrastructure.kafka.events.metadata import AvroEventMetadata
 from app.infrastructure.kafka.events.saga import CreatePodCommandEvent
@@ -13,7 +19,7 @@
 from app.services.k8s_worker.worker import KubernetesWorker
 from app.settings import Settings
 from dishka import AsyncContainer
-from kubernetes.client.rest import ApiException
+from kubernetes_asyncio.client.exceptions import ApiException
 
 pytestmark = [pytest.mark.e2e, pytest.mark.k8s]
 
@@ -24,6 +30,11 @@
 async def test_worker_creates_configmap_and_pod(
     scope: AsyncContainer, monkeypatch: pytest.MonkeyPatch, test_settings: Settings
 ) -> None:
+    """Test that KubernetesWorker can create ConfigMap and Pod resources.
+
+    This test requires a working Kubernetes cluster with proper permissions.
+    In CI, K3s is set up via .github/workflows/backend-ci.yml.
+    """
     # Ensure non-default namespace for worker validation
     ns = os.environ.get("K8S_NAMESPACE", "integr8scode")
     if ns == "default":
@@ -31,7 +42,6 @@ async def test_worker_creates_configmap_and_pod(
         monkeypatch.setenv("K8S_NAMESPACE", ns)
 
     schema: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
-    store: EventStore = await scope.get(EventStore)
     producer: UnifiedProducer = await scope.get(UnifiedProducer)
     idem: IdempotencyManager = await scope.get(IdempotencyManager)
 
@@ -41,15 +51,17 @@ async def test_worker_creates_configmap_and_pod(
         producer=producer,
         schema_registry_manager=schema,
         settings=test_settings,
-        event_store=store,
         idempotency_manager=idem,
         logger=_test_logger,
     )
 
-    # Initialize k8s clients using worker's own method
-    worker._initialize_kubernetes_client()  # noqa: SLF001
-    if worker.v1 is None:
-        pytest.skip("Kubernetes cluster not available")
+    # Initialize k8s clients - must succeed for this E2E test
+    await worker._initialize_kubernetes_client()  # noqa: SLF001
+    assert worker.v1 is not None, (
+        "Kubernetes client initialization failed. "
+        "Ensure KUBECONFIG is set or running in-cluster. "
+        f"KUBECONFIG={os.environ.get('KUBECONFIG', 'not set')}"
+    )
 
     exec_id = uuid.uuid4().hex[:8]
     cmd = CreatePodCommandEvent(
@@ -79,19 +91,21 @@ async def test_worker_creates_configmap_and_pod(
     try:
         await worker._create_config_map(cm)  # noqa: SLF001
     except ApiException as e:
-        if e.status in (403, 404):
-            pytest.skip(f"Insufficient permissions or namespace not found: {e}")
-        raise
+        pytest.fail(
+            f"Failed to create ConfigMap: {e.status} {e.reason}. "
+            f"Ensure namespace '{ns}' exists and test has RBAC permissions. "
+            f"Create namespace: kubectl create namespace {ns}"
+        )
 
     pod = worker.pod_builder.build_pod_manifest(cmd)
     await worker._create_pod(pod)  # noqa: SLF001
 
     # Verify resources exist
-    got_cm = worker.v1.read_namespaced_config_map(name=f"script-{exec_id}", namespace=ns)
+    got_cm = await worker.v1.read_namespaced_config_map(name=f"script-{exec_id}", namespace=ns)
     assert got_cm is not None
-    got_pod = worker.v1.read_namespaced_pod(name=f"executor-{exec_id}", namespace=ns)
+    got_pod = await worker.v1.read_namespaced_pod(name=f"executor-{exec_id}", namespace=ns)
     assert got_pod is not None
 
     # Cleanup
-    worker.v1.delete_namespaced_pod(name=f"executor-{exec_id}", namespace=ns)
-    worker.v1.delete_namespaced_config_map(name=f"script-{exec_id}", namespace=ns)
+    await worker.v1.delete_namespaced_pod(name=f"executor-{exec_id}", namespace=ns)
+    await worker.v1.delete_namespaced_config_map(name=f"script-{exec_id}", namespace=ns)
diff --git a/backend/tests/e2e/test_resource_cleaner_k8s.py b/backend/tests/e2e/test_resource_cleaner_k8s.py
index 33e57386..805aa785 100644
--- a/backend/tests/e2e/test_resource_cleaner_k8s.py
+++ b/backend/tests/e2e/test_resource_cleaner_k8s.py
@@ -3,10 +3,8 @@
 import os
 
 import pytest
-
 from app.services.result_processor.resource_cleaner import ResourceCleaner
 
-
 pytestmark = [pytest.mark.e2e, pytest.mark.k8s]
 
 _test_logger = logging.getLogger("test.k8s.resource_cleaner_k8s")
diff --git a/backend/tests/e2e/test_resource_cleaner_orphan.py b/backend/tests/e2e/test_resource_cleaner_orphan.py
index 2cd36173..41020ad5 100644
--- a/backend/tests/e2e/test_resource_cleaner_orphan.py
+++ b/backend/tests/e2e/test_resource_cleaner_orphan.py
@@ -1,55 +1,59 @@
-import asyncio
 import logging
-from datetime import datetime, timedelta, timezone
+from datetime import datetime
 
+import backoff
 import pytest
-from kubernetes import client as k8s_client, config as k8s_config
-
 from app.services.result_processor.resource_cleaner import ResourceCleaner
-from tests.helpers.eventually import eventually
+from kubernetes_asyncio import client as k8s_client
+from kubernetes_asyncio import config as k8s_config
 
 pytestmark = [pytest.mark.e2e, pytest.mark.k8s]
 
 _test_logger = logging.getLogger("test.k8s.resource_cleaner_orphan")
 
 
-def _ensure_kubeconfig():
+async def _ensure_kubeconfig() -> k8s_client.ApiClient:
+    """Load kubeconfig and return an async API client."""
     try:
         k8s_config.load_incluster_config()
     except Exception:
-        k8s_config.load_kube_config()
+        await k8s_config.load_kube_config()
+    return k8s_client.ApiClient()
 
 
 @pytest.mark.asyncio
-async def test_cleanup_orphaned_configmaps_dry_run():
-    _ensure_kubeconfig()
-    v1 = k8s_client.CoreV1Api()
-    ns = "default"
-    name = f"int-test-cm-{int(datetime.now().timestamp())}"
-
-    # Create a configmap labeled like the app uses
-    metadata = k8s_client.V1ObjectMeta(
-        name=name,
-        labels={"app": "integr8s", "execution-id": "e-int-test"},
-    )
-    body = k8s_client.V1ConfigMap(metadata=metadata, data={"k": "v"})
-    v1.create_namespaced_config_map(namespace=ns, body=body)
-
+async def test_cleanup_orphaned_configmaps_dry_run() -> None:
+    api_client = await _ensure_kubeconfig()
+    name: str | None = None
     try:
+        v1 = k8s_client.CoreV1Api(api_client)
+        ns = "default"
+        name = f"int-test-cm-{int(datetime.now().timestamp())}"
+
+        # Create a configmap labeled like the app uses
+        metadata = k8s_client.V1ObjectMeta(
+            name=name,
+            labels={"app": "integr8s", "execution-id": "e-int-test"},
+        )
+        body = k8s_client.V1ConfigMap(metadata=metadata, data={"k": "v"})
+        await v1.create_namespaced_config_map(namespace=ns, body=body)
+
         cleaner = ResourceCleaner(logger=_test_logger)
-        # Force as orphaned by using a large cutoff
-        cleaned = await cleaner.cleanup_orphaned_resources(namespace=ns, max_age_hours=0, dry_run=True)
 
         # We expect our configmap to be a candidate; poll the response
-        async def _has_cm():
+        @backoff.on_exception(backoff.constant, AssertionError, max_time=2.0, interval=0.1)
+        async def _wait_has_cm() -> None:
             # If cleaner is non-deterministic across runs, re-invoke to reflect current state
             res = await cleaner.cleanup_orphaned_resources(namespace=ns, max_age_hours=0, dry_run=True)
             assert any(name == cm for cm in res.get("configmaps", []))
 
-        await eventually(_has_cm, timeout=2.0, interval=0.1)
+        await _wait_has_cm()
     finally:
-        # Cleanup resource
-        try:
-            v1.delete_namespaced_config_map(name=name, namespace=ns)
-        except Exception:
-            pass
+        # Cleanup resource (only if created)
+        if name:
+            try:
+                v1 = k8s_client.CoreV1Api(api_client)
+                await v1.delete_namespaced_config_map(name=name, namespace="default")
+            except Exception:
+                pass
+        await api_client.close()
diff --git a/backend/tests/helpers/__init__.py b/backend/tests/helpers/__init__.py
index f6e01139..3855ae83 100644
--- a/backend/tests/helpers/__init__.py
+++ b/backend/tests/helpers/__init__.py
@@ -1,3 +1,5 @@
-"""Helper utilities for tests (async polling, Kafka utilities, event factories)."""
+"""Helper utilities for tests (Kafka utilities, event factories)."""
 
-from .events import make_execution_requested_event  # re-export
+from .events import make_execution_requested_event
+
+__all__ = ["make_execution_requested_event"]
diff --git a/backend/tests/helpers/cleanup.py b/backend/tests/helpers/cleanup.py
deleted file mode 100644
index 33a4cdfd..00000000
--- a/backend/tests/helpers/cleanup.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""Shared cleanup utilities for integration and E2E tests."""
-import redis.asyncio as redis
-from beanie import init_beanie
-
-from app.core.database_context import Database
-from app.db.docs import ALL_DOCUMENTS
-
-
-async def cleanup_db_and_redis(db: Database, redis_client: redis.Redis) -> None:
-    """Clean DB and Redis before a test.
-
-    NOTE: With pytest-xdist, each worker uses a separate Redis database
-    (gw0→db0, gw1→db1, etc.), so flushdb() is safe and only affects
-    that worker's database. See tests/conftest.py for REDIS_DB setup.
-    """
-    collections = await db.list_collection_names()
-    for name in collections:
-        if not name.startswith("system."):
-            await db.drop_collection(name)
-
-    await redis_client.flushdb()
-
-    await init_beanie(database=db, document_models=ALL_DOCUMENTS)
diff --git a/backend/tests/helpers/eventually.py b/backend/tests/helpers/eventually.py
deleted file mode 100644
index f72689f3..00000000
--- a/backend/tests/helpers/eventually.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import asyncio
-from typing import Awaitable, Callable, TypeVar
-
-T = TypeVar("T")
-
-
-async def eventually(
-    fn: Callable[[], Awaitable[T]] | Callable[[], T],
-    *,
-    timeout: float = 10.0,
-    interval: float = 0.1,
-    exceptions: tuple[type[BaseException], ...] = (AssertionError,),
-) -> T:
-    """Polls `fn` until it succeeds or timeout elapses.
-
-    - `fn` may be sync or async. If it raises one of `exceptions`, it is retried.
-    - Returns the value of `fn` on success.
-    - Raises the last exception after timeout.
-    """
-    deadline = asyncio.get_running_loop().time() + timeout
-    last_exc: BaseException | None = None
-    while True:
-        try:
-            res = fn()
-            if asyncio.iscoroutine(res):
-                return await res  # type: ignore[return-value]
-            return res  # type: ignore[return-value]
-        except exceptions as exc:  # type: ignore[misc]
-            last_exc = exc
-            if asyncio.get_running_loop().time() >= deadline:
-                raise
-            await asyncio.sleep(interval)
-
diff --git a/backend/tests/helpers/k8s_fakes.py b/backend/tests/helpers/k8s_fakes.py
index 835e29e3..368658cb 100644
--- a/backend/tests/helpers/k8s_fakes.py
+++ b/backend/tests/helpers/k8s_fakes.py
@@ -45,7 +45,12 @@ def __init__(self, reason: str, message: str | None = None) -> None:
 
 
 class State:
-    def __init__(self, terminated: Terminated | None = None, waiting: Waiting | None = None, running: Any | None = None) -> None:
+    def __init__(
+        self,
+        terminated: Terminated | None = None,
+        waiting: Waiting | None = None,
+        running: Any | None = None,
+    ) -> None:
         self.terminated = terminated
         self.waiting = waiting
         self.running = running
@@ -89,7 +94,13 @@ def __init__(
         annotations: dict[str, str] | None = None,
         resource_version: str | None = None,
     ) -> None:
-        self.metadata = Meta(name, namespace=namespace, labels=labels, annotations=annotations, resource_version=resource_version)
+        self.metadata = Meta(
+            name,
+            namespace=namespace,
+            labels=labels,
+            annotations=annotations,
+            resource_version=resource_version,
+        )
         self.status = Status(phase, reason, msg, cs)
         self.spec = Spec(adl)
 
@@ -128,33 +139,59 @@ def make_pod(
 
 
 class FakeApi:
+    """Fake K8s API for unit tests (async compatible with kubernetes_asyncio)."""
+
     def __init__(self, logs: str) -> None:
         self._logs = logs
 
-    def read_namespaced_pod_log(self, name: str, namespace: str, tail_lines: int = 10000):  # noqa: ARG002
+    async def read_namespaced_pod_log(self, name: str, namespace: str, tail_lines: int = 10000) -> str:  # noqa: ARG002
         return self._logs
 
+    async def get_api_resources(self) -> None:
+        """Async stub for API resources check."""
+        pass
+
+    async def list_namespaced_pod(self, namespace: str, **kwargs: Any) -> Any:  # noqa: ARG002
+        """Async stub for listing pods."""
+
+        class _PodList:
+            items: list[Pod] = []
+
+        return _PodList()
+
+
+class FakeAsyncWatch:
+    """Fake async Watch for kubernetes_asyncio compatibility in tests."""
+
+    def __init__(self, events: list[dict[str, Any]], resource_version: str = "rv2") -> None:
+        self._events = events
+        self.resource_version = resource_version
+        self._stopped = False
+
+    def stream(self, func: Any, **kwargs: Any) -> "FakeAsyncWatch":  # noqa: ARG002
+        """Return self to support async iteration."""
+        return self
 
-def make_watch(events: list[dict[str, Any]], resource_version: str = "rv2"):
-    class _StopEvent:
-        def __init__(self, rv: str) -> None:
-            self.resource_version = rv
+    def __aiter__(self) -> "FakeAsyncWatch":
+        self._index = 0
+        return self
 
-    class _Stream(list):
-        def __init__(self, ev: list[dict[str, Any]], rv: str) -> None:
-            super().__init__(ev)
-            self._stop_event = _StopEvent(rv)
+    async def __anext__(self) -> dict[str, Any]:
+        if self._stopped or self._index >= len(self._events):
+            raise StopAsyncIteration
+        event = self._events[self._index]
+        self._index += 1
+        return event
 
-    class _Watch:
-        def __init__(self, ev: list[dict[str, Any]], rv: str) -> None:
-            self._events = ev
-            self._rv = rv
+    def stop(self) -> None:
+        self._stopped = True
 
-        def stream(self, func, **kwargs):  # noqa: ARG002
-            return _Stream(list(self._events), self._rv)
+    async def close(self) -> None:
+        """Async close stub."""
+        pass
 
-        def stop(self) -> None:
-            return None
 
-    return _Watch(events, resource_version)
+def make_watch(events: list[dict[str, Any]], resource_version: str = "rv2") -> FakeAsyncWatch:
+    """Create a fake async watch for testing."""
+    return FakeAsyncWatch(events, resource_version)
 
diff --git a/backend/tests/helpers/kafka.py b/backend/tests/helpers/kafka.py
index 4ceefb22..42230281 100644
--- a/backend/tests/helpers/kafka.py
+++ b/backend/tests/helpers/kafka.py
@@ -1,19 +1,19 @@
-from typing import Awaitable, Callable
+from collections.abc import Awaitable, Callable
 
 import pytest
-
 from app.events.core import UnifiedProducer
 from app.infrastructure.kafka.events.base import BaseEvent
+from dishka import AsyncContainer
 
 
 @pytest.fixture(scope="function")
-async def producer(scope) -> UnifiedProducer:  # type: ignore[valid-type]
+async def producer(scope: AsyncContainer) -> UnifiedProducer:
     """Real Kafka producer from DI scope."""
     return await scope.get(UnifiedProducer)
 
 
 @pytest.fixture(scope="function")
-def send_event(producer: UnifiedProducer) -> Callable[[BaseEvent], Awaitable[None]]:  # type: ignore[valid-type]
+def send_event(producer: UnifiedProducer) -> Callable[[BaseEvent], Awaitable[None]]:
     async def _send(ev: BaseEvent) -> None:
         await producer.produce(ev)
     return _send
diff --git a/backend/tests/helpers/protocols.py b/backend/tests/helpers/protocols.py
new file mode 100644
index 00000000..cde65774
--- /dev/null
+++ b/backend/tests/helpers/protocols.py
@@ -0,0 +1,156 @@
+"""Protocol definitions for test fakes.
+
+These protocols define the interfaces that test fakes must implement,
+allowing proper type checking without using `# type: ignore` comments.
+"""
+
+from asyncio import Event
+from typing import Any, Protocol, runtime_checkable
+
+
+@runtime_checkable
+class SubscriptionProtocol(Protocol):
+    """Protocol for SSE subscription interface."""
+
+    async def get(self, model: type[Any], timeout: float = 0.5) -> Any | None:
+        """Get the next message from the subscription."""
+        ...
+
+    async def push(self, msg: dict[str, Any]) -> None:
+        """Push a message to the subscription (for testing)."""
+        ...
+
+    async def close(self) -> None:
+        """Close the subscription."""
+        ...
+
+
+@runtime_checkable
+class SSEBusProtocol(Protocol):
+    """Protocol for SSE bus interface."""
+
+    async def open_subscription(self, execution_id: str) -> SubscriptionProtocol:
+        """Open a subscription for an execution."""
+        ...
+
+    async def open_notification_subscription(self, user_id: str) -> SubscriptionProtocol:
+        """Open a notification subscription for a user."""
+        ...
+
+
+@runtime_checkable
+class ExecutionRepositoryProtocol(Protocol):
+    """Protocol for execution repository interface."""
+
+    async def get_execution_status(self, execution_id: str) -> Any:
+        """Get the status of an execution."""
+        ...
+
+    async def get_execution(self, execution_id: str) -> Any | None:
+        """Get an execution by ID."""
+        ...
+
+
+@runtime_checkable
+class ShutdownManagerProtocol(Protocol):
+    """Protocol for SSE shutdown manager interface."""
+
+    async def register_connection(
+        self, execution_id: str, connection_id: str
+    ) -> Event | None:
+        """Register a new SSE connection."""
+        ...
+
+    async def unregister_connection(
+        self, execution_id: str, connection_id: str
+    ) -> None:
+        """Unregister an SSE connection."""
+        ...
+
+    def is_shutting_down(self) -> bool:
+        """Check if shutdown has been initiated."""
+        ...
+
+    def get_shutdown_status(self) -> Any:
+        """Get the current shutdown status."""
+        ...
+
+
+@runtime_checkable
+class RouterProtocol(Protocol):
+    """Protocol for SSE router interface."""
+
+    def get_stats(self) -> dict[str, int | bool]:
+        """Get router statistics."""
+        ...
+
+
+@runtime_checkable
+class RouterWithStopProtocol(Protocol):
+    """Protocol for router with stop capability."""
+
+    async def stop(self) -> None:
+        """Stop the router."""
+        ...
+
+
+@runtime_checkable
+class RouterWithCloseProtocol(Protocol):
+    """Protocol for router with aclose capability."""
+
+    async def aclose(self) -> None:
+        """Close the router."""
+        ...
+
+
+@runtime_checkable
+class SettingsProtocol(Protocol):
+    """Protocol for settings interface used by SSE service."""
+
+    SSE_HEARTBEAT_INTERVAL: int
+
+
+@runtime_checkable
+class ResourceAllocationRepositoryProtocol(Protocol):
+    """Protocol for resource allocation repository interface."""
+
+    async def count_active(self, language: str) -> int:
+        """Count active allocations for a language."""
+        ...
+
+    async def create_allocation(self, create_data: Any) -> Any:
+        """Create a new resource allocation."""
+        ...
+
+    async def release_allocation(self, allocation_id: str) -> None:
+        """Release a resource allocation."""
+        ...
+
+
+@runtime_checkable
+class ProducerProtocol(Protocol):
+    """Protocol for event producer interface."""
+
+    async def produce(self, event: Any, key: str | None = None) -> None:
+        """Produce an event."""
+        ...
+
+
+@runtime_checkable
+class EventDispatcherProtocol(Protocol):
+    """Protocol for event dispatcher interface."""
+
+    def register_handler(self, event_type: Any, handler: Any) -> None:
+        """Register a handler for an event type."""
+        ...
+
+
+@runtime_checkable
+class K8sApiProtocol(Protocol):
+    """Protocol for Kubernetes API interface."""
+
+    def read_namespaced_pod_log(
+        self, name: str, namespace: str, tail_lines: int = 10000
+    ) -> str:
+        """Read logs from a pod."""
+        ...
diff --git a/backend/tests/helpers/sse.py b/backend/tests/helpers/sse.py
index e167467c..3d846937 100644
--- a/backend/tests/helpers/sse.py
+++ b/backend/tests/helpers/sse.py
@@ -1,11 +1,11 @@
 import asyncio
 import json
-from typing import AsyncIterator, Iterable
+from typing import Any, AsyncIterator, Iterable
 
 from httpx import AsyncClient
 
 
-async def stream_sse(client: AsyncClient, url: str, timeout: float = 20.0) -> AsyncIterator[dict]:
+async def stream_sse(client: AsyncClient, url: str, timeout: float = 20.0) -> AsyncIterator[dict[str, Any]]:
     """Yield parsed SSE event dicts from the given URL within a timeout.
 
     Expects lines in the form "data: {...json...}" and ignores keepalives.
@@ -31,7 +31,7 @@ async def wait_for_event_type(
     url: str,
     wanted_types: Iterable[str],
     timeout: float = 20.0,
-) -> dict:
+) -> dict[str, Any]:
     """Return first event whose type/event_type is in wanted_types, otherwise timeout."""
     wanted = {str(t).lower() for t in wanted_types}
     async for ev in stream_sse(client, url, timeout=timeout):
@@ -45,7 +45,7 @@ async def wait_for_execution_terminal(
     client: AsyncClient,
     execution_id: str,
     timeout: float = 30.0,
-) -> dict:
+) -> dict[str, Any]:
     terminal = {"execution_completed", "result_stored", "execution_failed", "execution_timeout", "execution_cancelled"}
     url = f"/api/v1/events/executions/{execution_id}"
     return await wait_for_event_type(client, url, terminal, timeout=timeout)
@@ -55,7 +55,7 @@ async def wait_for_execution_running(
     client: AsyncClient,
     execution_id: str,
     timeout: float = 15.0,
-) -> dict:
+) -> dict[str, Any]:
     running = {"execution_running", "execution_started", "execution_scheduled", "execution_queued"}
     url = f"/api/v1/events/executions/{execution_id}"
     return await wait_for_event_type(client, url, running, timeout=timeout)
diff --git a/backend/tests/integration/app/test_main_app.py b/backend/tests/integration/app/test_main_app.py
index 36af7d12..1354d933 100644
--- a/backend/tests/integration/app/test_main_app.py
+++ b/backend/tests/integration/app/test_main_app.py
@@ -2,39 +2,46 @@
 
 import pytest
 from fastapi import FastAPI
-from starlette.middleware.cors import CORSMiddleware
-
-from app.core.correlation import CorrelationMiddleware
-from app.core.middlewares import (
-    CacheControlMiddleware,
-    MetricsMiddleware,
-    RateLimitMiddleware,
-    RequestSizeLimitMiddleware,
-)
+from httpx import AsyncClient
+from starlette.routing import Route
 
 pytestmark = pytest.mark.integration
 
 
-def test_create_app_real_instance(app) -> None:  # type: ignore[valid-type]
+def test_create_app_real_instance(app: FastAPI) -> None:
     assert isinstance(app, FastAPI)
 
-    # Verify API routes are configured
-    paths = {r.path for r in app.router.routes}
+    # Verify API routes are configured (narrow BaseRoute to Route for path access)
+    paths = {r.path for r in app.router.routes if isinstance(r, Route)}
     assert any(p.startswith("/api/") for p in paths)
 
-    # Verify required middlewares are actually present in the stack
-    middleware_classes = {m.cls for m in app.user_middleware}
+    # Verify middleware stack has expected count (6 custom middlewares)
+    assert len(app.user_middleware) >= 6, "Expected at least 6 middlewares configured"
+
+
+@pytest.mark.asyncio
+async def test_middlewares_behavior(client: AsyncClient) -> None:
+    """Test middleware behavior via HTTP - the proper way to verify middleware config."""
+    # CORS middleware: responds to preflight OPTIONS with CORS headers for allowed origins
+    allowed_origin = "https://localhost:5001"
+    resp = await client.options(
+        "/api/v1/health",
+        headers={"Origin": allowed_origin, "Access-Control-Request-Method": "GET"},
+    )
+    assert resp.status_code == 200
+    assert resp.headers.get("access-control-allow-origin") == allowed_origin
+
+    # Correlation middleware: adds correlation ID header to responses
+    resp = await client.get("/api/v1/health")
+    assert "x-correlation-id" in resp.headers
 
-    # Check that all required middlewares are configured
-    assert CORSMiddleware in middleware_classes, "CORS middleware not configured"
-    assert CorrelationMiddleware in middleware_classes, "Correlation middleware not configured"
-    assert RequestSizeLimitMiddleware in middleware_classes, "Request size limit middleware not configured"
-    assert CacheControlMiddleware in middleware_classes, "Cache control middleware not configured"
-    assert MetricsMiddleware in middleware_classes, "Metrics middleware not configured"
-    assert RateLimitMiddleware in middleware_classes, "Rate limit middleware not configured"
+    # Cache-Control middleware: adds cache headers for configured endpoints
+    resp = await client.get("/api/v1/example-scripts")
+    assert resp.status_code == 200
+    assert "cache-control" in resp.headers
 
 
-def test_create_app_function_constructs(app) -> None:  # type: ignore[valid-type]
+def test_create_app_function_constructs(app: FastAPI) -> None:
     # Sanity: calling create_app returns a FastAPI instance (lazy import)
     inst = import_module("app.main").create_app()
     assert isinstance(inst, FastAPI)
diff --git a/backend/tests/integration/conftest.py b/backend/tests/integration/conftest.py
index a59a32a9..e6c4d276 100644
--- a/backend/tests/integration/conftest.py
+++ b/backend/tests/integration/conftest.py
@@ -1,18 +1,28 @@
-"""Integration tests conftest - with infrastructure cleanup."""
-import pytest_asyncio
-import redis.asyncio as redis
+"""Integration tests conftest."""
+import uuid
+from collections.abc import Callable
 
-from app.core.database_context import Database
-from tests.helpers.cleanup import cleanup_db_and_redis
+import pytest
 
 
-@pytest_asyncio.fixture(autouse=True)
-async def _cleanup(db: Database, redis_client: redis.Redis):
-    """Clean DB and Redis before each integration test.
+@pytest.fixture
+def unique_id(request: pytest.FixtureRequest) -> Callable[[str], str]:
+    """Generate unique IDs with a prefix for test isolation.
 
-    Only pre-test cleanup - post-test cleanup causes event loop issues
-    when SSE/streaming tests hold connections across loop boundaries.
+    Each call returns a new unique ID. The test name prefix ensures
+    isolation between tests; the counter ensures uniqueness within a test.
+
+    Usage:
+        def test_something(unique_id):
+            exec_id = unique_id("exec-")  # exec-test_somethin-a1b2-0
+            event_id = unique_id("evt-")  # evt-test_somethin-a1b2-1
     """
-    await cleanup_db_and_redis(db, redis_client)
-    yield
-    # No post-test cleanup to avoid "Event loop is closed" errors
+    base = f"{request.node.name[:15]}-{uuid.uuid4().hex[:4]}"
+    counter = [0]  # Mutable container for closure
+
+    def _make(prefix: str = "") -> str:
+        result = f"{prefix}{base}-{counter[0]}"
+        counter[0] += 1
+        return result
+
+    return _make
diff --git a/backend/tests/integration/core/__init__.py b/backend/tests/integration/core/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/core/test_container.py b/backend/tests/integration/core/test_container.py
index 36bad89a..85ef5122 100644
--- a/backend/tests/integration/core/test_container.py
+++ b/backend/tests/integration/core/test_container.py
@@ -1,14 +1,13 @@
 import pytest
-from dishka import AsyncContainer
 from app.core.database_context import Database
-
 from app.services.event_service import EventService
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.mongodb]
 
 
 @pytest.mark.asyncio
-async def test_container_resolves_services(app_container, scope) -> None:  # type: ignore[valid-type]
+async def test_container_resolves_services(app_container: AsyncContainer, scope: AsyncContainer) -> None:
     # Container is the real Dishka container
     assert isinstance(app_container, AsyncContainer)
 
diff --git a/backend/tests/integration/core/test_dishka_lifespan.py b/backend/tests/integration/core/test_dishka_lifespan.py
index bdb5c38c..4a6869f7 100644
--- a/backend/tests/integration/core/test_dishka_lifespan.py
+++ b/backend/tests/integration/core/test_dishka_lifespan.py
@@ -1,7 +1,7 @@
 from fastapi import FastAPI
 
 
-def test_lifespan_container_attached(app) -> None:  # type: ignore[valid-type]
+def test_lifespan_container_attached(app: FastAPI) -> None:
     # App fixture uses real lifespan; container is attached to app.state
     assert isinstance(app, FastAPI)
     assert hasattr(app.state, "dishka_container")
diff --git a/backend/tests/integration/db/__init__.py b/backend/tests/integration/db/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/db/repositories/test_admin_settings_repository.py b/backend/tests/integration/db/repositories/test_admin_settings_repository.py
index 7c19cf50..1816d158 100644
--- a/backend/tests/integration/db/repositories/test_admin_settings_repository.py
+++ b/backend/tests/integration/db/repositories/test_admin_settings_repository.py
@@ -1,12 +1,14 @@
 import pytest
+from app.core.database_context import Database
 from app.db.repositories.admin.admin_settings_repository import AdminSettingsRepository
 from app.domain.admin import SystemSettings
+from dishka import AsyncContainer
 
 pytestmark = pytest.mark.integration
 
 
 @pytest.fixture()
-async def repo(scope) -> AdminSettingsRepository:  # type: ignore[valid-type]
+async def repo(scope: AsyncContainer) -> AdminSettingsRepository:
     return await scope.get(AdminSettingsRepository)
 
 
@@ -24,7 +26,7 @@ async def test_get_system_settings_existing(repo: AdminSettingsRepository) -> No
 
 
 @pytest.mark.asyncio
-async def test_update_and_reset_settings(repo: AdminSettingsRepository, db) -> None:  # type: ignore[valid-type]
+async def test_update_and_reset_settings(repo: AdminSettingsRepository, db: Database) -> None:  # noqa: F811
     s = SystemSettings()
     updated = await repo.update_system_settings(s, updated_by="admin", user_id="u1")
     assert isinstance(updated, SystemSettings)
diff --git a/backend/tests/integration/db/repositories/test_dlq_repository.py b/backend/tests/integration/db/repositories/test_dlq_repository.py
index 07d3711f..dfc18190 100644
--- a/backend/tests/integration/db/repositories/test_dlq_repository.py
+++ b/backend/tests/integration/db/repositories/test_dlq_repository.py
@@ -1,105 +1,205 @@
 import logging
+from collections.abc import Callable
 from datetime import datetime, timezone
 
 import pytest
+from app.core.database_context import Database
 from app.db.docs import DLQMessageDocument
 from app.db.repositories.dlq_repository import DLQRepository
 from app.dlq import DLQMessageStatus
 from app.domain.enums.events import EventType
 
-pytestmark = pytest.mark.integration
+pytestmark = [pytest.mark.integration, pytest.mark.mongodb]
 
 _test_logger = logging.getLogger("test.db.repositories.dlq_repository")
 
 
-@pytest.fixture()
+@pytest.fixture
 def repo() -> DLQRepository:
     return DLQRepository(_test_logger)
 
 
-async def insert_test_dlq_docs():
-    """Insert test DLQ documents using Beanie."""
-    now = datetime.now(timezone.utc)
-
-    docs = [
-        DLQMessageDocument(
-            event_id="id1",
-            event_type=str(EventType.USER_LOGGED_IN),
-            event={
-                "event_type": str(EventType.USER_LOGGED_IN),
-                "metadata": {"service_name": "svc", "service_version": "1"},
-                "user_id": "u1",
-                "login_method": "password",
-            },
-            original_topic="t1",
-            error="err",
-            retry_count=0,
-            failed_at=now,
-            status=DLQMessageStatus.PENDING,
-            producer_id="p1",
-        ),
-        DLQMessageDocument(
-            event_id="id2",
-            event_type=str(EventType.USER_LOGGED_IN),
-            event={
-                "event_type": str(EventType.USER_LOGGED_IN),
-                "metadata": {"service_name": "svc", "service_version": "1"},
-                "user_id": "u1",
-                "login_method": "password",
-            },
-            original_topic="t1",
-            error="err",
-            retry_count=0,
-            failed_at=now,
-            status=DLQMessageStatus.RETRIED,
-            producer_id="p1",
-        ),
-        DLQMessageDocument(
-            event_id="id3",
-            event_type=str(EventType.EXECUTION_STARTED),
-            event={
-                "event_type": str(EventType.EXECUTION_STARTED),
-                "metadata": {"service_name": "svc", "service_version": "1"},
-                "execution_id": "x1",
-                "pod_name": "p1",
-            },
-            original_topic="t2",
-            error="err",
-            retry_count=0,
-            failed_at=now,
-            status=DLQMessageStatus.PENDING,
-            producer_id="p1",
-        ),
-    ]
-
-    for doc in docs:
-        await doc.insert()
+async def create_dlq_doc(
+    event_id: str,
+    topic: str,
+    status: DLQMessageStatus = DLQMessageStatus.PENDING,
+    event_type: EventType = EventType.USER_LOGGED_IN,
+) -> DLQMessageDocument:
+    """Create and insert a DLQ document with given parameters."""
+    doc = DLQMessageDocument(
+        event_id=event_id,
+        event_type=str(event_type),
+        event={
+            "event_type": str(event_type),
+            "metadata": {"service_name": "test", "service_version": "1"},
+            "user_id": "u1",
+            "login_method": "password",
+        },
+        original_topic=topic,
+        error="test error",
+        retry_count=0,
+        failed_at=datetime.now(timezone.utc),
+        status=status,
+        producer_id="test",
+    )
+    await doc.insert()
+    return doc
 
 
 @pytest.mark.asyncio
-async def test_stats_list_get_and_updates(repo: DLQRepository) -> None:
-    await insert_test_dlq_docs()
+async def test_get_message_by_id(repo: DLQRepository, db: Database, unique_id: Callable[[str], str]) -> None:
+    event_id = unique_id("dlq-")
+    topic = unique_id("topic-")
+
+    await create_dlq_doc(event_id, topic)
+
+    msg = await repo.get_message_by_id(event_id)
+    assert msg is not None
+    assert msg.event_id == event_id
+    assert msg.original_topic == topic
+    assert msg.status == DLQMessageStatus.PENDING
+
+
+@pytest.mark.asyncio
+async def test_get_message_by_id_not_found(repo: DLQRepository, db: Database, unique_id: Callable[[str], str]) -> None:
+    msg = await repo.get_message_by_id(unique_id("nonexistent-"))
+    assert msg is None
+
+
+@pytest.mark.asyncio
+async def test_mark_message_retried(repo: DLQRepository, db: Database, unique_id: Callable[[str], str]) -> None:
+    event_id = unique_id("dlq-")
+    topic = unique_id("topic-")
+
+    await create_dlq_doc(event_id, topic, status=DLQMessageStatus.PENDING)
+
+    result = await repo.mark_message_retried(event_id)
+    assert result is True
+
+    # Verify status changed
+    msg = await repo.get_message_by_id(event_id)
+    assert msg is not None
+    assert msg.status == DLQMessageStatus.RETRIED
+    assert msg.retried_at is not None
+
+
+@pytest.mark.asyncio
+async def test_mark_message_retried_not_found(
+    repo: DLQRepository, db: Database, unique_id: Callable[[str], str]
+) -> None:
+    result = await repo.mark_message_retried(unique_id("nonexistent-"))
+    assert result is False
+
+
+@pytest.mark.asyncio
+async def test_mark_message_discarded(repo: DLQRepository, db: Database, unique_id: Callable[[str], str]) -> None:
+    event_id = unique_id("dlq-")
+    topic = unique_id("topic-")
+
+    await create_dlq_doc(event_id, topic, status=DLQMessageStatus.PENDING)
+
+    result = await repo.mark_message_discarded(event_id, "test reason")
+    assert result is True
+
+    # Verify status changed
+    msg = await repo.get_message_by_id(event_id)
+    assert msg is not None
+    assert msg.status == DLQMessageStatus.DISCARDED
+    assert msg.discarded_at is not None
+    assert msg.discard_reason == "test reason"
+
+
+@pytest.mark.asyncio
+async def test_mark_message_discarded_not_found(
+    repo: DLQRepository, db: Database, unique_id: Callable[[str], str]
+) -> None:
+    result = await repo.mark_message_discarded(unique_id("nonexistent-"), "reason")
+    assert result is False
+
+
+@pytest.mark.asyncio
+async def test_get_messages_with_pagination(repo: DLQRepository, db: Database, unique_id: Callable[[str], str]) -> None:
+    topic = unique_id("topic-")
+    event_ids = [unique_id(f"dlq-{i}-") for i in range(5)]
+
+    for eid in event_ids:
+        await create_dlq_doc(eid, topic)
+
+    # Get first page
+    result = await repo.get_messages(topic=topic, limit=2, offset=0)
+    assert result.total == 5
+    assert len(result.messages) == 2
+    assert result.limit == 2
+    assert result.offset == 0
+
+    # Get second page
+    result2 = await repo.get_messages(topic=topic, limit=2, offset=2)
+    assert result2.total == 5
+    assert len(result2.messages) == 2
+    assert result2.offset == 2
+
+
+@pytest.mark.asyncio
+async def test_get_messages_filter_by_status(
+    repo: DLQRepository, db: Database, unique_id: Callable[[str], str]
+) -> None:
+    topic = unique_id("topic-")
+
+    await create_dlq_doc(unique_id("dlq-1-"), topic, status=DLQMessageStatus.PENDING)
+    await create_dlq_doc(unique_id("dlq-2-"), topic, status=DLQMessageStatus.PENDING)
+    await create_dlq_doc(unique_id("dlq-3-"), topic, status=DLQMessageStatus.RETRIED)
+
+    pending = await repo.get_messages(topic=topic, status=DLQMessageStatus.PENDING)
+    assert pending.total == 2
+
+    retried = await repo.get_messages(topic=topic, status=DLQMessageStatus.RETRIED)
+    assert retried.total == 1
+
+
+@pytest.mark.asyncio
+async def test_get_dlq_stats(repo: DLQRepository, db: Database, unique_id: Callable[[str], str]) -> None:
+    topic = unique_id("topic-")
+
+    await create_dlq_doc(unique_id("dlq-1-"), topic, status=DLQMessageStatus.PENDING)
+    await create_dlq_doc(unique_id("dlq-2-"), topic, status=DLQMessageStatus.RETRIED)
 
     stats = await repo.get_dlq_stats()
-    assert isinstance(stats.by_status, dict) and len(stats.by_topic) >= 1
 
-    res = await repo.get_messages(limit=2)
-    assert res.total >= 3 and len(res.messages) <= 2
-    msg = await repo.get_message_by_id("id1")
-    assert msg and msg.event_id == "id1"
-    assert await repo.mark_message_retried("id1") in (True, False)
-    assert await repo.mark_message_discarded("id1", "r") in (True, False)
+    assert isinstance(stats.by_status, dict)
+    assert isinstance(stats.by_topic, list)
+    assert isinstance(stats.by_event_type, list)
+    assert stats.age_stats is not None
+
+
+@pytest.mark.asyncio
+async def test_get_topics_summary(repo: DLQRepository, db: Database, unique_id: Callable[[str], str]) -> None:
+    topic = unique_id("topic-")
+
+    await create_dlq_doc(unique_id("dlq-1-"), topic, status=DLQMessageStatus.PENDING)
+    await create_dlq_doc(unique_id("dlq-2-"), topic, status=DLQMessageStatus.PENDING)
+    await create_dlq_doc(unique_id("dlq-3-"), topic, status=DLQMessageStatus.RETRIED)
+
+    summaries = await repo.get_topics_summary()
+    topic_summary = next((s for s in summaries if s.topic == topic), None)
 
-    topics = await repo.get_topics_summary()
-    assert any(t.topic == "t1" for t in topics)
+    assert topic_summary is not None
+    assert topic_summary.total_messages == 3
+    assert topic_summary.status_breakdown[DLQMessageStatus.PENDING] == 2
+    assert topic_summary.status_breakdown[DLQMessageStatus.RETRIED] == 1
 
 
 @pytest.mark.asyncio
-async def test_retry_batch(repo: DLQRepository) -> None:
-    class Manager:
-        async def retry_message_manually(self, eid: str) -> bool:  # noqa: ARG002
+async def test_retry_messages_batch_not_found(
+    repo: DLQRepository, db: Database, unique_id: Callable[[str], str]
+) -> None:
+    class MockManager:
+        async def retry_message_manually(self, event_id: str) -> bool:
             return True
 
-    result = await repo.retry_messages_batch(["missing"], Manager())
-    # Missing messages cause failures
-    assert result.total == 1 and result.failed >= 1
+    result = await repo.retry_messages_batch([unique_id("missing-")], MockManager())  # type: ignore[arg-type]
+    assert result.total == 1
+    assert result.failed == 1
+    assert result.successful == 0
+    assert result.details[0].status == "failed"
+    assert result.details[0].error is not None
+    assert "not found" in result.details[0].error.lower()
diff --git a/backend/tests/integration/db/repositories/test_event_repository.py b/backend/tests/integration/db/repositories/test_event_repository.py
new file mode 100644
index 00000000..04696e32
--- /dev/null
+++ b/backend/tests/integration/db/repositories/test_event_repository.py
@@ -0,0 +1,308 @@
+import logging
+from collections.abc import Callable
+from datetime import datetime, timedelta, timezone
+from typing import Any
+
+import pytest
+from app.db.repositories.event_repository import EventRepository
+from app.domain.enums.events import EventType
+from app.domain.events import Event
+from app.domain.events.event_metadata import EventMetadata
+
+_test_logger = logging.getLogger("test.db.repositories.event_repository")
+
+pytestmark = pytest.mark.integration
+
+
+def _make_event(
+    event_id: str,
+    event_type: EventType = EventType.EXECUTION_REQUESTED,
+    aggregate_id: str | None = None,
+    correlation_id: str = "corr-test",
+    user_id: str | None = None,
+    service_name: str = "test-service",
+    timestamp: datetime | None = None,
+) -> Event:
+    """Factory for Event domain objects."""
+    return Event(
+        event_id=event_id,
+        event_type=event_type,
+        event_version="1.0",
+        timestamp=timestamp or datetime.now(timezone.utc),
+        metadata=EventMetadata(
+            service_name=service_name,
+            service_version="1.0.0",
+            correlation_id=correlation_id,
+            user_id=user_id,
+        ),
+        payload={"test": "data", "execution_id": aggregate_id},
+        aggregate_id=aggregate_id,
+    )
+
+
+@pytest.mark.asyncio
+async def test_store_and_get_event(unique_id: Callable[[str], str]) -> None:
+    """Store event and retrieve by ID."""
+    repo = EventRepository(logger=_test_logger)
+    event_id = unique_id("evt-")
+    event = _make_event(event_id=event_id)
+
+    stored_id = await repo.store_event(event)
+    assert stored_id == event_id
+
+    retrieved = await repo.get_event(event_id)
+    assert retrieved is not None
+    assert retrieved.event_id == event_id
+    assert retrieved.event_type == EventType.EXECUTION_REQUESTED
+
+
+@pytest.mark.asyncio
+async def test_get_event_not_found(unique_id: Callable[[str], str]) -> None:
+    """Returns None for non-existent event."""
+    repo = EventRepository(logger=_test_logger)
+    result = await repo.get_event(unique_id("nonexistent-"))
+    assert result is None
+
+
+@pytest.mark.asyncio
+async def test_get_events_by_aggregate(unique_id: Callable[[str], str]) -> None:
+    """Retrieve events by aggregate_id."""
+    repo = EventRepository(logger=_test_logger)
+    aggregate_id = unique_id("exec-")
+
+    # Store multiple events for same aggregate
+    events = [
+        _make_event(unique_id("evt-"), EventType.EXECUTION_REQUESTED, aggregate_id),
+        _make_event(unique_id("evt-"), EventType.EXECUTION_QUEUED, aggregate_id),
+        _make_event(unique_id("evt-"), EventType.EXECUTION_RUNNING, aggregate_id),
+    ]
+    for e in events:
+        await repo.store_event(e)
+
+    # Retrieve all
+    result = await repo.get_events_by_aggregate(aggregate_id)
+    assert len(result) >= 3
+
+    # Filter by event type
+    filtered = await repo.get_events_by_aggregate(
+        aggregate_id, event_types=[EventType.EXECUTION_QUEUED]
+    )
+    assert all(e.event_type == EventType.EXECUTION_QUEUED for e in filtered)
+
+
+@pytest.mark.asyncio
+async def test_get_events_by_correlation(unique_id: Callable[[str], str]) -> None:
+    """Retrieve events by correlation_id with pagination."""
+    repo = EventRepository(logger=_test_logger)
+    correlation_id = unique_id("corr-")
+
+    # Store events with same correlation
+    for i in range(5):
+        event = _make_event(
+            unique_id("evt-"),
+            correlation_id=correlation_id,
+            aggregate_id=unique_id("exec-"),
+        )
+        await repo.store_event(event)
+
+    result = await repo.get_events_by_correlation(correlation_id, limit=3, skip=0)
+    assert result.total >= 5
+    assert len(result.events) == 3
+    assert result.has_more is True
+
+    # Get second page
+    page2 = await repo.get_events_by_correlation(correlation_id, limit=3, skip=3)
+    assert len(page2.events) >= 2
+
+
+@pytest.mark.asyncio
+async def test_get_execution_events(unique_id: Callable[[str], str]) -> None:
+    """Retrieve events for an execution with system event filtering."""
+    repo = EventRepository(logger=_test_logger)
+    execution_id = unique_id("exec-")
+
+    # Store regular and system events
+    await repo.store_event(
+        _make_event(unique_id("evt-"), aggregate_id=execution_id, service_name="api")
+    )
+    await repo.store_event(
+        _make_event(unique_id("evt-"), aggregate_id=execution_id, service_name="system-monitor")
+    )
+
+    # Without filter
+    all_events = await repo.get_execution_events(execution_id, exclude_system_events=False)
+    assert all_events.total >= 2
+
+    # With filter
+    filtered = await repo.get_execution_events(execution_id, exclude_system_events=True)
+    assert all(not e.metadata.service_name.startswith("system-") for e in filtered.events)
+
+
+@pytest.mark.asyncio
+async def test_get_event_statistics(unique_id: Callable[[str], str]) -> None:
+    """Get aggregated statistics for events."""
+    repo = EventRepository(logger=_test_logger)
+    now = datetime.now(timezone.utc)
+
+    # Store events of different types
+    for event_type in [EventType.EXECUTION_REQUESTED, EventType.EXECUTION_COMPLETED]:
+        for _ in range(2):
+            event = _make_event(unique_id("evt-"), event_type, timestamp=now)
+            await repo.store_event(event)
+
+    stats = await repo.get_event_statistics(
+        start_time=now - timedelta(hours=1),
+        end_time=now + timedelta(hours=1),
+    )
+
+    assert stats.total_events > 0
+    assert isinstance(stats.events_by_type, dict)
+    assert isinstance(stats.events_by_service, dict)
+
+
+@pytest.mark.asyncio
+async def test_get_user_events_paginated(unique_id: Callable[[str], str]) -> None:
+    """Retrieve user's events with pagination and filtering."""
+    repo = EventRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    # Store events for user
+    for i in range(3):
+        event = _make_event(
+            unique_id("evt-"),
+            event_type=EventType.EXECUTION_REQUESTED if i % 2 == 0 else EventType.EXECUTION_COMPLETED,
+            user_id=user_id,
+        )
+        await repo.store_event(event)
+
+    # Get all user events
+    result = await repo.get_user_events_paginated(user_id, limit=10)
+    assert result.total == 3
+
+    # Filter by event type (i=0,2 are EXECUTION_REQUESTED, i=1 is EXECUTION_COMPLETED)
+    filtered = await repo.get_user_events_paginated(
+        user_id,
+        event_types=[EventType.EXECUTION_REQUESTED.value],
+        limit=10,
+    )
+    assert filtered.total == 2
+    assert all(e.event_type == EventType.EXECUTION_REQUESTED for e in filtered.events)
+
+
+@pytest.mark.asyncio
+async def test_query_events_with_filter(unique_id: Callable[[str], str]) -> None:
+    """Query events with arbitrary filter."""
+    repo = EventRepository(logger=_test_logger)
+    service_name = unique_id("svc-")
+
+    # Store events
+    for _ in range(3):
+        event = _make_event(unique_id("evt-"), service_name=service_name)
+        await repo.store_event(event)
+
+    result = await repo.query_events(
+        query={"metadata.service_name": service_name},
+        limit=10,
+    )
+    assert result.total >= 3
+    assert all(e.metadata.service_name == service_name for e in result.events)
+
+
+@pytest.mark.asyncio
+async def test_aggregate_events(unique_id: Callable[[str], str]) -> None:
+    """Run aggregation pipeline on events."""
+    repo = EventRepository(logger=_test_logger)
+    service_name = unique_id("svc-")
+
+    # Store events
+    for _ in range(3):
+        await repo.store_event(_make_event(unique_id("evt-"), service_name=service_name))
+
+    pipeline: list[dict[str, Any]] = [
+        {"$match": {"metadata.service_name": service_name}},
+        {"$group": {"_id": "$event_type", "count": {"$sum": 1}}},
+    ]
+    result = await repo.aggregate_events(pipeline, limit=100)
+    assert len(result.results) > 0
+
+
+@pytest.mark.asyncio
+async def test_list_event_types(unique_id: Callable[[str], str]) -> None:
+    """List distinct event types."""
+    repo = EventRepository(logger=_test_logger)
+    service_name = unique_id("svc-")
+
+    # Store events of different types
+    await repo.store_event(
+        _make_event(unique_id("evt-"), EventType.EXECUTION_REQUESTED, service_name=service_name)
+    )
+    await repo.store_event(
+        _make_event(unique_id("evt-"), EventType.EXECUTION_COMPLETED, service_name=service_name)
+    )
+
+    types = await repo.list_event_types(match={"metadata.service_name": service_name})
+    assert len(types) >= 2
+
+
+@pytest.mark.asyncio
+async def test_delete_event_with_archival(unique_id: Callable[[str], str]) -> None:
+    """Delete event with archival."""
+    repo = EventRepository(logger=_test_logger)
+    event_id = unique_id("evt-")
+    event = _make_event(event_id)
+
+    await repo.store_event(event)
+
+    archived = await repo.delete_event_with_archival(
+        event_id, deleted_by="admin", deletion_reason="Test cleanup"
+    )
+    assert archived is not None
+    assert archived.event_id == event_id
+    assert archived.deleted_by == "admin"
+
+    # Original should be gone
+    assert await repo.get_event(event_id) is None
+
+
+@pytest.mark.asyncio
+async def test_delete_nonexistent_event(unique_id: Callable[[str], str]) -> None:
+    """Returns None when deleting non-existent event."""
+    repo = EventRepository(logger=_test_logger)
+    result = await repo.delete_event_with_archival(
+        unique_id("nonexistent-"), "admin", "test"
+    )
+    assert result is None
+
+
+@pytest.mark.asyncio
+async def test_get_aggregate_replay_info(unique_id: Callable[[str], str]) -> None:
+    """Get replay info for an aggregate."""
+    repo = EventRepository(logger=_test_logger)
+    aggregate_id = unique_id("exec-")
+
+    # Store events with timestamps
+    base_time = datetime.now(timezone.utc)
+    for i, event_type in enumerate(
+        [EventType.EXECUTION_REQUESTED, EventType.EXECUTION_QUEUED, EventType.EXECUTION_COMPLETED]
+    ):
+        event = _make_event(
+            unique_id("evt-"),
+            event_type,
+            aggregate_id,
+            timestamp=base_time + timedelta(seconds=i),
+        )
+        await repo.store_event(event)
+
+    info = await repo.get_aggregate_replay_info(aggregate_id)
+    assert info is not None
+    assert info.event_count >= 3
+    assert len(info.event_types) >= 3
+    assert info.start_time <= info.end_time
+
+
+@pytest.mark.asyncio
+async def test_get_aggregate_replay_info_not_found(unique_id: Callable[[str], str]) -> None:
+    """Returns None for non-existent aggregate."""
+    repo = EventRepository(logger=_test_logger)
+    result = await repo.get_aggregate_replay_info(unique_id("nonexistent-"))
+    assert result is None
diff --git a/backend/tests/integration/db/repositories/test_execution_repository.py b/backend/tests/integration/db/repositories/test_execution_repository.py
index eb3bf2cb..a0beeeac 100644
--- a/backend/tests/integration/db/repositories/test_execution_repository.py
+++ b/backend/tests/integration/db/repositories/test_execution_repository.py
@@ -1,5 +1,5 @@
 import logging
-from uuid import uuid4
+from collections.abc import Callable
 
 import pytest
 from app.db.repositories.execution_repository import ExecutionRepository
@@ -12,9 +12,9 @@
 
 
 @pytest.mark.asyncio
-async def test_execution_crud_and_query() -> None:
+async def test_execution_crud_and_query(unique_id: Callable[[str], str]) -> None:
     repo = ExecutionRepository(logger=_test_logger)
-    user_id = str(uuid4())
+    user_id = unique_id("user-")
 
     # Create
     create_data = DomainExecutionCreate(
diff --git a/backend/tests/integration/db/repositories/test_notification_repository.py b/backend/tests/integration/db/repositories/test_notification_repository.py
new file mode 100644
index 00000000..8981dfe6
--- /dev/null
+++ b/backend/tests/integration/db/repositories/test_notification_repository.py
@@ -0,0 +1,283 @@
+"""Integration tests for NotificationRepository."""
+import logging
+from collections.abc import Callable
+
+import pytest
+from app.db.repositories.notification_repository import NotificationRepository
+from app.domain.enums.notification import NotificationChannel, NotificationSeverity, NotificationStatus
+from app.domain.notification import (
+    DomainNotificationCreate,
+    DomainNotificationUpdate,
+    DomainSubscriptionUpdate,
+)
+
+_test_logger = logging.getLogger("test.db.repositories.notification_repository")
+
+pytestmark = pytest.mark.integration
+
+
+def _make_notification_create(
+    user_id: str,
+    subject: str = "Test Notification",
+    body: str = "Test message content",
+    severity: NotificationSeverity = NotificationSeverity.MEDIUM,
+    tags: list[str] | None = None,
+) -> DomainNotificationCreate:
+    """Factory for notification create data."""
+    return DomainNotificationCreate(
+        user_id=user_id,
+        channel=NotificationChannel.IN_APP,
+        subject=subject,
+        body=body,
+        severity=severity,
+        tags=tags or ["test"],
+    )
+
+
+@pytest.mark.asyncio
+async def test_create_and_get_notification(unique_id: Callable[[str], str]) -> None:
+    """Create notification and retrieve by ID."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    create_data = _make_notification_create(user_id)
+    created = await repo.create_notification(create_data)
+
+    assert created.notification_id
+    assert created.user_id == user_id
+    assert created.status == NotificationStatus.PENDING
+
+    # Retrieve
+    retrieved = await repo.get_notification(created.notification_id, user_id)
+    assert retrieved is not None
+    assert retrieved.subject == "Test Notification"
+
+
+@pytest.mark.asyncio
+async def test_get_notification_wrong_user(unique_id: Callable[[str], str]) -> None:
+    """Cannot get notification belonging to another user."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    created = await repo.create_notification(_make_notification_create(user_id))
+
+    # Try to get with wrong user
+    result = await repo.get_notification(created.notification_id, unique_id("other-user-"))
+    assert result is None
+
+
+@pytest.mark.asyncio
+async def test_update_notification(unique_id: Callable[[str], str]) -> None:
+    """Update notification fields."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    created = await repo.create_notification(_make_notification_create(user_id))
+
+    # Update
+    update = DomainNotificationUpdate(status=NotificationStatus.DELIVERED)
+    success = await repo.update_notification(created.notification_id, user_id, update)
+    assert success is True
+
+    # Verify
+    updated = await repo.get_notification(created.notification_id, user_id)
+    assert updated is not None
+    assert updated.status == NotificationStatus.DELIVERED
+
+
+@pytest.mark.asyncio
+async def test_update_notification_not_found(unique_id: Callable[[str], str]) -> None:
+    """Update returns False for non-existent notification."""
+    repo = NotificationRepository(logger=_test_logger)
+    update = DomainNotificationUpdate(status=NotificationStatus.DELIVERED)
+    result = await repo.update_notification(unique_id("notif-"), unique_id("user-"), update)
+    assert result is False
+
+
+@pytest.mark.asyncio
+async def test_mark_as_read(unique_id: Callable[[str], str]) -> None:
+    """Mark notification as read."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    created = await repo.create_notification(_make_notification_create(user_id))
+    # Set to delivered first
+    await repo.update_notification(
+        created.notification_id, user_id, DomainNotificationUpdate(status=NotificationStatus.DELIVERED)
+    )
+
+    success = await repo.mark_as_read(created.notification_id, user_id)
+    assert success is True
+
+    notif = await repo.get_notification(created.notification_id, user_id)
+    assert notif is not None
+    assert notif.status == NotificationStatus.READ
+    assert notif.read_at is not None
+
+
+@pytest.mark.asyncio
+async def test_mark_all_as_read(unique_id: Callable[[str], str]) -> None:
+    """Mark all user notifications as read."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    # Create multiple notifications and set to delivered
+    for _ in range(3):
+        created = await repo.create_notification(_make_notification_create(user_id))
+        await repo.update_notification(
+            created.notification_id, user_id, DomainNotificationUpdate(status=NotificationStatus.DELIVERED)
+        )
+
+    count = await repo.mark_all_as_read(user_id)
+    assert count >= 3
+
+
+@pytest.mark.asyncio
+async def test_delete_notification(unique_id: Callable[[str], str]) -> None:
+    """Delete notification."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    created = await repo.create_notification(_make_notification_create(user_id))
+
+    success = await repo.delete_notification(created.notification_id, user_id)
+    assert success is True
+
+    # Verify deleted
+    assert await repo.get_notification(created.notification_id, user_id) is None
+
+
+@pytest.mark.asyncio
+async def test_list_notifications_with_filters(unique_id: Callable[[str], str]) -> None:
+    """List notifications with various filters."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    # Create notifications with different tags
+    n1 = await repo.create_notification(_make_notification_create(user_id, tags=["alert", "critical"]))
+    await repo.update_notification(n1.notification_id, user_id, DomainNotificationUpdate(status=NotificationStatus.DELIVERED))
+
+    await repo.create_notification(_make_notification_create(user_id, tags=["info"]))
+
+    n3 = await repo.create_notification(_make_notification_create(user_id, tags=["alert", "warning"]))
+    await repo.update_notification(n3.notification_id, user_id, DomainNotificationUpdate(status=NotificationStatus.DELIVERED))
+
+    # List all
+    all_notifs = await repo.list_notifications(user_id)
+    assert len(all_notifs) >= 3
+
+    # Filter by status
+    delivered = await repo.list_notifications(user_id, status=NotificationStatus.DELIVERED)
+    assert len(delivered) >= 2
+
+    # Filter by include_tags
+    alerts = await repo.list_notifications(user_id, include_tags=["alert"])
+    assert len(alerts) >= 2
+
+
+@pytest.mark.asyncio
+async def test_count_and_unread_count(unique_id: Callable[[str], str]) -> None:
+    """Count notifications and unread count."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    n1 = await repo.create_notification(_make_notification_create(user_id))
+    await repo.update_notification(n1.notification_id, user_id, DomainNotificationUpdate(status=NotificationStatus.DELIVERED))
+
+    n2 = await repo.create_notification(_make_notification_create(user_id))
+    await repo.update_notification(n2.notification_id, user_id, DomainNotificationUpdate(status=NotificationStatus.READ))
+
+    total = await repo.count_notifications(user_id)
+    assert total >= 2
+
+    unread = await repo.get_unread_count(user_id)
+    assert unread >= 1
+
+
+@pytest.mark.asyncio
+async def test_try_claim_pending(unique_id: Callable[[str], str]) -> None:
+    """Claim pending notification for processing."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    created = await repo.create_notification(_make_notification_create(user_id))
+
+    claimed = await repo.try_claim_pending(created.notification_id)
+    assert claimed is True
+
+    # Verify status changed
+    notif = await repo.get_notification(created.notification_id, user_id)
+    assert notif is not None
+    assert notif.status == NotificationStatus.SENDING
+
+
+@pytest.mark.asyncio
+async def test_try_claim_already_claimed(unique_id: Callable[[str], str]) -> None:
+    """Cannot claim already claimed notification."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    created = await repo.create_notification(_make_notification_create(user_id))
+    await repo.update_notification(
+        created.notification_id, user_id, DomainNotificationUpdate(status=NotificationStatus.SENDING)
+    )
+
+    claimed = await repo.try_claim_pending(created.notification_id)
+    assert claimed is False
+
+
+@pytest.mark.asyncio
+async def test_find_pending_notifications(unique_id: Callable[[str], str]) -> None:
+    """Find pending notifications ready for processing."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    # Create pending notifications
+    for _ in range(3):
+        await repo.create_notification(_make_notification_create(user_id))
+
+    pending = await repo.find_pending_notifications(batch_size=10)
+    assert len(pending) >= 3
+
+
+@pytest.mark.asyncio
+async def test_subscription_upsert_and_get(unique_id: Callable[[str], str]) -> None:
+    """Create and update subscription."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    # Create
+    update = DomainSubscriptionUpdate(enabled=True)
+    sub = await repo.upsert_subscription(user_id, NotificationChannel.IN_APP, update)
+    assert sub.enabled is True
+
+    # Update
+    update2 = DomainSubscriptionUpdate(enabled=False)
+    sub2 = await repo.upsert_subscription(user_id, NotificationChannel.IN_APP, update2)
+    assert sub2.enabled is False
+
+    # Get
+    retrieved = await repo.get_subscription(user_id, NotificationChannel.IN_APP)
+    assert retrieved is not None
+    assert retrieved.enabled is False
+
+
+@pytest.mark.asyncio
+async def test_get_all_subscriptions(unique_id: Callable[[str], str]) -> None:
+    """Get all channel subscriptions with defaults."""
+    repo = NotificationRepository(logger=_test_logger)
+    user_id = unique_id("user-")
+
+    # Set one subscription
+    await repo.upsert_subscription(
+        user_id, NotificationChannel.WEBHOOK, DomainSubscriptionUpdate(enabled=False)
+    )
+
+    subs = await repo.get_all_subscriptions(user_id)
+
+    # Should have all channels
+    assert len(subs) == len(NotificationChannel)
+    # Explicit one should be disabled
+    assert subs[NotificationChannel.WEBHOOK].enabled is False
+    # Default ones should be enabled
+    assert subs[NotificationChannel.IN_APP].enabled is True
diff --git a/backend/tests/integration/db/repositories/test_saga_repository.py b/backend/tests/integration/db/repositories/test_saga_repository.py
new file mode 100644
index 00000000..06b8db55
--- /dev/null
+++ b/backend/tests/integration/db/repositories/test_saga_repository.py
@@ -0,0 +1,274 @@
+import logging
+from collections.abc import Callable
+from datetime import datetime, timedelta, timezone
+
+import pytest
+from app.db.repositories.saga_repository import SagaRepository
+from app.domain.enums.saga import SagaState
+from app.domain.saga import Saga, SagaFilter
+
+_test_logger = logging.getLogger("test.db.repositories.saga_repository")
+
+pytestmark = pytest.mark.integration
+
+
+def _make_saga(
+    saga_id: str,
+    saga_name: str = "execution_saga",
+    execution_id: str | None = None,
+    state: SagaState = SagaState.RUNNING,
+    user_id: str | None = None,
+    error_message: str | None = None,
+) -> Saga:
+    """Factory for Saga domain objects."""
+    return Saga(
+        saga_id=saga_id,
+        saga_name=saga_name,
+        execution_id=execution_id or saga_id.replace("saga-", "exec-"),
+        state=state,
+        completed_steps=[],
+        context_data={"user_id": user_id} if user_id else {},
+        error_message=error_message,
+    )
+
+
+@pytest.mark.asyncio
+async def test_upsert_and_get_saga(unique_id: Callable[[str], str]) -> None:
+    """Create saga and retrieve by ID."""
+    repo = SagaRepository()
+    saga_id = unique_id("saga-")
+    saga = _make_saga(saga_id)
+
+    # Insert (upsert returns False for new)
+    is_update = await repo.upsert_saga(saga)
+    assert is_update is False
+
+    # Get
+    retrieved = await repo.get_saga(saga_id)
+    assert retrieved is not None
+    assert retrieved.saga_id == saga_id
+    assert retrieved.state == SagaState.RUNNING
+
+
+@pytest.mark.asyncio
+async def test_upsert_existing_saga(unique_id: Callable[[str], str]) -> None:
+    """Update existing saga via upsert."""
+    repo = SagaRepository()
+    saga_id = unique_id("saga-")
+    saga = _make_saga(saga_id)
+
+    await repo.upsert_saga(saga)
+
+    # Update state
+    saga.state = SagaState.COMPLETED
+    saga.completed_steps = ["step1", "step2"]
+    is_update = await repo.upsert_saga(saga)
+    assert is_update is True
+
+    # Verify
+    retrieved = await repo.get_saga(saga_id)
+    assert retrieved is not None
+    assert retrieved.state == SagaState.COMPLETED
+    assert len(retrieved.completed_steps) == 2
+
+
+@pytest.mark.asyncio
+async def test_get_saga_not_found(unique_id: Callable[[str], str]) -> None:
+    """Returns None for non-existent saga."""
+    repo = SagaRepository()
+    result = await repo.get_saga(unique_id("nonexistent-"))
+    assert result is None
+
+
+@pytest.mark.asyncio
+async def test_get_saga_by_execution_and_name(unique_id: Callable[[str], str]) -> None:
+    """Retrieve saga by execution_id and saga_name."""
+    repo = SagaRepository()
+    saga_id = unique_id("saga-")
+    execution_id = unique_id("exec-")
+    saga_name = "test_saga"
+
+    saga = _make_saga(saga_id, saga_name=saga_name, execution_id=execution_id)
+    await repo.upsert_saga(saga)
+
+    retrieved = await repo.get_saga_by_execution_and_name(execution_id, saga_name)
+    assert retrieved is not None
+    assert retrieved.saga_id == saga_id
+
+
+@pytest.mark.asyncio
+async def test_get_sagas_by_execution(unique_id: Callable[[str], str]) -> None:
+    """Retrieve sagas by execution_id with state filtering."""
+    repo = SagaRepository()
+    execution_id = unique_id("exec-")
+
+    # Create sagas with different states
+    await repo.upsert_saga(_make_saga(unique_id("saga-"), execution_id=execution_id, state=SagaState.RUNNING))
+    await repo.upsert_saga(_make_saga(unique_id("saga-"), execution_id=execution_id, state=SagaState.COMPLETED))
+    await repo.upsert_saga(_make_saga(unique_id("saga-"), execution_id=execution_id, state=SagaState.RUNNING))
+
+    # Get all
+    result = await repo.get_sagas_by_execution(execution_id)
+    assert result.total >= 3
+
+    # Filter by state
+    running = await repo.get_sagas_by_execution(execution_id, state=SagaState.RUNNING)
+    assert running.total >= 2
+
+
+@pytest.mark.asyncio
+async def test_list_sagas_with_filter(unique_id: Callable[[str], str]) -> None:
+    """List sagas with SagaFilter."""
+    repo = SagaRepository()
+    user_id = unique_id("user-")
+    saga_name = "filter_test_saga"
+
+    # Create sagas
+    for state in [SagaState.RUNNING, SagaState.COMPLETED, SagaState.FAILED]:
+        saga = _make_saga(
+            unique_id("saga-"),
+            saga_name=saga_name,
+            user_id=user_id,
+            state=state,
+            error_message="Test error" if state == SagaState.FAILED else None,
+        )
+        await repo.upsert_saga(saga)
+
+    # Filter by user_id
+    user_filter = SagaFilter(user_id=user_id)
+    result = await repo.list_sagas(user_filter)
+    assert result.total >= 3
+
+    # Filter by state
+    state_filter = SagaFilter(state=SagaState.COMPLETED)
+    completed = await repo.list_sagas(state_filter)
+    assert all(s.state == SagaState.COMPLETED for s in completed.sagas)
+
+    # Filter by saga_name
+    name_filter = SagaFilter(saga_name=saga_name)
+    named = await repo.list_sagas(name_filter)
+    assert all(s.saga_name == saga_name for s in named.sagas)
+
+    # Filter by error_status
+    error_filter = SagaFilter(error_status=True)
+    with_errors = await repo.list_sagas(error_filter)
+    assert all(s.error_message is not None for s in with_errors.sagas)
+
+
+@pytest.mark.asyncio
+async def test_list_sagas_pagination(unique_id: Callable[[str], str]) -> None:
+    """List sagas with pagination."""
+    repo = SagaRepository()
+    user_id = unique_id("user-")
+
+    # Create multiple sagas
+    for _ in range(5):
+        await repo.upsert_saga(_make_saga(unique_id("saga-"), user_id=user_id))
+
+    user_filter = SagaFilter(user_id=user_id)
+
+    # First page
+    page1 = await repo.list_sagas(user_filter, limit=2, skip=0)
+    assert len(page1.sagas) == 2
+    assert page1.total >= 5
+
+    # Second page
+    page2 = await repo.list_sagas(user_filter, limit=2, skip=2)
+    assert len(page2.sagas) == 2
+
+
+@pytest.mark.asyncio
+async def test_update_saga_state(unique_id: Callable[[str], str]) -> None:
+    """Update saga state."""
+    repo = SagaRepository()
+    saga_id = unique_id("saga-")
+    saga = _make_saga(saga_id)
+    await repo.upsert_saga(saga)
+
+    # Update state
+    success = await repo.update_saga_state(saga_id, SagaState.COMPLETED)
+    assert success is True
+
+    retrieved = await repo.get_saga(saga_id)
+    assert retrieved is not None
+    assert retrieved.state == SagaState.COMPLETED
+
+
+@pytest.mark.asyncio
+async def test_update_saga_state_with_error(unique_id: Callable[[str], str]) -> None:
+    """Update saga state with error message."""
+    repo = SagaRepository()
+    saga_id = unique_id("saga-")
+    saga = _make_saga(saga_id)
+    await repo.upsert_saga(saga)
+
+    success = await repo.update_saga_state(saga_id, SagaState.FAILED, "Step 2 failed: timeout")
+    assert success is True
+
+    retrieved = await repo.get_saga(saga_id)
+    assert retrieved is not None
+    assert retrieved.state == SagaState.FAILED
+    assert retrieved.error_message == "Step 2 failed: timeout"
+
+
+@pytest.mark.asyncio
+async def test_update_saga_state_not_found(unique_id: Callable[[str], str]) -> None:
+    """Update returns False for non-existent saga."""
+    repo = SagaRepository()
+    result = await repo.update_saga_state(unique_id("nonexistent-"), SagaState.COMPLETED)
+    assert result is False
+
+
+@pytest.mark.asyncio
+async def test_count_sagas_by_state(unique_id: Callable[[str], str]) -> None:
+    """Count sagas grouped by state."""
+    repo = SagaRepository()
+
+    # Create sagas in different states
+    for state in [SagaState.RUNNING, SagaState.COMPLETED, SagaState.FAILED]:
+        await repo.upsert_saga(_make_saga(unique_id("saga-"), state=state))
+
+    counts = await repo.count_sagas_by_state()
+    assert isinstance(counts, dict)
+    # Should have entries for the states we created
+    assert len(counts) > 0
+
+
+@pytest.mark.asyncio
+async def test_find_timed_out_sagas(unique_id: Callable[[str], str]) -> None:
+    """Find sagas that have timed out."""
+    repo = SagaRepository()
+
+    # Create running saga with old timestamp
+    saga = _make_saga(unique_id("saga-"), state=SagaState.RUNNING)
+    saga.created_at = datetime.now(timezone.utc) - timedelta(hours=2)
+    await repo.upsert_saga(saga)
+
+    # Find timed out
+    cutoff = datetime.now(timezone.utc) - timedelta(hours=1)
+    timed_out = await repo.find_timed_out_sagas(cutoff)
+    assert len(timed_out) >= 1
+    assert all(s.created_at < cutoff for s in timed_out)
+
+
+@pytest.mark.asyncio
+async def test_get_saga_statistics(unique_id: Callable[[str], str]) -> None:
+    """Get saga statistics."""
+    repo = SagaRepository()
+    user_id = unique_id("user-")
+
+    # Create sagas
+    for state in [SagaState.RUNNING, SagaState.COMPLETED]:
+        saga = _make_saga(unique_id("saga-"), state=state, user_id=user_id)
+        if state == SagaState.COMPLETED:
+            saga.completed_at = datetime.now(timezone.utc)
+        await repo.upsert_saga(saga)
+
+    # Get stats with filter
+    saga_filter = SagaFilter(user_id=user_id)
+    stats = await repo.get_saga_statistics(saga_filter)
+
+    assert "total" in stats
+    assert "by_state" in stats
+    assert "average_duration_seconds" in stats
+    assert stats["total"] >= 2
diff --git a/backend/tests/integration/db/repositories/test_saved_script_repository.py b/backend/tests/integration/db/repositories/test_saved_script_repository.py
index 85fc2b58..58ebfd90 100644
--- a/backend/tests/integration/db/repositories/test_saved_script_repository.py
+++ b/backend/tests/integration/db/repositories/test_saved_script_repository.py
@@ -1,12 +1,13 @@
 import pytest
 from app.db.repositories.saved_script_repository import SavedScriptRepository
 from app.domain.saved_script import DomainSavedScriptCreate, DomainSavedScriptUpdate
+from dishka import AsyncContainer
 
 pytestmark = pytest.mark.integration
 
 
 @pytest.fixture()
-async def repo(scope) -> SavedScriptRepository:  # type: ignore[valid-type]
+async def repo(scope: AsyncContainer) -> SavedScriptRepository:
     return await scope.get(SavedScriptRepository)
 
 
diff --git a/backend/tests/integration/dlq/conftest.py b/backend/tests/integration/dlq/conftest.py
new file mode 100644
index 00000000..9fb79b2b
--- /dev/null
+++ b/backend/tests/integration/dlq/conftest.py
@@ -0,0 +1,22 @@
+import logging
+from collections.abc import Callable
+
+import pytest
+from app.dlq.manager import DLQManager, create_dlq_manager
+from app.events.schema.schema_registry import create_schema_registry_manager
+from app.settings import Settings
+
+_logger = logging.getLogger("test.dlq")
+
+
+@pytest.fixture
+def dlq_manager(test_settings: Settings, unique_id: Callable[[str], str]) -> DLQManager:
+    """DLQ manager with unique consumer group per test."""
+    schema_registry = create_schema_registry_manager(test_settings, _logger)
+    group_suffix = unique_id("dlq-")
+    return create_dlq_manager(
+        settings=test_settings,
+        schema_registry=schema_registry,
+        logger=_logger,
+        group_id_suffix=group_suffix,
+    )
diff --git a/backend/tests/integration/dlq/test_dlq_discard_policy.py b/backend/tests/integration/dlq/test_dlq_discard_policy.py
index ba625f58..3f8b87d9 100644
--- a/backend/tests/integration/dlq/test_dlq_discard_policy.py
+++ b/backend/tests/integration/dlq/test_dlq_discard_policy.py
@@ -1,38 +1,38 @@
 import json
-import logging
-import uuid
+from collections.abc import Callable
 from datetime import datetime, timezone
 
+import backoff
 import pytest
+from app.core.database_context import Database
 from app.db.docs import DLQMessageDocument
-from app.dlq.manager import create_dlq_manager
+from app.dlq.manager import DLQManager
 from app.dlq.models import DLQMessageStatus, RetryPolicy, RetryStrategy
 from app.domain.enums.kafka import KafkaTopic
-from app.events.schema.schema_registry import create_schema_registry_manager
+from app.settings import Settings
 from confluent_kafka import Producer
 
 from tests.helpers import make_execution_requested_event
-from tests.helpers.eventually import eventually
 
 # xdist_group: DLQ tests share a Kafka consumer group. When running in parallel,
 # different workers' managers consume each other's messages and apply wrong policies.
 # Serial execution ensures each test's manager processes only its own messages.
 pytestmark = [pytest.mark.integration, pytest.mark.kafka, pytest.mark.mongodb, pytest.mark.xdist_group("dlq")]
 
-_test_logger = logging.getLogger("test.dlq.discard_policy")
-
 
 @pytest.mark.asyncio
-async def test_dlq_manager_discards_with_manual_policy(db, test_settings) -> None:  # type: ignore[valid-type]
-    schema_registry = create_schema_registry_manager(test_settings, _test_logger)
-    manager = create_dlq_manager(settings=test_settings, schema_registry=schema_registry, logger=_test_logger)
-    # Use prefix from test_settings to match what the manager uses
+async def test_dlq_manager_discards_with_manual_policy(
+    db: Database,
+    test_settings: Settings,
+    dlq_manager: DLQManager,
+    unique_id: Callable[[str], str],
+) -> None:
     prefix = test_settings.KAFKA_TOPIC_PREFIX
     topic = f"{prefix}{str(KafkaTopic.EXECUTION_EVENTS)}"
-    manager.set_retry_policy(topic, RetryPolicy(topic=topic, strategy=RetryStrategy.MANUAL))
+    dlq_manager.set_retry_policy(topic, RetryPolicy(topic=topic, strategy=RetryStrategy.MANUAL))
 
     # Use unique execution_id to avoid conflicts with parallel test workers
-    ev = make_execution_requested_event(execution_id=f"exec-dlq-discard-{uuid.uuid4().hex[:8]}")
+    ev = make_execution_requested_event(execution_id=unique_id("exec-dlq-discard-"))
 
     payload = {
         "event": ev.to_dict(),
@@ -51,11 +51,12 @@ async def test_dlq_manager_discards_with_manual_policy(db, test_settings) -> Non
     )
     producer.flush(5)
 
-    async with manager:
+    async with dlq_manager:
 
-        async def _discarded() -> None:
+        @backoff.on_exception(backoff.constant, AssertionError, max_time=10.0, interval=0.2)
+        async def _wait_discarded() -> None:
             doc = await DLQMessageDocument.find_one({"event_id": ev.event_id})
             assert doc is not None
             assert doc.status == DLQMessageStatus.DISCARDED
 
-        await eventually(_discarded, timeout=10.0, interval=0.2)
+        await _wait_discarded()
diff --git a/backend/tests/integration/dlq/test_dlq_manager.py b/backend/tests/integration/dlq/test_dlq_manager.py
index b6da245e..8fcbf580 100644
--- a/backend/tests/integration/dlq/test_dlq_manager.py
+++ b/backend/tests/integration/dlq/test_dlq_manager.py
@@ -1,36 +1,35 @@
 import json
-import logging
-import uuid
+from collections.abc import Callable
 from datetime import datetime, timezone
 
+import backoff
 import pytest
+from app.core.database_context import Database
 from app.db.docs import DLQMessageDocument
-from app.dlq.manager import create_dlq_manager
+from app.dlq.manager import DLQManager
 from app.domain.enums.kafka import KafkaTopic
-from app.events.schema.schema_registry import create_schema_registry_manager
+from app.settings import Settings
 from confluent_kafka import Producer
 
 from tests.helpers import make_execution_requested_event
-from tests.helpers.eventually import eventually
 
 # xdist_group: DLQ tests share a Kafka consumer group. When running in parallel,
 # different workers' managers consume each other's messages and apply wrong policies.
 # Serial execution ensures each test's manager processes only its own messages.
 pytestmark = [pytest.mark.integration, pytest.mark.kafka, pytest.mark.mongodb, pytest.mark.xdist_group("dlq")]
 
-_test_logger = logging.getLogger("test.dlq.manager")
-
 
 @pytest.mark.asyncio
-async def test_dlq_manager_persists_in_mongo(db, test_settings) -> None:  # type: ignore[valid-type]
-    schema_registry = create_schema_registry_manager(test_settings, _test_logger)
-    manager = create_dlq_manager(settings=test_settings, schema_registry=schema_registry, logger=_test_logger)
-
-    # Use prefix from test_settings to match what the manager uses
+async def test_dlq_manager_persists_in_mongo(
+    db: Database,
+    test_settings: Settings,
+    dlq_manager: DLQManager,
+    unique_id: Callable[[str], str],
+) -> None:
     prefix = test_settings.KAFKA_TOPIC_PREFIX
 
     # Use unique execution_id to avoid conflicts with parallel test workers
-    ev = make_execution_requested_event(execution_id=f"exec-dlq-persist-{uuid.uuid4().hex[:8]}")
+    ev = make_execution_requested_event(execution_id=unique_id("exec-dlq-persist-"))
     payload = {
         "event": ev.to_dict(),
         "original_topic": f"{prefix}{str(KafkaTopic.EXECUTION_EVENTS)}",
@@ -49,12 +48,12 @@ async def test_dlq_manager_persists_in_mongo(db, test_settings) -> None:  # type
     )
     producer.flush(5)
 
-    # Run the manager briefly to consume and persist
-    async with manager:
+    async with dlq_manager:
 
-        async def _exists():
+        @backoff.on_exception(backoff.constant, AssertionError, max_time=10.0, interval=0.2)
+        async def _wait_exists() -> None:
             doc = await DLQMessageDocument.find_one({"event_id": ev.event_id})
             assert doc is not None
 
         # Poll until the document appears
-        await eventually(_exists, timeout=10.0, interval=0.2)
+        await _wait_exists()
diff --git a/backend/tests/integration/dlq/test_dlq_retry_immediate.py b/backend/tests/integration/dlq/test_dlq_retry_immediate.py
index 5c435b92..c14ea283 100644
--- a/backend/tests/integration/dlq/test_dlq_retry_immediate.py
+++ b/backend/tests/integration/dlq/test_dlq_retry_immediate.py
@@ -1,41 +1,41 @@
 import json
-import logging
-import uuid
+from collections.abc import Callable
 from datetime import datetime, timezone
 
+import backoff
 import pytest
+from app.core.database_context import Database
 from app.db.docs import DLQMessageDocument
-from app.dlq.manager import create_dlq_manager
+from app.dlq.manager import DLQManager
 from app.dlq.models import DLQMessageStatus, RetryPolicy, RetryStrategy
 from app.domain.enums.kafka import KafkaTopic
-from app.events.schema.schema_registry import create_schema_registry_manager
+from app.settings import Settings
 from confluent_kafka import Producer
 
 from tests.helpers import make_execution_requested_event
-from tests.helpers.eventually import eventually
 
 # xdist_group: DLQ tests share a Kafka consumer group. When running in parallel,
 # different workers' managers consume each other's messages and apply wrong policies.
 # Serial execution ensures each test's manager processes only its own messages.
 pytestmark = [pytest.mark.integration, pytest.mark.kafka, pytest.mark.mongodb, pytest.mark.xdist_group("dlq")]
 
-_test_logger = logging.getLogger("test.dlq.retry_immediate")
-
 
 @pytest.mark.asyncio
-async def test_dlq_manager_immediate_retry_updates_doc(db, test_settings) -> None:  # type: ignore[valid-type]
-    schema_registry = create_schema_registry_manager(test_settings, _test_logger)
-    manager = create_dlq_manager(settings=test_settings, schema_registry=schema_registry, logger=_test_logger)
-    # Use prefix from test_settings to match what the manager uses
+async def test_dlq_manager_immediate_retry_updates_doc(
+    db: Database,
+    test_settings: Settings,
+    dlq_manager: DLQManager,
+    unique_id: Callable[[str], str],
+) -> None:
     prefix = test_settings.KAFKA_TOPIC_PREFIX
     topic = f"{prefix}{str(KafkaTopic.EXECUTION_EVENTS)}"
-    manager.set_retry_policy(
+    dlq_manager.set_retry_policy(
         topic,
         RetryPolicy(topic=topic, strategy=RetryStrategy.IMMEDIATE, max_retries=1, base_delay_seconds=0.1),
     )
 
     # Use unique execution_id to avoid conflicts with parallel test workers
-    ev = make_execution_requested_event(execution_id=f"exec-dlq-retry-{uuid.uuid4().hex[:8]}")
+    ev = make_execution_requested_event(execution_id=unique_id("exec-dlq-retry-"))
 
     payload = {
         "event": ev.to_dict(),
@@ -54,13 +54,14 @@ async def test_dlq_manager_immediate_retry_updates_doc(db, test_settings) -> Non
     )
     prod.flush(5)
 
-    async with manager:
+    async with dlq_manager:
 
-        async def _retried() -> None:
+        @backoff.on_exception(backoff.constant, AssertionError, max_time=10.0, interval=0.2)
+        async def _wait_retried() -> None:
             doc = await DLQMessageDocument.find_one({"event_id": ev.event_id})
             assert doc is not None
             assert doc.status == DLQMessageStatus.RETRIED
             assert doc.retry_count == 1
             assert doc.retried_at is not None
 
-        await eventually(_retried, timeout=10.0, interval=0.2)
+        await _wait_retried()
diff --git a/backend/tests/integration/events/test_consume_roundtrip.py b/backend/tests/integration/events/test_consume_roundtrip.py
index b2ceb48b..d059c8a8 100644
--- a/backend/tests/integration/events/test_consume_roundtrip.py
+++ b/backend/tests/integration/events/test_consume_roundtrip.py
@@ -1,6 +1,7 @@
 import asyncio
 import logging
-import uuid
+from collections.abc import Callable
+from typing import Any
 
 import pytest
 from app.domain.enums.events import EventType
@@ -10,6 +11,7 @@
 from app.events.core.types import ConsumerConfig
 from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas
 from app.settings import Settings
+from dishka import AsyncContainer
 
 from tests.helpers import make_execution_requested_event
 
@@ -19,7 +21,7 @@
 
 
 @pytest.mark.asyncio
-async def test_produce_consume_roundtrip(scope) -> None:  # type: ignore[valid-type]
+async def test_produce_consume_roundtrip(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     # Ensure schemas are registered
     registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
     settings: Settings = await scope.get(Settings)
@@ -33,10 +35,10 @@ async def test_produce_consume_roundtrip(scope) -> None:  # type: ignore[valid-t
     received = asyncio.Event()
 
     @dispatcher.register(EventType.EXECUTION_REQUESTED)
-    async def _handle(_event) -> None:  # noqa: ANN001
+    async def _handle(_event: Any) -> None:
         received.set()
 
-    group_id = f"test-consumer.{uuid.uuid4().hex[:6]}"
+    group_id = unique_id("test-consumer-")
     config = ConsumerConfig(
         bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS,
         group_id=group_id,
@@ -51,15 +53,15 @@ async def _handle(_event) -> None:  # noqa: ANN001
         settings=settings,
         logger=_test_logger,
     )
-    await consumer.start([str(KafkaTopic.EXECUTION_EVENTS)])
 
-    try:
-        # Produce a request event
-        execution_id = f"exec-{uuid.uuid4().hex[:8]}"
-        evt = make_execution_requested_event(execution_id=execution_id)
-        await producer.produce(evt, key=execution_id)
+    # Produce BEFORE starting consumer - with earliest offset, consumer will read from beginning
+    execution_id = unique_id("exec-")
+    evt = make_execution_requested_event(execution_id=execution_id)
+    await producer.produce(evt, key=execution_id)
+
+    await consumer.start([KafkaTopic.EXECUTION_EVENTS])
 
-        # Wait for the handler to be called
+    try:
         await asyncio.wait_for(received.wait(), timeout=10.0)
     finally:
         await consumer.stop()
diff --git a/backend/tests/integration/events/test_consumer_group_monitor.py b/backend/tests/integration/events/test_consumer_group_monitor.py
index cfab3017..617efc42 100644
--- a/backend/tests/integration/events/test_consumer_group_monitor.py
+++ b/backend/tests/integration/events/test_consumer_group_monitor.py
@@ -9,7 +9,7 @@
 @pytest.mark.integration
 @pytest.mark.kafka
 @pytest.mark.asyncio
-async def test_list_groups_and_error_status():
+async def test_list_groups_and_error_status() -> None:
     mon = NativeConsumerGroupMonitor(logger=_test_logger)
     groups = await mon.list_consumer_groups()
     assert isinstance(groups, list)
diff --git a/backend/tests/integration/events/test_consumer_group_monitor_real.py b/backend/tests/integration/events/test_consumer_group_monitor_real.py
index a31ab4bf..a810f9df 100644
--- a/backend/tests/integration/events/test_consumer_group_monitor_real.py
+++ b/backend/tests/integration/events/test_consumer_group_monitor_real.py
@@ -1,5 +1,5 @@
 import logging
-from uuid import uuid4
+from collections.abc import Callable
 
 import pytest
 from app.events.consumer_group_monitor import (
@@ -14,10 +14,10 @@
 
 
 @pytest.mark.asyncio
-async def test_consumer_group_status_error_path_and_summary():
+async def test_consumer_group_status_error_path_and_summary(unique_id: Callable[[str], str]) -> None:
     monitor = NativeConsumerGroupMonitor(bootstrap_servers="localhost:9092", logger=_test_logger)
     # Non-existent group triggers error-handling path and returns minimal status
-    gid = f"does-not-exist-{uuid4().hex[:8]}"
+    gid = unique_id("does-not-exist-")
     status = await monitor.get_consumer_group_status(gid, timeout=5.0, include_lag=False)
     assert status.group_id == gid
     # Some clusters report non-existent groups as DEAD/UNKNOWN rather than raising
@@ -27,7 +27,7 @@ async def test_consumer_group_status_error_path_and_summary():
     assert summary["group_id"] == gid and summary["health"] == ConsumerGroupHealth.UNHEALTHY.value
 
 
-def test_assess_group_health_branches():
+def test_assess_group_health_branches() -> None:
     m = NativeConsumerGroupMonitor(logger=_test_logger)
     # Error state
     s = ConsumerGroupStatus(
@@ -81,9 +81,9 @@ def test_assess_group_health_branches():
 
 
 @pytest.mark.asyncio
-async def test_multiple_group_status_mixed_errors():
+async def test_multiple_group_status_mixed_errors(unique_id: Callable[[str], str]) -> None:
     m = NativeConsumerGroupMonitor(bootstrap_servers="localhost:9092", logger=_test_logger)
-    gids = [f"none-{uuid4().hex[:6]}", f"none-{uuid4().hex[:6]}"]
+    gids = [unique_id("none1-"), unique_id("none2-")]
     res = await m.get_multiple_group_status(gids, timeout=5.0, include_lag=False)
     assert set(res.keys()) == set(gids)
     assert all(v.health is ConsumerGroupHealth.UNHEALTHY for v in res.values())
diff --git a/backend/tests/integration/events/test_consumer_lifecycle.py b/backend/tests/integration/events/test_consumer_lifecycle.py
index eb63b770..8d80157f 100644
--- a/backend/tests/integration/events/test_consumer_lifecycle.py
+++ b/backend/tests/integration/events/test_consumer_lifecycle.py
@@ -1,11 +1,12 @@
 import logging
-from uuid import uuid4
+from collections.abc import Callable
 
 import pytest
 from app.domain.enums.kafka import KafkaTopic
 from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer
 from app.events.schema.schema_registry import SchemaRegistryManager
 from app.settings import Settings
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.kafka]
 
@@ -13,10 +14,10 @@
 
 
 @pytest.mark.asyncio
-async def test_consumer_start_status_seek_and_stop(scope) -> None:  # type: ignore[valid-type]
+async def test_consumer_start_status_seek_and_stop(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
     settings: Settings = await scope.get(Settings)
-    cfg = ConsumerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, group_id=f"test-consumer-{uuid4().hex[:6]}")
+    cfg = ConsumerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, group_id=unique_id("test-consumer-"))
     disp = EventDispatcher(logger=_test_logger)
     c = UnifiedConsumer(
         cfg,
diff --git a/backend/tests/integration/events/test_dlq_handler.py b/backend/tests/integration/events/test_dlq_handler.py
index 5659529b..153bcb83 100644
--- a/backend/tests/integration/events/test_dlq_handler.py
+++ b/backend/tests/integration/events/test_dlq_handler.py
@@ -1,9 +1,12 @@
 import logging
+from collections.abc import Callable
+from typing import Any
 
 import pytest
 from app.events.core import UnifiedProducer, create_dlq_error_handler, create_immediate_dlq_handler
 from app.infrastructure.kafka.events.metadata import AvroEventMetadata
 from app.infrastructure.kafka.events.saga import SagaStartedEvent
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.kafka]
 
@@ -11,20 +14,23 @@
 
 
 @pytest.mark.asyncio
-async def test_dlq_handler_with_retries(scope, monkeypatch):  # type: ignore[valid-type]
+async def test_dlq_handler_with_retries(
+    scope: AsyncContainer, monkeypatch: pytest.MonkeyPatch, unique_id: Callable[[str], str]
+) -> None:
     p: UnifiedProducer = await scope.get(UnifiedProducer)
     calls: list[tuple[str | None, str, str, int]] = []
 
-    async def _record_send_to_dlq(original_event, original_topic, error, retry_count):  # noqa: ANN001
+    async def _record_send_to_dlq(original_event: Any, original_topic: str, error: Any, retry_count: int) -> None:
         calls.append((original_event.event_id, original_topic, str(error), retry_count))
 
     monkeypatch.setattr(p, "send_to_dlq", _record_send_to_dlq)
-    h = create_dlq_error_handler(p, original_topic="t", max_retries=2, logger=_test_logger)
+    uid = unique_id("")
+    h = create_dlq_error_handler(p, original_topic=f"topic-{uid}", max_retries=2, logger=_test_logger)
     e = SagaStartedEvent(
-        saga_id="s",
+        saga_id=f"saga-{uid}",
         saga_name="n",
-        execution_id="x",
-        initial_event_id="i",
+        execution_id=f"exec-{uid}",
+        initial_event_id=f"evt-{uid}",
         metadata=AvroEventMetadata(service_name="a", service_version="1"),
     )
     # Call 1 and 2 should not send to DLQ
@@ -34,24 +40,27 @@ async def _record_send_to_dlq(original_event, original_topic, error, retry_count
     # 3rd call triggers DLQ
     await h(RuntimeError("boom"), e)
     assert len(calls) == 1
-    assert calls[0][1] == "t"
+    assert calls[0][1] == f"topic-{uid}"
 
 
 @pytest.mark.asyncio
-async def test_immediate_dlq_handler(scope, monkeypatch):  # type: ignore[valid-type]
+async def test_immediate_dlq_handler(
+    scope: AsyncContainer, monkeypatch: pytest.MonkeyPatch, unique_id: Callable[[str], str]
+) -> None:
     p: UnifiedProducer = await scope.get(UnifiedProducer)
     calls: list[tuple[str | None, str, str, int]] = []
 
-    async def _record_send_to_dlq(original_event, original_topic, error, retry_count):  # noqa: ANN001
+    async def _record_send_to_dlq(original_event: Any, original_topic: str, error: Any, retry_count: int) -> None:
         calls.append((original_event.event_id, original_topic, str(error), retry_count))
 
     monkeypatch.setattr(p, "send_to_dlq", _record_send_to_dlq)
-    h = create_immediate_dlq_handler(p, original_topic="t", logger=_test_logger)
+    uid = unique_id("")
+    h = create_immediate_dlq_handler(p, original_topic=f"topic-{uid}", logger=_test_logger)
     e = SagaStartedEvent(
-        saga_id="s2",
+        saga_id=f"saga-{uid}",
         saga_name="n",
-        execution_id="x",
-        initial_event_id="i",
+        execution_id=f"exec-{uid}",
+        initial_event_id=f"evt-{uid}",
         metadata=AvroEventMetadata(service_name="a", service_version="1"),
     )
     await h(RuntimeError("x"), e)
diff --git a/backend/tests/integration/events/test_event_dispatcher.py b/backend/tests/integration/events/test_event_dispatcher.py
index aa65d181..4eb0db33 100644
--- a/backend/tests/integration/events/test_event_dispatcher.py
+++ b/backend/tests/integration/events/test_event_dispatcher.py
@@ -1,6 +1,7 @@
 import asyncio
 import logging
-import uuid
+from collections.abc import Callable
+from typing import Any
 
 import pytest
 from app.domain.enums.events import EventType
@@ -10,6 +11,7 @@
 from app.events.core.types import ConsumerConfig
 from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas
 from app.settings import Settings
+from dishka import AsyncContainer
 
 from tests.helpers import make_execution_requested_event
 
@@ -19,7 +21,7 @@
 
 
 @pytest.mark.asyncio
-async def test_dispatcher_with_multiple_handlers(scope) -> None:  # type: ignore[valid-type]
+async def test_dispatcher_with_multiple_handlers(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     # Ensure schema registry is ready
     registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
     settings: Settings = await scope.get(Settings)
@@ -31,17 +33,17 @@ async def test_dispatcher_with_multiple_handlers(scope) -> None:  # type: ignore
     h2_called = asyncio.Event()
 
     @dispatcher.register(EventType.EXECUTION_REQUESTED)
-    async def h1(_e) -> None:  # noqa: ANN001
+    async def h1(_e: Any) -> None:
         h1_called.set()
 
     @dispatcher.register(EventType.EXECUTION_REQUESTED)
-    async def h2(_e) -> None:  # noqa: ANN001
+    async def h2(_e: Any) -> None:
         h2_called.set()
 
     # Real consumer against execution-events
     cfg = ConsumerConfig(
         bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS,
-        group_id=f"dispatcher-it.{uuid.uuid4().hex[:6]}",
+        group_id=unique_id("dispatcher-it-"),
         enable_auto_commit=True,
         auto_offset_reset="earliest",
     )
@@ -52,13 +54,14 @@ async def h2(_e) -> None:  # noqa: ANN001
         settings=settings,
         logger=_test_logger,
     )
-    await consumer.start([str(KafkaTopic.EXECUTION_EVENTS)])
 
-    # Produce a request event via DI
+    # Produce BEFORE starting consumer - with earliest offset, consumer will read from beginning
     producer: UnifiedProducer = await scope.get(UnifiedProducer)
-    evt = make_execution_requested_event(execution_id=f"exec-{uuid.uuid4().hex[:8]}")
+    evt = make_execution_requested_event(execution_id=unique_id("exec-"))
     await producer.produce(evt, key="k")
 
+    await consumer.start([KafkaTopic.EXECUTION_EVENTS])
+
     try:
         await asyncio.wait_for(asyncio.gather(h1_called.wait(), h2_called.wait()), timeout=10.0)
     finally:
diff --git a/backend/tests/integration/events/test_event_store_consumer.py b/backend/tests/integration/events/test_event_store_consumer.py
index ec35a99b..8b0b520e 100644
--- a/backend/tests/integration/events/test_event_store_consumer.py
+++ b/backend/tests/integration/events/test_event_store_consumer.py
@@ -1,17 +1,14 @@
 import logging
-import uuid
+from collections.abc import Callable
 
+import backoff
 import pytest
 from app.core.database_context import Database
-from app.domain.enums.kafka import KafkaTopic
-from app.events.core import UnifiedProducer
-from app.events.event_store import EventStore
-from app.events.event_store_consumer import EventStoreConsumer, create_event_store_consumer
-from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas
 from app.domain.enums.auth import LoginMethod
+from app.events.core import UnifiedProducer
 from app.infrastructure.kafka.events.metadata import AvroEventMetadata
 from app.infrastructure.kafka.events.user import UserLoggedInEvent
-from app.settings import Settings
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.kafka, pytest.mark.mongodb]
 
@@ -19,46 +16,32 @@
 
 
 @pytest.mark.asyncio
-async def test_event_store_consumer_stores_events(scope) -> None:  # type: ignore[valid-type]
-    # Ensure schemas
-    registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
-    await initialize_event_schemas(registry)
+async def test_event_store_consumer_stores_events(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
+    """Test that the app's EventStoreConsumer (started in lifespan) stores events to MongoDB.
 
-    # Resolve DI
+    The EventStoreConsumer is started automatically by the app lifespan and subscribes
+    to all topics. We just need to publish an event and verify it appears in MongoDB.
+    """
+    # Resolve DI - producer is already running, EventStoreConsumer is already running via app lifespan
     producer: UnifiedProducer = await scope.get(UnifiedProducer)
     db: Database = await scope.get(Database)
-    store: EventStore = await scope.get(EventStore)
-    settings: Settings = await scope.get(Settings)
 
     # Build an event
     ev = UserLoggedInEvent(
-        user_id=f"u-{uuid.uuid4().hex[:6]}",
+        user_id=unique_id("u-"),
         login_method=LoginMethod.PASSWORD,
         metadata=AvroEventMetadata(service_name="tests", service_version="1.0.0"),
     )
 
-    # Create a tuned consumer (fast batch timeout) limited to user-events
-    consumer: EventStoreConsumer = create_event_store_consumer(
-        event_store=store,
-        topics=[KafkaTopic.USER_EVENTS],
-        schema_registry_manager=registry,
-        settings=settings,
-        logger=_test_logger,
-        producer=producer,
-        batch_size=10,
-        batch_timeout_seconds=0.5,
-    )
-
-    # Start the consumer and publish
-    async with consumer:
-        await producer.produce(ev, key=ev.metadata.user_id or "u")
+    # Publish the event - the app's EventStoreConsumer will pick it up
+    await producer.produce(ev, key=ev.metadata.user_id or "u")
 
-        # Wait until the event is persisted in Mongo
-        coll = db.get_collection("events")
-        from tests.helpers.eventually import eventually
+    # Wait until the event is persisted in Mongo by the app's EventStoreConsumer
+    coll = db.get_collection("events")
 
-        async def _exists() -> None:
-            doc = await coll.find_one({"event_id": ev.event_id})
-            assert doc is not None
+    @backoff.on_exception(backoff.constant, AssertionError, max_time=30.0, interval=0.3)
+    async def _wait_exists() -> None:
+        doc = await coll.find_one({"event_id": ev.event_id})
+        assert doc is not None, f"Event {ev.event_id} not found in MongoDB"
 
-        await eventually(_exists, timeout=12.0, interval=0.2)
+    await _wait_exists()
diff --git a/backend/tests/integration/events/test_producer_roundtrip.py b/backend/tests/integration/events/test_producer_roundtrip.py
index c35364b9..baf3158f 100644
--- a/backend/tests/integration/events/test_producer_roundtrip.py
+++ b/backend/tests/integration/events/test_producer_roundtrip.py
@@ -1,10 +1,12 @@
 import json
 import logging
-from uuid import uuid4
+from collections.abc import Callable
 
 import pytest
 from app.events.core import ProducerConfig, UnifiedProducer
 from app.events.schema.schema_registry import SchemaRegistryManager
+from app.settings import Settings
+from dishka import AsyncContainer
 
 from tests.helpers import make_execution_requested_event
 
@@ -14,12 +16,20 @@
 
 
 @pytest.mark.asyncio
-async def test_unified_producer_start_produce_send_to_dlq_stop(scope):  # type: ignore[valid-type]
+async def test_unified_producer_start_produce_send_to_dlq_stop(
+    scope: AsyncContainer, unique_id: Callable[[str], str]
+) -> None:
     schema: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
-    prod = UnifiedProducer(ProducerConfig(bootstrap_servers="localhost:9092"), schema, logger=_test_logger)
+    settings: Settings = await scope.get(Settings)
+    prod = UnifiedProducer(
+        ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS),
+        schema,
+        settings,
+        logger=_test_logger,
+    )
 
     async with prod:
-        ev = make_execution_requested_event(execution_id=f"exec-{uuid4().hex[:8]}")
+        ev = make_execution_requested_event(execution_id=unique_id("exec-"))
         await prod.produce(ev)
 
         # Exercise send_to_dlq path
@@ -29,16 +39,16 @@ async def test_unified_producer_start_produce_send_to_dlq_stop(scope):  # type:
         assert st["running"] is True and st["state"] == "running"
 
 
-def test_producer_handle_stats_path():
+def test_producer_handle_stats_path() -> None:
     # Directly run stats parsing to cover branch logic; avoid relying on timing
-    from app.events.core.producer import ProducerMetrics
-    from app.events.core.producer import UnifiedProducer as UP
+    from app.events.core import ProducerMetrics
+    from app.events.core import UnifiedProducer as UP
 
     m = ProducerMetrics()
     p = object.__new__(UP)  # bypass __init__ safely for method call
     # Inject required attributes
-    p._metrics = m  # type: ignore[attr-defined]
-    p._stats_callback = None  # type: ignore[attr-defined]
+    p._metrics = m
+    p._stats_callback = None
     payload = json.dumps({"msg_cnt": 1, "topics": {"t": {"partitions": {"0": {"msgq_cnt": 2, "rtt": {"avg": 5}}}}}})
-    UP._handle_stats(p, payload)  # type: ignore[misc]
+    UP._handle_stats(p, payload)
     assert m.queue_size == 1 and m.avg_latency_ms > 0
diff --git a/backend/tests/integration/events/test_schema_registry_real.py b/backend/tests/integration/events/test_schema_registry_real.py
index 273c7706..6d8c2aad 100644
--- a/backend/tests/integration/events/test_schema_registry_real.py
+++ b/backend/tests/integration/events/test_schema_registry_real.py
@@ -1,4 +1,5 @@
 import logging
+from collections.abc import Callable
 
 import pytest
 from app.events.schema.schema_registry import SchemaRegistryManager
@@ -11,12 +12,14 @@
 _test_logger = logging.getLogger("test.events.schema_registry_real")
 
 
-def test_serialize_and_deserialize_event_real_registry(test_settings: Settings) -> None:
+def test_serialize_and_deserialize_event_real_registry(
+    test_settings: Settings, unique_id: Callable[[str], str]
+) -> None:
     # Uses real Schema Registry configured via env (SCHEMA_REGISTRY_URL)
     m = SchemaRegistryManager(settings=test_settings, logger=_test_logger)
     ev = PodCreatedEvent(
-        execution_id="e1",
-        pod_name="p",
+        execution_id=unique_id("exec-"),
+        pod_name=unique_id("pod-"),
         namespace="n",
         metadata=AvroEventMetadata(service_name="s", service_version="1"),
     )
diff --git a/backend/tests/integration/events/test_schema_registry_roundtrip.py b/backend/tests/integration/events/test_schema_registry_roundtrip.py
index 4791c16f..c016be9c 100644
--- a/backend/tests/integration/events/test_schema_registry_roundtrip.py
+++ b/backend/tests/integration/events/test_schema_registry_roundtrip.py
@@ -2,7 +2,9 @@
 
 import pytest
 from app.events.schema.schema_registry import MAGIC_BYTE, SchemaRegistryManager
+from app.infrastructure.kafka.events.execution import ExecutionRequestedEvent
 from app.settings import Settings
+from dishka import AsyncContainer
 
 from tests.helpers import make_execution_requested_event
 
@@ -12,13 +14,14 @@
 
 
 @pytest.mark.asyncio
-async def test_schema_registry_serialize_deserialize_roundtrip(scope):  # type: ignore[valid-type]
+async def test_schema_registry_serialize_deserialize_roundtrip(scope: AsyncContainer) -> None:
     reg: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
     # Schema registration happens lazily in serialize_event
     ev = make_execution_requested_event(execution_id="e-rt")
     data = reg.serialize_event(ev)
     assert data.startswith(MAGIC_BYTE)
     back = reg.deserialize_event(data, topic=str(ev.topic))
+    assert isinstance(back, ExecutionRequestedEvent)
     assert back.event_id == ev.event_id and back.execution_id == ev.execution_id
 
     # initialize_schemas should be a no-op if already initialized; call to exercise path
diff --git a/backend/tests/integration/idempotency/test_consumer_idempotent.py b/backend/tests/integration/idempotency/test_consumer_idempotent.py
deleted file mode 100644
index bdcc04d9..00000000
--- a/backend/tests/integration/idempotency/test_consumer_idempotent.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import asyncio
-import logging
-import uuid
-
-import pytest
-
-from app.domain.enums.events import EventType
-from app.domain.enums.kafka import KafkaTopic
-from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer
-from app.events.core.dispatcher import EventDispatcher as Disp
-from app.events.schema.schema_registry import SchemaRegistryManager
-from tests.helpers import make_execution_requested_event
-from app.services.idempotency.idempotency_manager import IdempotencyManager
-from app.services.idempotency.middleware import IdempotentConsumerWrapper
-from app.settings import Settings
-from tests.helpers.eventually import eventually
-
-pytestmark = [pytest.mark.integration, pytest.mark.kafka, pytest.mark.redis]
-
-_test_logger = logging.getLogger("test.idempotency.consumer_idempotent")
-
-
-@pytest.mark.asyncio
-async def test_consumer_idempotent_wrapper_blocks_duplicates(scope) -> None:  # type: ignore[valid-type]
-    producer: UnifiedProducer = await scope.get(UnifiedProducer)
-    idm: IdempotencyManager = await scope.get(IdempotencyManager)
-    registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
-    settings: Settings = await scope.get(Settings)
-
-    # Build a dispatcher with a counter
-    disp: Disp = EventDispatcher(logger=_test_logger)
-    seen = {"n": 0}
-
-    @disp.register(EventType.EXECUTION_REQUESTED)
-    async def handle(_ev):  # noqa: ANN001
-        seen["n"] += 1
-
-    # Real consumer with idempotent wrapper
-    cfg = ConsumerConfig(
-        bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS,
-        group_id=f"test-idem-consumer.{uuid.uuid4().hex[:6]}",
-        enable_auto_commit=True,
-        auto_offset_reset="earliest",
-    )
-    base = UnifiedConsumer(
-        cfg,
-        event_dispatcher=disp,
-        schema_registry=registry,
-        settings=settings,
-        logger=_test_logger,
-    )
-    wrapper = IdempotentConsumerWrapper(
-        consumer=base,
-        idempotency_manager=idm,
-        dispatcher=disp,
-        default_key_strategy="event_based",
-        enable_for_all_handlers=True,
-        logger=_test_logger,
-    )
-
-    await wrapper.start([KafkaTopic.EXECUTION_EVENTS])
-    try:
-        # Produce the same event twice (same event_id)
-        execution_id = f"e-{uuid.uuid4().hex[:8]}"
-        ev = make_execution_requested_event(execution_id=execution_id)
-        await producer.produce(ev, key=execution_id)
-        await producer.produce(ev, key=execution_id)
-
-        async def _one():
-            assert seen["n"] >= 1
-
-        await eventually(_one, timeout=10.0, interval=0.2)
-    finally:
-        await wrapper.stop()
diff --git a/backend/tests/integration/idempotency/test_decorator_idempotent.py b/backend/tests/integration/idempotency/test_decorator_idempotent.py
index 3f4d73ce..62c9afdc 100644
--- a/backend/tests/integration/idempotency/test_decorator_idempotent.py
+++ b/backend/tests/integration/idempotency/test_decorator_idempotent.py
@@ -1,9 +1,12 @@
 import logging
-import pytest
+from typing import Any
 
-from tests.helpers import make_execution_requested_event
+import pytest
 from app.services.idempotency.idempotency_manager import IdempotencyManager
 from app.services.idempotency.middleware import idempotent_handler
+from dishka import AsyncContainer
+
+from tests.helpers import make_execution_requested_event
 
 _test_logger = logging.getLogger("test.idempotency.decorator_idempotent")
 
@@ -12,13 +15,13 @@
 
 
 @pytest.mark.asyncio
-async def test_decorator_blocks_duplicate_event(scope) -> None:  # type: ignore[valid-type]
+async def test_decorator_blocks_duplicate_event(scope: AsyncContainer) -> None:
     idm: IdempotencyManager = await scope.get(IdempotencyManager)
 
     calls = {"n": 0}
 
     @idempotent_handler(idempotency_manager=idm, key_strategy="event_based", logger=_test_logger)
-    async def h(ev):  # noqa: ANN001
+    async def h(ev: Any) -> None:
         calls["n"] += 1
 
     ev = make_execution_requested_event(execution_id="exec-deco-1")
@@ -29,16 +32,16 @@ async def h(ev):  # noqa: ANN001
 
 
 @pytest.mark.asyncio
-async def test_decorator_custom_key_blocks(scope) -> None:  # type: ignore[valid-type]
+async def test_decorator_custom_key_blocks(scope: AsyncContainer) -> None:
     idm: IdempotencyManager = await scope.get(IdempotencyManager)
 
     calls = {"n": 0}
 
-    def fixed_key(_ev):  # noqa: ANN001
+    def fixed_key(_ev: Any) -> str:
         return "fixed-key"
 
     @idempotent_handler(idempotency_manager=idm, key_strategy="custom", custom_key_func=fixed_key, logger=_test_logger)
-    async def h(ev):  # noqa: ANN001
+    async def h(ev: Any) -> None:
         calls["n"] += 1
 
     e1 = make_execution_requested_event(execution_id="exec-deco-2a")
diff --git a/backend/tests/integration/idempotency/test_idempotency.py b/backend/tests/integration/idempotency/test_idempotency.py
index 6620ef6f..bf8e99a4 100644
--- a/backend/tests/integration/idempotency/test_idempotency.py
+++ b/backend/tests/integration/idempotency/test_idempotency.py
@@ -1,17 +1,18 @@
 import asyncio
 import json
 import logging
-import uuid
+from collections.abc import AsyncGenerator, Callable
 from datetime import datetime, timedelta, timezone
-import pytest
 
-from app.domain.idempotency import IdempotencyRecord, IdempotencyStatus, IdempotencyStats
+import pytest
+import redis.asyncio as aioredis
+from app.domain.idempotency import IdempotencyRecord, IdempotencyStatus
 from app.infrastructure.kafka.events.base import BaseEvent
-from tests.helpers import make_execution_requested_event
 from app.services.idempotency.idempotency_manager import IdempotencyConfig, IdempotencyManager
 from app.services.idempotency.middleware import IdempotentEventHandler, idempotent_handler
 from app.services.idempotency.redis_repository import RedisIdempotencyRepository
 
+from tests.helpers import make_execution_requested_event
 
 pytestmark = [pytest.mark.integration, pytest.mark.redis]
 
@@ -23,8 +24,10 @@ class TestIdempotencyManager:
     """IdempotencyManager backed by real Redis repository (DI-provided client)."""
 
     @pytest.fixture
-    async def manager(self, redis_client):  # type: ignore[valid-type]
-        prefix = f"idemp_ut:{uuid.uuid4().hex[:6]}"
+    async def manager(
+        self, redis_client: aioredis.Redis, unique_id: Callable[[str], str]
+    ) -> AsyncGenerator[IdempotencyManager, None]:
+        prefix = f"idemp_ut:{unique_id('')}"
         cfg = IdempotencyConfig(
             key_prefix=prefix,
             default_ttl_seconds=3600,
@@ -42,7 +45,7 @@ async def manager(self, redis_client):  # type: ignore[valid-type]
             await m.close()
 
     @pytest.mark.asyncio
-    async def test_complete_flow_new_event(self, manager):
+    async def test_complete_flow_new_event(self, manager: IdempotencyManager) -> None:
         """Test the complete flow for a new event"""
         real_event = make_execution_requested_event(execution_id="exec-123")
         # Check and reserve
@@ -54,7 +57,7 @@ async def test_complete_flow_new_event(self, manager):
         assert result.key.startswith(f"{manager.config.key_prefix}:")
 
         # Verify it's in the repository
-        record = await manager._repo.find_by_key(result.key)  # type: ignore[attr-defined]
+        record = await manager._repo.find_by_key(result.key)
         assert record is not None
         assert record.status == IdempotencyStatus.PROCESSING
 
@@ -63,13 +66,14 @@ async def test_complete_flow_new_event(self, manager):
         assert success is True
 
         # Verify status updated
-        record = await manager._repo.find_by_key(result.key)  # type: ignore[attr-defined]
+        record = await manager._repo.find_by_key(result.key)
+        assert record is not None
         assert record.status == IdempotencyStatus.COMPLETED
         assert record.completed_at is not None
         assert record.processing_duration_ms is not None
 
     @pytest.mark.asyncio
-    async def test_duplicate_detection(self, manager):
+    async def test_duplicate_detection(self, manager: IdempotencyManager) -> None:
         """Test that duplicates are properly detected"""
         real_event = make_execution_requested_event(execution_id="exec-dupe-1")
         # First request
@@ -85,7 +89,7 @@ async def test_duplicate_detection(self, manager):
         assert result2.status == IdempotencyStatus.COMPLETED
 
     @pytest.mark.asyncio
-    async def test_concurrent_requests_race_condition(self, manager):
+    async def test_concurrent_requests_race_condition(self, manager: IdempotencyManager) -> None:
         """Test handling of concurrent requests for the same event"""
         real_event = make_execution_requested_event(execution_id="exec-race-1")
         # Simulate concurrent requests
@@ -105,7 +109,7 @@ async def test_concurrent_requests_race_condition(self, manager):
         assert duplicate_count == 4
 
     @pytest.mark.asyncio
-    async def test_processing_timeout_allows_retry(self, manager):
+    async def test_processing_timeout_allows_retry(self, manager: IdempotencyManager) -> None:
         """Test that stuck processing allows retry after timeout"""
         real_event = make_execution_requested_event(execution_id="exec-timeout-1")
         # First request
@@ -113,9 +117,10 @@ async def test_processing_timeout_allows_retry(self, manager):
         assert result1.is_duplicate is False
 
         # Manually update the created_at to simulate old processing
-        record = await manager._repo.find_by_key(result1.key)  # type: ignore[attr-defined]
+        record = await manager._repo.find_by_key(result1.key)
+        assert record is not None
         record.created_at = datetime.now(timezone.utc) - timedelta(seconds=10)
-        await manager._repo.update_record(record)  # type: ignore[attr-defined]
+        await manager._repo.update_record(record)
 
         # Second request should be allowed due to timeout
         result2 = await manager.check_and_reserve(real_event, key_strategy="event_based")
@@ -123,7 +128,7 @@ async def test_processing_timeout_allows_retry(self, manager):
         assert result2.status == IdempotencyStatus.PROCESSING
 
     @pytest.mark.asyncio
-    async def test_content_hash_strategy(self, manager):
+    async def test_content_hash_strategy(self, manager: IdempotencyManager) -> None:
         """Test content-based deduplication"""
         # Two events with same content and same execution_id
         event1 = make_execution_requested_event(
@@ -147,7 +152,7 @@ async def test_content_hash_strategy(self, manager):
         assert result2.is_duplicate is True
 
     @pytest.mark.asyncio
-    async def test_failed_event_handling(self, manager):
+    async def test_failed_event_handling(self, manager: IdempotencyManager) -> None:
         """Test marking events as failed"""
         real_event = make_execution_requested_event(execution_id="exec-failed-1")
         # Reserve
@@ -160,13 +165,14 @@ async def test_failed_event_handling(self, manager):
         assert success is True
 
         # Verify status and error
-        record = await manager._repo.find_by_key(result.key)  # type: ignore[attr-defined]
+        record = await manager._repo.find_by_key(result.key)
+        assert record is not None
         assert record.status == IdempotencyStatus.FAILED
         assert record.error == error_msg
         assert record.completed_at is not None
 
     @pytest.mark.asyncio
-    async def test_result_caching(self, manager):
+    async def test_result_caching(self, manager: IdempotencyManager) -> None:
         """Test caching of results"""
         real_event = make_execution_requested_event(execution_id="exec-cache-1")
         # Reserve
@@ -192,7 +198,7 @@ async def test_result_caching(self, manager):
         assert duplicate_result.has_cached_result is True
 
     @pytest.mark.asyncio
-    async def test_stats_aggregation(self, manager):
+    async def test_stats_aggregation(self, manager: IdempotencyManager) -> None:
         """Test statistics aggregation"""
         # Create various events with different statuses
         events = []
@@ -224,7 +230,7 @@ async def test_stats_aggregation(self, manager):
         assert stats.prefix == manager.config.key_prefix
 
     @pytest.mark.asyncio
-    async def test_remove_key(self, manager):
+    async def test_remove_key(self, manager: IdempotencyManager) -> None:
         """Test removing idempotency keys"""
         real_event = make_execution_requested_event(execution_id="exec-remove-1")
         # Add a key
@@ -236,7 +242,7 @@ async def test_remove_key(self, manager):
         assert removed is True
 
         # Verify it's gone
-        record = await manager._repo.find_by_key(result.key)  # type: ignore[attr-defined]
+        record = await manager._repo.find_by_key(result.key)
         assert record is None
 
         # Can process again
@@ -248,8 +254,10 @@ class TestIdempotentEventHandlerIntegration:
     """Test IdempotentEventHandler with real components"""
 
     @pytest.fixture
-    async def manager(self, redis_client):  # type: ignore[valid-type]
-        prefix = f"handler_test:{uuid.uuid4().hex[:6]}"
+    async def manager(
+        self, redis_client: aioredis.Redis, unique_id: Callable[[str], str]
+    ) -> AsyncGenerator[IdempotencyManager, None]:
+        prefix = f"handler_test:{unique_id('')}"
         config = IdempotencyConfig(key_prefix=prefix, enable_metrics=False)
         repo = RedisIdempotencyRepository(redis_client, key_prefix=prefix)
         m = IdempotencyManager(config, repo, _test_logger)
@@ -260,11 +268,11 @@ async def manager(self, redis_client):  # type: ignore[valid-type]
             await m.close()
 
     @pytest.mark.asyncio
-    async def test_handler_processes_new_event(self, manager):
+    async def test_handler_processes_new_event(self, manager: IdempotencyManager) -> None:
         """Test that handler processes new events"""
-        processed_events = []
+        processed_events: list[BaseEvent] = []
 
-        async def actual_handler(event: BaseEvent):
+        async def actual_handler(event: BaseEvent) -> None:
             processed_events.append(event)
 
         # Create idempotent handler
@@ -284,11 +292,11 @@ async def actual_handler(event: BaseEvent):
         assert processed_events[0] == real_event
 
     @pytest.mark.asyncio
-    async def test_handler_blocks_duplicate(self, manager):
+    async def test_handler_blocks_duplicate(self, manager: IdempotencyManager) -> None:
         """Test that handler blocks duplicate events"""
-        processed_events = []
+        processed_events: list[BaseEvent] = []
 
-        async def actual_handler(event: BaseEvent):
+        async def actual_handler(event: BaseEvent) -> None:
             processed_events.append(event)
 
         # Create idempotent handler
@@ -308,10 +316,10 @@ async def actual_handler(event: BaseEvent):
         assert len(processed_events) == 1
 
     @pytest.mark.asyncio
-    async def test_handler_with_failure(self, manager):
+    async def test_handler_with_failure(self, manager: IdempotencyManager) -> None:
         """Test handler marks failure on exception"""
 
-        async def failing_handler(event: BaseEvent):
+        async def failing_handler(event: BaseEvent) -> None:
             raise ValueError("Processing failed")
 
         handler = IdempotentEventHandler(
@@ -328,19 +336,20 @@ async def failing_handler(event: BaseEvent):
 
         # Verify marked as failed
         key = f"{manager.config.key_prefix}:{real_event.event_type}:{real_event.event_id}"
-        record = await manager._repo.find_by_key(key)  # type: ignore[attr-defined]
+        record = await manager._repo.find_by_key(key)
+        assert record is not None
         assert record.status == IdempotencyStatus.FAILED
-        assert "Processing failed" in record.error
+        assert record.error is not None and "Processing failed" in record.error
 
     @pytest.mark.asyncio
-    async def test_handler_duplicate_callback(self, manager):
+    async def test_handler_duplicate_callback(self, manager: IdempotencyManager) -> None:
         """Test duplicate callback is invoked"""
-        duplicate_events = []
+        duplicate_events: list[tuple[BaseEvent, IdempotencyRecord]] = []
 
-        async def actual_handler(event: BaseEvent):
+        async def actual_handler(event: BaseEvent) -> None:
             pass  # Do nothing
 
-        async def on_duplicate(event: BaseEvent, result):
+        async def on_duplicate(event: BaseEvent, result: IdempotencyRecord) -> None:
             duplicate_events.append((event, result))
 
         handler = IdempotentEventHandler(
@@ -359,12 +368,12 @@ async def on_duplicate(event: BaseEvent, result):
         # Verify duplicate callback was called
         assert len(duplicate_events) == 1
         assert duplicate_events[0][0] == real_event
-        assert duplicate_events[0][1].is_duplicate is True
+        assert duplicate_events[0][1].status == IdempotencyStatus.COMPLETED
 
     @pytest.mark.asyncio
-    async def test_decorator_integration(self, manager):
+    async def test_decorator_integration(self, manager: IdempotencyManager) -> None:
         """Test the @idempotent_handler decorator"""
-        processed_events = []
+        processed_events: list[BaseEvent] = []
 
         @idempotent_handler(
             idempotency_manager=manager,
@@ -372,7 +381,7 @@ async def test_decorator_integration(self, manager):
             ttl_seconds=300,
             logger=_test_logger,
         )
-        async def my_handler(event: BaseEvent):
+        async def my_handler(event: BaseEvent) -> None:
             processed_events.append(event)
 
         # Process same event twice
@@ -394,18 +403,16 @@ async def my_handler(event: BaseEvent):
         assert len(processed_events) == 1  # Still only one
 
     @pytest.mark.asyncio
-    async def test_custom_key_function(self, manager):
+    async def test_custom_key_function(self, manager: IdempotencyManager) -> None:
         """Test handler with custom key function"""
-        processed_scripts = []
+        processed_scripts: list[str] = []
 
         async def process_script(event: BaseEvent) -> None:
-            processed_scripts.append(event.script)
+            processed_scripts.append(event.script)  # type: ignore[attr-defined]
 
         def extract_script_key(event: BaseEvent) -> str:
             # Custom key based on script content only
-            if hasattr(event, 'script'):
-                return f"script:{hash(event.script)}"
-            return str(event.event_id)
+            return f"script:{hash(event.script)}"  # type: ignore[attr-defined]
 
         handler = IdempotentEventHandler(
             handler=process_script,
@@ -445,25 +452,25 @@ def extract_script_key(event: BaseEvent) -> str:
         assert processed_scripts[0] == "print('hello')"
 
     @pytest.mark.asyncio
-    async def test_invalid_key_strategy(self, manager):
+    async def test_invalid_key_strategy(self, manager: IdempotencyManager) -> None:
         """Test that invalid key strategy raises error"""
         real_event = make_execution_requested_event(execution_id="invalid-strategy-1")
         with pytest.raises(ValueError, match="Invalid key strategy"):
             await manager.check_and_reserve(real_event, key_strategy="invalid_strategy")
 
     @pytest.mark.asyncio
-    async def test_custom_key_without_custom_key_param(self, manager):
+    async def test_custom_key_without_custom_key_param(self, manager: IdempotencyManager) -> None:
         """Test that custom strategy without custom_key raises error"""
         real_event = make_execution_requested_event(execution_id="custom-key-missing-1")
         with pytest.raises(ValueError, match="Invalid key strategy"):
             await manager.check_and_reserve(real_event, key_strategy="custom")
 
     @pytest.mark.asyncio
-    async def test_get_cached_json_existing(self, manager):
+    async def test_get_cached_json_existing(self, manager: IdempotencyManager) -> None:
         """Test retrieving cached JSON result"""
         # First complete with cached result
         real_event = make_execution_requested_event(execution_id="cache-exist-1")
-        result = await manager.check_and_reserve(real_event, key_strategy="event_based")
+        await manager.check_and_reserve(real_event, key_strategy="event_based")
         cached_data = json.dumps({"output": "test", "code": 0})
         await manager.mark_completed_with_json(real_event, cached_data, "event_based")
 
@@ -472,7 +479,7 @@ async def test_get_cached_json_existing(self, manager):
         assert retrieved == cached_data
 
     @pytest.mark.asyncio
-    async def test_get_cached_json_non_existing(self, manager):
+    async def test_get_cached_json_non_existing(self, manager: IdempotencyManager) -> None:
         """Test retrieving non-existing cached result raises assertion"""
         real_event = make_execution_requested_event(execution_id="cache-miss-1")
         # Trying to get cached result for non-existent key should raise
@@ -480,7 +487,7 @@ async def test_get_cached_json_non_existing(self, manager):
             await manager.get_cached_json(real_event, "event_based", None)
 
     @pytest.mark.asyncio
-    async def test_cleanup_expired_keys(self, manager):
+    async def test_cleanup_expired_keys(self, manager: IdempotencyManager) -> None:
         """Test cleanup of expired keys"""
         # Create expired record
         expired_key = f"{manager.config.key_prefix}:expired"
@@ -493,17 +500,19 @@ async def test_cleanup_expired_keys(self, manager):
             ttl_seconds=3600,  # 1 hour TTL
             completed_at=datetime.now(timezone.utc) - timedelta(hours=2)
         )
-        await manager._repo.insert_processing(expired_record)  # type: ignore[attr-defined]
+        await manager._repo.insert_processing(expired_record)
 
         # Cleanup should detect it as expired
         # Note: actual cleanup implementation depends on repository
-        record = await manager._repo.find_by_key(expired_key)  # type: ignore[attr-defined]
+        record = await manager._repo.find_by_key(expired_key)
         assert record is not None  # Still exists until explicit cleanup
 
     @pytest.mark.asyncio
-    async def test_metrics_enabled(self, redis_client):  # type: ignore[valid-type]
+    async def test_metrics_enabled(
+        self, redis_client: aioredis.Redis, unique_id: Callable[[str], str]
+    ) -> None:
         """Test manager with metrics enabled"""
-        config = IdempotencyConfig(key_prefix=f"metrics:{uuid.uuid4().hex[:6]}", enable_metrics=True)
+        config = IdempotencyConfig(key_prefix=f"metrics:{unique_id('')}", enable_metrics=True)
         repository = RedisIdempotencyRepository(redis_client, key_prefix=config.key_prefix)
         manager = IdempotencyManager(config, repository, _test_logger)
 
@@ -515,7 +524,7 @@ async def test_metrics_enabled(self, redis_client):  # type: ignore[valid-type]
         await manager.close()
 
     @pytest.mark.asyncio
-    async def test_content_hash_with_fields(self, manager):
+    async def test_content_hash_with_fields(self, manager: IdempotencyManager) -> None:
         """Test content hash with specific fields"""
         event1 = make_execution_requested_event(
             execution_id="exec-1",
diff --git a/backend/tests/integration/idempotency/test_idempotent_handler.py b/backend/tests/integration/idempotency/test_idempotent_handler.py
index 76ea369a..a5b82a6c 100644
--- a/backend/tests/integration/idempotency/test_idempotent_handler.py
+++ b/backend/tests/integration/idempotency/test_idempotent_handler.py
@@ -1,12 +1,13 @@
 import logging
+from collections.abc import Callable
+from typing import Any
 
 import pytest
-
-from app.events.schema.schema_registry import SchemaRegistryManager
-from tests.helpers import make_execution_requested_event
 from app.services.idempotency.idempotency_manager import IdempotencyManager
 from app.services.idempotency.middleware import IdempotentEventHandler
+from dishka import AsyncContainer
 
+from tests.helpers import make_execution_requested_event
 
 pytestmark = [pytest.mark.integration]
 
@@ -14,12 +15,12 @@
 
 
 @pytest.mark.asyncio
-async def test_idempotent_handler_blocks_duplicates(scope) -> None:  # type: ignore[valid-type]
+async def test_idempotent_handler_blocks_duplicates(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     manager: IdempotencyManager = await scope.get(IdempotencyManager)
 
     processed: list[str] = []
 
-    async def _handler(ev) -> None:  # noqa: ANN001
+    async def _handler(ev: Any) -> None:
         processed.append(ev.event_id)
 
     handler = IdempotentEventHandler(
@@ -29,7 +30,7 @@ async def _handler(ev) -> None:  # noqa: ANN001
         logger=_test_logger,
     )
 
-    ev = make_execution_requested_event(execution_id="exec-dup-1")
+    ev = make_execution_requested_event(execution_id=unique_id("exec-"))
 
     await handler(ev)
     await handler(ev)  # duplicate
@@ -38,12 +39,14 @@ async def _handler(ev) -> None:  # noqa: ANN001
 
 
 @pytest.mark.asyncio
-async def test_idempotent_handler_content_hash_blocks_same_content(scope) -> None:  # type: ignore[valid-type]
+async def test_idempotent_handler_content_hash_blocks_same_content(
+    scope: AsyncContainer, unique_id: Callable[[str], str]
+) -> None:
     manager: IdempotencyManager = await scope.get(IdempotencyManager)
 
     processed: list[str] = []
 
-    async def _handler(ev) -> None:  # noqa: ANN001
+    async def _handler(ev: Any) -> None:
         processed.append(ev.execution_id)
 
     handler = IdempotentEventHandler(
@@ -53,8 +56,10 @@ async def _handler(ev) -> None:  # noqa: ANN001
         logger=_test_logger,
     )
 
-    e1 = make_execution_requested_event(execution_id="exec-dup-2")
-    e2 = make_execution_requested_event(execution_id="exec-dup-2")
+    # Same execution_id means same content hash
+    execution_id = unique_id("exec-")
+    e1 = make_execution_requested_event(execution_id=execution_id)
+    e2 = make_execution_requested_event(execution_id=execution_id)
 
     await handler(e1)
     await handler(e2)
diff --git a/backend/tests/integration/notifications/test_notification_sse.py b/backend/tests/integration/notifications/test_notification_sse.py
index c2fbb401..1432204e 100644
--- a/backend/tests/integration/notifications/test_notification_sse.py
+++ b/backend/tests/integration/notifications/test_notification_sse.py
@@ -1,23 +1,22 @@
-import asyncio
-import json
-from uuid import uuid4
-import pytest
+from collections.abc import Callable
 
+import backoff
+import pytest
 from app.domain.enums.notification import NotificationChannel, NotificationSeverity
 from app.schemas_pydantic.sse import RedisNotificationMessage
 from app.services.notification_service import NotificationService
 from app.services.sse.redis_bus import SSERedisBus
-from tests.helpers.eventually import eventually
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.redis]
 
 
 @pytest.mark.asyncio
-async def test_in_app_notification_published_to_sse(scope) -> None:  # type: ignore[valid-type]
+async def test_in_app_notification_published_to_sse(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     svc: NotificationService = await scope.get(NotificationService)
     bus: SSERedisBus = await scope.get(SSERedisBus)
 
-    user_id = f"notif-user-{uuid4().hex[:8]}"
+    user_id = unique_id("notif-user-")
     # Open subscription before creating notification to catch the publish
     sub = await bus.open_notification_subscription(user_id)
 
@@ -25,7 +24,7 @@ async def test_in_app_notification_published_to_sse(scope) -> None:  # type: ign
     await svc.update_subscription(user_id, NotificationChannel.IN_APP, True)
 
     # Create notification via service (IN_APP channel triggers SSE publish)
-    n = await svc.create_notification(
+    await svc.create_notification(
         user_id=user_id,
         subject="Hello",
         body="World",
@@ -35,12 +34,17 @@ async def test_in_app_notification_published_to_sse(scope) -> None:  # type: ign
     )
 
     # Receive published SSE payload
-    async def _recv() -> RedisNotificationMessage:
+    msg: RedisNotificationMessage | None = None
+
+    @backoff.on_exception(backoff.constant, AssertionError, max_time=5.0, interval=0.1)
+    async def _wait_recv() -> None:
+        nonlocal msg
         m = await sub.get(RedisNotificationMessage)
         assert m is not None
-        return m
+        msg = m
 
-    msg = await eventually(_recv, timeout=5.0, interval=0.1)
+    await _wait_recv()
+    assert msg is not None
     # Basic shape assertions
     assert msg.subject == "Hello"
     assert msg.body == "World"
diff --git a/backend/tests/integration/result_processor/test_result_processor.py b/backend/tests/integration/result_processor/test_result_processor.py
index 5c9a98c4..c65ae109 100644
--- a/backend/tests/integration/result_processor/test_result_processor.py
+++ b/backend/tests/integration/result_processor/test_result_processor.py
@@ -1,16 +1,16 @@
 import asyncio
 import logging
-import uuid
-from tests.helpers.eventually import eventually
-import pytest
+from collections.abc import Callable
+from typing import Any
 
+import backoff
+import pytest
 from app.core.database_context import Database
-
 from app.db.repositories.execution_repository import ExecutionRepository
 from app.domain.enums.events import EventType
 from app.domain.enums.execution import ExecutionStatus
-from app.domain.execution import DomainExecutionCreate
 from app.domain.enums.kafka import KafkaTopic
+from app.domain.execution import DomainExecutionCreate
 from app.domain.execution.models import ResourceUsageDomain
 from app.events.core import UnifiedConsumer, UnifiedProducer
 from app.events.core.dispatcher import EventDispatcher
@@ -21,6 +21,7 @@
 from app.services.idempotency import IdempotencyManager
 from app.services.result_processor.processor import ResultProcessor
 from app.settings import Settings
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.kafka, pytest.mark.mongodb]
 
@@ -28,7 +29,7 @@
 
 
 @pytest.mark.asyncio
-async def test_result_processor_persists_and_emits(scope) -> None:  # type: ignore[valid-type]
+async def test_result_processor_persists_and_emits(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     # Ensure schemas
     registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
     settings: Settings = await scope.get(Settings)
@@ -65,10 +66,10 @@ async def test_result_processor_persists_and_emits(scope) -> None:  # type: igno
     stored_received = asyncio.Event()
 
     @dispatcher.register(EventType.RESULT_STORED)
-    async def _stored(_event) -> None:  # noqa: ANN001
+    async def _stored(_event: Any) -> None:
         stored_received.set()
 
-    group_id = f"rp-test.{uuid.uuid4().hex[:6]}"
+    group_id = unique_id("rp-test-")
     cconf = ConsumerConfig(
         bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS,
         group_id=group_id,
@@ -82,7 +83,7 @@ async def _stored(_event) -> None:  # noqa: ANN001
         settings=settings,
         logger=_test_logger,
     )
-    await stored_consumer.start([str(KafkaTopic.EXECUTION_RESULTS)])
+    await stored_consumer.start([KafkaTopic.EXECUTION_RESULTS])
 
     try:
         async with processor:
@@ -104,12 +105,13 @@ async def _stored(_event) -> None:  # noqa: ANN001
             await producer.produce(evt, key=execution_id)
 
             # Wait for DB persistence (event-driven polling)
-            async def _persisted() -> None:
+            @backoff.on_exception(backoff.constant, AssertionError, max_time=12.0, interval=0.2)
+            async def _wait_persisted() -> None:
                 doc = await db.get_collection("executions").find_one({"execution_id": execution_id})
                 assert doc is not None
                 assert doc.get("status") == ExecutionStatus.COMPLETED.value
 
-            await eventually(_persisted, timeout=12.0, interval=0.2)
+            await _wait_persisted()
 
             # Wait for result stored event
             await asyncio.wait_for(stored_received.wait(), timeout=10.0)
diff --git a/backend/tests/integration/services/admin/__init__.py b/backend/tests/integration/services/admin/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/services/admin/test_admin_user_service.py b/backend/tests/integration/services/admin/test_admin_user_service.py
index a392a908..b52139cc 100644
--- a/backend/tests/integration/services/admin/test_admin_user_service.py
+++ b/backend/tests/integration/services/admin/test_admin_user_service.py
@@ -2,15 +2,15 @@
 
 import pytest
 from app.core.database_context import Database
-
 from app.domain.enums.user import UserRole
 from app.services.admin import AdminUserService
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.mongodb]
 
 
 @pytest.mark.asyncio
-async def test_get_user_overview_basic(scope) -> None:  # type: ignore[valid-type]
+async def test_get_user_overview_basic(scope: AsyncContainer) -> None:
     svc: AdminUserService = await scope.get(AdminUserService)
     db: Database = await scope.get(Database)
     await db.get_collection("users").insert_one({
@@ -30,7 +30,7 @@ async def test_get_user_overview_basic(scope) -> None:  # type: ignore[valid-typ
 
 
 @pytest.mark.asyncio
-async def test_get_user_overview_user_not_found(scope) -> None:  # type: ignore[valid-type]
+async def test_get_user_overview_user_not_found(scope: AsyncContainer) -> None:
     svc: AdminUserService = await scope.get(AdminUserService)
     with pytest.raises(ValueError):
         await svc.get_user_overview("missing")
diff --git a/backend/tests/integration/services/coordinator/__init__.py b/backend/tests/integration/services/coordinator/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/services/coordinator/test_execution_coordinator.py b/backend/tests/integration/services/coordinator/test_execution_coordinator.py
index 7131b2ab..aca20474 100644
--- a/backend/tests/integration/services/coordinator/test_execution_coordinator.py
+++ b/backend/tests/integration/services/coordinator/test_execution_coordinator.py
@@ -1,25 +1,29 @@
-import pytest
+from collections.abc import Callable
 
+import pytest
 from app.services.coordinator.coordinator import ExecutionCoordinator
+from dishka import AsyncContainer
+
 from tests.helpers import make_execution_requested_event
 
 pytestmark = pytest.mark.integration
 
 
 @pytest.mark.asyncio
-async def test_handle_requested_and_schedule(scope) -> None:  # type: ignore[valid-type]
+async def test_handle_requested_and_schedule(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     coord: ExecutionCoordinator = await scope.get(ExecutionCoordinator)
-    ev = make_execution_requested_event(execution_id="e-real-1")
+    execution_id = unique_id("exec-")
+    ev = make_execution_requested_event(execution_id=execution_id)
 
     # Directly route requested event (no Kafka consumer)
     await coord._handle_execution_requested(ev)  # noqa: SLF001
 
-    pos = await coord.queue_manager.get_queue_position("e-real-1")
+    pos = await coord.queue_manager.get_queue_position(execution_id)
     assert pos is not None
 
     # Schedule one execution from queue
     next_ev = await coord.queue_manager.get_next_execution()
-    assert next_ev is not None and next_ev.execution_id == "e-real-1"
+    assert next_ev is not None and next_ev.execution_id == execution_id
     await coord._schedule_execution(next_ev)  # noqa: SLF001
     # Should be tracked as active
-    assert "e-real-1" in coord._active_executions  # noqa: SLF001
+    assert execution_id in coord._active_executions  # noqa: SLF001
diff --git a/backend/tests/integration/services/events/__init__.py b/backend/tests/integration/services/events/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/services/events/test_event_bus.py b/backend/tests/integration/services/events/test_event_bus.py
index 398300c0..f0c5a265 100644
--- a/backend/tests/integration/services/events/test_event_bus.py
+++ b/backend/tests/integration/services/events/test_event_bus.py
@@ -1,13 +1,13 @@
+import backoff
 import pytest
-
 from app.services.event_bus import EventBusEvent, EventBusManager
-from tests.helpers.eventually import eventually
+from dishka import AsyncContainer
 
 pytestmark = pytest.mark.integration
 
 
 @pytest.mark.asyncio
-async def test_event_bus_publish_subscribe(scope) -> None:  # type: ignore[valid-type]
+async def test_event_bus_publish_subscribe(scope: AsyncContainer) -> None:
     manager: EventBusManager = await scope.get(EventBusManager)
     bus = await manager.get_event_bus()
 
@@ -19,7 +19,8 @@ async def handler(event: EventBusEvent) -> None:
     await bus.subscribe("test.*", handler)
     await bus.publish("test.created", {"x": 1})
 
-    async def _received():
+    @backoff.on_exception(backoff.constant, AssertionError, max_time=2.0, interval=0.05)
+    async def _wait_received() -> None:
         assert any(e.event_type == "test.created" for e in received)
 
-    await eventually(_received, timeout=2.0, interval=0.05)
+    await _wait_received()
diff --git a/backend/tests/integration/services/events/test_kafka_event_service.py b/backend/tests/integration/services/events/test_kafka_event_service.py
index 8a13fdee..2f82be72 100644
--- a/backend/tests/integration/services/events/test_kafka_event_service.py
+++ b/backend/tests/integration/services/events/test_kafka_event_service.py
@@ -1,22 +1,25 @@
-import pytest
+from collections.abc import Callable
 
+import pytest
 from app.db.repositories import EventRepository
 from app.domain.enums.events import EventType
 from app.domain.enums.execution import ExecutionStatus
 from app.services.kafka_event_service import KafkaEventService
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.kafka, pytest.mark.mongodb]
 
 
 @pytest.mark.asyncio
-async def test_publish_user_registered_event(scope) -> None:  # type: ignore[valid-type]
+async def test_publish_user_registered_event(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     svc: KafkaEventService = await scope.get(KafkaEventService)
     repo: EventRepository = await scope.get(EventRepository)
 
+    user_id = unique_id("user-")
     event_id = await svc.publish_event(
         event_type=EventType.USER_REGISTERED,
-        payload={"user_id": "u1", "username": "alice", "email": "alice@example.com"},
-        aggregate_id="u1",
+        payload={"user_id": user_id, "username": "alice", "email": "alice@example.com"},
+        aggregate_id=user_id,
     )
     assert isinstance(event_id, str) and event_id
     stored = await repo.get_event(event_id)
@@ -24,13 +27,14 @@ async def test_publish_user_registered_event(scope) -> None:  # type: ignore[val
 
 
 @pytest.mark.asyncio
-async def test_publish_execution_event(scope) -> None:  # type: ignore[valid-type]
+async def test_publish_execution_event(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     svc: KafkaEventService = await scope.get(KafkaEventService)
     repo: EventRepository = await scope.get(EventRepository)
 
+    execution_id = unique_id("exec-")
     event_id = await svc.publish_execution_event(
         event_type=EventType.EXECUTION_QUEUED,
-        execution_id="exec1",
+        execution_id=execution_id,
         status=ExecutionStatus.QUEUED,
         metadata=None,
         error_message=None,
@@ -40,15 +44,18 @@ async def test_publish_execution_event(scope) -> None:  # type: ignore[valid-typ
 
 
 @pytest.mark.asyncio
-async def test_publish_pod_event_and_without_metadata(scope) -> None:  # type: ignore[valid-type]
+async def test_publish_pod_event_and_without_metadata(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     svc: KafkaEventService = await scope.get(KafkaEventService)
     repo: EventRepository = await scope.get(EventRepository)
 
+    execution_id = unique_id("exec-")
+    pod_name = unique_id("executor-")
+
     # Pod event
     eid = await svc.publish_pod_event(
         event_type=EventType.POD_CREATED,
-        pod_name="executor-pod1",
-        execution_id="exec1",
+        pod_name=pod_name,
+        execution_id=execution_id,
         namespace="ns",
         status="pending",
         metadata=None,
@@ -57,10 +64,11 @@ async def test_publish_pod_event_and_without_metadata(scope) -> None:  # type: i
     assert await repo.get_event(eid) is not None
 
     # Generic event without metadata
+    user_id = unique_id("user-")
     eid2 = await svc.publish_event(
         event_type=EventType.USER_LOGGED_IN,
-        payload={"user_id": "u2", "login_method": "password"},
-        aggregate_id="u2",
+        payload={"user_id": user_id, "login_method": "password"},
+        aggregate_id=user_id,
         metadata=None,
     )
     assert isinstance(eid2, str)
diff --git a/backend/tests/integration/services/execution/__init__.py b/backend/tests/integration/services/execution/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/services/execution/test_execution_service.py b/backend/tests/integration/services/execution/test_execution_service.py
index 184a3494..c3e689e9 100644
--- a/backend/tests/integration/services/execution/test_execution_service.py
+++ b/backend/tests/integration/services/execution/test_execution_service.py
@@ -1,13 +1,13 @@
 import pytest
-
 from app.domain.execution import ResourceLimitsDomain
 from app.services.execution_service import ExecutionService
+from dishka import AsyncContainer
 
 pytestmark = pytest.mark.integration
 
 
 @pytest.mark.asyncio
-async def test_execute_script_and_limits(scope) -> None:  # type: ignore[valid-type]
+async def test_execute_script_and_limits(scope: AsyncContainer) -> None:
     svc: ExecutionService = await scope.get(ExecutionService)
     limits = await svc.get_k8s_resource_limits()
     assert isinstance(limits, ResourceLimitsDomain)
diff --git a/backend/tests/integration/services/idempotency/__init__.py b/backend/tests/integration/services/idempotency/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/services/idempotency/test_redis_repository.py b/backend/tests/integration/services/idempotency/test_redis_repository.py
index 7f96b783..6a15dbf3 100644
--- a/backend/tests/integration/services/idempotency/test_redis_repository.py
+++ b/backend/tests/integration/services/idempotency/test_redis_repository.py
@@ -1,8 +1,9 @@
 import json
+from collections.abc import Callable
 from datetime import datetime, timedelta, timezone
-import pytest
-from pymongo.errors import DuplicateKeyError
 
+import pytest
+import redis.asyncio as aioredis
 from app.domain.idempotency import IdempotencyRecord, IdempotencyStatus
 from app.services.idempotency.redis_repository import (
     RedisIdempotencyRepository,
@@ -10,52 +11,54 @@
     _json_default,
     _parse_iso_datetime,
 )
-
+from pymongo.errors import DuplicateKeyError
 
 pytestmark = [pytest.mark.integration, pytest.mark.redis]
 
 
 class TestHelperFunctions:
-    def test_iso_datetime(self):
+    def test_iso_datetime(self) -> None:
         dt = datetime(2025, 1, 15, 10, 30, 45, tzinfo=timezone.utc)
         result = _iso(dt)
         assert result == "2025-01-15T10:30:45+00:00"
 
-    def test_iso_datetime_with_timezone(self):
+    def test_iso_datetime_with_timezone(self) -> None:
         dt = datetime(2025, 1, 15, 10, 30, 45, tzinfo=timezone(timedelta(hours=5)))
         result = _iso(dt)
         assert result == "2025-01-15T05:30:45+00:00"
 
-    def test_json_default_datetime(self):
+    def test_json_default_datetime(self) -> None:
         dt = datetime(2025, 1, 15, 10, 30, 45, tzinfo=timezone.utc)
         result = _json_default(dt)
         assert result == "2025-01-15T10:30:45+00:00"
 
-    def test_json_default_other(self):
+    def test_json_default_other(self) -> None:
         obj = {"key": "value"}
         result = _json_default(obj)
         assert result == "{'key': 'value'}"
 
-    def test_parse_iso_datetime_variants(self):
-        assert _parse_iso_datetime("2025-01-15T10:30:45+00:00").year == 2025
-        assert _parse_iso_datetime("2025-01-15T10:30:45Z").tzinfo == timezone.utc
+    def test_parse_iso_datetime_variants(self) -> None:
+        dt1 = _parse_iso_datetime("2025-01-15T10:30:45+00:00")
+        assert dt1 is not None and dt1.year == 2025
+        dt2 = _parse_iso_datetime("2025-01-15T10:30:45Z")
+        assert dt2 is not None and dt2.tzinfo == timezone.utc
         assert _parse_iso_datetime(None) is None
         assert _parse_iso_datetime("") is None
         assert _parse_iso_datetime("not-a-date") is None
 
 
 @pytest.fixture
-def repository(redis_client):  # type: ignore[valid-type]
+def repository(redis_client: aioredis.Redis) -> RedisIdempotencyRepository:
     return RedisIdempotencyRepository(redis_client, key_prefix="idempotency")
 
 
 @pytest.fixture
-def sample_record():
+def sample_record(unique_id: Callable[[str], str]) -> IdempotencyRecord:
     return IdempotencyRecord(
-        key="test-key",
+        key=unique_id("key-"),
         status=IdempotencyStatus.PROCESSING,
         event_type="test.event",
-        event_id="event-123",
+        event_id=unique_id("event-"),
         created_at=datetime(2025, 1, 15, 10, 30, 45, tzinfo=timezone.utc),
         ttl_seconds=5,
         completed_at=None,
@@ -65,17 +68,17 @@ def sample_record():
     )
 
 
-def test_full_key_helpers(repository):
+def test_full_key_helpers(repository: RedisIdempotencyRepository) -> None:
     assert repository._full_key("my") == "idempotency:my"
     assert repository._full_key("idempotency:my") == "idempotency:my"
 
 
-def test_doc_record_roundtrip(repository):
+def test_doc_record_roundtrip(repository: RedisIdempotencyRepository, unique_id: Callable[[str], str]) -> None:
     rec = IdempotencyRecord(
-        key="k",
+        key=unique_id("k-"),
         status=IdempotencyStatus.COMPLETED,
         event_type="e.t",
-        event_id="e-1",
+        event_id=unique_id("e-"),
         created_at=datetime(2025, 1, 15, tzinfo=timezone.utc),
         ttl_seconds=60,
         completed_at=datetime(2025, 1, 15, 0, 1, tzinfo=timezone.utc),
@@ -89,7 +92,9 @@ def test_doc_record_roundtrip(repository):
 
 
 @pytest.mark.asyncio
-async def test_insert_find_update_delete_flow(repository, redis_client, sample_record):  # type: ignore[valid-type]
+async def test_insert_find_update_delete_flow(
+    repository: RedisIdempotencyRepository, redis_client: aioredis.Redis, sample_record: IdempotencyRecord
+) -> None:
     # Insert processing (NX)
     await repository.insert_processing(sample_record)
     key = repository._full_key(sample_record.key)
@@ -121,30 +126,43 @@ async def test_insert_find_update_delete_flow(repository, redis_client, sample_r
 
 
 @pytest.mark.asyncio
-async def test_update_record_when_missing(repository, sample_record):
+async def test_update_record_when_missing(
+    repository: RedisIdempotencyRepository, sample_record: IdempotencyRecord
+) -> None:
     # If key missing, update returns 0
     res = await repository.update_record(sample_record)
     assert res == 0
 
 
 @pytest.mark.asyncio
-async def test_aggregate_status_counts(repository, redis_client):  # type: ignore[valid-type]
-    # Seed few keys directly using repository
-    for i, status in enumerate((IdempotencyStatus.PROCESSING, IdempotencyStatus.PROCESSING, IdempotencyStatus.COMPLETED)):
+async def test_aggregate_status_counts(
+    redis_client: aioredis.Redis, unique_id: Callable[[str], str]
+) -> None:
+    # Use unique prefix to isolate this test from parallel runs
+    prefix = unique_id("idemp-agg-")
+    repo = RedisIdempotencyRepository(redis_client, key_prefix=prefix)
+
+    statuses = (IdempotencyStatus.PROCESSING, IdempotencyStatus.PROCESSING, IdempotencyStatus.COMPLETED)
+    for i, status in enumerate(statuses):
         rec = IdempotencyRecord(
-            key=f"k{i}", status=status, event_type="t", event_id=f"e{i}", created_at=datetime.now(timezone.utc), ttl_seconds=60
+            key=f"k-{i}",
+            status=status,
+            event_type="t",
+            event_id=f"e-{i}",
+            created_at=datetime.now(timezone.utc),
+            ttl_seconds=60,
         )
-        await repository.insert_processing(rec)
+        await repo.insert_processing(rec)
         if status != IdempotencyStatus.PROCESSING:
             rec.status = status
             rec.completed_at = datetime.now(timezone.utc)
-            await repository.update_record(rec)
+            await repo.update_record(rec)
 
-    counts = await repository.aggregate_status_counts("idempotency")
+    counts = await repo.aggregate_status_counts(prefix)
     assert counts[IdempotencyStatus.PROCESSING] == 2
     assert counts[IdempotencyStatus.COMPLETED] == 1
 
 
 @pytest.mark.asyncio
-async def test_health_check(repository):
+async def test_health_check(repository: RedisIdempotencyRepository) -> None:
     await repository.health_check()  # should not raise
diff --git a/backend/tests/integration/services/notifications/__init__.py b/backend/tests/integration/services/notifications/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/services/notifications/test_notification_service.py b/backend/tests/integration/services/notifications/test_notification_service.py
index c1faa79a..783e12e8 100644
--- a/backend/tests/integration/services/notifications/test_notification_service.py
+++ b/backend/tests/integration/services/notifications/test_notification_service.py
@@ -1,26 +1,29 @@
 import pytest
-
 from app.db.repositories import NotificationRepository
 from app.domain.enums.notification import NotificationChannel, NotificationSeverity
-from app.domain.notification import DomainNotification
+from app.domain.notification import DomainNotificationCreate
 from app.services.notification_service import NotificationService
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.mongodb]
 
 
 @pytest.mark.asyncio
-async def test_notification_service_crud_and_subscription(scope) -> None:  # type: ignore[valid-type]
+async def test_notification_service_crud_and_subscription(scope: AsyncContainer) -> None:
     svc: NotificationService = await scope.get(NotificationService)
     repo: NotificationRepository = await scope.get(NotificationRepository)
 
     # Create a notification via repository and then use service to mark/delete
-    n = DomainNotification(user_id="u1", severity=NotificationSeverity.MEDIUM, tags=["x"], channel=NotificationChannel.IN_APP, subject="s", body="b")
-    _nid = await repo.create_notification(n)
-    got = await repo.get_notification(n.notification_id, "u1")
+    n = DomainNotificationCreate(
+        user_id="u1", severity=NotificationSeverity.MEDIUM, tags=["x"],
+        channel=NotificationChannel.IN_APP, subject="s", body="b",
+    )
+    created = await repo.create_notification(n)
+    got = await repo.get_notification(created.notification_id, "u1")
     assert got is not None
 
     # Mark as read through service
-    ok = await svc.mark_as_read("u1", got.notification_id)
+    ok = await svc.mark_as_read("u1", created.notification_id)
     assert ok is True
 
     # Subscriptions via service wrapper calls the repo
diff --git a/backend/tests/integration/services/rate_limit/__init__.py b/backend/tests/integration/services/rate_limit/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/services/rate_limit/test_rate_limit_service.py b/backend/tests/integration/services/rate_limit/test_rate_limit_service.py
index 24f11477..d307dbed 100644
--- a/backend/tests/integration/services/rate_limit/test_rate_limit_service.py
+++ b/backend/tests/integration/services/rate_limit/test_rate_limit_service.py
@@ -1,11 +1,8 @@
-import asyncio
 import json
-import time
-from datetime import datetime, timezone
-from uuid import uuid4
+from collections.abc import Callable
+from typing import Any, Self
 
 import pytest
-
 from app.domain.rate_limit import (
     EndpointGroup,
     RateLimitAlgorithm,
@@ -14,14 +11,17 @@
     UserRateLimit,
 )
 from app.services.rate_limit_service import RateLimitService
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.redis]
 
 
 @pytest.mark.asyncio
-async def test_normalize_and_disabled_and_bypass_and_no_rule(scope) -> None:  # type: ignore[valid-type]
+async def test_normalize_and_disabled_and_bypass_and_no_rule(
+    scope: AsyncContainer, unique_id: Callable[[str], str]
+) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
-    svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
+    svc.prefix = f"{svc.prefix}{unique_id('')}:"
     # ensure disabled for first path
     await svc.update_config(RateLimitConfig(default_rules=[]))
     svc.settings.RATE_LIMIT_ENABLED = False
@@ -48,9 +48,11 @@ async def test_normalize_and_disabled_and_bypass_and_no_rule(scope) -> None:  #
 
 
 @pytest.mark.asyncio
-async def test_sliding_window_allowed_and_rejected(scope) -> None:  # type: ignore[valid-type]
+async def test_sliding_window_allowed_and_rejected(
+    scope: AsyncContainer, unique_id: Callable[[str], str]
+) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
-    svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
+    svc.prefix = f"{svc.prefix}{unique_id('')}:"
     svc.settings.RATE_LIMIT_ENABLED = True  # Enable rate limiting for this test
     # matching rule with window 5, limit 3
     rule = RateLimitRule(endpoint_pattern=r"^/api/v1/x", group=EndpointGroup.API, requests=3, window_seconds=5,
@@ -73,9 +75,9 @@ async def test_sliding_window_allowed_and_rejected(scope) -> None:  # type: igno
 
 
 @pytest.mark.asyncio
-async def test_token_bucket_paths(scope) -> None:  # type: ignore[valid-type]
+async def test_token_bucket_paths(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
-    svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
+    svc.prefix = f"{svc.prefix}{unique_id('')}:"
     svc.settings.RATE_LIMIT_ENABLED = True  # Enable rate limiting for this test
     rule = RateLimitRule(endpoint_pattern=r"^/api/v1/t", group=EndpointGroup.API, requests=2, window_seconds=10,
                          burst_multiplier=1.0, algorithm=RateLimitAlgorithm.TOKEN_BUCKET)
@@ -101,9 +103,11 @@ async def test_token_bucket_paths(scope) -> None:  # type: ignore[valid-type]
 
 
 @pytest.mark.asyncio
-async def test_config_update_and_user_helpers(scope) -> None:  # type: ignore[valid-type]
+async def test_config_update_and_user_helpers(
+    scope: AsyncContainer, unique_id: Callable[[str], str]
+) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
-    svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
+    svc.prefix = f"{svc.prefix}{unique_id('')}:"
     cfg = RateLimitConfig(
         default_rules=[RateLimitRule(endpoint_pattern=r"^/a", group=EndpointGroup.API, requests=1, window_seconds=1)])
     await svc.update_config(cfg)
@@ -124,10 +128,10 @@ async def test_config_update_and_user_helpers(scope) -> None:  # type: ignore[va
 
 
 @pytest.mark.asyncio
-async def test_ip_based_rate_limiting(scope) -> None:  # type: ignore[valid-type]
+async def test_ip_based_rate_limiting(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     """Test IP-based rate limiting."""
     svc: RateLimitService = await scope.get(RateLimitService)
-    svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
+    svc.prefix = f"{svc.prefix}{unique_id('')}:"
 
     # Test IP-based check
     cfg = RateLimitConfig(
@@ -151,22 +155,27 @@ async def test_ip_based_rate_limiting(scope) -> None:  # type: ignore[valid-type
 
 
 @pytest.mark.asyncio
-async def test_get_config_roundtrip(scope) -> None:  # type: ignore[valid-type]
+async def test_get_config_roundtrip(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
-    svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
-    cfg = RateLimitConfig(default_rules=[RateLimitRule(endpoint_pattern=r"^/z", group=EndpointGroup.API, requests=1, window_seconds=1)])
+    svc.prefix = f"{svc.prefix}{unique_id('')}:"
+    rule = RateLimitRule(endpoint_pattern=r"^/z", group=EndpointGroup.API, requests=1, window_seconds=1)
+    cfg = RateLimitConfig(default_rules=[rule])
     await svc.update_config(cfg)
     got = await svc._get_config()
     assert isinstance(got, RateLimitConfig)
 
 
 @pytest.mark.asyncio
-async def test_sliding_window_edge(scope) -> None:  # type: ignore[valid-type]
+async def test_sliding_window_edge(scope: AsyncContainer, unique_id: Callable[[str], str]) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
-    svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
+    svc.prefix = f"{svc.prefix}{unique_id('')}:"
     svc.settings.RATE_LIMIT_ENABLED = True  # Enable rate limiting for this test
     # Configure a tight window and ensure behavior is consistent
-    cfg = RateLimitConfig(default_rules=[RateLimitRule(endpoint_pattern=r"^/edge", group=EndpointGroup.API, requests=1, window_seconds=1, algorithm=RateLimitAlgorithm.SLIDING_WINDOW)])
+    rule = RateLimitRule(
+        endpoint_pattern=r"^/edge", group=EndpointGroup.API,
+        requests=1, window_seconds=1, algorithm=RateLimitAlgorithm.SLIDING_WINDOW,
+    )
+    cfg = RateLimitConfig(default_rules=[rule])
     await svc.update_config(cfg)
     ok = await svc.check_rate_limit("u", "/edge")
     assert ok.allowed is True
@@ -176,16 +185,18 @@ async def test_sliding_window_edge(scope) -> None:  # type: ignore[valid-type]
 
 
 @pytest.mark.asyncio
-async def test_sliding_window_pipeline_failure(scope, monkeypatch) -> None:  # type: ignore[valid-type]
+async def test_sliding_window_pipeline_failure(
+    scope: AsyncContainer, monkeypatch: pytest.MonkeyPatch, unique_id: Callable[[str], str]
+) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
-    svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
+    svc.prefix = f"{svc.prefix}{unique_id('')}:"
 
     class FailingPipe:
-        def zremrangebyscore(self, *a, **k): return self  # noqa: ANN001, D401
-        def zadd(self, *a, **k): return self  # noqa: ANN001, D401
-        def zcard(self, *a, **k): return self  # noqa: ANN001, D401
-        def expire(self, *a, **k): return self  # noqa: ANN001, D401
-        async def execute(self): raise ConnectionError("Pipeline failed")
+        def zremrangebyscore(self, *a: Any, **k: Any) -> Self: return self
+        def zadd(self, *a: Any, **k: Any) -> Self: return self
+        def zcard(self, *a: Any, **k: Any) -> Self: return self
+        def expire(self, *a: Any, **k: Any) -> Self: return self
+        async def execute(self) -> None: raise ConnectionError("Pipeline failed")
 
     monkeypatch.setattr(svc.redis, "pipeline", lambda: FailingPipe())
 
@@ -204,7 +215,7 @@ async def execute(self): raise ConnectionError("Pipeline failed")
 
 
 @pytest.mark.asyncio
-async def test_token_bucket_invalid_data(scope) -> None:  # type: ignore[valid-type]
+async def test_token_bucket_invalid_data(scope: AsyncContainer) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
     key = f"{svc.prefix}tb:user:/api"
     await svc.redis.set(key, "invalid-json")
@@ -224,9 +235,12 @@ async def test_token_bucket_invalid_data(scope) -> None:  # type: ignore[valid-t
 
 
 @pytest.mark.asyncio
-async def test_update_config_serialization_error(scope, monkeypatch) -> None:  # type: ignore[valid-type]
+async def test_update_config_serialization_error(
+    scope: AsyncContainer, monkeypatch: pytest.MonkeyPatch
+) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
-    async def failing_setex(key, ttl, value):  # noqa: ANN001
+
+    async def failing_setex(key: str, ttl: int, value: str) -> None:
         raise ValueError("Serialization failed")
     monkeypatch.setattr(svc.redis, "setex", failing_setex)
 
@@ -236,16 +250,17 @@ async def failing_setex(key, ttl, value):  # noqa: ANN001
 
 
 @pytest.mark.asyncio
-async def test_get_user_rate_limit_not_found(scope) -> None:  # type: ignore[valid-type]
+async def test_get_user_rate_limit_not_found(scope: AsyncContainer) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
     result = await svc.get_user_rate_limit("nonexistent")
     assert result is None
 
 
 @pytest.mark.asyncio
-async def test_reset_user_limits_error(scope, monkeypatch) -> None:  # type: ignore[valid-type]
+async def test_reset_user_limits_error(scope: AsyncContainer, monkeypatch: pytest.MonkeyPatch) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
-    async def failing_smembers(key):  # noqa: ANN001
+
+    async def failing_smembers(key: str) -> None:
         raise ConnectionError("smembers failed")
     monkeypatch.setattr(svc.redis, "smembers", failing_smembers)
     with pytest.raises(ConnectionError):
@@ -253,18 +268,17 @@ async def failing_smembers(key):  # noqa: ANN001
 
 
 @pytest.mark.asyncio
-async def test_get_usage_stats_with_keys(scope) -> None:  # type: ignore[valid-type]
+async def test_get_usage_stats_with_keys(scope: AsyncContainer) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
     user_id = "user"
-    index_key = f"{svc.prefix}index:{user_id}"
     sw_key = f"{svc.prefix}sw:{user_id}:/api:key1"
-    await svc.redis.sadd(index_key, sw_key)
+    await svc._register_user_key(user_id, sw_key)
     stats = await svc.get_usage_stats(user_id)
     assert isinstance(stats, dict)
 
 
 @pytest.mark.asyncio
-async def test_check_rate_limit_with_user_override(scope) -> None:  # type: ignore[valid-type]
+async def test_check_rate_limit_with_user_override(scope: AsyncContainer) -> None:
     svc: RateLimitService = await scope.get(RateLimitService)
     svc.settings.RATE_LIMIT_ENABLED = True  # Enable rate limiting for this test
     rule = RateLimitRule(
@@ -281,15 +295,13 @@ async def test_check_rate_limit_with_user_override(scope) -> None:  # type: igno
     endpoint = "/api/test"
     allowed_count = 0
     for _ in range(5):
-        res = await svc.check_rate_limit("normal_user", endpoint, config=cfg)
-        allowed_count += 1 if res.allowed else 0
-        await asyncio.sleep(0.05)
+        if (await svc.check_rate_limit("normal_user", endpoint, config=cfg)).allowed:
+            allowed_count += 1
     assert allowed_count == int(rule.requests)  # Should be exactly 3
 
     # Special user: higher multiplier allows more requests
     allowed_count_special = 0
     for _ in range(6):
-        res = await svc.check_rate_limit("special_user", endpoint, config=cfg)
-        allowed_count_special += 1 if res.allowed else 0
-        await asyncio.sleep(0.05)
+        if (await svc.check_rate_limit("special_user", endpoint, config=cfg)).allowed:
+            allowed_count_special += 1
     assert allowed_count_special > allowed_count
diff --git a/backend/tests/integration/services/replay/__init__.py b/backend/tests/integration/services/replay/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/services/replay/test_replay_service.py b/backend/tests/integration/services/replay/test_replay_service.py
index de47f756..730a12c9 100644
--- a/backend/tests/integration/services/replay/test_replay_service.py
+++ b/backend/tests/integration/services/replay/test_replay_service.py
@@ -1,14 +1,14 @@
 import pytest
-
 from app.domain.enums.replay import ReplayTarget, ReplayType
 from app.services.event_replay import ReplayConfig, ReplayFilter
 from app.services.replay_service import ReplayService
+from dishka import AsyncContainer
 
 pytestmark = pytest.mark.integration
 
 
 @pytest.mark.asyncio
-async def test_replay_service_create_and_list(scope) -> None:  # type: ignore[valid-type]
+async def test_replay_service_create_and_list(scope: AsyncContainer) -> None:
     svc: ReplayService = await scope.get(ReplayService)
 
     cfg = ReplayConfig(
diff --git a/backend/tests/integration/services/saga/__init__.py b/backend/tests/integration/services/saga/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/services/saga/test_saga_service.py b/backend/tests/integration/services/saga/test_saga_service.py
index 21d6f3b1..78f8dbe2 100644
--- a/backend/tests/integration/services/saga/test_saga_service.py
+++ b/backend/tests/integration/services/saga/test_saga_service.py
@@ -1,26 +1,26 @@
-import pytest
 from datetime import datetime, timezone
 
+import pytest
+from app.domain.enums.user import UserRole
+from app.schemas_pydantic.user import User
 from app.services.saga.saga_service import SagaService
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.mongodb]
 
 
 @pytest.mark.asyncio
-async def test_saga_service_basic(scope) -> None:  # type: ignore[valid-type]
+async def test_saga_service_basic(scope: AsyncContainer) -> None:
     svc: SagaService = await scope.get(SagaService)
-    from app.domain.user import User as DomainUser
-    from app.domain.enums.user import UserRole
-    user = DomainUser(
+    user = User(
         user_id="u1",
         username="u1",
         email="u1@example.com",
         role=UserRole.USER,
         is_active=True,
         is_superuser=False,
-        hashed_password="x",
         created_at=datetime.now(timezone.utc),
         updated_at=datetime.now(timezone.utc),
     )
     res = await svc.list_user_sagas(user)
-    assert hasattr(res, "sagas") and isinstance(res.sagas, list)
+    assert res.sagas is not None and isinstance(res.sagas, list)
diff --git a/backend/tests/integration/services/saved_script/__init__.py b/backend/tests/integration/services/saved_script/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/services/saved_script/test_saved_script_service.py b/backend/tests/integration/services/saved_script/test_saved_script_service.py
index 16d980c8..8a96e59b 100644
--- a/backend/tests/integration/services/saved_script/test_saved_script_service.py
+++ b/backend/tests/integration/services/saved_script/test_saved_script_service.py
@@ -1,7 +1,7 @@
 import pytest
-
 from app.domain.saved_script import DomainSavedScriptCreate, DomainSavedScriptUpdate, SavedScriptNotFoundError
 from app.services.saved_script_service import SavedScriptService
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.mongodb]
 
@@ -11,7 +11,7 @@ def _create_payload() -> DomainSavedScriptCreate:
 
 
 @pytest.mark.asyncio
-async def test_crud_saved_script(scope) -> None:  # type: ignore[valid-type]
+async def test_crud_saved_script(scope: AsyncContainer) -> None:
     service: SavedScriptService = await scope.get(SavedScriptService)
     created = await service.create_saved_script(_create_payload(), user_id="u1")
     assert created.user_id == "u1"
@@ -19,7 +19,9 @@ async def test_crud_saved_script(scope) -> None:  # type: ignore[valid-type]
     got = await service.get_saved_script(str(created.script_id), "u1")
     assert got and got.script_id == created.script_id
 
-    out = await service.update_saved_script(str(created.script_id), "u1", DomainSavedScriptUpdate(name="new", script="p"))
+    out = await service.update_saved_script(
+        str(created.script_id), "u1", DomainSavedScriptUpdate(name="new", script="p"),
+    )
     assert out and out.name == "new"
 
     lst = await service.list_saved_scripts("u1")
diff --git a/backend/tests/integration/services/sse/test_partitioned_event_router.py b/backend/tests/integration/services/sse/test_partitioned_event_router.py
index 040a62b5..7dc42a17 100644
--- a/backend/tests/integration/services/sse/test_partitioned_event_router.py
+++ b/backend/tests/integration/services/sse/test_partitioned_event_router.py
@@ -1,6 +1,8 @@
 import logging
-from uuid import uuid4
+from collections.abc import Callable
+from typing import Any
 
+import backoff
 import pytest
 from app.core.metrics.events import EventMetrics
 from app.events.core import EventDispatcher
@@ -11,7 +13,6 @@
 from app.settings import Settings
 
 from tests.helpers import make_execution_requested_event
-from tests.helpers.eventually import eventually
 
 pytestmark = [pytest.mark.integration, pytest.mark.redis]
 
@@ -19,8 +20,10 @@
 
 
 @pytest.mark.asyncio
-async def test_router_bridges_to_redis(redis_client, test_settings: Settings) -> None:
-    suffix = uuid4().hex[:6]
+async def test_router_bridges_to_redis(
+    redis_client: Any, test_settings: Settings, unique_id: Callable[[str], str]
+) -> None:
+    suffix = unique_id("")
     bus = SSERedisBus(
         redis_client,
         exec_prefix=f"sse:exec:{suffix}:",
@@ -38,26 +41,33 @@ async def test_router_bridges_to_redis(redis_client, test_settings: Settings) ->
     router._register_routing_handlers(disp)
 
     # Open Redis subscription for our execution id
-    execution_id = f"e-{uuid4().hex[:8]}"
+    execution_id = unique_id("e-")
     subscription = await bus.open_subscription(execution_id)
 
     ev = make_execution_requested_event(execution_id=execution_id)
     handler = disp.get_handlers(ev.event_type)[0]
     await handler(ev)
 
-    async def _recv():
+    msg: RedisSSEMessage | None = None
+
+    @backoff.on_exception(backoff.constant, AssertionError, max_time=2.0, interval=0.05)
+    async def _wait_recv() -> None:
+        nonlocal msg
         m = await subscription.get(RedisSSEMessage)
         assert m is not None
-        return m
+        msg = m
 
-    msg = await eventually(_recv, timeout=2.0, interval=0.05)
+    await _wait_recv()
+    assert msg is not None
     assert str(msg.event_type) == str(ev.event_type)
 
 
 @pytest.mark.asyncio
-async def test_router_start_and_stop(redis_client, test_settings: Settings) -> None:
+async def test_router_start_and_stop(
+    redis_client: Any, test_settings: Settings, unique_id: Callable[[str], str]
+) -> None:
     test_settings.SSE_CONSUMER_POOL_SIZE = 1
-    suffix = uuid4().hex[:6]
+    suffix = unique_id("")
     router = SSEKafkaRedisBridge(
         schema_registry=SchemaRegistryManager(settings=test_settings, logger=_test_logger),
         settings=test_settings,
diff --git a/backend/tests/integration/services/sse/test_redis_bus.py b/backend/tests/integration/services/sse/test_redis_bus.py
deleted file mode 100644
index ae54a6e4..00000000
--- a/backend/tests/integration/services/sse/test_redis_bus.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import asyncio
-import json
-import logging
-from typing import Any
-
-import pytest
-
-pytestmark = pytest.mark.integration
-
-from app.domain.enums.events import EventType
-from app.schemas_pydantic.sse import RedisNotificationMessage, RedisSSEMessage
-from app.services.sse.redis_bus import SSERedisBus
-
-_test_logger = logging.getLogger("test.services.sse.redis_bus")
-
-
-class _DummyEvent:
-    def __init__(self, execution_id: str, event_type: EventType, extra: dict[str, Any] | None = None) -> None:
-        self.execution_id = execution_id
-        self.event_type = event_type
-        self._extra = extra or {}
-
-    def model_dump(self, mode: str | None = None) -> dict[str, Any]:  # noqa: ARG002
-        return {"execution_id": self.execution_id, **self._extra}
-
-
-class _FakePubSub:
-    def __init__(self) -> None:
-        self.subscribed: set[str] = set()
-        self._queue: asyncio.Queue[dict[str, Any]] = asyncio.Queue()
-        self.closed = False
-
-    async def subscribe(self, channel: str) -> None:
-        self.subscribed.add(channel)
-
-    async def get_message(self, ignore_subscribe_messages: bool = True, timeout: float = 0.5):  # noqa: ARG002
-        try:
-            msg = await asyncio.wait_for(self._queue.get(), timeout=timeout)
-            return msg
-        except asyncio.TimeoutError:
-            return None
-
-    async def push(self, channel: str, payload: str | bytes) -> None:
-        self._queue.put_nowait({"type": "message", "channel": channel, "data": payload})
-
-    async def unsubscribe(self, channel: str) -> None:
-        self.subscribed.discard(channel)
-
-    async def aclose(self) -> None:
-        self.closed = True
-
-
-class _FakeRedis:
-    def __init__(self) -> None:
-        self.published: list[tuple[str, str]] = []
-        self._pubsub = _FakePubSub()
-
-    async def publish(self, channel: str, payload: str) -> None:
-        self.published.append((channel, payload))
-
-    def pubsub(self) -> _FakePubSub:
-        return self._pubsub
-
-
-@pytest.mark.asyncio
-async def test_publish_and_subscribe_round_trip() -> None:
-    r = _FakeRedis()
-    bus = SSERedisBus(r, logger=_test_logger)
-
-    # Subscribe
-    sub = await bus.open_subscription("exec-1")
-    assert isinstance(sub, object)
-    assert "sse:exec:exec-1" in r._pubsub.subscribed
-
-    # Publish event
-    evt = _DummyEvent("exec-1", EventType.EXECUTION_COMPLETED, {"status": "completed"})
-    await bus.publish_event("exec-1", evt)
-    assert r.published, "nothing published"
-    ch, payload = r.published[-1]
-    assert ch.endswith("exec-1")
-    # Push to pubsub and read from subscription
-    await r._pubsub.push(ch, payload)
-    msg = await sub.get(RedisSSEMessage)
-    assert msg and msg.event_type == EventType.EXECUTION_COMPLETED
-    assert msg.execution_id == "exec-1"
-
-    # Non-message / invalid JSON paths
-    await r._pubsub.push(ch, b"not-json")
-    assert await sub.get(RedisSSEMessage) is None
-
-    # Close
-    await sub.close()
-    assert "sse:exec:exec-1" not in r._pubsub.subscribed and r._pubsub.closed is True
-
-
-@pytest.mark.asyncio
-async def test_notifications_channels() -> None:
-    r = _FakeRedis()
-    bus = SSERedisBus(r, logger=_test_logger)
-    nsub = await bus.open_notification_subscription("user-1")
-    assert "sse:notif:user-1" in r._pubsub.subscribed
-
-    notif = RedisNotificationMessage(
-        notification_id="n1",
-        severity="low",
-        status="pending",
-        tags=[],
-        subject="test",
-        body="body",
-        action_url="",
-        created_at="2025-01-01T00:00:00Z",
-    )
-    await bus.publish_notification("user-1", notif)
-    ch, payload = r.published[-1]
-    assert ch.endswith("user-1")
-    await r._pubsub.push(ch, payload)
-    got = await nsub.get(RedisNotificationMessage)
-    assert got is not None
-    assert got.notification_id == "n1"
-
-    await nsub.close()
diff --git a/backend/tests/integration/services/user_settings/__init__.py b/backend/tests/integration/services/user_settings/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/integration/services/user_settings/test_user_settings_service.py b/backend/tests/integration/services/user_settings/test_user_settings_service.py
index dccc3b2b..1acb9d2e 100644
--- a/backend/tests/integration/services/user_settings/test_user_settings_service.py
+++ b/backend/tests/integration/services/user_settings/test_user_settings_service.py
@@ -1,7 +1,6 @@
 from datetime import datetime, timezone
 
 import pytest
-
 from app.domain.enums import Theme
 from app.domain.user.settings_models import (
     DomainEditorSettings,
@@ -9,12 +8,13 @@
     DomainUserSettingsUpdate,
 )
 from app.services.user_settings_service import UserSettingsService
+from dishka import AsyncContainer
 
 pytestmark = [pytest.mark.integration, pytest.mark.mongodb]
 
 
 @pytest.mark.asyncio
-async def test_get_update_and_history(scope) -> None:  # type: ignore[valid-type]
+async def test_get_update_and_history(scope: AsyncContainer) -> None:
     svc: UserSettingsService = await scope.get(UserSettingsService)
     user_id = "u1"
 
diff --git a/backend/tests/integration/test_admin_routes.py b/backend/tests/integration/test_admin_routes.py
index 03206678..4ea7c383 100644
--- a/backend/tests/integration/test_admin_routes.py
+++ b/backend/tests/integration/test_admin_routes.py
@@ -1,11 +1,7 @@
-from typing import Dict
-from uuid import uuid4
+from collections.abc import Callable
 
 import pytest
 from app.schemas_pydantic.admin_settings import (
-    ExecutionLimitsSchema,
-    MonitoringSettingsSchema,
-    SecuritySettingsSchema,
     SystemSettings,
 )
 from app.schemas_pydantic.admin_user_overview import AdminUserOverview
@@ -27,61 +23,27 @@ async def test_get_settings_requires_auth(self, client: AsyncClient) -> None:
         assert "not authenticated" in error["detail"].lower() or "unauthorized" in error["detail"].lower()
 
     @pytest.mark.asyncio
-    async def test_get_settings_with_admin_auth(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_get_settings_with_admin_auth(self, authenticated_admin_client: AsyncClient) -> None:
         """Test getting system settings with admin authentication."""
-        # Login and get cookies
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Now get settings with auth cookie
-        response = await client.get("/api/v1/admin/settings/")
+        response = await authenticated_admin_client.get("/api/v1/admin/settings/")
         assert response.status_code == 200
 
-        # Validate response structure
-        data = response.json()
-        settings = SystemSettings(**data)
-
-        # Verify all nested structures
-        assert settings.execution_limits is not None
-        assert isinstance(settings.execution_limits, ExecutionLimitsSchema)
-        assert settings.execution_limits.max_timeout_seconds == 300  # Default value
-        assert settings.execution_limits.max_memory_mb == 512
-        assert settings.execution_limits.max_cpu_cores == 2
-        assert settings.execution_limits.max_concurrent_executions == 10
-
-        assert settings.security_settings is not None
-        assert isinstance(settings.security_settings, SecuritySettingsSchema)
-        assert settings.security_settings.password_min_length == 8
-        assert settings.security_settings.session_timeout_minutes == 60
-        assert settings.security_settings.max_login_attempts == 5
-        assert settings.security_settings.lockout_duration_minutes == 15
-
-        assert settings.monitoring_settings is not None
-        assert isinstance(settings.monitoring_settings, MonitoringSettingsSchema)
-        assert settings.monitoring_settings.metrics_retention_days == 30
-        assert settings.monitoring_settings.log_level == "INFO"
-        assert settings.monitoring_settings.enable_tracing is True
-        assert settings.monitoring_settings.sampling_rate == 0.1
+        # Pydantic validates types, required fields, and nested structures
+        settings = SystemSettings(**response.json())
+
+        # Verify reasonable bounds (not exact values - those can change)
+        assert settings.execution_limits.max_timeout_seconds > 0
+        assert settings.execution_limits.max_memory_mb > 0
+        assert settings.security_settings.password_min_length >= 1
+        assert settings.monitoring_settings.sampling_rate >= 0
 
     @pytest.mark.asyncio
-    async def test_update_and_reset_settings(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_update_and_reset_settings(self, authenticated_admin_client: AsyncClient) -> None:
         """Test updating and resetting system settings."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get original settings
-        original_response = await client.get("/api/v1/admin/settings/")
+        original_response = await authenticated_admin_client.get("/api/v1/admin/settings/")
         assert original_response.status_code == 200
-        original_settings = original_response.json()
+        # original_settings preserved for potential rollback verification
 
         # Update settings
         updated_settings = {
@@ -105,7 +67,7 @@ async def test_update_and_reset_settings(self, client: AsyncClient, test_admin:
             }
         }
 
-        update_response = await client.put("/api/v1/admin/settings/", json=updated_settings)
+        update_response = await authenticated_admin_client.put("/api/v1/admin/settings/", json=updated_settings)
         assert update_response.status_code == 200
 
         # Verify updates were applied
@@ -115,7 +77,7 @@ async def test_update_and_reset_settings(self, client: AsyncClient, test_admin:
         assert returned_settings.monitoring_settings.log_level == "WARNING"
 
         # Reset settings
-        reset_response = await client.post("/api/v1/admin/settings/reset")
+        reset_response = await authenticated_admin_client.post("/api/v1/admin/settings/reset")
         assert reset_response.status_code == 200
 
         # Verify reset to defaults
@@ -125,18 +87,10 @@ async def test_update_and_reset_settings(self, client: AsyncClient, test_admin:
         assert reset_settings.monitoring_settings.log_level == "INFO"
 
     @pytest.mark.asyncio
-    async def test_regular_user_cannot_access_settings(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_regular_user_cannot_access_settings(self, authenticated_client: AsyncClient) -> None:
         """Test that regular users cannot access admin settings."""
-        # Login as regular user
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Try to access admin settings
-        response = await client.get("/api/v1/admin/settings/")
+        # Try to access admin settings as regular user
+        response = await authenticated_client.get("/api/v1/admin/settings/")
         assert response.status_code == 403
 
         error = response.json()
@@ -149,18 +103,10 @@ class TestAdminUsers:
     """Test admin user management endpoints against real backend."""
 
     @pytest.mark.asyncio
-    async def test_list_users_with_pagination(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_list_users_with_pagination(self, authenticated_admin_client: AsyncClient) -> None:
         """Test listing users with pagination."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # List users
-        response = await client.get("/api/v1/admin/users/?limit=10&offset=0")
+        response = await authenticated_admin_client.get("/api/v1/admin/users/?limit=10&offset=0")
         assert response.status_code == 200
 
         data = response.json()
@@ -188,25 +134,19 @@ async def test_list_users_with_pagination(self, client: AsyncClient, test_admin:
             assert "updated_at" in user
 
     @pytest.mark.asyncio
-    async def test_create_and_manage_user(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_create_and_manage_user(
+        self, authenticated_admin_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test full user CRUD operations."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Create a new user
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         new_user_data = {
-            "username": f"test_managed_user_{unique_id}",
-            "email": f"managed_{unique_id}@example.com",
+            "username": f"test_managed_user_{uid}",
+            "email": f"managed_{uid}@example.com",
             "password": "SecureP@ssw0rd123"
         }
 
-        create_response = await client.post("/api/v1/admin/users/", json=new_user_data)
+        create_response = await authenticated_admin_client.post("/api/v1/admin/users/", json=new_user_data)
         assert create_response.status_code in [200, 201]
 
         created_user = create_response.json()
@@ -218,11 +158,11 @@ async def test_create_and_manage_user(self, client: AsyncClient, test_admin: Dic
         user_id = created_user["user_id"]
 
         # Get user details
-        get_response = await client.get(f"/api/v1/admin/users/{user_id}")
+        get_response = await authenticated_admin_client.get(f"/api/v1/admin/users/{user_id}")
         assert get_response.status_code == 200
 
         # Get user overview
-        overview_response = await client.get(f"/api/v1/admin/users/{user_id}/overview")
+        overview_response = await authenticated_admin_client.get(f"/api/v1/admin/users/{user_id}/overview")
         assert overview_response.status_code == 200
 
         overview_data = overview_response.json()
@@ -232,11 +172,11 @@ async def test_create_and_manage_user(self, client: AsyncClient, test_admin: Dic
 
         # Update user
         update_data = {
-            "username": f"updated_{unique_id}",
-            "email": f"updated_{unique_id}@example.com"
+            "username": f"updated_{uid}",
+            "email": f"updated_{uid}@example.com"
         }
 
-        update_response = await client.put(f"/api/v1/admin/users/{user_id}", json=update_data)
+        update_response = await authenticated_admin_client.put(f"/api/v1/admin/users/{user_id}", json=update_data)
         assert update_response.status_code == 200
 
         updated_user = update_response.json()
@@ -244,11 +184,11 @@ async def test_create_and_manage_user(self, client: AsyncClient, test_admin: Dic
         assert updated_user["email"] == update_data["email"]
 
         # Delete user
-        delete_response = await client.delete(f"/api/v1/admin/users/{user_id}")
+        delete_response = await authenticated_admin_client.delete(f"/api/v1/admin/users/{user_id}")
         assert delete_response.status_code in [200, 204]
 
         # Verify deletion
-        get_deleted_response = await client.get(f"/api/v1/admin/users/{user_id}")
+        get_deleted_response = await authenticated_admin_client.get(f"/api/v1/admin/users/{user_id}")
         assert get_deleted_response.status_code == 404
 
 
@@ -257,16 +197,8 @@ class TestAdminEvents:
     """Test admin event management endpoints against real backend."""
 
     @pytest.mark.asyncio
-    async def test_browse_events(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_browse_events(self, authenticated_admin_client: AsyncClient) -> None:
         """Test browsing events with filters."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Browse events
         browse_payload = {
             "filters": {
@@ -278,7 +210,7 @@ async def test_browse_events(self, client: AsyncClient, test_admin: Dict[str, st
             "sort_order": -1
         }
 
-        response = await client.post("/api/v1/admin/events/browse", json=browse_payload)
+        response = await authenticated_admin_client.post("/api/v1/admin/events/browse", json=browse_payload)
         assert response.status_code == 200
 
         data = response.json()
@@ -291,18 +223,10 @@ async def test_browse_events(self, client: AsyncClient, test_admin: Dict[str, st
         assert data["total"] >= 0
 
     @pytest.mark.asyncio
-    async def test_event_statistics(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_event_statistics(self, authenticated_admin_client: AsyncClient) -> None:
         """Test getting event statistics."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get event statistics
-        response = await client.get("/api/v1/admin/events/stats?hours=24")
+        response = await authenticated_admin_client.get("/api/v1/admin/events/stats?hours=24")
         assert response.status_code == 200
 
         data = response.json()
@@ -324,15 +248,10 @@ async def test_event_statistics(self, client: AsyncClient, test_admin: Dict[str,
             assert data["error_rate"] >= 0.0
 
     @pytest.mark.asyncio
-    async def test_admin_events_export_csv_and_json(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_admin_events_export_csv_and_json(self, authenticated_admin_client: AsyncClient) -> None:
         """Export admin events as CSV and JSON and validate basic structure."""
-        # Login as admin
-        login_data = {"username": test_admin["username"], "password": test_admin["password"]}
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # CSV export
-        r_csv = await client.get("/api/v1/admin/events/export/csv?limit=10")
+        r_csv = await authenticated_admin_client.get("/api/v1/admin/events/export/csv?limit=10")
         assert r_csv.status_code == 200, f"CSV export failed: {r_csv.status_code} - {r_csv.text[:200]}"
         ct_csv = r_csv.headers.get("content-type", "")
         assert "text/csv" in ct_csv
@@ -341,7 +260,7 @@ async def test_admin_events_export_csv_and_json(self, client: AsyncClient, test_
         assert "Event ID" in body_csv and "Timestamp" in body_csv
 
         # JSON export
-        r_json = await client.get("/api/v1/admin/events/export/json?limit=10")
+        r_json = await authenticated_admin_client.get("/api/v1/admin/events/export/json?limit=10")
         assert r_json.status_code == 200, f"JSON export failed: {r_json.status_code} - {r_json.text[:200]}"
         ct_json = r_json.headers.get("content-type", "")
         assert "application/json" in ct_json
@@ -351,27 +270,23 @@ async def test_admin_events_export_csv_and_json(self, client: AsyncClient, test_
         assert "exported_at" in data["export_metadata"]
 
     @pytest.mark.asyncio
-    async def test_admin_user_rate_limits_and_password_reset(self, client: AsyncClient,
-                                                             test_admin: Dict[str, str]) -> None:
+    async def test_admin_user_rate_limits_and_password_reset(
+        self, authenticated_admin_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Create a user, manage rate limits, and reset password via admin endpoints."""
-        # Login as admin
-        login_data = {"username": test_admin["username"], "password": test_admin["password"]}
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Create a new user to operate on
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         new_user = {
-            "username": f"rate_limit_user_{unique_id}",
-            "email": f"rl_{unique_id}@example.com",
+            "username": f"rate_limit_user_{uid}",
+            "email": f"rl_{uid}@example.com",
             "password": "TempP@ss1234"
         }
-        create_response = await client.post("/api/v1/admin/users/", json=new_user)
+        create_response = await authenticated_admin_client.post("/api/v1/admin/users/", json=new_user)
         assert create_response.status_code in [200, 201]
         target_user_id = create_response.json()["user_id"]
 
         # Get current rate limits (may be None for fresh user)
-        rl_get = await client.get(f"/api/v1/admin/users/{target_user_id}/rate-limits")
+        rl_get = await authenticated_admin_client.get(f"/api/v1/admin/users/{target_user_id}/rate-limits")
         assert rl_get.status_code == 200
         rl_body = rl_get.json()
         assert rl_body.get("user_id") == target_user_id
@@ -395,28 +310,30 @@ async def test_admin_user_rate_limits_and_password_reset(self, client: AsyncClie
                 }
             ]
         }
-        rl_put = await client.put(f"/api/v1/admin/users/{target_user_id}/rate-limits", json=update_payload)
+        rl_put = await authenticated_admin_client.put(
+            f"/api/v1/admin/users/{target_user_id}/rate-limits", json=update_payload
+        )
         assert rl_put.status_code == 200
         put_body = rl_put.json()
         assert put_body.get("updated") is True
         assert put_body.get("config", {}).get("user_id") == target_user_id
 
         # Reset rate limits
-        rl_reset = await client.post(f"/api/v1/admin/users/{target_user_id}/rate-limits/reset")
+        rl_reset = await authenticated_admin_client.post(f"/api/v1/admin/users/{target_user_id}/rate-limits/reset")
         assert rl_reset.status_code == 200
 
         # Reset password for the user
         new_password = "NewPassw0rd!"
-        pw_reset = await client.post(
+        pw_reset = await authenticated_admin_client.post(
             f"/api/v1/admin/users/{target_user_id}/reset-password",
             json={"new_password": new_password}
         )
         assert pw_reset.status_code == 200
 
         # Verify user can login with the new password
-        logout_resp = await client.post("/api/v1/auth/logout")
+        logout_resp = await authenticated_admin_client.post("/api/v1/auth/logout")
         assert logout_resp.status_code in [200, 204]
-        login_new = await client.post(
+        login_new = await authenticated_admin_client.post(
             "/api/v1/auth/login",
             data={"username": new_user["username"], "password": new_password}
         )
diff --git a/backend/tests/integration/test_alertmanager.py b/backend/tests/integration/test_alertmanager.py
index c61304c1..609a26af 100644
--- a/backend/tests/integration/test_alertmanager.py
+++ b/backend/tests/integration/test_alertmanager.py
@@ -1,12 +1,13 @@
-import pytest
 from datetime import datetime, timezone
 
+import pytest
+from httpx import AsyncClient
 
 pytestmark = pytest.mark.integration
 
 
 @pytest.mark.asyncio
-async def test_grafana_alert_endpoints(client):
+async def test_grafana_alert_endpoints(client: AsyncClient) -> None:
     # Test endpoint
     r_test = await client.get("/api/v1/alerts/grafana/test")
     assert r_test.status_code == 200
diff --git a/backend/tests/integration/test_auth_routes.py b/backend/tests/integration/test_auth_routes.py
index 07df6472..02a802f2 100644
--- a/backend/tests/integration/test_auth_routes.py
+++ b/backend/tests/integration/test_auth_routes.py
@@ -1,10 +1,9 @@
-from uuid import uuid4
+from typing import Callable
 
 import pytest
-from httpx import AsyncClient
-
 from app.domain.enums.user import UserRole as UserRoleEnum
 from app.schemas_pydantic.user import UserResponse
+from httpx import AsyncClient
 
 
 @pytest.mark.integration
@@ -12,12 +11,12 @@ class TestAuthentication:
     """Test authentication endpoints against real backend."""
 
     @pytest.mark.asyncio
-    async def test_user_registration_success(self, client: AsyncClient) -> None:
+    async def test_user_registration_success(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         """Test successful user registration with all required fields."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         registration_data = {
-            "username": f"test_auth_user_{unique_id}",
-            "email": f"test_auth_{unique_id}@example.com",
+            "username": f"test_auth_user_{uid}",
+            "email": f"test_auth_{uid}@example.com",
             "password": "SecureP@ssw0rd123"
         }
 
@@ -48,12 +47,14 @@ async def test_user_registration_success(self, client: AsyncClient) -> None:
         assert user.is_superuser is False
 
     @pytest.mark.asyncio
-    async def test_user_registration_with_weak_password(self, client: AsyncClient) -> None:
+    async def test_user_registration_with_weak_password(
+        self, client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test that registration fails with weak passwords."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         registration_data = {
-            "username": f"test_weak_pwd_{unique_id}",
-            "email": f"test_weak_{unique_id}@example.com",
+            "username": f"test_weak_pwd_{uid}",
+            "email": f"test_weak_{uid}@example.com",
             "password": "weak"  # Too short
         }
 
@@ -71,12 +72,12 @@ async def test_user_registration_with_weak_password(self, client: AsyncClient) -
         assert any(word in error_text for word in ["password", "length", "characters", "weak", "short"])
 
     @pytest.mark.asyncio
-    async def test_duplicate_username_registration(self, client: AsyncClient) -> None:
+    async def test_duplicate_username_registration(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         """Test that duplicate username registration is prevented."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         registration_data = {
-            "username": f"duplicate_user_{unique_id}",
-            "email": f"duplicate1_{unique_id}@example.com",
+            "username": f"duplicate_user_{uid}",
+            "email": f"duplicate1_{uid}@example.com",
             "password": "SecureP@ssw0rd123"
         }
 
@@ -87,7 +88,7 @@ async def test_duplicate_username_registration(self, client: AsyncClient) -> Non
         # Attempt duplicate registration with same username, different email
         duplicate_data = {
             "username": registration_data["username"],  # Same username
-            "email": f"duplicate2_{unique_id}@example.com",  # Different email
+            "email": f"duplicate2_{uid}@example.com",  # Different email
             "password": "SecureP@ssw0rd123"
         }
 
@@ -100,12 +101,12 @@ async def test_duplicate_username_registration(self, client: AsyncClient) -> Non
                    for word in ["already", "exists", "taken", "duplicate"])
 
     @pytest.mark.asyncio
-    async def test_duplicate_email_registration(self, client: AsyncClient) -> None:
+    async def test_duplicate_email_registration(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         """Test that duplicate email registration is prevented."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         registration_data = {
-            "username": f"user_email1_{unique_id}",
-            "email": f"duplicate_email_{unique_id}@example.com",
+            "username": f"user_email1_{uid}",
+            "email": f"duplicate_email_{uid}@example.com",
             "password": "SecureP@ssw0rd123"
         }
 
@@ -115,23 +116,25 @@ async def test_duplicate_email_registration(self, client: AsyncClient) -> None:
 
         # Attempt duplicate registration with same email, different username
         duplicate_data = {
-            "username": f"user_email2_{unique_id}",  # Different username
+            "username": f"user_email2_{uid}",  # Different username
             "email": registration_data["email"],  # Same email
             "password": "SecureP@ssw0rd123"
         }
 
         duplicate_response = await client.post("/api/v1/auth/register", json=duplicate_data)
         # Backend might allow duplicate emails but not duplicate usernames
-        # If it allows the registration, that's also valid behavior  
+        # If it allows the registration, that's also valid behavior
         assert duplicate_response.status_code in [200, 201, 400, 409]
 
     @pytest.mark.asyncio
-    async def test_login_success_with_valid_credentials(self, client: AsyncClient) -> None:
+    async def test_login_success_with_valid_credentials(
+        self, client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test successful login with valid credentials."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         registration_data = {
-            "username": f"login_test_{unique_id}",
-            "email": f"login_{unique_id}@example.com",
+            "username": f"login_test_{uid}",
+            "email": f"login_{uid}@example.com",
             "password": "SecureLoginP@ss123"
         }
 
@@ -166,12 +169,14 @@ async def test_login_success_with_valid_credentials(self, client: AsyncClient) -
         assert len(cookies) > 0  # Should have at least one cookie
 
     @pytest.mark.asyncio
-    async def test_login_failure_with_wrong_password(self, client: AsyncClient) -> None:
+    async def test_login_failure_with_wrong_password(
+        self, client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test that login fails with incorrect password."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         registration_data = {
-            "username": f"wrong_pwd_{unique_id}",
-            "email": f"wrong_pwd_{unique_id}@example.com",
+            "username": f"wrong_pwd_{uid}",
+            "email": f"wrong_pwd_{uid}@example.com",
             "password": "CorrectP@ssw0rd123"
         }
 
@@ -193,11 +198,13 @@ async def test_login_failure_with_wrong_password(self, client: AsyncClient) -> N
                    for word in ["invalid", "incorrect", "credentials", "unauthorized"])
 
     @pytest.mark.asyncio
-    async def test_login_failure_with_nonexistent_user(self, client: AsyncClient) -> None:
+    async def test_login_failure_with_nonexistent_user(
+        self, client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test that login fails for non-existent user."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         login_data = {
-            "username": f"nonexistent_user_{unique_id}",
+            "username": f"nonexistent_user_{uid}",
             "password": "AnyP@ssw0rd123"
         }
 
@@ -208,12 +215,12 @@ async def test_login_failure_with_nonexistent_user(self, client: AsyncClient) ->
         assert "detail" in error_data
 
     @pytest.mark.asyncio
-    async def test_get_current_user_info(self, client: AsyncClient) -> None:
+    async def test_get_current_user_info(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         """Test getting current user information via /me endpoint."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         registration_data = {
-            "username": f"me_test_{unique_id}",
-            "email": f"me_test_{unique_id}@example.com",
+            "username": f"me_test_{uid}",
+            "email": f"me_test_{uid}@example.com",
             "password": "SecureP@ssw0rd123"
         }
 
@@ -259,12 +266,12 @@ async def test_unauthorized_access_without_auth(self, client: AsyncClient) -> No
                    for word in ["not authenticated", "unauthorized", "login"])
 
     @pytest.mark.asyncio
-    async def test_logout_clears_session(self, client: AsyncClient) -> None:
+    async def test_logout_clears_session(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         """Test logout functionality clears the session."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         registration_data = {
-            "username": f"logout_test_{unique_id}",
-            "email": f"logout_{unique_id}@example.com",
+            "username": f"logout_test_{uid}",
+            "email": f"logout_{uid}@example.com",
             "password": "SecureP@ssw0rd123"
         }
 
@@ -295,12 +302,12 @@ async def test_logout_clears_session(self, client: AsyncClient) -> None:
         assert me_after_logout.status_code == 401
 
     @pytest.mark.asyncio
-    async def test_verify_token_endpoint(self, client: AsyncClient) -> None:
+    async def test_verify_token_endpoint(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         """Test token verification endpoint."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         registration_data = {
-            "username": f"verify_token_{unique_id}",
-            "email": f"verify_{unique_id}@example.com",
+            "username": f"verify_token_{uid}",
+            "email": f"verify_{uid}@example.com",
             "password": "SecureP@ssw0rd123"
         }
 
@@ -328,9 +335,8 @@ async def test_verify_token_endpoint(self, client: AsyncClient) -> None:
             assert verify_data["username"] == registration_data["username"]
 
     @pytest.mark.asyncio
-    async def test_invalid_email_format_rejected(self, client: AsyncClient) -> None:
+    async def test_invalid_email_format_rejected(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         """Test that invalid email formats are rejected during registration."""
-        unique_id = str(uuid4())[:8]
         invalid_emails = [
             "not-an-email",
             "@example.com",
@@ -338,9 +344,9 @@ async def test_invalid_email_format_rejected(self, client: AsyncClient) -> None:
             "user@.com",
         ]
 
-        for invalid_email in invalid_emails:
+        for i, invalid_email in enumerate(invalid_emails):
             registration_data = {
-                "username": f"invalid_email_{unique_id}",
+                "username": f"invalid_email_{unique_id('')}_{i}",
                 "email": invalid_email,
                 "password": "ValidP@ssw0rd123"
             }
@@ -351,16 +357,13 @@ async def test_invalid_email_format_rejected(self, client: AsyncClient) -> None:
             error_data = response.json()
             assert "detail" in error_data
 
-            # Update unique_id for next iteration to avoid username conflicts
-            unique_id = str(uuid4())[:8]
-
     @pytest.mark.asyncio
-    async def test_csrf_token_generation(self, client: AsyncClient) -> None:
+    async def test_csrf_token_generation(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         """Test CSRF token generation on login."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         registration_data = {
-            "username": f"csrf_test_{unique_id}",
-            "email": f"csrf_{unique_id}@example.com",
+            "username": f"csrf_test_{uid}",
+            "email": f"csrf_{uid}@example.com",
             "password": "SecureP@ssw0rd123"
         }
 
@@ -385,12 +388,14 @@ async def test_csrf_token_generation(self, client: AsyncClient) -> None:
             assert isinstance(response_data["csrf_token"], str)
 
     @pytest.mark.asyncio
-    async def test_session_persistence_across_requests(self, client: AsyncClient) -> None:
+    async def test_session_persistence_across_requests(
+        self, client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test that session persists across multiple requests after login."""
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         registration_data = {
-            "username": f"session_test_{unique_id}",
-            "email": f"session_{unique_id}@example.com",
+            "username": f"session_test_{uid}",
+            "email": f"session_{uid}@example.com",
             "password": "SecureP@ssw0rd123"
         }
 
diff --git a/backend/tests/integration/test_dlq_routes.py b/backend/tests/integration/test_dlq_routes.py
index 5cc114a0..fd67a040 100644
--- a/backend/tests/integration/test_dlq_routes.py
+++ b/backend/tests/integration/test_dlq_routes.py
@@ -1,19 +1,17 @@
 from datetime import datetime
-from typing import Dict
 
 import pytest
-from httpx import AsyncClient
-
+from app.dlq import DLQMessageStatus
 from app.schemas_pydantic.dlq import (
-    DLQStats,
-    DLQMessagesResponse,
-    DLQMessageResponse,
-    DLQMessageDetail,
-    DLQMessageStatus,
     DLQBatchRetryResponse,
-    DLQTopicSummaryResponse
+    DLQMessageDetail,
+    DLQMessageResponse,
+    DLQMessagesResponse,
+    DLQStats,
+    DLQTopicSummaryResponse,
 )
 from app.schemas_pydantic.user import MessageResponse
+from httpx import AsyncClient
 
 
 @pytest.mark.integration
@@ -33,71 +31,23 @@ async def test_dlq_requires_authentication(self, client: AsyncClient) -> None:
                    for word in ["not authenticated", "unauthorized", "login"])
 
     @pytest.mark.asyncio
-    async def test_get_dlq_statistics(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_dlq_statistics(self, authenticated_client: AsyncClient) -> None:
         """Test getting DLQ statistics."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Get DLQ stats
-        response = await client.get("/api/v1/dlq/stats")
+        response = await authenticated_client.get("/api/v1/dlq/stats")
         assert response.status_code == 200
 
-        # Validate response structure
-        stats_data = response.json()
-        stats = DLQStats(**stats_data)
-
-        # Verify structure
-        assert isinstance(stats.by_status, dict)
-        assert isinstance(stats.by_topic, list)
-        assert isinstance(stats.by_event_type, list)
-        assert isinstance(stats.age_stats, dict)
-        assert stats.timestamp is not None
-
-        # Check status breakdown
-        for status in ["pending", "retrying", "failed", "discarded"]:
-            if status in stats.by_status:
-                assert isinstance(stats.by_status[status], int)
-                assert stats.by_status[status] >= 0
-
-        # Check topic stats
-        for topic_stat in stats.by_topic:
-            assert "topic" in topic_stat
-            assert "count" in topic_stat
-            assert isinstance(topic_stat["count"], int)
-            assert topic_stat["count"] >= 0
-
-        # Check event type stats  
-        for event_type_stat in stats.by_event_type:
-            assert "event_type" in event_type_stat
-            assert "count" in event_type_stat
-            assert isinstance(event_type_stat["count"], int)
-            assert event_type_stat["count"] >= 0
-
-        # Check age stats
-        if stats.age_stats:
-            for key in ["min", "max", "avg", "median"]:
-                if key in stats.age_stats:
-                    assert isinstance(stats.age_stats[key], (int, float))
-                    assert stats.age_stats[key] >= 0
+        # Pydantic validates structure and types
+        stats = DLQStats(**response.json())
+
+        # Verify counts are non-negative
+        for count in stats.by_status.values():
+            assert count >= 0
 
     @pytest.mark.asyncio
-    async def test_list_dlq_messages(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_list_dlq_messages(self, authenticated_client: AsyncClient) -> None:
         """Test listing DLQ messages with filters."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # List all DLQ messages
-        response = await client.get("/api/v1/dlq/messages?limit=10&offset=0")
+        response = await authenticated_client.get("/api/v1/dlq/messages?limit=10&offset=0")
         assert response.status_code == 200
 
         # Validate response structure
@@ -125,24 +75,12 @@ async def test_list_dlq_messages(self, client: AsyncClient, test_user: Dict[str,
             if message.age_seconds is not None:
                 assert message.age_seconds >= 0
 
-            # Check details if present
-            if message.details:
-                assert isinstance(message.details, dict)
-
     @pytest.mark.asyncio
-    async def test_filter_dlq_messages_by_status(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_filter_dlq_messages_by_status(self, authenticated_client: AsyncClient) -> None:
         """Test filtering DLQ messages by status."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Test different status filters
         for status in ["pending", "scheduled", "retried", "discarded"]:
-            response = await client.get(f"/api/v1/dlq/messages?status={status}&limit=5")
+            response = await authenticated_client.get(f"/api/v1/dlq/messages?status={status}&limit=5")
             assert response.status_code == 200
 
             messages_data = response.json()
@@ -153,19 +91,11 @@ async def test_filter_dlq_messages_by_status(self, client: AsyncClient, test_use
                 assert message.status == status
 
     @pytest.mark.asyncio
-    async def test_filter_dlq_messages_by_topic(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_filter_dlq_messages_by_topic(self, authenticated_client: AsyncClient) -> None:
         """Test filtering DLQ messages by topic."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Filter by a specific topic
         test_topic = "execution-events"
-        response = await client.get(f"/api/v1/dlq/messages?topic={test_topic}&limit=5")
+        response = await authenticated_client.get(f"/api/v1/dlq/messages?topic={test_topic}&limit=5")
         assert response.status_code == 200
 
         messages_data = response.json()
@@ -176,67 +106,32 @@ async def test_filter_dlq_messages_by_topic(self, client: AsyncClient, test_user
             assert message.original_topic == test_topic
 
     @pytest.mark.asyncio
-    async def test_get_single_dlq_message_detail(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_single_dlq_message_detail(self, authenticated_client: AsyncClient) -> None:
         """Test getting detailed information for a single DLQ message."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # First get list of messages to find an ID
-        list_response = await client.get("/api/v1/dlq/messages?limit=1")
+        list_response = await authenticated_client.get("/api/v1/dlq/messages?limit=1")
         assert list_response.status_code == 200
 
         messages_data = list_response.json()
         if messages_data["total"] > 0 and messages_data["messages"]:
-            # Get details for the first message
             event_id = messages_data["messages"][0]["event_id"]
 
-            detail_response = await client.get(f"/api/v1/dlq/messages/{event_id}")
+            detail_response = await authenticated_client.get(f"/api/v1/dlq/messages/{event_id}")
             assert detail_response.status_code == 200
 
-            # Validate detailed response
-            detail_data = detail_response.json()
-            message_detail = DLQMessageDetail(**detail_data)
+            # Pydantic validates structure and types
+            message_detail = DLQMessageDetail(**detail_response.json())
 
-            # Verify all fields are present
+            # Verify we got the right message and business logic constraints
             assert message_detail.event_id == event_id
-            assert message_detail.event is not None
-            assert isinstance(message_detail.event, dict)
-            assert message_detail.event_type is not None
-            assert message_detail.original_topic is not None
-            assert message_detail.error is not None
             assert message_detail.retry_count >= 0
-            assert message_detail.failed_at is not None
-            assert message_detail.status in DLQMessageStatus.__members__.values()
-            assert message_detail.created_at is not None
-            assert message_detail.last_updated is not None
-
-            # Optional fields
-            if message_detail.producer_id:
-                assert isinstance(message_detail.producer_id, str)
-            if message_detail.dlq_offset is not None:
-                assert message_detail.dlq_offset >= 0
-            if message_detail.dlq_partition is not None:
-                assert message_detail.dlq_partition >= 0
 
     @pytest.mark.asyncio
-    async def test_get_nonexistent_dlq_message(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_nonexistent_dlq_message(self, authenticated_client: AsyncClient) -> None:
         """Test getting a non-existent DLQ message."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Try to get non-existent message
         fake_event_id = "00000000-0000-0000-0000-000000000000"
-        response = await client.get(f"/api/v1/dlq/messages/{fake_event_id}")
+        response = await authenticated_client.get(f"/api/v1/dlq/messages/{fake_event_id}")
         assert response.status_code == 404
 
         error_data = response.json()
@@ -244,16 +139,8 @@ async def test_get_nonexistent_dlq_message(self, client: AsyncClient, test_user:
         assert "not found" in error_data["detail"].lower()
 
     @pytest.mark.asyncio
-    async def test_set_retry_policy(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_set_retry_policy(self, authenticated_client: AsyncClient) -> None:
         """Test setting a retry policy for a topic."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Set retry policy
         policy_data = {
             "topic": "test-topic",
@@ -264,28 +151,20 @@ async def test_set_retry_policy(self, client: AsyncClient, test_user: Dict[str,
             "retry_multiplier": 2.0
         }
 
-        response = await client.post("/api/v1/dlq/retry-policy", json=policy_data)
+        response = await authenticated_client.post("/api/v1/dlq/retry-policy", json=policy_data)
         assert response.status_code == 200
 
         # Validate response
         result_data = response.json()
         result = MessageResponse(**result_data)
         assert "retry policy set" in result.message.lower()
-        assert policy_data["topic"] in result.message
+        assert str(policy_data["topic"]) in result.message
 
     @pytest.mark.asyncio
-    async def test_retry_dlq_messages_batch(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_retry_dlq_messages_batch(self, authenticated_client: AsyncClient) -> None:
         """Test retrying a batch of DLQ messages."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get some failed messages to retry
-        list_response = await client.get("/api/v1/dlq/messages?status=discarded&limit=3")
+        list_response = await authenticated_client.get("/api/v1/dlq/messages?status=discarded&limit=3")
         assert list_response.status_code == 200
 
         messages_data = list_response.json()
@@ -298,7 +177,7 @@ async def test_retry_dlq_messages_batch(self, client: AsyncClient, test_user: Di
                 "event_ids": event_ids
             }
 
-            retry_response = await client.post("/api/v1/dlq/retry", json=retry_request)
+            retry_response = await authenticated_client.post("/api/v1/dlq/retry", json=retry_request)
             assert retry_response.status_code == 200
 
             # Validate retry response
@@ -319,18 +198,10 @@ async def test_retry_dlq_messages_batch(self, client: AsyncClient, test_user: Di
                     assert "success" in detail
 
     @pytest.mark.asyncio
-    async def test_discard_dlq_message(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_discard_dlq_message(self, authenticated_client: AsyncClient) -> None:
         """Test discarding a DLQ message."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get a failed message to discard
-        list_response = await client.get("/api/v1/dlq/messages?status=discarded&limit=1")
+        list_response = await authenticated_client.get("/api/v1/dlq/messages?status=discarded&limit=1")
         assert list_response.status_code == 200
 
         messages_data = list_response.json()
@@ -339,7 +210,7 @@ async def test_discard_dlq_message(self, client: AsyncClient, test_user: Dict[st
 
             # Discard the message
             discard_reason = "Test discard - message unrecoverable"
-            discard_response = await client.delete(
+            discard_response = await authenticated_client.delete(
                 f"/api/v1/dlq/messages/{event_id}?reason={discard_reason}"
             )
             assert discard_response.status_code == 200
@@ -351,25 +222,17 @@ async def test_discard_dlq_message(self, client: AsyncClient, test_user: Dict[st
             assert event_id in result.message
 
             # Verify message is now discarded
-            detail_response = await client.get(f"/api/v1/dlq/messages/{event_id}")
+            detail_response = await authenticated_client.get(f"/api/v1/dlq/messages/{event_id}")
             if detail_response.status_code == 200:
                 detail_data = detail_response.json()
                 # Status should be discarded
                 assert detail_data["status"] == "discarded"
 
     @pytest.mark.asyncio
-    async def test_get_dlq_topics_summary(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_dlq_topics_summary(self, authenticated_client: AsyncClient) -> None:
         """Test getting DLQ topics summary."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get topics summary
-        response = await client.get("/api/v1/dlq/topics")
+        response = await authenticated_client.get("/api/v1/dlq/topics")
         assert response.status_code == 200
 
         # Validate response
@@ -404,18 +267,10 @@ async def test_get_dlq_topics_summary(self, client: AsyncClient, test_user: Dict
                 assert topic_summary.max_retry_count >= 0
 
     @pytest.mark.asyncio
-    async def test_dlq_message_pagination(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_dlq_message_pagination(self, authenticated_client: AsyncClient) -> None:
         """Test DLQ message pagination."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get first page
-        page1_response = await client.get("/api/v1/dlq/messages?limit=5&offset=0")
+        page1_response = await authenticated_client.get("/api/v1/dlq/messages?limit=5&offset=0")
         assert page1_response.status_code == 200
 
         page1_data = page1_response.json()
@@ -423,7 +278,7 @@ async def test_dlq_message_pagination(self, client: AsyncClient, test_user: Dict
 
         # If there are more than 5 messages, get second page
         if page1.total > 5:
-            page2_response = await client.get("/api/v1/dlq/messages?limit=5&offset=5")
+            page2_response = await authenticated_client.get("/api/v1/dlq/messages?limit=5&offset=5")
             assert page2_response.status_code == 200
 
             page2_data = page2_response.json()
@@ -442,39 +297,28 @@ async def test_dlq_message_pagination(self, client: AsyncClient, test_user: Dict
                 assert len(page1_ids.intersection(page2_ids)) == 0
 
     @pytest.mark.asyncio
-    async def test_dlq_error_handling(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_dlq_error_handling(self, authenticated_client: AsyncClient) -> None:
         """Test DLQ error handling for invalid requests."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Test invalid limit
-        response = await client.get("/api/v1/dlq/messages?limit=10000")  # Too high
+        response = await authenticated_client.get("/api/v1/dlq/messages?limit=10000")  # Too high
         # Should either accept with max limit or reject
         assert response.status_code in [200, 400, 422]
 
         # Test negative offset
-        response = await client.get("/api/v1/dlq/messages?limit=10&offset=-1")
+        response = await authenticated_client.get("/api/v1/dlq/messages?limit=10&offset=-1")
         assert response.status_code in [400, 422]
 
         # Test invalid status filter
-        response = await client.get("/api/v1/dlq/messages?status=invalid_status")
+        response = await authenticated_client.get("/api/v1/dlq/messages?status=invalid_status")
         assert response.status_code in [400, 422]
 
         # Test retry with empty list
-        retry_request = {
-            "event_ids": []
-        }
-        response = await client.post("/api/v1/dlq/retry", json=retry_request)
+        response = await authenticated_client.post("/api/v1/dlq/retry", json={"event_ids": []})
         # Should handle gracefully or reject invalid input
         assert response.status_code in [200, 400, 404, 422]
 
         # Test discard without reason
         fake_event_id = "00000000-0000-0000-0000-000000000000"
-        response = await client.delete(f"/api/v1/dlq/messages/{fake_event_id}")
+        response = await authenticated_client.delete(f"/api/v1/dlq/messages/{fake_event_id}")
         # Should require reason parameter
         assert response.status_code in [400, 422, 404]
diff --git a/backend/tests/integration/test_events_routes.py b/backend/tests/integration/test_events_routes.py
index 342bd8ad..e0d25c6e 100644
--- a/backend/tests/integration/test_events_routes.py
+++ b/backend/tests/integration/test_events_routes.py
@@ -1,18 +1,19 @@
-from datetime import datetime, timezone, timedelta
-from typing import Dict
-from uuid import uuid4
+from collections.abc import Callable
+from datetime import datetime, timedelta, timezone
 
 import pytest
-from httpx import AsyncClient
-
 from app.domain.enums.events import EventType
+from app.domain.enums.user import UserRole
 from app.schemas_pydantic.events import (
     EventListResponse,
     EventResponse,
     EventStatistics,
     PublishEventResponse,
-    ReplayAggregateResponse
+    ReplayAggregateResponse,
 )
+from httpx import AsyncClient
+
+from tests.conftest import MakeUser
 
 
 @pytest.mark.integration
@@ -32,12 +33,10 @@ async def test_events_require_authentication(self, client: AsyncClient) -> None:
                    for word in ["not authenticated", "unauthorized", "login"])
 
     @pytest.mark.asyncio
-    async def test_get_user_events(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_user_events(self, authenticated_client: AsyncClient) -> None:
         """Test getting user's events."""
-        # Already authenticated via test_user fixture
-
         # Get user events
-        response = await client.get("/api/v1/events/user?limit=10&skip=0")
+        response = await authenticated_client.get("/api/v1/events/user?limit=10&skip=0")
         # Some deployments may route this path under a dynamic segment and return 404.
         # Accept 200 with a valid payload or 404 (no such resource).
         assert response.status_code in [200, 404]
@@ -61,8 +60,8 @@ async def test_get_user_events(self, client: AsyncClient, test_user: Dict[str, s
                 assert event.event_type is not None
                 assert event.aggregate_id is not None
                 assert event.timestamp is not None
-                assert event.version is not None
-                assert event.user_id is not None
+                assert event.event_version is not None
+                assert event.metadata.user_id is not None
 
                 # Optional fields
                 if event.payload:
@@ -73,63 +72,50 @@ async def test_get_user_events(self, client: AsyncClient, test_user: Dict[str, s
                     assert isinstance(event.correlation_id, str)
 
     @pytest.mark.asyncio
-    async def test_get_user_events_with_filters(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_user_events_with_filters(self, authenticated_client: AsyncClient) -> None:
         """Test filtering user events."""
-        # Already authenticated via test_user fixture
-
         # Create an execution to generate events
         execution_request = {
             "script": "print('Test for event filtering')",
             "lang": "python",
             "lang_version": "3.11"
         }
-        exec_response = await client.post("/api/v1/execute", json=execution_request)
+        exec_response = await authenticated_client.post("/api/v1/execute", json=execution_request)
         assert exec_response.status_code == 200
 
         # Filter by event types
-        event_types = ["execution.requested", "execution.completed"]
-        params = {
-            "event_types": event_types,
-            "limit": 20,
-            "sort_order": "desc"
-        }
-
-        response = await client.get("/api/v1/events/user", params=params)
+        response = await authenticated_client.get(
+            "/api/v1/events/user",
+            params={"event_types": ["execution.requested", "execution.completed"], "limit": 20, "sort_order": "desc"},
+        )
         assert response.status_code in [200, 404]
         if response.status_code == 200:
             events_data = response.json()
             events_response = EventListResponse(**events_data)
 
             # Filtered events should only contain specified types
+            event_types = ["execution.requested", "execution.completed"]
             for event in events_response.events:
                 if event.event_type:  # Some events might have been created
-                    assert any(event_type in event.event_type for event_type in event_types) or len(
+                    assert any(et in str(event.event_type) for et in event_types) or len(
                         events_response.events) == 0
 
     @pytest.mark.asyncio
-    async def test_get_execution_events(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_execution_events(self, authenticated_client: AsyncClient) -> None:
         """Test getting events for a specific execution."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Create an execution
         execution_request = {
             "script": "print('Test execution events')",
             "lang": "python",
             "lang_version": "3.11"
         }
-        exec_response = await client.post("/api/v1/execute", json=execution_request)
+        exec_response = await authenticated_client.post("/api/v1/execute", json=execution_request)
         assert exec_response.status_code == 200
 
         execution_id = exec_response.json()["execution_id"]
 
         # Get execution events (JSON, not SSE stream)
-        response = await client.get(
+        response = await authenticated_client.get(
             f"/api/v1/events/executions/{execution_id}/events?include_system_events=true"
         )
         assert response.status_code == 200
@@ -147,16 +133,8 @@ async def test_get_execution_events(self, client: AsyncClient, test_user: Dict[s
                 assert execution_id in event.aggregate_id or event.aggregate_id == execution_id
 
     @pytest.mark.asyncio
-    async def test_query_events_advanced(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_query_events_advanced(self, authenticated_client: AsyncClient) -> None:
         """Test advanced event querying with filters."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Query events with multiple filters
         query_request = {
             "event_types": [
@@ -171,7 +149,7 @@ async def test_query_events_advanced(self, client: AsyncClient, test_user: Dict[
             "sort_order": "desc"
         }
 
-        response = await client.post("/api/v1/events/query", json=query_request)
+        response = await authenticated_client.post("/api/v1/events/query", json=query_request)
         assert response.status_code == 200
 
         events_data = response.json()
@@ -191,27 +169,19 @@ async def test_query_events_advanced(self, client: AsyncClient, test_user: Dict[
                 assert t1 >= t2  # Descending order
 
     @pytest.mark.asyncio
-    async def test_get_events_by_correlation_id(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_events_by_correlation_id(self, authenticated_client: AsyncClient) -> None:
         """Test getting events by correlation ID."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Create an execution (which generates correlated events)
         execution_request = {
             "script": "print('Test correlation')",
             "lang": "python",
             "lang_version": "3.11"
         }
-        exec_response = await client.post("/api/v1/execute", json=execution_request)
+        exec_response = await authenticated_client.post("/api/v1/execute", json=execution_request)
         assert exec_response.status_code == 200
 
         # Get events for the user to find a correlation ID
-        user_events_response = await client.get("/api/v1/events/user?limit=10")
+        user_events_response = await authenticated_client.get("/api/v1/events/user?limit=10")
         assert user_events_response.status_code == 200
 
         user_events = user_events_response.json()
@@ -219,7 +189,7 @@ async def test_get_events_by_correlation_id(self, client: AsyncClient, test_user
             correlation_id = user_events["events"][0]["correlation_id"]
 
             # Get events by correlation ID
-            response = await client.get(f"/api/v1/events/correlation/{correlation_id}?limit=50")
+            response = await authenticated_client.get(f"/api/v1/events/correlation/{correlation_id}?limit=50")
             assert response.status_code == 200
 
             correlated_events = response.json()
@@ -231,18 +201,10 @@ async def test_get_events_by_correlation_id(self, client: AsyncClient, test_user
                     assert event.correlation_id == correlation_id
 
     @pytest.mark.asyncio
-    async def test_get_current_request_events(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_current_request_events(self, authenticated_client: AsyncClient) -> None:
         """Test getting events for the current request."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get current request events (might be empty if no correlation context)
-        response = await client.get("/api/v1/events/current-request?limit=10")
+        response = await authenticated_client.get("/api/v1/events/current-request?limit=10")
         assert response.status_code == 200
 
         events_data = response.json()
@@ -253,18 +215,10 @@ async def test_get_current_request_events(self, client: AsyncClient, test_user:
         assert events_response.total >= 0
 
     @pytest.mark.asyncio
-    async def test_get_event_statistics(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_event_statistics(self, authenticated_client: AsyncClient) -> None:
         """Test getting event statistics."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get statistics for last 24 hours
-        response = await client.get("/api/v1/events/statistics")
+        response = await authenticated_client.get("/api/v1/events/statistics")
         assert response.status_code == 200
 
         stats_data = response.json()
@@ -281,26 +235,15 @@ async def test_get_event_statistics(self, client: AsyncClient, test_user: Dict[s
 
         # Events by hour should have proper structure
         for hourly_stat in stats.events_by_hour:
-            # Some implementations return {'_id': hour, 'count': n}
-            hour_key = "hour" if "hour" in hourly_stat else "_id"
-            assert hour_key in hourly_stat
-            assert "count" in hourly_stat
-            assert isinstance(hourly_stat["count"], int)
-            assert hourly_stat["count"] >= 0
+            assert hourly_stat.hour is not None
+            assert isinstance(hourly_stat.count, int)
+            assert hourly_stat.count >= 0
 
     @pytest.mark.asyncio
-    async def test_get_single_event(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_single_event(self, authenticated_client: AsyncClient) -> None:
         """Test getting a single event by ID."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get user events to find an event ID
-        events_response = await client.get("/api/v1/events/user?limit=1")
+        events_response = await authenticated_client.get("/api/v1/events/user?limit=1")
         assert events_response.status_code == 200
 
         events_data = events_response.json()
@@ -308,7 +251,7 @@ async def test_get_single_event(self, client: AsyncClient, test_user: Dict[str,
             event_id = events_data["events"][0]["event_id"]
 
             # Get single event
-            response = await client.get(f"/api/v1/events/{event_id}")
+            response = await authenticated_client.get(f"/api/v1/events/{event_id}")
             assert response.status_code == 200
 
             event_data = response.json()
@@ -320,19 +263,13 @@ async def test_get_single_event(self, client: AsyncClient, test_user: Dict[str,
             assert event.timestamp is not None
 
     @pytest.mark.asyncio
-    async def test_get_nonexistent_event(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_nonexistent_event(
+        self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test getting a non-existent event."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Try to get non-existent event
-        fake_event_id = str(uuid4())
-        response = await client.get(f"/api/v1/events/{fake_event_id}")
+        fake_event_id = unique_id("fake-event-")
+        response = await authenticated_client.get(f"/api/v1/events/{fake_event_id}")
         assert response.status_code == 404
 
         error_data = response.json()
@@ -340,75 +277,47 @@ async def test_get_nonexistent_event(self, client: AsyncClient, test_user: Dict[
         assert "not found" in error_data["detail"].lower()
 
     @pytest.mark.asyncio
-    async def test_list_event_types(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_list_event_types(self, authenticated_client: AsyncClient) -> None:
         """Test listing available event types."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # List event types
-        response = await client.get("/api/v1/events/types/list")
+        response = await authenticated_client.get("/api/v1/events/types/list")
         assert response.status_code == 200
 
         event_types = response.json()
         assert isinstance(event_types, list)
 
-        # Should contain common event types
-        common_types = [
-            "execution.requested",
-            "execution.completed",
-            "user.logged_in",
-            "user.registered"
-        ]
-
         # At least some common types should be present
         for event_type in event_types:
             assert isinstance(event_type, str)
             assert len(event_type) > 0
 
     @pytest.mark.asyncio
-    async def test_publish_custom_event_requires_admin(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_publish_custom_event_requires_admin(
+        self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test that publishing custom events requires admin privileges."""
-        # Login as regular user
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Try to publish custom event
+        # Try to publish custom event (as regular user)
         publish_request = {
             "event_type": EventType.SYSTEM_ERROR.value,
             "payload": {
                 "test": "data",
                 "value": 123
             },
-            "aggregate_id": str(uuid4()),
-            "correlation_id": str(uuid4())
+            "aggregate_id": unique_id("aggregate-"),
+            "correlation_id": unique_id("corr-")
         }
 
-        response = await client.post("/api/v1/events/publish", json=publish_request)
+        response = await authenticated_client.post("/api/v1/events/publish", json=publish_request)
         assert response.status_code == 403  # Forbidden for non-admin
 
     @pytest.mark.asyncio
     @pytest.mark.kafka
-    async def test_publish_custom_event_as_admin(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_publish_custom_event_as_admin(
+        self, authenticated_admin_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test publishing custom events as admin."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Publish custom event (requires Kafka); skip if not available
-        aggregate_id = str(uuid4())
+        aggregate_id = unique_id("aggregate-")
         publish_request = {
             "event_type": EventType.SYSTEM_ERROR.value,
             "payload": {
@@ -417,16 +326,15 @@ async def test_publish_custom_event_as_admin(self, client: AsyncClient, test_adm
                 "service_name": "tests"
             },
             "aggregate_id": aggregate_id,
-            "correlation_id": str(uuid4()),
+            "correlation_id": unique_id("corr-"),
             "metadata": {
                 "source": "integration_test",
                 "version": "1.0"
             }
         }
 
-        response = await client.post("/api/v1/events/publish", json=publish_request)
-        if response.status_code != 200:
-            pytest.skip("Kafka not available for publishing events")
+        response = await authenticated_admin_client.post("/api/v1/events/publish", json=publish_request)
+        assert response.status_code == 200, f"Publish failed: {response.status_code} - {response.text}"
 
         publish_response = PublishEventResponse(**response.json())
         assert publish_response.event_id is not None
@@ -434,16 +342,8 @@ async def test_publish_custom_event_as_admin(self, client: AsyncClient, test_adm
         assert publish_response.timestamp is not None
 
     @pytest.mark.asyncio
-    async def test_aggregate_events(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_aggregate_events(self, authenticated_client: AsyncClient) -> None:
         """Test event aggregation."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Create aggregation pipeline
         aggregation_request = {
             "pipeline": [
@@ -454,7 +354,7 @@ async def test_aggregate_events(self, client: AsyncClient, test_user: Dict[str,
             "limit": 10
         }
 
-        response = await client.post("/api/v1/events/aggregate", json=aggregation_request)
+        response = await authenticated_client.post("/api/v1/events/aggregate", json=aggregation_request)
         assert response.status_code == 200
 
         results = response.json()
@@ -469,51 +369,30 @@ async def test_aggregate_events(self, client: AsyncClient, test_user: Dict[str,
             assert result["count"] >= 0
 
     @pytest.mark.asyncio
-    async def test_delete_event_requires_admin(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_delete_event_requires_admin(
+        self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test that deleting events requires admin privileges."""
-        # Login as regular user
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Try to delete an event
-        fake_event_id = str(uuid4())
-        response = await client.delete(f"/api/v1/events/{fake_event_id}")
+        # Try to delete an event (as regular user)
+        fake_event_id = unique_id("fake-event-")
+        response = await authenticated_client.delete(f"/api/v1/events/{fake_event_id}")
         assert response.status_code == 403  # Forbidden for non-admin
 
     @pytest.mark.asyncio
-    async def test_replay_aggregate_events_requires_admin(self, client: AsyncClient,
-                                                          test_user: Dict[str, str]) -> None:
+    async def test_replay_aggregate_events_requires_admin(
+        self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test that replaying events requires admin privileges."""
-        # Login as regular user
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Try to replay events
-        aggregate_id = str(uuid4())
-        response = await client.post(f"/api/v1/events/replay/{aggregate_id}?dry_run=true")
+        # Try to replay events (as regular user)
+        aggregate_id = unique_id("aggregate-")
+        response = await authenticated_client.post(f"/api/v1/events/replay/{aggregate_id}?dry_run=true")
         assert response.status_code == 403  # Forbidden for non-admin
 
     @pytest.mark.asyncio
-    async def test_replay_aggregate_events_dry_run(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_replay_aggregate_events_dry_run(self, authenticated_admin_client: AsyncClient) -> None:
         """Test replaying events in dry-run mode."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get an existing aggregate ID from events
-        events_response = await client.get("/api/v1/events/user?limit=1")
+        events_response = await authenticated_admin_client.get("/api/v1/events/user?limit=1")
         assert events_response.status_code == 200
 
         events_data = events_response.json()
@@ -521,7 +400,7 @@ async def test_replay_aggregate_events_dry_run(self, client: AsyncClient, test_a
             aggregate_id = events_data["events"][0]["aggregate_id"]
 
             # Try dry-run replay
-            response = await client.post(f"/api/v1/events/replay/{aggregate_id}?dry_run=true")
+            response = await authenticated_admin_client.post(f"/api/v1/events/replay/{aggregate_id}?dry_run=true")
 
             if response.status_code == 200:
                 replay_data = response.json()
@@ -529,7 +408,7 @@ async def test_replay_aggregate_events_dry_run(self, client: AsyncClient, test_a
 
                 assert replay_response.dry_run is True
                 assert replay_response.aggregate_id == aggregate_id
-                assert replay_response.event_count >= 0
+                assert replay_response.event_count is None or replay_response.event_count >= 0
 
                 if replay_response.event_types:
                     assert isinstance(replay_response.event_types, list)
@@ -543,18 +422,10 @@ async def test_replay_aggregate_events_dry_run(self, client: AsyncClient, test_a
                 assert "detail" in error_data
 
     @pytest.mark.asyncio
-    async def test_event_pagination(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_event_pagination(self, authenticated_client: AsyncClient) -> None:
         """Test event pagination."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get first page
-        page1_response = await client.get("/api/v1/events/user?limit=5&skip=0")
+        page1_response = await authenticated_client.get("/api/v1/events/user?limit=5&skip=0")
         assert page1_response.status_code == 200
 
         page1_data = page1_response.json()
@@ -562,7 +433,7 @@ async def test_event_pagination(self, client: AsyncClient, test_user: Dict[str,
 
         # If there are more than 5 events, get second page
         if page1.total > 5:
-            page2_response = await client.get("/api/v1/events/user?limit=5&skip=5")
+            page2_response = await authenticated_client.get("/api/v1/events/user?limit=5&skip=5")
             assert page2_response.status_code == 200
 
             page2_data = page2_response.json()
@@ -581,46 +452,35 @@ async def test_event_pagination(self, client: AsyncClient, test_user: Dict[str,
                 assert len(page1_ids.intersection(page2_ids)) == 0
 
     @pytest.mark.asyncio
-    async def test_events_isolation_between_users(self, client: AsyncClient,
-                                                  test_user: Dict[str, str],
-                                                  test_admin: Dict[str, str]) -> None:
+    async def test_events_isolation_between_users(
+        self, client: AsyncClient, make_user: MakeUser,
+    ) -> None:
         """Test that events are properly isolated between users."""
-        # Get events as regular user
-        user_login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        user_login_response = await client.post("/api/v1/auth/login", data=user_login_data)
-        assert user_login_response.status_code == 200
+        user = await make_user(UserRole.USER)
+        admin = await make_user(UserRole.ADMIN)
 
+        # Get events as regular user (already logged in from make_user)
         user_events_response = await client.get("/api/v1/events/user?limit=10")
         assert user_events_response.status_code == 200
-
         user_events = user_events_response.json()
-        user_event_ids = [e["event_id"] for e in user_events["events"]]
 
-        # Get events as admin (without include_all_users flag)
-        admin_login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        admin_login_response = await client.post("/api/v1/auth/login", data=admin_login_data)
-        assert admin_login_response.status_code == 200
+        # Login as admin
+        await client.post(
+            "/api/v1/auth/login",
+            data={"username": admin["username"], "password": admin["password"]},
+        )
 
         admin_events_response = await client.get("/api/v1/events/user?limit=10")
         assert admin_events_response.status_code == 200
-
         admin_events = admin_events_response.json()
-        admin_event_ids = [e["event_id"] for e in admin_events["events"]]
 
-        # Events should be different (unless users share some events)
-        # But user IDs in events should be different
+        # Events should be different - user IDs in events should match logged-in user
         for event in user_events["events"]:
             meta = event.get("metadata") or {}
             if meta.get("user_id"):
-                assert meta["user_id"] == test_user.get("user_id", meta["user_id"])
+                assert meta["user_id"] == user.get("user_id", meta["user_id"])
 
         for event in admin_events["events"]:
             meta = event.get("metadata") or {}
             if meta.get("user_id"):
-                assert meta["user_id"] == test_admin.get("user_id", meta["user_id"])
+                assert meta["user_id"] == admin.get("user_id", meta["user_id"])
diff --git a/backend/tests/integration/test_health_routes.py b/backend/tests/integration/test_health_routes.py
index 40105561..1845fb5e 100644
--- a/backend/tests/integration/test_health_routes.py
+++ b/backend/tests/integration/test_health_routes.py
@@ -1,6 +1,5 @@
 import asyncio
 import time
-from typing import Dict
 
 import pytest
 from httpx import AsyncClient
@@ -48,40 +47,32 @@ async def test_concurrent_liveness_fetch(self, client: AsyncClient) -> None:
         assert all(r.status_code == 200 for r in responses)
 
     @pytest.mark.asyncio
-    async def test_app_responds_during_load(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
-        # Login first for creating load
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
+    async def test_app_responds_during_load(self, authenticated_client: AsyncClient) -> None:
         # Create some load with execution requests
-        async def create_load():
+        async def create_load() -> int | None:
             execution_request = {
                 "script": "print('Load test')",
                 "lang": "python",
                 "lang_version": "3.11"
             }
             try:
-                response = await client.post("/api/v1/execute", json=execution_request)
+                response = await authenticated_client.post("/api/v1/execute", json=execution_request)
                 return response.status_code
-            except:
+            except Exception:
                 return None
 
         # Start load generation
         load_tasks = [create_load() for _ in range(5)]
 
         # Check readiness during load
-        r0 = await client.get("/api/v1/health/live")
+        r0 = await authenticated_client.get("/api/v1/health/live")
         assert r0.status_code == 200
 
         # Wait for load tasks to complete
         await asyncio.gather(*load_tasks, return_exceptions=True)
 
         # Check readiness after load
-        r1 = await client.get("/api/v1/health/live")
+        r1 = await authenticated_client.get("/api/v1/health/live")
         assert r1.status_code == 200
 
     @pytest.mark.asyncio
diff --git a/backend/tests/integration/test_notifications_routes.py b/backend/tests/integration/test_notifications_routes.py
index 5e60164f..510e19f3 100644
--- a/backend/tests/integration/test_notifications_routes.py
+++ b/backend/tests/integration/test_notifications_routes.py
@@ -1,17 +1,16 @@
-from typing import Dict
-
 import pytest
-from httpx import AsyncClient
-
+from app.domain.enums.notification import NotificationChannel, NotificationStatus
+from app.domain.enums.user import UserRole
 from app.schemas_pydantic.notification import (
+    DeleteNotificationResponse,
     NotificationListResponse,
-    NotificationStatus,
-    NotificationChannel,
     NotificationSubscription,
     SubscriptionsResponse,
     UnreadCountResponse,
-    DeleteNotificationResponse
 )
+from httpx import AsyncClient
+
+from tests.conftest import MakeUser
 
 
 @pytest.mark.integration
@@ -31,18 +30,10 @@ async def test_notifications_require_authentication(self, client: AsyncClient) -
                    for word in ["not authenticated", "unauthorized", "login"])
 
     @pytest.mark.asyncio
-    async def test_list_user_notifications(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_list_user_notifications(self, authenticated_client: AsyncClient) -> None:
         """Test listing user's notifications."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # List notifications
-        response = await client.get("/api/v1/notifications?limit=10&offset=0")
+        response = await authenticated_client.get("/api/v1/notifications?limit=10&offset=0")
         assert response.status_code == 200
 
         # Validate response structure
@@ -66,19 +57,16 @@ async def test_list_user_notifications(self, client: AsyncClient, test_user: Dic
             assert n.created_at is not None
 
     @pytest.mark.asyncio
-    async def test_filter_notifications_by_status(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_filter_notifications_by_status(self, authenticated_client: AsyncClient) -> None:
         """Test filtering notifications by status."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Test different status filters
-        for status in [NotificationStatus.READ.value, NotificationStatus.DELIVERED.value, NotificationStatus.SKIPPED.value]:
-            response = await client.get(f"/api/v1/notifications?status={status}&limit=5")
+        statuses = [
+            NotificationStatus.READ.value,
+            NotificationStatus.DELIVERED.value,
+            NotificationStatus.SKIPPED.value,
+        ]
+        for status in statuses:
+            response = await authenticated_client.get(f"/api/v1/notifications?status={status}&limit=5")
             assert response.status_code == 200
 
             notifications_data = response.json()
@@ -89,18 +77,10 @@ async def test_filter_notifications_by_status(self, client: AsyncClient, test_us
                 assert notification.status == status
 
     @pytest.mark.asyncio
-    async def test_get_unread_count(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_unread_count(self, authenticated_client: AsyncClient) -> None:
         """Test getting count of unread notifications."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get unread count
-        response = await client.get("/api/v1/notifications/unread-count")
+        response = await authenticated_client.get("/api/v1/notifications/unread-count")
         assert response.status_code == 200
 
         # Validate response
@@ -113,18 +93,10 @@ async def test_get_unread_count(self, client: AsyncClient, test_user: Dict[str,
         # Note: listing cannot filter 'unread' directly; count is authoritative
 
     @pytest.mark.asyncio
-    async def test_mark_notification_as_read(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_mark_notification_as_read(self, authenticated_client: AsyncClient) -> None:
         """Test marking a notification as read."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get an unread notification
-        notifications_response = await client.get(
+        notifications_response = await authenticated_client.get(
             f"/api/v1/notifications?status={NotificationStatus.DELIVERED.value}&limit=1")
         assert notifications_response.status_code == 200
 
@@ -133,11 +105,11 @@ async def test_mark_notification_as_read(self, client: AsyncClient, test_user: D
             notification_id = notifications_data["notifications"][0]["notification_id"]
 
             # Mark as read
-            mark_response = await client.put(f"/api/v1/notifications/{notification_id}/read")
+            mark_response = await authenticated_client.put(f"/api/v1/notifications/{notification_id}/read")
             assert mark_response.status_code == 204
 
             # Verify it's now marked as read
-            updated_response = await client.get("/api/v1/notifications")
+            updated_response = await authenticated_client.get("/api/v1/notifications")
             assert updated_response.status_code == 200
 
             updated_data = updated_response.json()
@@ -148,23 +120,10 @@ async def test_mark_notification_as_read(self, client: AsyncClient, test_user: D
                     break
 
     @pytest.mark.asyncio
-    async def test_mark_nonexistent_notification_as_read(self, client: AsyncClient,
-                                                         test_user: Dict[str, str]) -> None:
+    async def test_mark_nonexistent_notification_as_read(self, authenticated_client: AsyncClient) -> None:
         """Test marking a non-existent notification as read."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        # Try to mark non-existent notification as read
         fake_notification_id = "00000000-0000-0000-0000-000000000000"
-        response = await client.put(f"/api/v1/notifications/{fake_notification_id}/read")
-        # Prefer 404; if backend returns 500, treat as unavailable feature
-        if response.status_code == 500:
-            pytest.skip("Backend returns 500 for unknown notification IDs")
+        response = await authenticated_client.put(f"/api/v1/notifications/{fake_notification_id}/read")
         assert response.status_code == 404
 
         error_data = response.json()
@@ -172,44 +131,23 @@ async def test_mark_nonexistent_notification_as_read(self, client: AsyncClient,
         assert "not found" in error_data["detail"].lower()
 
     @pytest.mark.asyncio
-    async def test_mark_all_notifications_as_read(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_mark_all_notifications_as_read(self, authenticated_client: AsyncClient) -> None:
         """Test marking all notifications as read."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Mark all as read
-        mark_all_response = await client.post("/api/v1/notifications/mark-all-read")
+        mark_all_response = await authenticated_client.post("/api/v1/notifications/mark-all-read")
         assert mark_all_response.status_code == 204
 
-        # Verify all are now read
-        # Verify via unread-count only (list endpoint pagination can hide remaining)
-        unread_response = await client.get("/api/v1/notifications/unread-count")
-        assert unread_response.status_code == 200
-
-        # Also verify unread count is 0
-        count_response = await client.get("/api/v1/notifications/unread-count")
+        # Verify unread count is now 0
+        count_response = await authenticated_client.get("/api/v1/notifications/unread-count")
         assert count_response.status_code == 200
         count_data = count_response.json()
         assert count_data["unread_count"] == 0
 
     @pytest.mark.asyncio
-    async def test_get_notification_subscriptions(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_notification_subscriptions(self, authenticated_client: AsyncClient) -> None:
         """Test getting user's notification subscriptions."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get subscriptions
-        response = await client.get("/api/v1/notifications/subscriptions")
+        response = await authenticated_client.get("/api/v1/notifications/subscriptions")
         assert response.status_code == 200
 
         # Validate response
@@ -239,16 +177,8 @@ async def test_get_notification_subscriptions(self, client: AsyncClient, test_us
                 assert subscription.slack_webhook.startswith("http")
 
     @pytest.mark.asyncio
-    async def test_update_notification_subscription(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_update_notification_subscription(self, authenticated_client: AsyncClient) -> None:
         """Test updating a notification subscription."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Update in_app subscription
         update_data = {
             "enabled": True,
@@ -257,7 +187,7 @@ async def test_update_notification_subscription(self, client: AsyncClient, test_
             "exclude_tags": ["external_alert"]
         }
 
-        response = await client.put("/api/v1/notifications/subscriptions/in_app", json=update_data)
+        response = await authenticated_client.put("/api/v1/notifications/subscriptions/in_app", json=update_data)
         assert response.status_code == 200
 
         # Validate response
@@ -271,7 +201,7 @@ async def test_update_notification_subscription(self, client: AsyncClient, test_
         assert updated_subscription.exclude_tags == update_data["exclude_tags"]
 
         # Verify the update persisted
-        get_response = await client.get("/api/v1/notifications/subscriptions")
+        get_response = await authenticated_client.get("/api/v1/notifications/subscriptions")
         assert get_response.status_code == 200
 
         subs_data = get_response.json()
@@ -284,16 +214,8 @@ async def test_update_notification_subscription(self, client: AsyncClient, test_
                 break
 
     @pytest.mark.asyncio
-    async def test_update_webhook_subscription(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_update_webhook_subscription(self, authenticated_client: AsyncClient) -> None:
         """Test updating webhook subscription with URL."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Update webhook subscription
         update_data = {
             "enabled": True,
@@ -303,7 +225,7 @@ async def test_update_webhook_subscription(self, client: AsyncClient, test_user:
             "exclude_tags": []
         }
 
-        response = await client.put("/api/v1/notifications/subscriptions/webhook", json=update_data)
+        response = await authenticated_client.put("/api/v1/notifications/subscriptions/webhook", json=update_data)
         assert response.status_code == 200
 
         # Validate response
@@ -316,16 +238,8 @@ async def test_update_webhook_subscription(self, client: AsyncClient, test_user:
         assert updated_subscription.severities == update_data["severities"]
 
     @pytest.mark.asyncio
-    async def test_update_slack_subscription(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_update_slack_subscription(self, authenticated_client: AsyncClient) -> None:
         """Test updating Slack subscription with webhook."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Update Slack subscription
         update_data = {
             "enabled": True,
@@ -335,7 +249,7 @@ async def test_update_slack_subscription(self, client: AsyncClient, test_user: D
             "exclude_tags": []
         }
 
-        response = await client.put("/api/v1/notifications/subscriptions/slack", json=update_data)
+        response = await authenticated_client.put("/api/v1/notifications/subscriptions/slack", json=update_data)
         # Slack subscription may be disabled by config; 422 indicates validation
         assert response.status_code in [200, 422]
         if response.status_code == 422:
@@ -351,18 +265,10 @@ async def test_update_slack_subscription(self, client: AsyncClient, test_user: D
         assert updated_subscription.severities == update_data["severities"]
 
     @pytest.mark.asyncio
-    async def test_delete_notification(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_delete_notification(self, authenticated_client: AsyncClient) -> None:
         """Test deleting a notification."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get a notification to delete
-        notifications_response = await client.get("/api/v1/notifications?limit=1")
+        notifications_response = await authenticated_client.get("/api/v1/notifications?limit=1")
         assert notifications_response.status_code == 200
 
         notifications_data = notifications_response.json()
@@ -370,7 +276,7 @@ async def test_delete_notification(self, client: AsyncClient, test_user: Dict[st
             notification_id = notifications_data["notifications"][0]["notification_id"]
 
             # Delete the notification
-            delete_response = await client.delete(f"/api/v1/notifications/{notification_id}")
+            delete_response = await authenticated_client.delete(f"/api/v1/notifications/{notification_id}")
             assert delete_response.status_code == 200
 
             # Validate response
@@ -379,7 +285,7 @@ async def test_delete_notification(self, client: AsyncClient, test_user: Dict[st
             assert "deleted" in delete_result.message.lower()
 
             # Verify it's deleted
-            list_response = await client.get("/api/v1/notifications")
+            list_response = await authenticated_client.get("/api/v1/notifications")
             assert list_response.status_code == 200
 
             list_data = list_response.json()
@@ -388,19 +294,11 @@ async def test_delete_notification(self, client: AsyncClient, test_user: Dict[st
             assert notification_id not in notification_ids
 
     @pytest.mark.asyncio
-    async def test_delete_nonexistent_notification(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_delete_nonexistent_notification(self, authenticated_client: AsyncClient) -> None:
         """Test deleting a non-existent notification."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Try to delete non-existent notification
         fake_notification_id = "00000000-0000-0000-0000-000000000000"
-        response = await client.delete(f"/api/v1/notifications/{fake_notification_id}")
+        response = await authenticated_client.delete(f"/api/v1/notifications/{fake_notification_id}")
         assert response.status_code == 404
 
         error_data = response.json()
@@ -408,18 +306,10 @@ async def test_delete_nonexistent_notification(self, client: AsyncClient, test_u
         assert "not found" in error_data["detail"].lower()
 
     @pytest.mark.asyncio
-    async def test_notification_pagination(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_notification_pagination(self, authenticated_client: AsyncClient) -> None:
         """Test notification pagination."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get first page
-        page1_response = await client.get("/api/v1/notifications?limit=5&offset=0")
+        page1_response = await authenticated_client.get("/api/v1/notifications?limit=5&offset=0")
         assert page1_response.status_code == 200
 
         page1_data = page1_response.json()
@@ -427,7 +317,7 @@ async def test_notification_pagination(self, client: AsyncClient, test_user: Dic
 
         # If there are more than 5 notifications, get second page
         if page1.total > 5:
-            page2_response = await client.get("/api/v1/notifications?limit=5&offset=5")
+            page2_response = await authenticated_client.get("/api/v1/notifications?limit=5&offset=5")
             assert page2_response.status_code == 200
 
             page2_data = page2_response.json()
@@ -444,60 +334,40 @@ async def test_notification_pagination(self, client: AsyncClient, test_user: Dic
                 assert len(page1_ids.intersection(page2_ids)) == 0
 
     @pytest.mark.asyncio
-    async def test_notifications_isolation_between_users(self, client: AsyncClient,
-                                                         test_user: Dict[str, str],
-                                                         test_admin: Dict[str, str]) -> None:
+    async def test_notifications_isolation_between_users(
+        self, client: AsyncClient, make_user: MakeUser,
+    ) -> None:
         """Test that notifications are isolated between users."""
-        # Login as regular user
-        user_login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        user_login_response = await client.post("/api/v1/auth/login", data=user_login_data)
-        assert user_login_response.status_code == 200
-
-        # Get user's notifications
+        # Create user and fetch notifications immediately (make_user logs in)
+        await make_user(UserRole.USER)
         user_notifications_response = await client.get("/api/v1/notifications")
         assert user_notifications_response.status_code == 200
+        user_notification_ids = [
+            n["notification_id"] for n in user_notifications_response.json()["notifications"]
+        ]
 
-        user_notifications_data = user_notifications_response.json()
-        user_notification_ids = [n["notification_id"] for n in user_notifications_data["notifications"]]
-
-        # Login as admin
-        admin_login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        admin_login_response = await client.post("/api/v1/auth/login", data=admin_login_data)
-        assert admin_login_response.status_code == 200
-
-        # Get admin's notifications
+        # Create admin and fetch notifications immediately (make_user logs in)
+        await make_user(UserRole.ADMIN)
         admin_notifications_response = await client.get("/api/v1/notifications")
         assert admin_notifications_response.status_code == 200
-
-        admin_notifications_data = admin_notifications_response.json()
-        admin_notification_ids = [n["notification_id"] for n in admin_notifications_data["notifications"]]
+        admin_notification_ids = [
+            n["notification_id"] for n in admin_notifications_response.json()["notifications"]
+        ]
 
         # Notifications should be different (no overlap)
         if user_notification_ids and admin_notification_ids:
             assert len(set(user_notification_ids).intersection(set(admin_notification_ids))) == 0
 
     @pytest.mark.asyncio
-    async def test_invalid_notification_channel(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_invalid_notification_channel(self, authenticated_client: AsyncClient) -> None:
         """Test updating subscription with invalid channel."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Try invalid channel
         update_data = {
             "enabled": True,
             "severities": ["medium"]
         }
 
-        response = await client.put("/api/v1/notifications/subscriptions/invalid_channel", json=update_data)
+        response = await authenticated_client.put(
+            "/api/v1/notifications/subscriptions/invalid_channel", json=update_data
+        )
         assert response.status_code in [400, 404, 422]
diff --git a/backend/tests/integration/test_replay_routes.py b/backend/tests/integration/test_replay_routes.py
index 1cdf73ec..d170f937 100644
--- a/backend/tests/integration/test_replay_routes.py
+++ b/backend/tests/integration/test_replay_routes.py
@@ -1,21 +1,14 @@
-import asyncio
-from datetime import datetime, timezone, timedelta
-from typing import Dict
-from uuid import uuid4
+from collections.abc import Callable
+from datetime import datetime, timedelta, timezone
 
+import backoff
 import pytest
-from httpx import AsyncClient
-
 from app.domain.enums.events import EventType
-from app.domain.enums.replay import ReplayStatus, ReplayType, ReplayTarget
-from app.schemas_pydantic.replay import (
-    ReplayRequest,
-    ReplayResponse,
-    SessionSummary,
-    CleanupResponse
-)
+from app.domain.enums.replay import ReplayStatus, ReplayTarget, ReplayType
+from app.domain.replay import ReplayFilter
+from app.schemas_pydantic.replay import CleanupResponse, ReplayRequest, ReplayResponse, SessionSummary
 from app.schemas_pydantic.replay_models import ReplaySession
-from tests.helpers.eventually import eventually
+from httpx import AsyncClient
 
 
 @pytest.mark.integration
@@ -23,12 +16,10 @@ class TestReplayRoutes:
     """Test replay endpoints against real backend."""
 
     @pytest.mark.asyncio
-    async def test_replay_requires_admin_authentication(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_replay_requires_admin_authentication(self, authenticated_client: AsyncClient) -> None:
         """Test that replay endpoints require admin authentication."""
-        # Already authenticated via test_user fixture
-
         # Try to access replay endpoints as non-admin
-        response = await client.get("/api/v1/replay/sessions")
+        response = await authenticated_client.get("/api/v1/replay/sessions")
         assert response.status_code == 403
 
         error_data = response.json()
@@ -37,22 +28,22 @@ async def test_replay_requires_admin_authentication(self, client: AsyncClient, t
                    for word in ["admin", "forbidden", "denied"])
 
     @pytest.mark.asyncio
-    async def test_create_replay_session(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_create_replay_session(self, authenticated_admin_client: AsyncClient) -> None:
         """Test creating a replay session."""
-        # Already authenticated via test_admin fixture
-
         # Create replay session
         replay_request = ReplayRequest(
             replay_type=ReplayType.QUERY,
             target=ReplayTarget.KAFKA,
-            event_types=[EventType.EXECUTION_REQUESTED, EventType.EXECUTION_COMPLETED],
-            start_time=datetime.now(timezone.utc) - timedelta(days=7),
-            end_time=datetime.now(timezone.utc),
+            filter=ReplayFilter(
+                event_types=[EventType.EXECUTION_REQUESTED, EventType.EXECUTION_COMPLETED],
+                start_time=datetime.now(timezone.utc) - timedelta(days=7),
+                end_time=datetime.now(timezone.utc),
+            ),
             speed_multiplier=1.0,
             preserve_timestamps=True,
         ).model_dump(mode="json")
 
-        response = await client.post("/api/v1/replay/sessions", json=replay_request)
+        response = await authenticated_admin_client.post("/api/v1/replay/sessions", json=replay_request)
         assert response.status_code in [200, 422]
         if response.status_code == 422:
             return
@@ -67,12 +58,10 @@ async def test_create_replay_session(self, client: AsyncClient, test_admin: Dict
         assert replay_response.message is not None
 
     @pytest.mark.asyncio
-    async def test_list_replay_sessions(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_list_replay_sessions(self, authenticated_admin_client: AsyncClient) -> None:
         """Test listing replay sessions."""
-        # Already authenticated via test_admin fixture
-
         # List replay sessions
-        response = await client.get("/api/v1/replay/sessions?limit=10")
+        response = await authenticated_admin_client.get("/api/v1/replay/sessions?limit=10")
         assert response.status_code in [200, 404]
         if response.status_code != 200:
             return
@@ -88,27 +77,27 @@ async def test_list_replay_sessions(self, client: AsyncClient, test_admin: Dict[
             assert session_summary.created_at is not None
 
     @pytest.mark.asyncio
-    async def test_get_replay_session_details(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_get_replay_session_details(self, authenticated_admin_client: AsyncClient) -> None:
         """Test getting detailed information about a replay session."""
-        # Already authenticated via test_admin fixture
-
         # Create a session first
         replay_request = ReplayRequest(
             replay_type=ReplayType.QUERY,
             target=ReplayTarget.KAFKA,
-            event_types=[EventType.USER_LOGGED_IN],
-            start_time=datetime.now(timezone.utc) - timedelta(hours=24),
-            end_time=datetime.now(timezone.utc),
+            filter=ReplayFilter(
+                event_types=[EventType.USER_LOGGED_IN],
+                start_time=datetime.now(timezone.utc) - timedelta(hours=24),
+                end_time=datetime.now(timezone.utc),
+            ),
             speed_multiplier=2.0,
         ).model_dump(mode="json")
 
-        create_response = await client.post("/api/v1/replay/sessions", json=replay_request)
+        create_response = await authenticated_admin_client.post("/api/v1/replay/sessions", json=replay_request)
         assert create_response.status_code == 200
 
         session_id = create_response.json()["session_id"]
 
         # Get session details
-        detail_response = await client.get(f"/api/v1/replay/sessions/{session_id}")
+        detail_response = await authenticated_admin_client.get(f"/api/v1/replay/sessions/{session_id}")
         assert detail_response.status_code in [200, 404]
         if detail_response.status_code != 200:
             return
@@ -121,33 +110,27 @@ async def test_get_replay_session_details(self, client: AsyncClient, test_admin:
         assert session.created_at is not None
 
     @pytest.mark.asyncio
-    async def test_start_replay_session(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_start_replay_session(self, authenticated_admin_client: AsyncClient) -> None:
         """Test starting a replay session."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Create a session
         replay_request = ReplayRequest(
             replay_type=ReplayType.QUERY,
             target=ReplayTarget.KAFKA,
-            event_types=[EventType.SYSTEM_ERROR],
-            start_time=datetime.now(timezone.utc) - timedelta(hours=1),
-            end_time=datetime.now(timezone.utc),
+            filter=ReplayFilter(
+                event_types=[EventType.SYSTEM_ERROR],
+                start_time=datetime.now(timezone.utc) - timedelta(hours=1),
+                end_time=datetime.now(timezone.utc),
+            ),
             speed_multiplier=1.0,
         ).model_dump(mode="json")
 
-        create_response = await client.post("/api/v1/replay/sessions", json=replay_request)
+        create_response = await authenticated_admin_client.post("/api/v1/replay/sessions", json=replay_request)
         assert create_response.status_code == 200
 
         session_id = create_response.json()["session_id"]
 
         # Start the session
-        start_response = await client.post(f"/api/v1/replay/sessions/{session_id}/start")
+        start_response = await authenticated_admin_client.post(f"/api/v1/replay/sessions/{session_id}/start")
         assert start_response.status_code in [200, 404]
         if start_response.status_code != 200:
             return
@@ -160,39 +143,33 @@ async def test_start_replay_session(self, client: AsyncClient, test_admin: Dict[
         assert start_result.message is not None
 
     @pytest.mark.asyncio
-    async def test_pause_and_resume_replay_session(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_pause_and_resume_replay_session(self, authenticated_admin_client: AsyncClient) -> None:
         """Test pausing and resuming a replay session."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Create and start a session
         replay_request = ReplayRequest(
             replay_type=ReplayType.QUERY,
             target=ReplayTarget.KAFKA,
-            event_types=[EventType.SYSTEM_ERROR],
-            start_time=datetime.now(timezone.utc) - timedelta(hours=2),
-            end_time=datetime.now(timezone.utc),
+            filter=ReplayFilter(
+                event_types=[EventType.SYSTEM_ERROR],
+                start_time=datetime.now(timezone.utc) - timedelta(hours=2),
+                end_time=datetime.now(timezone.utc),
+            ),
             speed_multiplier=0.5,
         ).model_dump(mode="json")
 
-        create_response = await client.post("/api/v1/replay/sessions", json=replay_request)
+        create_response = await authenticated_admin_client.post("/api/v1/replay/sessions", json=replay_request)
         assert create_response.status_code == 200
 
         session_id = create_response.json()["session_id"]
 
         # Start the session
-        start_response = await client.post(f"/api/v1/replay/sessions/{session_id}/start")
+        start_response = await authenticated_admin_client.post(f"/api/v1/replay/sessions/{session_id}/start")
         assert start_response.status_code in [200, 404]
         if start_response.status_code != 200:
             return
 
         # Pause the session
-        pause_response = await client.post(f"/api/v1/replay/sessions/{session_id}/pause")
+        pause_response = await authenticated_admin_client.post(f"/api/v1/replay/sessions/{session_id}/pause")
         # Could succeed or fail if session already completed or not found
         assert pause_response.status_code in [200, 400, 404]
 
@@ -205,7 +182,7 @@ async def test_pause_and_resume_replay_session(self, client: AsyncClient, test_a
 
             # If paused, try to resume
             if pause_result.status == "paused":
-                resume_response = await client.post(f"/api/v1/replay/sessions/{session_id}/resume")
+                resume_response = await authenticated_admin_client.post(f"/api/v1/replay/sessions/{session_id}/resume")
                 assert resume_response.status_code == 200
 
                 resume_data = resume_response.json()
@@ -215,33 +192,27 @@ async def test_pause_and_resume_replay_session(self, client: AsyncClient, test_a
                 assert resume_result.status in [ReplayStatus.RUNNING, ReplayStatus.COMPLETED]
 
     @pytest.mark.asyncio
-    async def test_cancel_replay_session(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_cancel_replay_session(self, authenticated_admin_client: AsyncClient) -> None:
         """Test cancelling a replay session."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Create a session
         replay_request = ReplayRequest(
             replay_type=ReplayType.QUERY,
             target=ReplayTarget.KAFKA,
-            event_types=[EventType.SYSTEM_ERROR],
-            start_time=datetime.now(timezone.utc) - timedelta(hours=1),
-            end_time=datetime.now(timezone.utc),
+            filter=ReplayFilter(
+                event_types=[EventType.SYSTEM_ERROR],
+                start_time=datetime.now(timezone.utc) - timedelta(hours=1),
+                end_time=datetime.now(timezone.utc),
+            ),
             speed_multiplier=1.0,
         ).model_dump(mode="json")
 
-        create_response = await client.post("/api/v1/replay/sessions", json=replay_request)
+        create_response = await authenticated_admin_client.post("/api/v1/replay/sessions", json=replay_request)
         assert create_response.status_code == 200
 
         session_id = create_response.json()["session_id"]
 
         # Cancel the session
-        cancel_response = await client.post(f"/api/v1/replay/sessions/{session_id}/cancel")
+        cancel_response = await authenticated_admin_client.post(f"/api/v1/replay/sessions/{session_id}/cancel")
         assert cancel_response.status_code in [200, 404]
         if cancel_response.status_code != 200:
             return
@@ -254,16 +225,8 @@ async def test_cancel_replay_session(self, client: AsyncClient, test_admin: Dict
         assert cancel_result.message is not None
 
     @pytest.mark.asyncio
-    async def test_filter_sessions_by_status(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_filter_sessions_by_status(self, authenticated_admin_client: AsyncClient) -> None:
         """Test filtering replay sessions by status."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Test different status filters
         for status in [
             ReplayStatus.CREATED.value,
@@ -272,7 +235,7 @@ async def test_filter_sessions_by_status(self, client: AsyncClient, test_admin:
             ReplayStatus.FAILED.value,
             ReplayStatus.CANCELLED.value,
         ]:
-            response = await client.get(f"/api/v1/replay/sessions?status={status}&limit=5")
+            response = await authenticated_admin_client.get(f"/api/v1/replay/sessions?status={status}&limit=5")
             assert response.status_code in [200, 404]
             if response.status_code != 200:
                 continue
@@ -286,18 +249,10 @@ async def test_filter_sessions_by_status(self, client: AsyncClient, test_admin:
                 assert session.status == status
 
     @pytest.mark.asyncio
-    async def test_cleanup_old_sessions(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_cleanup_old_sessions(self, authenticated_admin_client: AsyncClient) -> None:
         """Test cleanup of old replay sessions."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Cleanup sessions older than 24 hours
-        cleanup_response = await client.post("/api/v1/replay/cleanup?older_than_hours=24")
+        cleanup_response = await authenticated_admin_client.post("/api/v1/replay/cleanup?older_than_hours=24")
         assert cleanup_response.status_code == 200
 
         cleanup_data = cleanup_response.json()
@@ -308,19 +263,13 @@ async def test_cleanup_old_sessions(self, client: AsyncClient, test_admin: Dict[
         assert cleanup_result.message is not None
 
     @pytest.mark.asyncio
-    async def test_get_nonexistent_session(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_get_nonexistent_session(
+        self, authenticated_admin_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test getting a non-existent replay session."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Try to get non-existent session
-        fake_session_id = str(uuid4())
-        response = await client.get(f"/api/v1/replay/sessions/{fake_session_id}")
+        fake_session_id = unique_id("session-")
+        response = await authenticated_admin_client.get(f"/api/v1/replay/sessions/{fake_session_id}")
         # Could return 404 or empty result
         assert response.status_code in [200, 404]
 
@@ -329,36 +278,24 @@ async def test_get_nonexistent_session(self, client: AsyncClient, test_admin: Di
             assert "detail" in error_data
 
     @pytest.mark.asyncio
-    async def test_start_nonexistent_session(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_start_nonexistent_session(
+        self, authenticated_admin_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test starting a non-existent replay session."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Try to start non-existent session
-        fake_session_id = str(uuid4())
-        response = await client.post(f"/api/v1/replay/sessions/{fake_session_id}/start")
+        fake_session_id = unique_id("session-")
+        response = await authenticated_admin_client.post(f"/api/v1/replay/sessions/{fake_session_id}/start")
         # Should fail
         assert response.status_code in [400, 404]
 
     @pytest.mark.asyncio
-    async def test_replay_session_state_transitions(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_replay_session_state_transitions(
+        self, authenticated_admin_client: AsyncClient, unique_id: Callable[[str], str],
+    ) -> None:
         """Test valid state transitions for replay sessions."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Create a session
         replay_request = {
-            "name": f"State Test Session {uuid4().hex[:8]}",
+            "name": f"State Test Session {unique_id('')}",
             "description": "Testing state transitions",
             "filters": {
                 "event_types": ["state.test.event"],
@@ -369,7 +306,7 @@ async def test_replay_session_state_transitions(self, client: AsyncClient, test_
             "speed_multiplier": 1.0
         }
 
-        create_response = await client.post("/api/v1/replay/sessions", json=replay_request)
+        create_response = await authenticated_admin_client.post("/api/v1/replay/sessions", json=replay_request)
         assert create_response.status_code in [200, 422]
         if create_response.status_code != 200:
             return
@@ -379,31 +316,25 @@ async def test_replay_session_state_transitions(self, client: AsyncClient, test_
         assert initial_status == ReplayStatus.CREATED
 
         # Can't pause a session that hasn't started
-        pause_response = await client.post(f"/api/v1/replay/sessions/{session_id}/pause")
+        pause_response = await authenticated_admin_client.post(f"/api/v1/replay/sessions/{session_id}/pause")
         assert pause_response.status_code in [400, 409]  # Invalid state transition
 
         # Can start from pending
-        start_response = await client.post(f"/api/v1/replay/sessions/{session_id}/start")
+        start_response = await authenticated_admin_client.post(f"/api/v1/replay/sessions/{session_id}/start")
         assert start_response.status_code == 200
 
         # Can't start again if already running
-        start_again_response = await client.post(f"/api/v1/replay/sessions/{session_id}/start")
+        start_again_response = await authenticated_admin_client.post(f"/api/v1/replay/sessions/{session_id}/start")
         assert start_again_response.status_code in [200, 400, 409]  # Might be idempotent or error
 
     @pytest.mark.asyncio
-    async def test_replay_with_complex_filters(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_replay_with_complex_filters(
+        self, authenticated_admin_client: AsyncClient, unique_id: Callable[[str], str],
+    ) -> None:
         """Test creating replay session with complex filters."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Create session with complex filters
         replay_request = {
-            "name": f"Complex Filter Session {uuid4().hex[:8]}",
+            "name": f"Complex Filter Session {unique_id('')}",
             "description": "Testing complex event filters",
             "filters": {
                 "event_types": [
@@ -414,9 +345,9 @@ async def test_replay_with_complex_filters(self, client: AsyncClient, test_admin
                 ],
                 "start_time": (datetime.now(timezone.utc) - timedelta(days=30)).isoformat(),
                 "end_time": datetime.now(timezone.utc).isoformat(),
-                "aggregate_id": str(uuid4()),
-                "correlation_id": str(uuid4()),
-                "user_id": test_admin.get("user_id"),
+                "aggregate_id": unique_id("aggregate-"),
+                "correlation_id": unique_id("corr-"),
+                "user_id": unique_id("user-"),
                 "service_name": "execution-service"
             },
             "target_topic": "complex-filter-topic",
@@ -425,7 +356,7 @@ async def test_replay_with_complex_filters(self, client: AsyncClient, test_admin
             "batch_size": 100
         }
 
-        response = await client.post("/api/v1/replay/sessions", json=replay_request)
+        response = await authenticated_admin_client.post("/api/v1/replay/sessions", json=replay_request)
         assert response.status_code in [200, 422]
         if response.status_code != 200:
             return
@@ -437,19 +368,13 @@ async def test_replay_with_complex_filters(self, client: AsyncClient, test_admin
         assert replay_response.status in ["created", "pending"]
 
     @pytest.mark.asyncio
-    async def test_replay_session_progress_tracking(self, client: AsyncClient, test_admin: Dict[str, str]) -> None:
+    async def test_replay_session_progress_tracking(
+        self, authenticated_admin_client: AsyncClient, unique_id: Callable[[str], str],
+    ) -> None:
         """Test tracking progress of replay sessions."""
-        # Login as admin
-        login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Create and start a session
         replay_request = {
-            "name": f"Progress Test Session {uuid4().hex[:8]}",
+            "name": f"Progress Test Session {unique_id('')}",
             "description": "Testing progress tracking",
             "filters": {
                 "event_types": ["progress.test.event"],
@@ -460,7 +385,7 @@ async def test_replay_session_progress_tracking(self, client: AsyncClient, test_
             "speed_multiplier": 10.0  # Fast replay
         }
 
-        create_response = await client.post("/api/v1/replay/sessions", json=replay_request)
+        create_response = await authenticated_admin_client.post("/api/v1/replay/sessions", json=replay_request)
         assert create_response.status_code in [200, 422]
         if create_response.status_code != 200:
             return
@@ -468,18 +393,18 @@ async def test_replay_session_progress_tracking(self, client: AsyncClient, test_
         session_id = create_response.json()["session_id"]
 
         # Start the session
-        await client.post(f"/api/v1/replay/sessions/{session_id}/start")
+        await authenticated_admin_client.post(f"/api/v1/replay/sessions/{session_id}/start")
 
         # Poll progress without fixed sleeps
-        async def _check_progress_once() -> None:
-            detail_response = await client.get(f"/api/v1/replay/sessions/{session_id}")
+        @backoff.on_exception(backoff.constant, AssertionError, max_time=5.0, interval=0.5)
+        async def _wait_progress() -> None:
+            detail_response = await authenticated_admin_client.get(f"/api/v1/replay/sessions/{session_id}")
             assert detail_response.status_code == 200
             session_data = detail_response.json()
             session = ReplaySession(**session_data)
-            if session.events_replayed is not None and session.events_total is not None:
-                assert 0 <= session.events_replayed <= session.events_total
-                if session.events_total > 0:
-                    progress = (session.events_replayed / session.events_total) * 100
-                    assert 0.0 <= progress <= 100.0
+            assert 0 <= session.replayed_events <= session.total_events
+            if session.total_events > 0:
+                progress = (session.replayed_events / session.total_events) * 100
+                assert 0.0 <= progress <= 100.0
 
-        await eventually(_check_progress_once, timeout=5.0, interval=0.5)
+        await _wait_progress()
diff --git a/backend/tests/integration/test_saga_routes.py b/backend/tests/integration/test_saga_routes.py
index b26d7b90..00b08ada 100644
--- a/backend/tests/integration/test_saga_routes.py
+++ b/backend/tests/integration/test_saga_routes.py
@@ -1,72 +1,68 @@
 import asyncio
-import uuid
-from typing import Dict
+from collections.abc import Callable
 
 import pytest
 from app.domain.enums.saga import SagaState
+from app.domain.enums.user import UserRole
 from app.schemas_pydantic.saga import (
     SagaListResponse,
     SagaStatusResponse,
 )
 from httpx import AsyncClient
 
+from tests.conftest import MakeUser
+
 
 class TestSagaRoutes:
     """Test saga routes against the real backend."""
 
     @pytest.mark.asyncio
-    async def test_get_saga_requires_auth(self, client: AsyncClient) -> None:
+    async def test_get_saga_requires_auth(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         """Test that getting saga status requires authentication."""
-        saga_id = str(uuid.uuid4())
+        saga_id = unique_id("saga-")
         response = await client.get(f"/api/v1/sagas/{saga_id}")
         assert response.status_code == 401
         assert "Not authenticated" in response.json()["detail"]
 
     @pytest.mark.asyncio
     async def test_get_saga_not_found(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
     ) -> None:
         """Test getting non-existent saga returns 404."""
-        # Already authenticated via test_user fixture
-
         # Try to get non-existent saga
-        saga_id = str(uuid.uuid4())
-        response = await client.get(f"/api/v1/sagas/{saga_id}")
+        saga_id = unique_id("saga-")
+        response = await authenticated_client.get(f"/api/v1/sagas/{saga_id}")
         assert response.status_code == 404
         assert "not found" in response.json()["detail"]
 
     @pytest.mark.asyncio
     async def test_get_execution_sagas_requires_auth(
-            self, client: AsyncClient
+            self, client: AsyncClient, unique_id: Callable[[str], str]
     ) -> None:
         """Test that getting execution sagas requires authentication."""
-        execution_id = str(uuid.uuid4())
+        execution_id = unique_id("exec-")
         response = await client.get(f"/api/v1/sagas/execution/{execution_id}")
         assert response.status_code == 401
 
     @pytest.mark.asyncio
     async def test_get_execution_sagas_empty(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
     ) -> None:
         """Test getting sagas for execution with no sagas."""
-        # Already authenticated via test_user fixture
-
         # Get sagas for non-existent execution
-        execution_id = str(uuid.uuid4())
-        response = await client.get(f"/api/v1/sagas/execution/{execution_id}")
+        execution_id = unique_id("exec-")
+        response = await authenticated_client.get(f"/api/v1/sagas/execution/{execution_id}")
         # Access to a random execution (non-owned) must be forbidden
         assert response.status_code == 403
 
     @pytest.mark.asyncio
     async def test_get_execution_sagas_with_state_filter(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
     ) -> None:
         """Test getting execution sagas filtered by state."""
-        # Already authenticated via test_user fixture
-
         # Get sagas filtered by running state
-        execution_id = str(uuid.uuid4())
-        response = await client.get(
+        execution_id = unique_id("exec-")
+        response = await authenticated_client.get(
             f"/api/v1/sagas/execution/{execution_id}",
             params={"state": SagaState.RUNNING.value}
         )
@@ -85,13 +81,11 @@ async def test_list_sagas_requires_auth(self, client: AsyncClient) -> None:
 
     @pytest.mark.asyncio
     async def test_list_sagas_paginated(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient
     ) -> None:
         """Test listing sagas with pagination."""
-        # Already authenticated via test_user fixture
-
         # List sagas with pagination
-        response = await client.get(
+        response = await authenticated_client.get(
             "/api/v1/sagas/",
             params={"limit": 10, "offset": 0}
         )
@@ -104,19 +98,11 @@ async def test_list_sagas_paginated(
 
     @pytest.mark.asyncio
     async def test_list_sagas_with_state_filter(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient
     ) -> None:
         """Test listing sagas filtered by state."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # List completed sagas
-        response = await client.get(
+        response = await authenticated_client.get(
             "/api/v1/sagas/",
             params={"state": SagaState.COMPLETED.value, "limit": 5}
         )
@@ -130,19 +116,11 @@ async def test_list_sagas_with_state_filter(
 
     @pytest.mark.asyncio
     async def test_list_sagas_large_limit(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient
     ) -> None:
         """Test listing sagas with maximum limit."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # List with max limit
-        response = await client.get(
+        response = await authenticated_client.get(
             "/api/v1/sagas/",
             params={"limit": 1000}
         )
@@ -153,113 +131,69 @@ async def test_list_sagas_large_limit(
 
     @pytest.mark.asyncio
     async def test_list_sagas_invalid_limit(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient
     ) -> None:
         """Test listing sagas with invalid limit."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Try with limit too large
-        response = await client.get(
+        response = await authenticated_client.get(
             "/api/v1/sagas/",
             params={"limit": 10000}
         )
         assert response.status_code == 422  # Validation error
 
     @pytest.mark.asyncio
-    async def test_cancel_saga_requires_auth(self, client: AsyncClient) -> None:
+    async def test_cancel_saga_requires_auth(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         """Test that cancelling saga requires authentication."""
-        saga_id = str(uuid.uuid4())
+        saga_id = unique_id("saga-")
         response = await client.post(f"/api/v1/sagas/{saga_id}/cancel")
         assert response.status_code == 401
 
     @pytest.mark.asyncio
     async def test_cancel_saga_not_found(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
     ) -> None:
         """Test cancelling non-existent saga returns 404."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Try to cancel non-existent saga
-        saga_id = str(uuid.uuid4())
-        response = await client.post(f"/api/v1/sagas/{saga_id}/cancel")
+        saga_id = unique_id("saga-")
+        response = await authenticated_client.post(f"/api/v1/sagas/{saga_id}/cancel")
         assert response.status_code == 404
         assert "not found" in response.json()["detail"]
 
     @pytest.mark.asyncio
     async def test_saga_access_control(
-            self,
-            client: AsyncClient,
-            test_user: Dict[str, str],
-            another_user: Dict[str, str]
+        self, client: AsyncClient, make_user: MakeUser,
     ) -> None:
         """Test that users can only access their own sagas."""
-        # User 1 lists their sagas
-        login_data1 = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response1 = await client.post("/api/v1/auth/login", data=login_data1)
-        assert login_response1.status_code == 200
-
+        # Create user1 and fetch their sagas immediately (make_user logs in)
+        await make_user(UserRole.USER)
         response1 = await client.get("/api/v1/sagas/")
         assert response1.status_code == 200
         user1_sagas = SagaListResponse(**response1.json())
 
-        # Logout
-        await client.post("/api/v1/auth/logout")
-
-        # User 2 lists their sagas
-        login_data2 = {
-            "username": another_user["username"],
-            "password": another_user["password"]
-        }
-        login_response2 = await client.post("/api/v1/auth/login", data=login_data2)
-        assert login_response2.status_code == 200
-
+        # Create user2 and fetch their sagas immediately (make_user logs in)
+        await make_user(UserRole.USER)
         response2 = await client.get("/api/v1/sagas/")
         assert response2.status_code == 200
         user2_sagas = SagaListResponse(**response2.json())
 
         # Each user should see only their own sagas
-        # (we can't verify the exact content without creating sagas,
-        # but we can verify the endpoint works correctly)
         assert isinstance(user1_sagas.sagas, list)
         assert isinstance(user2_sagas.sagas, list)
 
     @pytest.mark.asyncio
     async def test_get_saga_with_details(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient
     ) -> None:
         """Test getting saga with all details when it exists."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # First list sagas to potentially find one
-        list_response = await client.get("/api/v1/sagas/", params={"limit": 1})
+        list_response = await authenticated_client.get("/api/v1/sagas/", params={"limit": 1})
         assert list_response.status_code == 200
         saga_list = SagaListResponse(**list_response.json())
 
         if saga_list.sagas and len(saga_list.sagas) > 0:
             # Get details of the first saga
             saga_id = saga_list.sagas[0].saga_id
-            response = await client.get(f"/api/v1/sagas/{saga_id}")
+            response = await authenticated_client.get(f"/api/v1/sagas/{saga_id}")
 
             # Could be 200 if accessible or 403 if not owned by user
             assert response.status_code in [200, 403, 404]
@@ -271,19 +205,11 @@ async def test_get_saga_with_details(
 
     @pytest.mark.asyncio
     async def test_list_sagas_with_offset(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient
     ) -> None:
         """Test listing sagas with offset for pagination."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Get first page
-        response1 = await client.get(
+        response1 = await authenticated_client.get(
             "/api/v1/sagas/",
             params={"limit": 5, "offset": 0}
         )
@@ -291,7 +217,7 @@ async def test_list_sagas_with_offset(
         page1 = SagaListResponse(**response1.json())
 
         # Get second page
-        response2 = await client.get(
+        response2 = await authenticated_client.get(
             "/api/v1/sagas/",
             params={"limit": 5, "offset": 5}
         )
@@ -307,19 +233,11 @@ async def test_list_sagas_with_offset(
 
     @pytest.mark.asyncio
     async def test_cancel_saga_invalid_state(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient
     ) -> None:
         """Test cancelling a saga in invalid state (if one exists)."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Try to find a completed saga to cancel
-        response = await client.get(
+        response = await authenticated_client.get(
             "/api/v1/sagas/",
             params={"state": SagaState.COMPLETED.value, "limit": 1}
         )
@@ -329,29 +247,21 @@ async def test_cancel_saga_invalid_state(
         if saga_list.sagas and len(saga_list.sagas) > 0:
             # Try to cancel completed saga (should fail)
             saga_id = saga_list.sagas[0].saga_id
-            cancel_response = await client.post(f"/api/v1/sagas/{saga_id}/cancel")
+            cancel_response = await authenticated_client.post(f"/api/v1/sagas/{saga_id}/cancel")
             # Should get 400 (invalid state) or 403 (access denied) or 404
             assert cancel_response.status_code in [400, 403, 404]
 
     @pytest.mark.asyncio
     async def test_get_execution_sagas_multiple_states(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
     ) -> None:
         """Test getting execution sagas across different states."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        execution_id = str(uuid.uuid4())
+        execution_id = unique_id("exec-")
 
         # Test each state filter
         for state in [SagaState.CREATED, SagaState.RUNNING, SagaState.COMPLETED,
                       SagaState.FAILED, SagaState.CANCELLED]:
-            response = await client.get(
+            response = await authenticated_client.get(
                 f"/api/v1/sagas/execution/{execution_id}",
                 params={"state": state.value}
             )
@@ -367,19 +277,11 @@ async def test_get_execution_sagas_multiple_states(
 
     @pytest.mark.asyncio
     async def test_saga_response_structure(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient
     ) -> None:
         """Test that saga responses have correct structure."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # List sagas to verify response structure
-        response = await client.get("/api/v1/sagas/", params={"limit": 1})
+        response = await authenticated_client.get("/api/v1/sagas/", params={"limit": 1})
         assert response.status_code == 200
 
         saga_list = SagaListResponse(**response.json())
@@ -398,21 +300,13 @@ async def test_saga_response_structure(
 
     @pytest.mark.asyncio
     async def test_concurrent_saga_access(
-            self, client: AsyncClient, test_user: Dict[str, str]
+            self, authenticated_client: AsyncClient
     ) -> None:
         """Test concurrent access to saga endpoints."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Make multiple concurrent requests
         tasks = []
         for i in range(5):
-            tasks.append(client.get(
+            tasks.append(authenticated_client.get(
                 "/api/v1/sagas/",
                 params={"limit": 10, "offset": i * 10}
             ))
diff --git a/backend/tests/integration/test_saved_scripts_routes.py b/backend/tests/integration/test_saved_scripts_routes.py
index cc42b39c..2b5d086e 100644
--- a/backend/tests/integration/test_saved_scripts_routes.py
+++ b/backend/tests/integration/test_saved_scripts_routes.py
@@ -1,13 +1,13 @@
+from collections.abc import Callable
 from datetime import datetime, timezone
-from typing import Dict
-from uuid import UUID, uuid4
+from uuid import UUID
 
 import pytest
+from app.domain.enums.user import UserRole
+from app.schemas_pydantic.saved_script import SavedScriptResponse
 from httpx import AsyncClient
 
-from app.schemas_pydantic.saved_script import (
-    SavedScriptResponse
-)
+from tests.conftest import MakeUser
 
 
 @pytest.mark.integration
@@ -33,22 +33,22 @@ async def test_create_script_requires_authentication(self, client: AsyncClient)
                    for word in ["not authenticated", "unauthorized", "login"])
 
     @pytest.mark.asyncio
-    async def test_create_and_retrieve_saved_script(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_create_and_retrieve_saved_script(
+        self, authenticated_client: AsyncClient, unique_id: Callable[[str], str],
+    ) -> None:
         """Test creating and retrieving a saved script."""
-        # Already authenticated via test_user fixture
-
         # Create a unique script
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         script_data = {
-            "name": f"Test Script {unique_id}",
-            "script": f"# Script {unique_id}\nprint('Hello from saved script {unique_id}')",
+            "name": f"Test Script {uid}",
+            "script": f"# Script {uid}\nprint('Hello from saved script {uid}')",
             "lang": "python",
             "lang_version": "3.11",
             "description": f"Test script created at {datetime.now(timezone.utc).isoformat()}"
         }
 
         # Create the script
-        create_response = await client.post("/api/v1/scripts", json=script_data)
+        create_response = await authenticated_client.post("/api/v1/scripts", json=script_data)
         assert create_response.status_code in [200, 201]
 
         # Validate response structure
@@ -77,7 +77,7 @@ async def test_create_and_retrieve_saved_script(self, client: AsyncClient, test_
         assert saved_script.updated_at is not None
 
         # Now retrieve the script by ID
-        get_response = await client.get(f"/api/v1/scripts/{saved_script.script_id}")
+        get_response = await authenticated_client.get(f"/api/v1/scripts/{saved_script.script_id}")
         assert get_response.status_code == 200
 
         retrieved_data = get_response.json()
@@ -89,29 +89,29 @@ async def test_create_and_retrieve_saved_script(self, client: AsyncClient, test_
         assert retrieved_script.script == script_data["script"]
 
     @pytest.mark.asyncio
-    async def test_list_user_scripts(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_list_user_scripts(
+        self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test listing user's saved scripts."""
-        # Already authenticated via test_user fixture
-
         # Create a few scripts
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         scripts_to_create = [
             {
-                "name": f"List Test Script 1 {unique_id}",
+                "name": f"List Test Script 1 {uid}",
                 "script": "print('Script 1')",
                 "lang": "python",
                 "lang_version": "3.11",
                 "description": "First script"
             },
             {
-                "name": f"List Test Script 2 {unique_id}",
+                "name": f"List Test Script 2 {uid}",
                 "script": "console.log('Script 2');",
                 "lang": "javascript",
                 "lang_version": "18",
                 "description": "Second script"
             },
             {
-                "name": f"List Test Script 3 {unique_id}",
+                "name": f"List Test Script 3 {uid}",
                 "script": "print('Script 3')",
                 "lang": "python",
                 "lang_version": "3.10"
@@ -120,12 +120,12 @@ async def test_list_user_scripts(self, client: AsyncClient, test_user: Dict[str,
 
         created_ids = []
         for script_data in scripts_to_create:
-            create_response = await client.post("/api/v1/scripts", json=script_data)
+            create_response = await authenticated_client.post("/api/v1/scripts", json=script_data)
             if create_response.status_code in [200, 201]:
                 created_ids.append(create_response.json()["script_id"])
 
         # List all scripts
-        list_response = await client.get("/api/v1/scripts")
+        list_response = await authenticated_client.get("/api/v1/scripts")
         assert list_response.status_code == 200
 
         scripts_list = list_response.json()
@@ -149,21 +149,21 @@ async def test_list_user_scripts(self, client: AsyncClient, test_user: Dict[str,
             assert created_id in returned_ids
 
     @pytest.mark.asyncio
-    async def test_update_saved_script(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_update_saved_script(
+        self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test updating a saved script."""
-        # Already authenticated via test_user fixture
-
         # Create a script
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         original_data = {
-            "name": f"Original Script {unique_id}",
+            "name": f"Original Script {uid}",
             "script": "print('Original content')",
             "lang": "python",
             "lang_version": "3.11",
             "description": "Original description"
         }
 
-        create_response = await client.post("/api/v1/scripts", json=original_data)
+        create_response = await authenticated_client.post("/api/v1/scripts", json=original_data)
         assert create_response.status_code in [200, 201]
 
         created_script = create_response.json()
@@ -172,14 +172,14 @@ async def test_update_saved_script(self, client: AsyncClient, test_user: Dict[st
 
         # Update the script
         updated_data = {
-            "name": f"Updated Script {unique_id}",
+            "name": f"Updated Script {uid}",
             "script": "print('Updated content with more features')",
             "lang": "python",
             "lang_version": "3.12",
             "description": "Updated description with more details"
         }
 
-        update_response = await client.put(f"/api/v1/scripts/{script_id}", json=updated_data)
+        update_response = await authenticated_client.put(f"/api/v1/scripts/{script_id}", json=updated_data)
         assert update_response.status_code == 200
 
         updated_script_data = update_response.json()
@@ -202,31 +202,31 @@ async def test_update_saved_script(self, client: AsyncClient, test_user: Dict[st
         assert updated_script.updated_at > updated_script.created_at
 
     @pytest.mark.asyncio
-    async def test_delete_saved_script(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_delete_saved_script(
+        self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test deleting a saved script."""
-        # Already authenticated via test_user fixture
-
         # Create a script to delete
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         script_data = {
-            "name": f"Script to Delete {unique_id}",
+            "name": f"Script to Delete {uid}",
             "script": "print('Delete me')",
             "lang": "python",
             "lang_version": "3.11",
             "description": "This script will be deleted"
         }
 
-        create_response = await client.post("/api/v1/scripts", json=script_data)
+        create_response = await authenticated_client.post("/api/v1/scripts", json=script_data)
         assert create_response.status_code in [200, 201]
 
         script_id = create_response.json()["script_id"]
 
         # Delete the script
-        delete_response = await client.delete(f"/api/v1/scripts/{script_id}")
+        delete_response = await authenticated_client.delete(f"/api/v1/scripts/{script_id}")
         assert delete_response.status_code in [200, 204]
 
         # Verify it's deleted by trying to get it
-        get_response = await client.get(f"/api/v1/scripts/{script_id}")
+        get_response = await authenticated_client.get(f"/api/v1/scripts/{script_id}")
         assert get_response.status_code in [404, 403]
 
         if get_response.status_code == 404:
@@ -234,102 +234,74 @@ async def test_delete_saved_script(self, client: AsyncClient, test_user: Dict[st
             assert "detail" in error_data
 
     @pytest.mark.asyncio
-    async def test_cannot_access_other_users_scripts(self, client: AsyncClient, test_user: Dict[str, str],
-                                                     test_admin: Dict[str, str]) -> None:
+    async def test_cannot_access_other_users_scripts(
+        self,
+        client: AsyncClient,
+        make_user: MakeUser,
+        unique_id: Callable[[str], str],
+    ) -> None:
         """Test that users cannot access scripts created by other users."""
-        # Create a script as regular user
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        unique_id = str(uuid4())[:8]
+        # Create user and immediately create their script (make_user logs in with proper headers)
+        await make_user(UserRole.USER)
+        uid = unique_id("")
         user_script_data = {
-            "name": f"User Private Script {unique_id}",
+            "name": f"User Private Script {uid}",
             "script": "print('Private to user')",
             "lang": "python",
             "lang_version": "3.11",
-            "description": "Should only be visible to creating user"
+            "description": "Should only be visible to creating user",
         }
-
         create_response = await client.post("/api/v1/scripts", json=user_script_data)
         assert create_response.status_code in [200, 201]
-
         user_script_id = create_response.json()["script_id"]
 
-        # Now login as admin
-        admin_login_data = {
-            "username": test_admin["username"],
-            "password": test_admin["password"]
-        }
-        admin_login_response = await client.post("/api/v1/auth/login", data=admin_login_data)
-        assert admin_login_response.status_code == 200
+        # Create admin and immediately try to access user's script (make_user logs in with proper headers)
+        await make_user(UserRole.ADMIN)
 
-        # Try to access the user's script as admin
-        # This should fail unless admin has special permissions
+        # Try to access the user's script as admin - should fail
         get_response = await client.get(f"/api/v1/scripts/{user_script_id}")
-        # Should be forbidden or not found
         assert get_response.status_code in [403, 404]
 
         # List scripts as admin - should not include user's script
         list_response = await client.get("/api/v1/scripts")
         assert list_response.status_code == 200
-
-        admin_scripts = list_response.json()
-        admin_script_ids = [s["script_id"] for s in admin_scripts]
-        # User's script should not be in admin's list
+        admin_script_ids = [s["script_id"] for s in list_response.json()]
         assert user_script_id not in admin_script_ids
 
     @pytest.mark.asyncio
-    async def test_script_with_invalid_language(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_script_with_invalid_language(
+        self, authenticated_client: AsyncClient, unique_id: Callable[[str], str],
+    ) -> None:
         """Test that invalid language/version combinations are handled."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
 
         # Try invalid language
         invalid_lang_data = {
-            "name": f"Invalid Language Script {unique_id}",
+            "name": f"Invalid Language Script {uid}",
             "script": "print('test')",
             "lang": "invalid_language",
             "lang_version": "1.0"
         }
 
-        response = await client.post("/api/v1/scripts", json=invalid_lang_data)
+        response = await authenticated_client.post("/api/v1/scripts", json=invalid_lang_data)
         # Backend may accept arbitrary lang values; accept any outcome
         assert response.status_code in [200, 201, 400, 422]
 
         # Try unsupported version
         unsupported_version_data = {
-            "name": f"Unsupported Version Script {unique_id}",
+            "name": f"Unsupported Version Script {uid}",
             "script": "print('test')",
             "lang": "python",
             "lang_version": "2.7"  # Python 2 likely not supported
         }
 
-        response = await client.post("/api/v1/scripts", json=unsupported_version_data)
+        response = await authenticated_client.post("/api/v1/scripts", json=unsupported_version_data)
         # Might accept but warn, or reject
         assert response.status_code in [200, 201, 400, 422]
 
     @pytest.mark.asyncio
-    async def test_script_name_constraints(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_script_name_constraints(self, authenticated_client: AsyncClient) -> None:
         """Test script name validation and constraints."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         # Test empty name
         empty_name_data = {
             "name": "",
@@ -338,7 +310,7 @@ async def test_script_name_constraints(self, client: AsyncClient, test_user: Dic
             "lang_version": "3.11"
         }
 
-        response = await client.post("/api/v1/scripts", json=empty_name_data)
+        response = await authenticated_client.post("/api/v1/scripts", json=empty_name_data)
         assert response.status_code in [200, 201, 400, 422]
 
         # Test very long name
@@ -349,63 +321,48 @@ async def test_script_name_constraints(self, client: AsyncClient, test_user: Dic
             "lang_version": "3.11"
         }
 
-        response = await client.post("/api/v1/scripts", json=long_name_data)
+        response = await authenticated_client.post("/api/v1/scripts", json=long_name_data)
         # Should either accept or reject based on max length
         if response.status_code in [400, 422]:
             error_data = response.json()
             assert "detail" in error_data
 
     @pytest.mark.asyncio
-    async def test_script_content_size_limits(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_script_content_size_limits(
+        self, authenticated_client: AsyncClient, unique_id: Callable[[str], str]
+    ) -> None:
         """Test script content size limits."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
 
         # Test reasonably large script (should succeed)
         large_content = "# Large script\n" + "\n".join([f"print('Line {i}')" for i in range(1000)])
         large_script_data = {
-            "name": f"Large Script {unique_id}",
+            "name": f"Large Script {uid}",
             "script": large_content,
             "lang": "python",
             "lang_version": "3.11"
         }
 
-        response = await client.post("/api/v1/scripts", json=large_script_data)
+        response = await authenticated_client.post("/api/v1/scripts", json=large_script_data)
         assert response.status_code in [200, 201]
 
-        # Test excessively large script (should fail)
+        # Test excessively large script (should fail with 413 from RequestSizeLimitMiddleware)
+        # Middleware default is 10MB; 10MB script + JSON overhead exceeds this
         huge_content = "x" * (1024 * 1024 * 10)  # 10MB
         huge_script_data = {
-            "name": f"Huge Script {unique_id}",
+            "name": f"Huge Script {uid}",
             "script": huge_content,
             "lang": "python",
-            "lang_version": "3.11"
+            "lang_version": "3.11",
         }
 
-        response = await client.post("/api/v1/scripts", json=huge_script_data)
-        # If backend returns 500 for oversized payload, skip as environment-specific
-        if response.status_code >= 500:
-            pytest.skip("Backend returned 5xx for oversized script upload")
-        assert response.status_code in [200, 201, 400, 413, 422]
+        response = await authenticated_client.post("/api/v1/scripts", json=huge_script_data)
+        assert response.status_code == 413, f"Expected 413 Payload Too Large, got {response.status_code}"
+        assert "too large" in response.json().get("detail", "").lower()
 
     @pytest.mark.asyncio
-    async def test_update_nonexistent_script(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_update_nonexistent_script(self, authenticated_client: AsyncClient) -> None:
         """Test updating a non-existent script."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         fake_script_id = "00000000-0000-0000-0000-000000000000"
 
         update_data = {
@@ -415,7 +372,7 @@ async def test_update_nonexistent_script(self, client: AsyncClient, test_user: D
             "lang_version": "3.11"
         }
 
-        response = await client.put(f"/api/v1/scripts/{fake_script_id}", json=update_data)
+        response = await authenticated_client.put(f"/api/v1/scripts/{fake_script_id}", json=update_data)
         # Non-existent script must return 404/403 (no server error)
         assert response.status_code in [404, 403]
 
@@ -423,54 +380,43 @@ async def test_update_nonexistent_script(self, client: AsyncClient, test_user: D
         assert "detail" in error_data
 
     @pytest.mark.asyncio
-    async def test_delete_nonexistent_script(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_delete_nonexistent_script(self, authenticated_client: AsyncClient) -> None:
         """Test deleting a non-existent script."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
-
         fake_script_id = "00000000-0000-0000-0000-000000000000"
 
-        response = await client.delete(f"/api/v1/scripts/{fake_script_id}")
+        response = await authenticated_client.delete(f"/api/v1/scripts/{fake_script_id}")
         # Could be 404 (not found) or 204 (idempotent delete)
         assert response.status_code in [404, 403, 204]
 
     @pytest.mark.asyncio
-    async def test_scripts_persist_across_sessions(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_scripts_persist_across_sessions(
+        self, client: AsyncClient, make_user: MakeUser, unique_id: Callable[[str], str],
+    ) -> None:
         """Test that scripts persist across login sessions."""
-        # First session - create script
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_response = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response.status_code == 200
+        user = await make_user(UserRole.USER)
 
-        unique_id = str(uuid4())[:8]
+        uid = unique_id("")
         script_data = {
-            "name": f"Persistent Script {unique_id}",
+            "name": f"Persistent Script {uid}",
             "script": "print('Should persist')",
             "lang": "python",
             "lang_version": "3.11",
-            "description": "Testing persistence"
+            "description": "Testing persistence",
         }
 
         create_response = await client.post("/api/v1/scripts", json=script_data)
         assert create_response.status_code in [200, 201]
-
         script_id = create_response.json()["script_id"]
 
         # Logout
-        logout_response = await client.post("/api/v1/auth/logout")
-        assert logout_response.status_code == 200
-
-        # Second session - retrieve script
-        login_response2 = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_response2.status_code == 200
+        await client.post("/api/v1/auth/logout")
+
+        # Second session - login again and retrieve script
+        login_resp = await client.post(
+            "/api/v1/auth/login",
+            data={"username": user["username"], "password": user["password"]},
+        )
+        assert login_resp.status_code == 200
 
         # Script should still exist
         get_response = await client.get(f"/api/v1/scripts/{script_id}")
diff --git a/backend/tests/integration/test_sse_routes.py b/backend/tests/integration/test_sse_routes.py
index ace4bc48..4809e968 100644
--- a/backend/tests/integration/test_sse_routes.py
+++ b/backend/tests/integration/test_sse_routes.py
@@ -1,19 +1,17 @@
 import asyncio
 import json
-from typing import Dict
-from uuid import uuid4
+from collections.abc import Callable
 
+import backoff
 import pytest
-from httpx import AsyncClient
-
 from app.domain.enums.notification import NotificationSeverity, NotificationStatus
-from app.schemas_pydantic.sse import RedisNotificationMessage, SSEHealthResponse
-from app.infrastructure.kafka.events.pod import PodCreatedEvent
 from app.infrastructure.kafka.events.metadata import AvroEventMetadata
+from app.infrastructure.kafka.events.pod import PodCreatedEvent
+from app.schemas_pydantic.sse import RedisNotificationMessage, SSEHealthResponse
 from app.services.sse.redis_bus import SSERedisBus
 from app.services.sse.sse_service import SSEService
-from tests.helpers.eventually import eventually
-
+from dishka import AsyncContainer
+from httpx import AsyncClient
 
 # Note: httpx with ASGITransport doesn't support SSE streaming
 # We test SSE functionality directly through the service, not HTTP
@@ -24,13 +22,13 @@ class TestSSERoutes:
     """SSE routes tested with deterministic event-driven reads (no polling)."""
 
     @pytest.mark.asyncio
-    async def test_sse_requires_authentication(self, client: AsyncClient) -> None:
+    async def test_sse_requires_authentication(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         r = await client.get("/api/v1/events/notifications/stream")
         assert r.status_code == 401
         detail = r.json().get("detail", "").lower()
         assert any(x in detail for x in ("not authenticated", "unauthorized", "login"))
 
-        exec_id = str(uuid4())
+        exec_id = unique_id("exec-")
         r = await client.get(f"/api/v1/events/executions/{exec_id}")
         assert r.status_code == 401
 
@@ -38,26 +36,29 @@ async def test_sse_requires_authentication(self, client: AsyncClient) -> None:
         assert r.status_code == 401
 
     @pytest.mark.asyncio
-    async def test_sse_health_status(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
-        r = await client.get("/api/v1/events/health")
+    async def test_sse_health_status(self, authenticated_client: AsyncClient) -> None:
+        r = await authenticated_client.get("/api/v1/events/health")
         assert r.status_code == 200
         health = SSEHealthResponse(**r.json())
         assert health.status in ("healthy", "degraded", "unhealthy", "draining")
         assert isinstance(health.active_connections, int) and health.active_connections >= 0
 
     @pytest.mark.asyncio
-    async def test_notification_stream_service(self, scope, test_user: Dict[str, str]) -> None:  # type: ignore[valid-type]
+    async def test_notification_stream_service(
+        self, scope: AsyncContainer, unique_id: Callable[[str], str]
+    ) -> None:
         """Test SSE notification stream directly through service (httpx doesn't support SSE streaming)."""
         sse_service: SSEService = await scope.get(SSEService)
         bus: SSERedisBus = await scope.get(SSERedisBus)
-        user_id = f"user-{uuid4().hex[:8]}"
+        user_id = unique_id("user-")
         
         # Create notification stream generator
         stream_gen = sse_service.create_notification_stream(user_id)
         
         # Collect events with timeout
-        events = []
-        async def collect_events():
+        events: list[dict[str, object]] = []
+
+        async def collect_events() -> None:
             async for event in stream_gen:
                 if "data" in event:
                     data = json.loads(event["data"])
@@ -67,15 +68,17 @@ async def collect_events():
         
         # Start collecting events
         collect_task = asyncio.create_task(collect_events())
-        
+
         # Wait until the initial 'connected' event is received
-        async def _connected() -> None:
+        @backoff.on_exception(backoff.constant, AssertionError, max_time=5.0, interval=0.1)
+        async def _wait_connected() -> None:
             assert len(events) > 0 and events[0].get("event_type") == "connected"
-        await eventually(_connected, timeout=2.0, interval=0.05)
-        
+
+        await _wait_connected()
+
         # Publish a notification
         notification = RedisNotificationMessage(
-            notification_id=f"notif-{uuid4().hex[:8]}",
+            notification_id=unique_id("notif-"),
             severity=NotificationSeverity.MEDIUM,
             status=NotificationStatus.PENDING,
             tags=[],
@@ -88,7 +91,7 @@ async def _connected() -> None:
         
         # Wait for collection to complete
         try:
-            await asyncio.wait_for(collect_task, timeout=2.0)
+            await asyncio.wait_for(collect_task, timeout=5.0)
         except asyncio.TimeoutError:
             collect_task.cancel()
         
@@ -97,9 +100,11 @@ async def _connected() -> None:
         assert len(notif_events) > 0
 
     @pytest.mark.asyncio
-    async def test_execution_event_stream_service(self, scope, test_user: Dict[str, str]) -> None:  # type: ignore[valid-type]
+    async def test_execution_event_stream_service(
+        self, scope: AsyncContainer, unique_id: Callable[[str], str]
+    ) -> None:
         """Test SSE execution stream directly through service (httpx doesn't support SSE streaming)."""
-        exec_id = f"e-{uuid4().hex[:8]}"
+        exec_id = unique_id("e-")
         user_id = "test-user-id"
         
         sse_service: SSEService = await scope.get(SSEService)
@@ -109,8 +114,9 @@ async def test_execution_event_stream_service(self, scope, test_user: Dict[str,
         stream_gen = sse_service.create_execution_stream(exec_id, user_id)
         
         # Collect events
-        events = []
-        async def collect_events():
+        events: list[dict[str, object]] = []
+
+        async def collect_events() -> None:
             async for event in stream_gen:
                 if "data" in event:
                     data = json.loads(event["data"])
@@ -120,12 +126,14 @@ async def collect_events():
         
         # Start collecting
         collect_task = asyncio.create_task(collect_events())
-        
+
         # Wait until the initial 'connected' event is received
-        async def _connected() -> None:
+        @backoff.on_exception(backoff.constant, AssertionError, max_time=5.0, interval=0.1)
+        async def _wait_connected() -> None:
             assert len(events) > 0 and events[0].get("event_type") == "connected"
-        await eventually(_connected, timeout=2.0, interval=0.05)
-        
+
+        await _wait_connected()
+
         # Publish pod event
         ev = PodCreatedEvent(
             execution_id=exec_id,
@@ -137,7 +145,7 @@ async def _connected() -> None:
         
         # Wait for collection
         try:
-            await asyncio.wait_for(collect_task, timeout=2.0)
+            await asyncio.wait_for(collect_task, timeout=5.0)
         except asyncio.TimeoutError:
             collect_task.cancel()
         
@@ -146,37 +154,35 @@ async def _connected() -> None:
         assert len(pod_events) > 0
 
     @pytest.mark.asyncio
-    async def test_sse_route_requires_auth(self, client: AsyncClient) -> None:
+    async def test_sse_route_requires_auth(self, client: AsyncClient, unique_id: Callable[[str], str]) -> None:
         """Test that SSE routes require authentication (HTTP-level test only)."""
         # Test notification stream requires auth
         r = await client.get("/api/v1/events/notifications/stream")
         assert r.status_code == 401
-        
+
         # Test execution stream requires auth
-        exec_id = str(uuid4())
+        exec_id = unique_id("exec-")
         r = await client.get(f"/api/v1/events/executions/{exec_id}")
         assert r.status_code == 401
 
     @pytest.mark.asyncio
-    async def test_sse_endpoint_returns_correct_headers(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
-        task = asyncio.create_task(client.get("/api/v1/events/notifications/stream"))
-        
-        async def _tick() -> None:
-            return None
-        await eventually(_tick, timeout=0.1, interval=0.01)
-        
+    async def test_sse_cancellation_doesnt_break_client(self, authenticated_client: AsyncClient) -> None:
+        """Test that cancelling an SSE stream doesn't break the client."""
+        task = asyncio.create_task(authenticated_client.get("/api/v1/events/notifications/stream"))
+
+        # Cancel immediately - no sleep needed, cancellation valid at any point
         task.cancel()
-        try:
+        with pytest.raises(asyncio.CancelledError):
             await task
-        except asyncio.CancelledError:
-            pass
-        
-        r = await client.get("/api/v1/events/health")
+
+        # Verify client still works after cancellation
+        r = await authenticated_client.get("/api/v1/events/health")
         assert r.status_code == 200
-        assert isinstance(r.json(), dict)
 
     @pytest.mark.asyncio
-    async def test_multiple_concurrent_sse_service_connections(self, scope, test_user: Dict[str, str]) -> None:  # type: ignore[valid-type]
+    async def test_multiple_concurrent_sse_service_connections(
+        self, scope: AsyncContainer
+    ) -> None:
         """Test multiple concurrent SSE connections through the service."""
         sse_service: SSEService = await scope.get(SSEService)
         
diff --git a/backend/tests/integration/test_user_settings_routes.py b/backend/tests/integration/test_user_settings_routes.py
index c6378351..972d5f88 100644
--- a/backend/tests/integration/test_user_settings_routes.py
+++ b/backend/tests/integration/test_user_settings_routes.py
@@ -1,86 +1,16 @@
-import asyncio
 from datetime import datetime, timezone
-from typing import Dict
-from uuid import uuid4
 
 import pytest
-import pytest_asyncio
+from app.domain.enums.user import UserRole
+from app.schemas_pydantic.user_settings import SettingsHistoryResponse, UserSettings
 from httpx import AsyncClient
 
-from app.schemas_pydantic.user_settings import (
-    UserSettings,
-    SettingsHistoryResponse
-)
-from tests.helpers.eventually import eventually
+from tests.conftest import MakeUser
 
 # Force these tests to run sequentially on a single worker to avoid state conflicts
 pytestmark = pytest.mark.xdist_group(name="user_settings")
 
 
-@pytest_asyncio.fixture
-async def test_user(client: AsyncClient) -> Dict[str, str]:
-    """Create a fresh user for each test."""
-    uid = uuid4().hex[:8]
-    username = f"test_user_{uid}"
-    email = f"{username}@example.com"
-    password = "TestPass123!"
-
-    # Register the user
-    await client.post("/api/v1/auth/register", json={
-        "username": username,
-        "email": email,
-        "password": password,
-        "role": "user"
-    })
-
-    # Login to get CSRF token
-    login_resp = await client.post("/api/v1/auth/login", data={
-        "username": username,
-        "password": password
-    })
-    csrf = login_resp.json().get("csrf_token", "")
-
-    return {
-        "username": username,
-        "email": email,
-        "password": password,
-        "csrf_token": csrf,
-        "headers": {"X-CSRF-Token": csrf}
-    }
-
-
-@pytest_asyncio.fixture
-async def test_user2(client: AsyncClient) -> Dict[str, str]:
-    """Create a second fresh user for isolation tests."""
-    uid = uuid4().hex[:8]
-    username = f"test_user2_{uid}"
-    email = f"{username}@example.com"
-    password = "TestPass123!"
-
-    # Register the user
-    await client.post("/api/v1/auth/register", json={
-        "username": username,
-        "email": email,
-        "password": password,
-        "role": "user"
-    })
-
-    # Login to get CSRF token
-    login_resp = await client.post("/api/v1/auth/login", data={
-        "username": username,
-        "password": password
-    })
-    csrf = login_resp.json().get("csrf_token", "")
-
-    return {
-        "username": username,
-        "email": email,
-        "password": password,
-        "csrf_token": csrf,
-        "headers": {"X-CSRF-Token": csrf}
-    }
-
-
 @pytest.mark.integration
 class TestUserSettingsRoutes:
     """Test user settings endpoints against real backend."""
@@ -98,85 +28,53 @@ async def test_user_settings_require_authentication(self, client: AsyncClient) -
                    for word in ["not authenticated", "unauthorized", "login"])
 
     @pytest.mark.asyncio
-    async def test_get_user_settings(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_user_settings(self, authenticated_client: AsyncClient) -> None:
         """Test getting user settings."""
-        # Already authenticated via test_user fixture
-
-        # Get user settings
-        response = await client.get("/api/v1/user/settings/")
+        response = await authenticated_client.get("/api/v1/user/settings/")
         assert response.status_code == 200
 
-        # Validate response structure
-        settings_data = response.json()
-        settings = UserSettings(**settings_data)
+        # Pydantic validates types and required fields
+        settings = UserSettings(**response.json())
 
-        # Verify required fields
-        assert settings.user_id is not None
+        # Verify business logic constraints (not type checks)
         assert settings.theme in ["light", "dark", "auto", "system"]
-        # Language field may not be present in all deployments
-        if hasattr(settings, "language"):
-            assert isinstance(settings.language, str)
-        assert isinstance(settings.timezone, str)
-
-        # Verify notification settings (API uses execution_* and security_alerts fields)
-        assert settings.notifications is not None
-        assert isinstance(settings.notifications.execution_completed, bool)
-        assert isinstance(settings.notifications.execution_failed, bool)
-        assert isinstance(settings.notifications.system_updates, bool)
-        assert isinstance(settings.notifications.security_alerts, bool)
-
-        # Verify editor settings  
-        assert settings.editor is not None
-        assert isinstance(settings.editor.font_size, int)
         assert 8 <= settings.editor.font_size <= 32
-        assert settings.editor.theme in ["auto", "one-dark", "monokai", "github", "dracula", "solarized", "vs", "vscode"]
-        assert isinstance(settings.editor.tab_size, int)
         assert settings.editor.tab_size in [2, 4, 8]
-        assert isinstance(settings.editor.word_wrap, bool)
-        assert isinstance(settings.editor.show_line_numbers, bool)
-
-        # Verify timestamp fields
-        assert settings.created_at is not None
-        assert settings.updated_at is not None
-
-        # Custom settings might be empty or contain user preferences
-        if settings.custom_settings:
-            assert isinstance(settings.custom_settings, dict)
 
     @pytest.mark.asyncio
-    async def test_update_user_settings(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_update_user_settings(self, authenticated_client: AsyncClient) -> None:
         """Test updating user settings."""
-        # Already authenticated via test_user fixture
-
         # Get current settings to preserve original values
-        original_response = await client.get("/api/v1/user/settings/")
+        original_response = await authenticated_client.get("/api/v1/user/settings/")
         assert original_response.status_code == 200
         original_settings = original_response.json()
 
         # Update settings
+        notifications = {
+            "execution_completed": False,
+            "execution_failed": True,
+            "system_updates": True,
+            "security_alerts": True,
+            "channels": ["in_app", "webhook"],
+        }
+        editor = {
+            "theme": "monokai",
+            "font_size": 14,
+            "tab_size": 4,
+            "use_tabs": False,
+            "word_wrap": True,
+            "show_line_numbers": True,
+        }
         update_data = {
             "theme": "dark" if original_settings["theme"] == "light" else "light",
             "timezone": "America/New_York" if original_settings["timezone"] != "America/New_York" else "UTC",
             "date_format": "MM/DD/YYYY",
             "time_format": "12h",
-            "notifications": {
-                "execution_completed": False,
-                "execution_failed": True,
-                "system_updates": True,
-                "security_alerts": True,
-                "channels": ["in_app", "webhook"]
-            },
-            "editor": {
-                "theme": "monokai",
-                "font_size": 14,
-                "tab_size": 4,
-                "use_tabs": False,
-                "word_wrap": True,
-                "show_line_numbers": True
-            }
+            "notifications": notifications,
+            "editor": editor,
         }
 
-        response = await client.put("/api/v1/user/settings/", json=update_data)
+        response = await authenticated_client.put("/api/v1/user/settings/", json=update_data)
         if response.status_code != 200:
             pytest.fail(f"Status: {response.status_code}, Body: {response.json()}, Data: {update_data}")
         assert response.status_code == 200
@@ -189,27 +87,24 @@ async def test_update_user_settings(self, client: AsyncClient, test_user: Dict[s
         assert updated_settings.time_format == update_data["time_format"]
 
         # Verify notification settings were updated
-        assert updated_settings.notifications.execution_completed == update_data["notifications"][
-            "execution_completed"]
-        assert updated_settings.notifications.execution_failed == update_data["notifications"]["execution_failed"]
-        assert updated_settings.notifications.system_updates == update_data["notifications"]["system_updates"]
-        assert updated_settings.notifications.security_alerts == update_data["notifications"]["security_alerts"]
+        assert updated_settings.notifications.execution_completed == notifications["execution_completed"]
+        assert updated_settings.notifications.execution_failed == notifications["execution_failed"]
+        assert updated_settings.notifications.system_updates == notifications["system_updates"]
+        assert updated_settings.notifications.security_alerts == notifications["security_alerts"]
         assert "in_app" in [str(c) for c in updated_settings.notifications.channels]
 
         # Verify editor settings were updated
-        assert updated_settings.editor.theme == update_data["editor"]["theme"]
-        assert updated_settings.editor.font_size == update_data["editor"]["font_size"]
-        assert updated_settings.editor.tab_size == update_data["editor"]["tab_size"]
-        assert updated_settings.editor.word_wrap == update_data["editor"]["word_wrap"]
-        assert updated_settings.editor.show_line_numbers == update_data["editor"]["show_line_numbers"]
+        assert updated_settings.editor.theme == editor["theme"]
+        assert updated_settings.editor.font_size == editor["font_size"]
+        assert updated_settings.editor.tab_size == editor["tab_size"]
+        assert updated_settings.editor.word_wrap == editor["word_wrap"]
+        assert updated_settings.editor.show_line_numbers == editor["show_line_numbers"]
 
     @pytest.mark.asyncio
-    async def test_update_theme_only(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_update_theme_only(self, authenticated_client: AsyncClient) -> None:
         """Test updating only the theme setting."""
-        # Already authenticated via test_user fixture
-
         # Get current theme
-        original_response = await client.get("/api/v1/user/settings/")
+        original_response = await authenticated_client.get("/api/v1/user/settings/")
         assert original_response.status_code == 200
         original_theme = original_response.json()["theme"]
 
@@ -219,7 +114,7 @@ async def test_update_theme_only(self, client: AsyncClient, test_user: Dict[str,
             "theme": new_theme
         }
 
-        response = await client.put("/api/v1/user/settings/theme", json=theme_update)
+        response = await authenticated_client.put("/api/v1/user/settings/theme", json=theme_update)
         assert response.status_code == 200
 
         # Validate updated settings
@@ -233,10 +128,8 @@ async def test_update_theme_only(self, client: AsyncClient, test_user: Dict[str,
         assert updated_settings.timezone == original_response.json()["timezone"]
 
     @pytest.mark.asyncio
-    async def test_update_notification_settings_only(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_update_notification_settings_only(self, authenticated_client: AsyncClient) -> None:
         """Test updating only notification settings."""
-        # Already authenticated via test_user fixture
-
         # Update notification settings
         notification_update = {
             "execution_completed": True,
@@ -246,9 +139,7 @@ async def test_update_notification_settings_only(self, client: AsyncClient, test
             "channels": ["in_app"]
         }
 
-        response = await client.put("/api/v1/user/settings/notifications", json=notification_update)
-        if response.status_code >= 500:
-            pytest.skip("Notification settings update not available in this environment")
+        response = await authenticated_client.put("/api/v1/user/settings/notifications", json=notification_update)
         assert response.status_code == 200
 
         # Validate updated settings
@@ -260,10 +151,8 @@ async def test_update_notification_settings_only(self, client: AsyncClient, test
         assert "in_app" in [str(c) for c in updated_settings.notifications.channels]
 
     @pytest.mark.asyncio
-    async def test_update_editor_settings_only(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_update_editor_settings_only(self, authenticated_client: AsyncClient) -> None:
         """Test updating only editor settings."""
-        # Already authenticated via test_user fixture
-
         # Update editor settings
         editor_update = {
             "theme": "dracula",
@@ -274,9 +163,7 @@ async def test_update_editor_settings_only(self, client: AsyncClient, test_user:
             "show_line_numbers": True
         }
 
-        response = await client.put("/api/v1/user/settings/editor", json=editor_update)
-        if response.status_code >= 500:
-            pytest.skip("Editor settings update not available in this environment")
+        response = await authenticated_client.put("/api/v1/user/settings/editor", json=editor_update)
         assert response.status_code == 200
 
         # Validate updated settings
@@ -288,7 +175,7 @@ async def test_update_editor_settings_only(self, client: AsyncClient, test_user:
         assert updated_settings.editor.show_line_numbers == editor_update["show_line_numbers"]
 
     @pytest.mark.asyncio
-    async def test_update_custom_setting(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_update_custom_setting(self, authenticated_client: AsyncClient) -> None:
         """Test updating a custom setting."""
         # Update custom settings via main settings endpoint
         custom_key = "custom_preference"
@@ -299,7 +186,7 @@ async def test_update_custom_setting(self, client: AsyncClient, test_user: Dict[
             }
         }
 
-        response = await client.put("/api/v1/user/settings/", json=update_data)
+        response = await authenticated_client.put("/api/v1/user/settings/", json=update_data)
         assert response.status_code == 200
 
         # Validate updated settings
@@ -308,26 +195,15 @@ async def test_update_custom_setting(self, client: AsyncClient, test_user: Dict[
         assert updated_settings.custom_settings[custom_key] == custom_value
 
     @pytest.mark.asyncio
-    async def test_get_settings_history(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_get_settings_history(self, authenticated_client: AsyncClient) -> None:
         """Test getting settings change history."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_resp = await client.post("/api/v1/auth/login", data=login_data)
-        assert login_resp.status_code == 200
-
         # Make some changes to build history (theme change)
         theme_update = {"theme": "dark"}
-        response = await client.put("/api/v1/user/settings/theme", json=theme_update)
-        if response.status_code >= 500:
-            pytest.skip("Settings history not available in this environment")
+        response = await authenticated_client.put("/api/v1/user/settings/theme", json=theme_update)
+        assert response.status_code == 200
 
         # Get history
-        history_response = await client.get("/api/v1/user/settings/history")
-        if history_response.status_code >= 500:
-            pytest.skip("Settings history endpoint not available in this environment")
+        history_response = await authenticated_client.get("/api/v1/user/settings/history")
         assert history_response.status_code == 200
 
         # Validate history structure
@@ -339,73 +215,43 @@ async def test_get_settings_history(self, client: AsyncClient, test_user: Dict[s
             assert entry.timestamp is not None
 
     @pytest.mark.asyncio
-    async def test_restore_settings_to_previous_point(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_restore_settings_to_previous_point(self, authenticated_client: AsyncClient) -> None:
         """Test restoring settings to a previous point in time."""
-        # Login first
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        await client.post("/api/v1/auth/login", data=login_data)
-
         # Get original settings
-        original_resp = await client.get("/api/v1/user/settings/")
+        original_resp = await authenticated_client.get("/api/v1/user/settings/")
         assert original_resp.status_code == 200
         original_theme = original_resp.json()["theme"]
 
         # Make a change
         new_theme = "dark" if original_theme != "dark" else "light"
-        await client.put("/api/v1/user/settings/theme", json={"theme": new_theme})
-
-        # Ensure restore point is distinct by checking time monotonicity
-        prev = datetime.now(timezone.utc)
+        await authenticated_client.put("/api/v1/user/settings/theme", json={"theme": new_theme})
 
-        async def _tick():
-            now = datetime.now(timezone.utc)
-            assert (now - prev).total_seconds() >= 0
-
-        await eventually(_tick, timeout=0.5, interval=0.05)
-
-        # Get restore point (before the change)
+        # Get restore point (after the change)
         restore_point = datetime.now(timezone.utc).isoformat()
 
         # Make another change
         second_theme = "auto" if new_theme != "auto" else "system"
-        await client.put("/api/v1/user/settings/theme", json={"theme": second_theme})
+        await authenticated_client.put("/api/v1/user/settings/theme", json={"theme": second_theme})
 
         # Try to restore to the restore point
         restore_data = {"timestamp": restore_point}
-        restore_resp = await client.post("/api/v1/user/settings/restore", json=restore_data)
-
-        # Skip if restore functionality not available
-        if restore_resp.status_code >= 500:
-            pytest.skip("Settings restore not available in this environment")
+        restore_resp = await authenticated_client.post("/api/v1/user/settings/restore", json=restore_data)
+        assert restore_resp.status_code == 200
 
-        # If successful, verify the theme was restored
-        if restore_resp.status_code == 200:
-            current_resp = await client.get("/api/v1/user/settings/")
-            # Since restore might not work exactly as expected in test environment,
-            # just verify we get valid settings back
-            assert current_resp.status_code == 200
+        # Verify we get valid settings back
+        current_resp = await authenticated_client.get("/api/v1/user/settings/")
+        assert current_resp.status_code == 200
 
     @pytest.mark.asyncio
-    async def test_invalid_theme_value(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_invalid_theme_value(self, authenticated_client: AsyncClient) -> None:
         """Test that invalid theme values are rejected."""
-        # Already authenticated via test_user fixture
-
-        # Try to update with invalid theme
         invalid_theme = {"theme": "invalid_theme"}
-
-        response = await client.put("/api/v1/user/settings/theme", json=invalid_theme)
-        if response.status_code >= 500:
-            pytest.skip("Theme validation not available in this environment")
+        response = await authenticated_client.put("/api/v1/user/settings/theme", json=invalid_theme)
         assert response.status_code in [400, 422]
 
     @pytest.mark.asyncio
-    async def test_invalid_editor_settings(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_invalid_editor_settings(self, authenticated_client: AsyncClient) -> None:
         """Test that invalid editor settings are rejected."""
-        # Already authenticated via test_user fixture
-
         # Try to update with invalid editor settings
         invalid_editor = {
             "theme": "dracula",
@@ -416,41 +262,34 @@ async def test_invalid_editor_settings(self, client: AsyncClient, test_user: Dic
             "show_line_numbers": True
         }
 
-        response = await client.put("/api/v1/user/settings/editor", json=invalid_editor)
-        if response.status_code >= 500:
-            pytest.skip("Editor validation not available in this environment")
+        response = await authenticated_client.put("/api/v1/user/settings/editor", json=invalid_editor)
         assert response.status_code in [400, 422]
 
     @pytest.mark.asyncio
-    async def test_settings_isolation_between_users(self, client: AsyncClient,
-                                                    test_user: Dict[str, str],
-                                                    test_user2: Dict[str, str]) -> None:
+    async def test_settings_isolation_between_users(
+        self, client: AsyncClient, make_user: MakeUser,
+    ) -> None:
         """Test that settings are isolated between users."""
+        user1 = await make_user(UserRole.USER)
+        user2 = await make_user(UserRole.USER)
 
         # Login as first user
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        await client.post("/api/v1/auth/login", data=login_data)
+        await client.post(
+            "/api/v1/auth/login",
+            data={"username": user1["username"], "password": user1["password"]},
+        )
 
         # Update first user's settings
-        user1_update = {
-            "theme": "dark",
-            "timezone": "America/New_York"
-        }
+        user1_update = {"theme": "dark", "timezone": "America/New_York"}
         response = await client.put("/api/v1/user/settings/", json=user1_update)
         assert response.status_code == 200
 
-        # Log out
+        # Log out and login as second user
         await client.post("/api/v1/auth/logout")
-
-        # Login as second user
-        login_data = {
-            "username": test_user2["username"],
-            "password": test_user2["password"]
-        }
-        await client.post("/api/v1/auth/login", data=login_data)
+        await client.post(
+            "/api/v1/auth/login",
+            data={"username": user2["username"], "password": user2["password"]},
+        )
 
         # Get second user's settings
         response = await client.get("/api/v1/user/settings/")
@@ -458,28 +297,26 @@ async def test_settings_isolation_between_users(self, client: AsyncClient,
         user2_settings = response.json()
 
         # Verify second user's settings are not affected by first user's changes
-        # Second user should have default settings, not the first user's custom settings
-        assert user2_settings["theme"] != user1_update["theme"] or user2_settings["timezone"] != user1_update[
-            "timezone"]
+        assert (
+            user2_settings["theme"] != user1_update["theme"]
+            or user2_settings["timezone"] != user1_update["timezone"]
+        )
 
     @pytest.mark.asyncio
-    async def test_settings_persistence(self, client: AsyncClient, test_user: Dict[str, str]) -> None:
+    async def test_settings_persistence(self, client: AsyncClient, make_user: MakeUser) -> None:
         """Test that settings persist across login sessions."""
-        # Already authenticated via test_user fixture
+        user = await make_user(UserRole.USER)
 
         # Update settings
-        update_data = {
-            "theme": "dark",
-            "timezone": "Europe/London",
-            "editor": {
-                "theme": "github",
-                "font_size": 18,
-                "tab_size": 8,
-                "use_tabs": True,
-                "word_wrap": False,
-                "show_line_numbers": False
-            }
+        editor = {
+            "theme": "github",
+            "font_size": 18,
+            "tab_size": 8,
+            "use_tabs": True,
+            "word_wrap": False,
+            "show_line_numbers": False,
         }
+        update_data = {"theme": "dark", "timezone": "Europe/London", "editor": editor}
 
         response = await client.put("/api/v1/user/settings/", json=update_data)
         assert response.status_code == 200
@@ -488,11 +325,10 @@ async def test_settings_persistence(self, client: AsyncClient, test_user: Dict[s
         await client.post("/api/v1/auth/logout")
 
         # Log back in as same user
-        login_data = {
-            "username": test_user["username"],
-            "password": test_user["password"]
-        }
-        login_resp = await client.post("/api/v1/auth/login", data=login_data)
+        login_resp = await client.post(
+            "/api/v1/auth/login",
+            data={"username": user["username"], "password": user["password"]},
+        )
         assert login_resp.status_code == 200
 
         # Get settings again
@@ -503,8 +339,8 @@ async def test_settings_persistence(self, client: AsyncClient, test_user: Dict[s
         # Verify settings persisted
         assert persisted_settings.theme == update_data["theme"]
         assert persisted_settings.timezone == update_data["timezone"]
-        assert persisted_settings.editor.theme == update_data["editor"]["theme"]
-        assert persisted_settings.editor.font_size == update_data["editor"]["font_size"]
-        assert persisted_settings.editor.tab_size == update_data["editor"]["tab_size"]
-        assert persisted_settings.editor.word_wrap == update_data["editor"]["word_wrap"]
-        assert persisted_settings.editor.show_line_numbers == update_data["editor"]["show_line_numbers"]
+        assert persisted_settings.editor.theme == editor["theme"]
+        assert persisted_settings.editor.font_size == editor["font_size"]
+        assert persisted_settings.editor.tab_size == editor["tab_size"]
+        assert persisted_settings.editor.word_wrap == editor["word_wrap"]
+        assert persisted_settings.editor.show_line_numbers == editor["show_line_numbers"]
diff --git a/backend/tests/load/cli.py b/backend/tests/load/cli.py
index e672617d..b6228859 100644
--- a/backend/tests/load/cli.py
+++ b/backend/tests/load/cli.py
@@ -18,7 +18,8 @@ async def _run(cfg: LoadConfig) -> int:
     # Brief run configuration summary to stdout for easier troubleshooting
     print(
         f"Load config: base_url={cfg.base_url} api_prefix={cfg.api_prefix} "
-        f"mode={cfg.mode} clients={cfg.clients} concurrency={cfg.concurrency} duration={cfg.duration_seconds}s verify_tls={cfg.verify_tls}"
+        f"mode={cfg.mode} clients={cfg.clients} concurrency={cfg.concurrency} "
+        f"duration={cfg.duration_seconds}s verify_tls={cfg.verify_tls}"
     )
     # Quick preflight to catch prefix/port mistakes early
     pre_stats = StatsCollector()
@@ -46,7 +47,10 @@ async def _run(cfg: LoadConfig) -> int:
     stats.save(stats_path)
     # Print concise summary
     summary = stats.finalize()
-    print(f"Load run complete: mode={cfg.mode} requests={summary['total_requests']} errors={summary['total_errors']} runtime={summary['runtime_seconds']}s")
+    print(
+        f"Load run complete: mode={cfg.mode} requests={summary['total_requests']} "
+        f"errors={summary['total_errors']} runtime={summary['runtime_seconds']}s"
+    )
     print(f"Report saved to: {stats_path}")
     # Optional plots
     if getattr(cfg, "generate_plots", False):
@@ -75,7 +79,7 @@ def main(argv: list[str] | None = None) -> int:
     if args.base_url:
         cfg.base_url = args.base_url
     if args.mode:
-        cfg.mode = args.mode  # type: ignore[assignment]
+        cfg.mode = args.mode
     if args.clients is not None:
         cfg.clients = args.clients
     if args.concurrency is not None:
@@ -86,7 +90,7 @@ def main(argv: list[str] | None = None) -> int:
         cfg.duration_seconds = args.duration
 
     # Pass plots flag through cfg (without changing dataclass fields)
-    setattr(cfg, "generate_plots", bool(args.plots))
+    cfg.generate_plots = bool(args.plots)
     return asyncio.run(_run(cfg))
 
 
diff --git a/backend/tests/load/config.py b/backend/tests/load/config.py
index a5cf208a..bf1c3d7e 100644
--- a/backend/tests/load/config.py
+++ b/backend/tests/load/config.py
@@ -4,19 +4,30 @@
 from dataclasses import dataclass, field
 from typing import Literal
 
-
 Mode = Literal["monkey", "user", "both"]
 
 
+def _get_mode() -> Mode:
+    """Get mode from env with proper Literal type."""
+    env_val = os.getenv("LOAD_MODE", "both")
+    if env_val == "monkey":
+        return "monkey"
+    if env_val == "user":
+        return "user"
+    return "both"
+
+
 @dataclass(slots=True)
 class LoadConfig:
     base_url: str = field(default_factory=lambda: os.getenv("LOAD_BASE_URL", "https://[::1]:443"))
     api_prefix: str = field(default_factory=lambda: os.getenv("LOAD_API_PREFIX", "/api/v1"))
-    verify_tls: bool = field(default_factory=lambda: os.getenv("LOAD_VERIFY_TLS", "false").lower() in ("1", "true", "yes"))
+    verify_tls: bool = field(
+        default_factory=lambda: os.getenv("LOAD_VERIFY_TLS", "false").lower() in ("1", "true", "yes")
+    )
     generate_plots: bool = field(default=False)
 
     # Clients and workload
-    mode: Mode = field(default_factory=lambda: os.getenv("LOAD_MODE", "both"))
+    mode: Mode = field(default_factory=_get_mode)
     clients: int = int(os.getenv("LOAD_CLIENTS", "25"))
     concurrency: int = int(os.getenv("LOAD_CONCURRENCY", "10"))
     # Default run duration ~3 minutes
@@ -24,16 +35,26 @@ class LoadConfig:
     ramp_up_seconds: int = int(os.getenv("LOAD_RAMP", "5"))
 
     # User pool (for user-mode)
-    auto_register_users: bool = field(default_factory=lambda: os.getenv("LOAD_AUTO_REGISTER", "true").lower() in ("1","true","yes"))
+    auto_register_users: bool = field(
+        default_factory=lambda: os.getenv("LOAD_AUTO_REGISTER", "true").lower() in ("1", "true", "yes")
+    )
     user_prefix: str = os.getenv("LOAD_USER_PREFIX", "loaduser")
     user_domain: str = os.getenv("LOAD_USER_DOMAIN", "example.com")
     user_password: str = os.getenv("LOAD_USER_PASSWORD", "testpass123!")
 
     # Endpoint toggles
-    enable_sse: bool = field(default_factory=lambda: os.getenv("LOAD_ENABLE_SSE", "true").lower() in ("1","true","yes"))
-    enable_saved_scripts: bool = field(default_factory=lambda: os.getenv("LOAD_ENABLE_SCRIPTS", "true").lower() in ("1","true","yes"))
-    enable_user_settings: bool = field(default_factory=lambda: os.getenv("LOAD_ENABLE_SETTINGS", "true").lower() in ("1","true","yes"))
-    enable_notifications: bool = field(default_factory=lambda: os.getenv("LOAD_ENABLE_NOTIFICATIONS", "true").lower() in ("1","true","yes"))
+    enable_sse: bool = field(
+        default_factory=lambda: os.getenv("LOAD_ENABLE_SSE", "true").lower() in ("1", "true", "yes")
+    )
+    enable_saved_scripts: bool = field(
+        default_factory=lambda: os.getenv("LOAD_ENABLE_SCRIPTS", "true").lower() in ("1", "true", "yes")
+    )
+    enable_user_settings: bool = field(
+        default_factory=lambda: os.getenv("LOAD_ENABLE_SETTINGS", "true").lower() in ("1", "true", "yes")
+    )
+    enable_notifications: bool = field(
+        default_factory=lambda: os.getenv("LOAD_ENABLE_NOTIFICATIONS", "true").lower() in ("1", "true", "yes")
+    )
 
     # Reporting
     # Default to tests/load/out relative to current working directory
diff --git a/backend/tests/load/http_client.py b/backend/tests/load/http_client.py
index 94d3d4c4..87c53b8a 100644
--- a/backend/tests/load/http_client.py
+++ b/backend/tests/load/http_client.py
@@ -67,7 +67,7 @@ async def login(self, username: str, password: str) -> bool:
         r = await self._request("POST", url, data=httpx.QueryParams(data), headers=headers)
         if r.status_code == 200:
             # Extract csrf cookie (not httpOnly) for subsequent writes
-            for cookie in self.client.cookies.jar:  # type: ignore[attr-defined]
+            for cookie in self.client.cookies.jar:
                 if cookie.name == "csrf_token":
                     self.csrf_token = cookie.value
                     break
@@ -107,7 +107,7 @@ async def sse_execution(self, execution_id: str, max_seconds: float = 10.0) -> T
         # Use a separate streaming client to avoid interfering with normal client timeouts
         async with httpx.AsyncClient(verify=self.cfg.verify_tls, timeout=None) as s:
             # Reuse cookies for auth
-            s.cookies = self.client.cookies.copy()
+            s.cookies.update(self.client.cookies)
             t0 = time.perf_counter()
             try:
                 async with s.stream("GET", url) as resp:
diff --git a/backend/tests/load/monkey_runner.py b/backend/tests/load/monkey_runner.py
index ece0b9f6..2d07db50 100644
--- a/backend/tests/load/monkey_runner.py
+++ b/backend/tests/load/monkey_runner.py
@@ -3,12 +3,13 @@
 import asyncio
 import json
 import random
-import string
 import secrets
+import string
+import time
 from typing import Any
 
 from .config import LoadConfig
-from .http_client import APIClient
+from .http_client import APIClient, APIUser
 from .stats import StatsCollector
 from .strategies import json_value
 
@@ -63,9 +64,6 @@ def build_monkey_catalog(cfg: LoadConfig) -> list[tuple[str, str]]:
     return out
 
 
-import time
-
-
 async def run_monkey_swarm(cfg: LoadConfig, stats: StatsCollector, clients: int) -> None:
     catalog = build_monkey_catalog(cfg)
     sem = asyncio.Semaphore(cfg.concurrency)
@@ -77,11 +75,12 @@ async def one_client(i: int) -> None:
             # Half of clients attempt to login/register first
             if random.random() < 0.5:
                 uname = f"monkey_{_rand(6)}"
-                await c.register(user := type("U", (), {
-                    "username": uname,
-                    "email": f"{uname}@{cfg.user_domain}",
-                    "password": cfg.user_password
-                }))
+                user = APIUser(
+                    username=uname,
+                    email=f"{uname}@{cfg.user_domain}",
+                    password=cfg.user_password,
+                )
+                await c.register(user)
                 await c.login(uname, cfg.user_password)
 
             # Run until deadline
diff --git a/backend/tests/load/plot_report.py b/backend/tests/load/plot_report.py
index 54c5c365..25048057 100644
--- a/backend/tests/load/plot_report.py
+++ b/backend/tests/load/plot_report.py
@@ -10,7 +10,8 @@
 
 def _load_report(path: str | Path) -> Dict[str, Any]:
     with open(path, "r", encoding="utf-8") as f:
-        return json.load(f)
+        result: Dict[str, Any] = json.load(f)
+        return result
 
 
 def _ensure_out_dir(path: str | Path) -> Path:
@@ -89,7 +90,7 @@ def plot_endpoint_throughput(report: Dict[str, Any], out_dir: Path, top_n: int =
     labels = [k for k, _ in data]
     total = [v.get("count", 0) for _, v in data]
     errors = [v.get("errors", 0) for _, v in data]
-    successes = [t - e for t, e in zip(total, errors)]
+    successes = [t - e for t, e in zip(total, errors, strict=True)]
 
     x = range(len(labels))
     width = 0.45
diff --git a/backend/tests/load/strategies.py b/backend/tests/load/strategies.py
index 283473bf..d3eabbea 100644
--- a/backend/tests/load/strategies.py
+++ b/backend/tests/load/strategies.py
@@ -5,7 +5,6 @@
 
 from hypothesis import strategies as st
 
-
 # Generic JSON strategies (bounded sizes to keep payloads realistic)
 json_scalar = st.one_of(
     st.none(),
@@ -48,8 +47,8 @@
 severity = st.sampled_from(["info", "warning", "error", "critical"])  # common values
 label_key = st.text(min_size=1, max_size=24)
 label_val = st.text(min_size=0, max_size=64)
-labels = st.dictionaries(label_key, label_val, max_size=8)
-annotations = st.dictionaries(label_key, label_val, max_size=8)
+labels: st.SearchStrategy[dict[str, str]] = st.dictionaries(label_key, label_val, max_size=8)
+annot_strat: st.SearchStrategy[dict[str, str]] = st.dictionaries(label_key, label_val, max_size=8)
 
 def _iso_time() -> st.SearchStrategy[str]:
     base = datetime(2024, 1, 1)
@@ -61,7 +60,7 @@ def _iso_time() -> st.SearchStrategy[str]:
     {
         "status": st.sampled_from(["firing", "resolved"]),
         "labels": labels,
-        "annotations": annotations,
+        "annotations": annot_strat,
         "startsAt": _iso_time(),
         "endsAt": _iso_time(),
         "generatorURL": st.text(min_size=0, max_size=120),
@@ -77,7 +76,7 @@ def _iso_time() -> st.SearchStrategy[str]:
         "groupKey": st.text(min_size=0, max_size=64),
         "groupLabels": labels,
         "commonLabels": labels,
-        "commonAnnotations": annotations,
+        "commonAnnotations": annot_strat,
         "externalURL": st.text(min_size=0, max_size=120),
         "version": st.text(min_size=1, max_size=16),
     }
diff --git a/backend/tests/load/user_runner.py b/backend/tests/load/user_runner.py
index 1c441bd2..9d0b269d 100644
--- a/backend/tests/load/user_runner.py
+++ b/backend/tests/load/user_runner.py
@@ -2,8 +2,9 @@
 
 import asyncio
 import random
+import time
+from collections.abc import Awaitable, Callable
 from dataclasses import dataclass
-from typing import Callable
 
 from .config import LoadConfig
 from .http_client import APIClient, APIUser
@@ -14,7 +15,7 @@
 class UserTask:
     name: str
     weight: int
-    fn: Callable[[APIClient], asyncio.Future]
+    fn: Callable[[APIClient], Awaitable[None]]
 
 
 async def _flow_execute_and_get_result(c: APIClient) -> None:
@@ -81,11 +82,8 @@ async def _flow_settings_and_notifications(c: APIClient) -> None:
         await c.mark_all_read()
 
 
-import time
-
-
 async def run_user_swarm(cfg: LoadConfig, stats: StatsCollector, clients: int) -> None:
-    tasks: list[asyncio.Task] = []
+    tasks: list[asyncio.Task[None]] = []
     sem = asyncio.Semaphore(cfg.concurrency)
     deadline = time.time() + max(1, cfg.duration_seconds)
 
diff --git a/backend/tests/unit/conftest.py b/backend/tests/unit/conftest.py
index e89e4163..b6d843fe 100644
--- a/backend/tests/unit/conftest.py
+++ b/backend/tests/unit/conftest.py
@@ -3,24 +3,26 @@
 Unit tests should NOT access real infrastructure (DB, Redis, HTTP).
 These fixtures raise errors to catch accidental usage.
 """
+from typing import NoReturn
+
 import pytest
 
 
 @pytest.fixture
-def db():
+def db() -> NoReturn:
     raise RuntimeError("Unit tests should not access DB - use mocks or move to integration/")
 
 
 @pytest.fixture
-def redis_client():
+def redis_client() -> NoReturn:
     raise RuntimeError("Unit tests should not access Redis - use mocks or move to integration/")
 
 
 @pytest.fixture
-def client():
+def client() -> NoReturn:
     raise RuntimeError("Unit tests should not use HTTP client - use mocks or move to integration/")
 
 
 @pytest.fixture
-def app():
+def app() -> NoReturn:
     raise RuntimeError("Unit tests should not use full app - use mocks or move to integration/")
diff --git a/backend/tests/unit/core/metrics/test_base_metrics.py b/backend/tests/unit/core/metrics/test_base_metrics.py
index f8a6ab3e..6fe2ad2a 100644
--- a/backend/tests/unit/core/metrics/test_base_metrics.py
+++ b/backend/tests/unit/core/metrics/test_base_metrics.py
@@ -1,8 +1,6 @@
 import pytest
-
 from app.core.metrics.base import BaseMetrics
 
-
 pytestmark = pytest.mark.unit
 
 
diff --git a/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py b/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py
index 1103bb98..36d0fdbc 100644
--- a/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py
+++ b/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py
@@ -1,5 +1,4 @@
 import pytest
-
 from app.core.metrics.connections import ConnectionMetrics
 from app.core.metrics.coordinator import CoordinatorMetrics
 
diff --git a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
index 1b8d8072..11ee21cd 100644
--- a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
+++ b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
@@ -1,5 +1,4 @@
 import pytest
-
 from app.core.metrics.database import DatabaseMetrics
 from app.core.metrics.dlq import DLQMetrics
 
diff --git a/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py b/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py
index 9f008a66..b295af48 100644
--- a/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py
+++ b/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py
@@ -1,43 +1,47 @@
 
 
 import pytest
-
-from app.core.metrics.execution import ExecutionMetrics
 from app.core.metrics.events import EventMetrics
+from app.core.metrics.execution import ExecutionMetrics
 from app.domain.enums.execution import ExecutionStatus
 
-
 pytestmark = pytest.mark.unit
 
 
 def test_execution_metrics_methods() -> None:
     """Test with no-op metrics."""
-    
     m = ExecutionMetrics()
     m.record_script_execution(ExecutionStatus.QUEUED, "python-3.11")
     m.record_execution_duration(0.5, "python-3.11")
-    m.increment_active_executions(); m.decrement_active_executions()
+    m.increment_active_executions()
+    m.decrement_active_executions()
     m.record_memory_usage(123.4, "python-3.11")
     m.record_error("timeout")
-    m.update_queue_depth(1); m.update_queue_depth(-1)
+    m.update_queue_depth(1)
+    m.update_queue_depth(-1)
     m.record_queue_wait_time(0.1, "python-3.11")
-    m.record_execution_assigned(); m.record_execution_queued(); m.record_execution_scheduled("ok")
-    m.update_cpu_available(100.0); m.update_memory_available(512.0); m.update_gpu_available(1)
+    m.record_execution_assigned()
+    m.record_execution_queued()
+    m.record_execution_scheduled("ok")
+    m.update_cpu_available(100.0)
+    m.update_memory_available(512.0)
+    m.update_gpu_available(1)
     m.update_allocations_active(2)
 
 
 def test_event_metrics_methods() -> None:
     """Test with no-op metrics."""
-    
     m = EventMetrics()
     m.record_event_published("execution.requested", None)
     m.record_event_processing_duration(0.05, "execution.requested")
     m.record_pod_event_published("pod.running")
     m.record_event_replay_operation("prepare", "success")
     m.update_event_buffer_size(3)
-    m.record_event_buffer_dropped(); m.record_event_buffer_processed()
+    m.record_event_buffer_dropped()
+    m.record_event_buffer_processed()
     m.record_event_buffer_latency(0.2)
-    m.set_event_buffer_backpressure(True); m.set_event_buffer_backpressure(False)
+    m.set_event_buffer_backpressure(True)
+    m.set_event_buffer_backpressure(False)
     m.record_event_buffer_memory_usage(12.3)
     m.record_event_stored("execution.requested", "events")
     m.record_events_processing_failed("topic", "etype", "group", "error")
@@ -45,8 +49,12 @@ def test_event_metrics_methods() -> None:
     m.record_event_store_failed("etype", "fail")
     m.record_event_query_duration(0.2, "by_type", "events")
     m.record_processing_duration(0.3, "topic", "etype", "group")
-    m.record_kafka_message_produced("t"); m.record_kafka_message_consumed("t", "g")
+    m.record_kafka_message_produced("t")
+    m.record_kafka_message_consumed("t", "g")
     m.record_kafka_consumer_lag(10, "t", "g", 0)
-    m.record_kafka_production_error("t", "e"); m.record_kafka_consumption_error("t", "g", "e")
-    m.update_event_bus_queue_size(1, "default"); m.set_event_bus_queue_size(5, "default"); m.set_event_bus_queue_size(2, "default")
+    m.record_kafka_production_error("t", "e")
+    m.record_kafka_consumption_error("t", "g", "e")
+    m.update_event_bus_queue_size(1, "default")
+    m.set_event_bus_queue_size(5, "default")
+    m.set_event_bus_queue_size(2, "default")
 
diff --git a/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py b/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py
index ff97c429..2399801d 100644
--- a/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py
+++ b/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py
@@ -1,5 +1,4 @@
 import pytest
-
 from app.core.metrics.health import HealthMetrics
 
 pytestmark = pytest.mark.unit
@@ -14,7 +13,7 @@ def test_health_metrics_methods() -> None:
     m.update_health_check_status(1, "liveness", "basic")
     m.record_health_status("svc", "healthy")
     m.record_service_health_score("svc", 95.0)
-    m.update_liveness_status(True, "app");
+    m.update_liveness_status(True, "app")
     m.update_readiness_status(False, "app")
     m.record_dependency_health("mongo", True, 0.2)
     m.record_health_check_timeout("readiness", "db")
diff --git a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
index 5fbdcc73..95461cd3 100644
--- a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
+++ b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
@@ -1,24 +1,26 @@
 
 
 import pytest
-
 from app.core.metrics.kubernetes import KubernetesMetrics
 from app.core.metrics.notifications import NotificationMetrics
 
-
 pytestmark = pytest.mark.unit
 
 
 def test_kubernetes_metrics_methods() -> None:
     """Test with no-op metrics."""
-    
     m = KubernetesMetrics()
     m.record_pod_creation_failure("quota")
-    m.record_pod_created("success", "python"); m.record_pod_creation_duration(0.4, "python")
-    m.update_active_pod_creations(2); m.increment_active_pod_creations(); m.decrement_active_pod_creations()
+    m.record_pod_created("success", "python")
+    m.record_pod_creation_duration(0.4, "python")
+    m.update_active_pod_creations(2)
+    m.increment_active_pod_creations()
+    m.decrement_active_pod_creations()
     m.record_config_map_created("ok")
-    m.record_k8s_pod_created("success", "python"); m.record_k8s_pod_creation_duration(0.3, "python")
-    m.record_k8s_config_map_created("ok"); m.record_k8s_network_policy_created("ok")
+    m.record_k8s_pod_created("success", "python")
+    m.record_k8s_pod_creation_duration(0.3, "python")
+    m.record_k8s_config_map_created("ok")
+    m.record_k8s_network_policy_created("ok")
     m.update_k8s_active_creations(1)
     m.increment_pod_monitor_watch_reconnects()
     m.record_pod_monitor_event_processing_duration(0.2, "ADDED")
@@ -36,7 +38,6 @@ def test_kubernetes_metrics_methods() -> None:
 
 def test_notification_metrics_methods() -> None:
     """Test with no-op metrics."""
-    
     m = NotificationMetrics()
     m.record_notification_sent("welcome", channel="email", severity="high")
     m.record_notification_failed("welcome", "smtp_error", channel="email")
@@ -44,14 +45,21 @@ def test_notification_metrics_methods() -> None:
     m.record_notification_status_change("n1", "pending", "queued")
     m.record_notification_read("welcome", 2.0)
     m.record_notification_clicked("welcome")
-    m.update_unread_count("u1", 5); m.update_unread_count("u1", 2)
-    m.record_notification_throttled("welcome", "u1"); m.record_throttle_window_hit("u1")
-    m.record_notification_retry("welcome", 1, False); m.record_notification_retry("welcome", 2, True)
+    m.update_unread_count("u1", 5)
+    m.update_unread_count("u1", 2)
+    m.record_notification_throttled("welcome", "u1")
+    m.record_throttle_window_hit("u1")
+    m.record_notification_retry("welcome", 1, False)
+    m.record_notification_retry("welcome", 2, True)
     m.record_batch_processed(10, 1.2, notification_type="welcome")
-    m.record_template_render(0.2, "tmpl", success=True); m.record_template_render(0.1, "tmpl", success=False)
+    m.record_template_render(0.2, "tmpl", success=True)
+    m.record_template_render(0.1, "tmpl", success=False)
     m.record_webhook_delivery(0.3, 200, "/hooks/*")
     m.record_slack_delivery(0.4, "#general", False, error_type="rate_limited")
-    m.update_active_subscriptions("u1", 3); m.update_active_subscriptions("u1", 1)
+    m.update_active_subscriptions("u1", 3)
+    m.update_active_subscriptions("u1", 1)
     m.record_subscription_change("u1", "welcome", "subscribe")
-    m.increment_pending_notifications(); m.decrement_pending_notifications()
-    m.increment_queued_notifications(); m.decrement_queued_notifications()
+    m.increment_pending_notifications()
+    m.decrement_pending_notifications()
+    m.increment_queued_notifications()
+    m.decrement_queued_notifications()
diff --git a/backend/tests/unit/core/metrics/test_metrics_classes.py b/backend/tests/unit/core/metrics/test_metrics_classes.py
index e0e02ef3..70c1ac7b 100644
--- a/backend/tests/unit/core/metrics/test_metrics_classes.py
+++ b/backend/tests/unit/core/metrics/test_metrics_classes.py
@@ -10,9 +10,10 @@
 from app.core.metrics.rate_limit import RateLimitMetrics
 from app.core.metrics.replay import ReplayMetrics
 from app.core.metrics.security import SecurityMetrics
+from app.domain.enums.execution import ExecutionStatus
 
 
-def test_connection_metrics_smoke():
+def test_connection_metrics_smoke() -> None:
     """Test ConnectionMetrics smoke test with no-op metrics."""
     # Create ConnectionMetrics instance - will use NoOpMeterProvider automatically
     m = ConnectionMetrics()
@@ -25,7 +26,7 @@ def test_connection_metrics_smoke():
     m.update_event_bus_subscribers(3, "*")
 
 
-def test_event_metrics_smoke():
+def test_event_metrics_smoke() -> None:
     """Test EventMetrics smoke test with no-op metrics."""
     # Create EventMetrics instance - will use NoOpMeterProvider automatically
     m = EventMetrics()
@@ -54,13 +55,13 @@ def test_event_metrics_smoke():
     m.set_event_bus_queue_size(5)
 
 
-def test_other_metrics_classes_smoke():
+def test_other_metrics_classes_smoke() -> None:
     """Test other metrics classes smoke test with no-op metrics."""
     # Create metrics instances - will use NoOpMeterProvider automatically
     CoordinatorMetrics().record_coordinator_processing_time(0.01)
     DatabaseMetrics().record_mongodb_operation("read", "ok")
     DLQMetrics().record_dlq_message_received("topic", "type")
-    ExecutionMetrics().record_script_execution("QUEUED", "python")
+    ExecutionMetrics().record_script_execution(ExecutionStatus.QUEUED, "python")
     HealthMetrics().record_health_check_duration(0.001, "liveness", "basic")
     KubernetesMetrics().record_k8s_pod_created("success", "python")
     NotificationMetrics().record_notification_sent("welcome", channel="email")
diff --git a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
index 6e03f057..03fd393b 100644
--- a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
+++ b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
@@ -1,10 +1,8 @@
 
 import pytest
-
 from app.core.metrics.replay import ReplayMetrics
 from app.core.metrics.security import SecurityMetrics
 
-
 pytestmark = pytest.mark.unit
 
 
@@ -13,7 +11,9 @@ def test_replay_metrics_methods() -> None:
     # Create ReplayMetrics instance - will use NoOpMeterProvider automatically
     m = ReplayMetrics()
     m.record_session_created("by_id", "kafka")
-    m.update_active_replays(2); m.increment_active_replays(); m.decrement_active_replays()
+    m.update_active_replays(2)
+    m.increment_active_replays()
+    m.decrement_active_replays()
     m.record_events_replayed("by_id", "etype", "success", 3)
     m.record_event_replayed("by_id", "etype", "failed")
     m.record_replay_duration(2.0, "by_id", total_events=4)
@@ -21,14 +21,16 @@ def test_replay_metrics_methods() -> None:
     m.record_replay_error("timeout", "by_id")
     m.record_status_change("s1", "running", "completed")
     m.update_sessions_by_status("running", -1)
-    m.record_replay_by_target("kafka", True); m.record_replay_by_target("kafka", False)
+    m.record_replay_by_target("kafka", True)
+    m.record_replay_by_target("kafka", False)
     m.record_speed_multiplier(2.0, "by_id")
     m.record_delay_applied(0.05)
     m.record_batch_size(10, "by_id")
     m.record_events_filtered("type", 5)
     m.record_filter_effectiveness(5, 10, "type")
     m.record_replay_memory_usage(123.0, "s1")
-    m.update_replay_queue_size("s1", 10); m.update_replay_queue_size("s1", 4)
+    m.update_replay_queue_size("s1", 10)
+    m.update_replay_queue_size("s1", 4)
 
 
 def test_security_metrics_methods() -> None:
@@ -38,19 +40,27 @@ def test_security_metrics_methods() -> None:
     m.record_security_event("scan_started", severity="high", source="scanner")
     m.record_security_violation("csrf", user_id="u1", ip_address="127.0.0.1")
     m.record_authentication_attempt("password", False, user_id="u1", duration_seconds=0.2)
-    m.update_active_sessions(2); m.increment_active_sessions(); m.decrement_active_sessions()
-    m.record_token_generated("access", 3600); m.record_token_refreshed("access"); m.record_token_revoked("access", "logout")
+    m.update_active_sessions(2)
+    m.increment_active_sessions()
+    m.decrement_active_sessions()
+    m.record_token_generated("access", 3600)
+    m.record_token_refreshed("access")
+    m.record_token_revoked("access", "logout")
     m.record_token_validation_failure("access", "expired")
     m.record_authorization_check("/admin", "GET", False, user_role="user")
     m.record_permission_check("write", True, user_id="u1")
-    m.record_csrf_token_generated(); m.record_csrf_validation_failure("missing")
+    m.record_csrf_token_generated()
+    m.record_csrf_validation_failure("missing")
     m.record_network_policy_violation("np1", "pod1", violation_type="egress")
     m.record_privilege_escalation_attempt("u1", "admin", True)
-    m.record_rate_limit_hit("/api"); m.record_rate_limit_violation("/api", limit=100)
-    m.record_api_key_created("kid"); m.record_api_key_revoked("kid", "compromised"); m.record_api_key_usage("kid", "/api")
-    m.record_audit_event("config_change", "u1", resource="system"); m.record_password_change("u1", True)
+    m.record_rate_limit_hit("/api")
+    m.record_rate_limit_violation("/api", limit=100)
+    m.record_api_key_created("kid")
+    m.record_api_key_revoked("kid", "compromised")
+    m.record_api_key_usage("kid", "/api")
+    m.record_audit_event("config_change", "u1", resource="system")
+    m.record_password_change("u1", True)
     m.record_password_reset_request("u1", method="email")
     m.record_weak_password_attempt("u1", "common_password")
     m.record_brute_force_attempt("1.2.3.4", target_user="u1", action_taken="blocked")
     m.record_account_locked("u1", "brute_force", duration_seconds=600)
-
diff --git a/backend/tests/unit/core/test_adaptive_sampling.py b/backend/tests/unit/core/test_adaptive_sampling.py
index 1929de38..1effd85e 100644
--- a/backend/tests/unit/core/test_adaptive_sampling.py
+++ b/backend/tests/unit/core/test_adaptive_sampling.py
@@ -2,7 +2,6 @@
 from unittest.mock import patch
 
 import pytest
-
 from app.core.adaptive_sampling import AdaptiveSampler, create_adaptive_sampler
 
 
diff --git a/backend/tests/unit/core/test_csrf.py b/backend/tests/unit/core/test_csrf.py
index 9ef0b506..df88cac0 100644
--- a/backend/tests/unit/core/test_csrf.py
+++ b/backend/tests/unit/core/test_csrf.py
@@ -1,10 +1,15 @@
 import pytest
+from app.core.security import security_service, validate_csrf_token
+from app.domain.user import CSRFValidationError
 from starlette.requests import Request
 
-from app.core.security import validate_csrf_token, security_service
 
-
-def make_request(method: str, path: str, headers: dict[str, str] | None = None, cookies: dict[str, str] | None = None) -> Request:
+def make_request(
+    method: str,
+    path: str,
+    headers: dict[str, str] | None = None,
+    cookies: dict[str, str] | None = None,
+) -> Request:
     headers = headers or {}
     if cookies:
         cookie_header = "; ".join(f"{k}={v}" for k, v in cookies.items())
@@ -25,7 +30,7 @@ def test_csrf_skips_on_get() -> None:
 
 def test_csrf_missing_header_raises_when_authenticated() -> None:
     req = make_request("POST", "/api/v1/items", cookies={"access_token": "tok", "csrf_token": "abc"})
-    with pytest.raises(Exception):
+    with pytest.raises(CSRFValidationError):
         validate_csrf_token(req)
 
 
diff --git a/backend/tests/unit/core/test_logging_and_correlation.py b/backend/tests/unit/core/test_logging_and_correlation.py
index bad1385f..f535ab9f 100644
--- a/backend/tests/unit/core/test_logging_and_correlation.py
+++ b/backend/tests/unit/core/test_logging_and_correlation.py
@@ -38,14 +38,16 @@ def capture_log(formatter: logging.Formatter, msg: str, extra: dict[str, Any] |
     string_io.close()
 
     if output:
-        return json.loads(output)
+        result: dict[str, Any] = json.loads(output)
+        return result
 
     # Fallback: create and format record manually
     lr = logging.LogRecord("t", logging.INFO, __file__, 1, msg, (), None, None)
     # Apply the filter manually
     correlation_filter.filter(lr)
     s = formatter.format(lr)
-    return json.loads(s)
+    fallback_result: dict[str, Any] = json.loads(s)
+    return fallback_result
 
 
 def test_json_formatter_sanitizes_tokens(monkeypatch: pytest.MonkeyPatch) -> None:
@@ -83,6 +85,6 @@ async def ping(request: Request) -> JSONResponse:
         assert "X-Correlation-ID" in r.headers
 
 
-def test_setup_logger_returns_logger():
+def test_setup_logger_returns_logger() -> None:
     lg = setup_logger(log_level="INFO")
     assert hasattr(lg, "info")
diff --git a/backend/tests/unit/core/test_security.py b/backend/tests/unit/core/test_security.py
index a3c475c3..1188de39 100644
--- a/backend/tests/unit/core/test_security.py
+++ b/backend/tests/unit/core/test_security.py
@@ -4,11 +4,9 @@
 
 import jwt
 import pytest
-from jwt.exceptions import InvalidTokenError
-
 from app.core.security import SecurityService
 from app.domain.enums.user import UserRole
- 
+from jwt.exceptions import InvalidTokenError
 
 
 class TestPasswordHashing:
@@ -222,11 +220,9 @@ def test_decode_token_missing_username(
     ) -> None:
         """Test decoding token without username."""
         # Create token without 'sub' field
-        data = {"user_id": str(uuid4())}
-
+        user_id = str(uuid4())
         expire = datetime.now(timezone.utc) + timedelta(minutes=15)
-        to_encode = data.copy()
-        to_encode.update({"exp": expire})
+        to_encode: dict[str, str | datetime] = {"user_id": user_id, "exp": expire}
 
         token = jwt.encode(
             to_encode,
@@ -239,7 +235,7 @@ def test_decode_token_missing_username(
             token, security_service.settings.SECRET_KEY, algorithms=[security_service.settings.ALGORITHM]
         )
         assert "sub" not in decoded
-        assert decoded["user_id"] == data["user_id"]
+        assert decoded["user_id"] == user_id
 
     async def test_concurrent_token_creation(
             self,
diff --git a/backend/tests/unit/core/test_utils.py b/backend/tests/unit/core/test_utils.py
index ee386718..feefc04d 100644
--- a/backend/tests/unit/core/test_utils.py
+++ b/backend/tests/unit/core/test_utils.py
@@ -1,6 +1,5 @@
-from starlette.requests import Request
-
 from app.core.utils import StringEnum, get_client_ip
+from starlette.requests import Request
 
 
 class E(StringEnum):
diff --git a/backend/tests/unit/events/core/__init__.py b/backend/tests/unit/events/core/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/unit/events/core/test_consumer_config.py b/backend/tests/unit/events/core/test_consumer_config.py
index 455cef0f..99e1a6bf 100644
--- a/backend/tests/unit/events/core/test_consumer_config.py
+++ b/backend/tests/unit/events/core/test_consumer_config.py
@@ -1,4 +1,3 @@
-import pytest
 
 from app.events.core.types import ConsumerConfig, ProducerConfig
 
diff --git a/backend/tests/unit/events/test_event_dispatcher.py b/backend/tests/unit/events/test_event_dispatcher.py
index 28f7c92d..27d640b8 100644
--- a/backend/tests/unit/events/test_event_dispatcher.py
+++ b/backend/tests/unit/events/test_event_dispatcher.py
@@ -1,17 +1,31 @@
 import logging
 
 from app.domain.enums.events import EventType
+from app.domain.enums.storage import ExecutionErrorType
 from app.events.core import EventDispatcher
 from app.infrastructure.kafka.events.base import BaseEvent
+from app.infrastructure.kafka.events.execution import ExecutionFailedEvent, ExecutionRequestedEvent
+from app.infrastructure.kafka.events.metadata import AvroEventMetadata
+
 from tests.helpers import make_execution_requested_event
 
 _test_logger = logging.getLogger("test.events.event_dispatcher")
 
 
-def make_event():
+def make_requested_event() -> ExecutionRequestedEvent:
     return make_execution_requested_event(execution_id="e1")
 
 
+def make_failed_event() -> ExecutionFailedEvent:
+    return ExecutionFailedEvent(
+        execution_id="e1",
+        exit_code=1,
+        error_type=ExecutionErrorType.SCRIPT_ERROR,
+        error_message="Test failure",
+        metadata=AvroEventMetadata(service_name="test", service_version="1.0"),
+    )
+
+
 async def _async_noop(_: BaseEvent) -> None:
     return None
 
@@ -47,12 +61,9 @@ async def test_dispatch_metrics_processed_and_skipped() -> None:
     async def handler(_: BaseEvent) -> None:
         called["n"] += 1
 
-    await disp.dispatch(make_event())
+    await disp.dispatch(make_requested_event())
     # Dispatch event with no handlers (different type)
-    # Reuse base event but fake type by replacing value
-    e = make_event()
-    e.event_type = EventType.EXECUTION_FAILED  # type: ignore[attr-defined]
-    await disp.dispatch(e)
+    await disp.dispatch(make_failed_event())
 
     metrics = disp.get_metrics()
     assert called["n"] == 1
diff --git a/backend/tests/unit/events/test_kafka_events.py b/backend/tests/unit/events/test_kafka_events.py
new file mode 100644
index 00000000..6424e8ed
--- /dev/null
+++ b/backend/tests/unit/events/test_kafka_events.py
@@ -0,0 +1,717 @@
+import json
+from datetime import datetime, timezone
+from typing import Any
+from uuid import UUID
+
+import pytest
+from app.domain.enums.auth import LoginMethod
+from app.domain.enums.events import EventType
+from app.domain.enums.execution import ExecutionStatus
+from app.domain.enums.kafka import KafkaTopic
+from app.domain.enums.notification import NotificationChannel, NotificationSeverity
+from app.domain.enums.storage import ExecutionErrorType
+from app.domain.execution import ResourceUsageDomain
+from app.infrastructure.kafka.events.base import BaseEvent
+from app.infrastructure.kafka.events.execution import (
+    ExecutionAcceptedEvent,
+    ExecutionCancelledEvent,
+    ExecutionCompletedEvent,
+    ExecutionFailedEvent,
+    ExecutionQueuedEvent,
+    ExecutionRequestedEvent,
+    ExecutionRunningEvent,
+    ExecutionStartedEvent,
+    ExecutionTimeoutEvent,
+)
+from app.infrastructure.kafka.events.metadata import AvroEventMetadata
+from app.infrastructure.kafka.events.notification import (
+    NotificationClickedEvent,
+    NotificationCreatedEvent,
+    NotificationDeliveredEvent,
+    NotificationFailedEvent,
+    NotificationReadEvent,
+    NotificationSentEvent,
+)
+from app.infrastructure.kafka.events.saga import (
+    AllocateResourcesCommandEvent,
+    CreatePodCommandEvent,
+    DeletePodCommandEvent,
+    ReleaseResourcesCommandEvent,
+    SagaCancelledEvent,
+    SagaCompensatedEvent,
+    SagaCompensatingEvent,
+    SagaCompletedEvent,
+    SagaFailedEvent,
+    SagaStartedEvent,
+)
+from app.infrastructure.kafka.events.user import (
+    UserDeletedEvent,
+    UserLoggedInEvent,
+    UserLoggedOutEvent,
+    UserRegisteredEvent,
+    UserSettingsUpdatedEvent,
+    UserUpdatedEvent,
+)
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.fixture
+def metadata() -> AvroEventMetadata:
+    """Create standard metadata for tests."""
+    return AvroEventMetadata(
+        service_name="test-service",
+        service_version="1.0.0",
+        user_id="user-123",
+        correlation_id="corr-456",
+    )
+
+
+@pytest.fixture
+def resource_usage() -> ResourceUsageDomain:
+    """Create standard resource usage for tests."""
+    return ResourceUsageDomain(
+        execution_time_wall_seconds=1.5,
+        cpu_time_jiffies=100,
+        clk_tck_hertz=100,
+        peak_memory_kb=1024,
+    )
+
+
+class TestAvroEventMetadata:
+    """Tests for AvroEventMetadata."""
+
+    def test_default_correlation_id_generated(self) -> None:
+        """Generates UUID correlation_id by default."""
+        metadata = AvroEventMetadata(service_name="svc", service_version="1.0")
+        UUID(metadata.correlation_id)  # Validates it's a valid UUID
+
+    def test_with_correlation_returns_new_instance(self, metadata: AvroEventMetadata) -> None:
+        """with_correlation returns new instance with updated correlation_id."""
+        new_metadata = metadata.with_correlation("new-corr")
+        assert new_metadata.correlation_id == "new-corr"
+        assert metadata.correlation_id == "corr-456"  # Original unchanged
+
+    def test_with_user_returns_new_instance(self, metadata: AvroEventMetadata) -> None:
+        """with_user returns new instance with updated user_id."""
+        new_metadata = metadata.with_user("new-user")
+        assert new_metadata.user_id == "new-user"
+        assert metadata.user_id == "user-123"  # Original unchanged
+
+    def test_ensure_correlation_id_preserves_existing(self, metadata: AvroEventMetadata) -> None:
+        """ensure_correlation_id keeps existing correlation_id."""
+        result = metadata.ensure_correlation_id()
+        assert result.correlation_id == "corr-456"
+
+    def test_ensure_correlation_id_generates_when_empty(self) -> None:
+        """ensure_correlation_id generates new id when empty."""
+        metadata = AvroEventMetadata(
+            service_name="svc",
+            service_version="1.0",
+            correlation_id="",
+        )
+        result = metadata.ensure_correlation_id()
+        # Empty string is falsy, should generate new UUID
+        assert result.correlation_id != ""
+        assert result.correlation_id != metadata.correlation_id
+        UUID(result.correlation_id)  # Raises ValueError if invalid
+        assert result is not metadata  # Returns new instance
+        assert metadata.correlation_id == ""  # Original unchanged
+
+
+class TestBaseEvent:
+    """Tests for BaseEvent base class behavior."""
+
+    def test_event_id_auto_generated(self, metadata: AvroEventMetadata) -> None:
+        """Event ID is auto-generated as valid UUID."""
+        event = ExecutionRequestedEvent(
+            execution_id="exec-1",
+            script="print('test')",
+            language="python",
+            language_version="3.11",
+            runtime_image="python:3.11-slim",
+            runtime_command=["python"],
+            runtime_filename="main.py",
+            timeout_seconds=30,
+            cpu_limit="100m",
+            memory_limit="128Mi",
+            cpu_request="50m",
+            memory_request="64Mi",
+            metadata=metadata,
+        )
+        UUID(event.event_id)  # Validates UUID format
+
+    def test_timestamp_auto_generated(self, metadata: AvroEventMetadata) -> None:
+        """Timestamp is auto-generated as UTC datetime."""
+        before = datetime.now(timezone.utc)
+        event = ExecutionRequestedEvent(
+            execution_id="exec-1",
+            script="print('test')",
+            language="python",
+            language_version="3.11",
+            runtime_image="python:3.11-slim",
+            runtime_command=["python"],
+            runtime_filename="main.py",
+            timeout_seconds=30,
+            cpu_limit="100m",
+            memory_limit="128Mi",
+            cpu_request="50m",
+            memory_request="64Mi",
+            metadata=metadata,
+        )
+        after = datetime.now(timezone.utc)
+        assert before <= event.timestamp <= after
+
+    def test_to_dict_serializes_properly(self, metadata: AvroEventMetadata) -> None:
+        """to_dict produces JSON-serializable dict."""
+        event = ExecutionRequestedEvent(
+            execution_id="exec-1",
+            script="print('test')",
+            language="python",
+            language_version="3.11",
+            runtime_image="python:3.11-slim",
+            runtime_command=["python"],
+            runtime_filename="main.py",
+            timeout_seconds=30,
+            cpu_limit="100m",
+            memory_limit="128Mi",
+            cpu_request="50m",
+            memory_request="64Mi",
+            metadata=metadata,
+        )
+        d = event.to_dict()
+        json_str = json.dumps(d)  # Should not raise
+        parsed = json.loads(json_str)
+        assert parsed["execution_id"] == "exec-1"
+        assert isinstance(parsed["timestamp"], str)  # Serialized to ISO string
+
+    def test_to_json_produces_valid_json(self, metadata: AvroEventMetadata) -> None:
+        """to_json produces valid JSON string."""
+        event = ExecutionRequestedEvent(
+            execution_id="exec-1",
+            script="print('test')",
+            language="python",
+            language_version="3.11",
+            runtime_image="python:3.11-slim",
+            runtime_command=["python"],
+            runtime_filename="main.py",
+            timeout_seconds=30,
+            cpu_limit="100m",
+            memory_limit="128Mi",
+            cpu_request="50m",
+            memory_request="64Mi",
+            metadata=metadata,
+        )
+        json_str = event.to_json()
+        parsed = json.loads(json_str)
+        assert parsed["script"] == "print('test')"
+
+
+class TestExecutionEvents:
+    """Tests for execution event types."""
+
+    @pytest.mark.parametrize(
+        "event_cls,event_type,topic,extra_fields",
+        [
+            (
+                ExecutionRequestedEvent,
+                EventType.EXECUTION_REQUESTED,
+                KafkaTopic.EXECUTION_EVENTS,
+                {
+                    "execution_id": "exec-1",
+                    "script": "print(1)",
+                    "language": "python",
+                    "language_version": "3.11",
+                    "runtime_image": "python:3.11-slim",
+                    "runtime_command": ["python"],
+                    "runtime_filename": "main.py",
+                    "timeout_seconds": 30,
+                    "cpu_limit": "100m",
+                    "memory_limit": "128Mi",
+                    "cpu_request": "50m",
+                    "memory_request": "64Mi",
+                },
+            ),
+            (
+                ExecutionAcceptedEvent,
+                EventType.EXECUTION_ACCEPTED,
+                KafkaTopic.EXECUTION_EVENTS,
+                {"execution_id": "exec-1", "queue_position": 5},
+            ),
+            (
+                ExecutionQueuedEvent,
+                EventType.EXECUTION_QUEUED,
+                KafkaTopic.EXECUTION_EVENTS,
+                {"execution_id": "exec-1"},
+            ),
+            (
+                ExecutionRunningEvent,
+                EventType.EXECUTION_RUNNING,
+                KafkaTopic.EXECUTION_EVENTS,
+                {"execution_id": "exec-1", "pod_name": "exec-1-pod"},
+            ),
+            (
+                ExecutionStartedEvent,
+                EventType.EXECUTION_STARTED,
+                KafkaTopic.EXECUTION_EVENTS,
+                {"execution_id": "exec-1", "pod_name": "exec-1-pod"},
+            ),
+            (
+                ExecutionCancelledEvent,
+                EventType.EXECUTION_CANCELLED,
+                KafkaTopic.EXECUTION_EVENTS,
+                {"execution_id": "exec-1", "reason": "user_requested"},
+            ),
+        ],
+        ids=[
+            "requested",
+            "accepted",
+            "queued",
+            "running",
+            "started",
+            "cancelled",
+        ],
+    )
+    def test_execution_event_types_and_topics(
+        self,
+        metadata: AvroEventMetadata,
+        event_cls: type[BaseEvent],
+        event_type: EventType,
+        topic: KafkaTopic,
+        extra_fields: dict[str, Any],
+    ) -> None:
+        """Execution events have correct event_type and topic."""
+        event = event_cls(metadata=metadata, **extra_fields)
+        assert event.event_type == event_type
+        assert event_cls.topic == topic
+
+    def test_execution_completed_event(
+        self, metadata: AvroEventMetadata, resource_usage: ResourceUsageDomain
+    ) -> None:
+        """ExecutionCompletedEvent has all required fields."""
+        event = ExecutionCompletedEvent(
+            execution_id="exec-1",
+            exit_code=0,
+            resource_usage=resource_usage,
+            stdout="Hello\n",
+            stderr="",
+            metadata=metadata,
+        )
+        assert event.event_type == EventType.EXECUTION_COMPLETED
+        assert event.topic == KafkaTopic.EXECUTION_COMPLETED
+        assert event.exit_code == 0
+        assert event.resource_usage.execution_time_wall_seconds == 1.5
+
+    def test_execution_failed_event(
+        self, metadata: AvroEventMetadata, resource_usage: ResourceUsageDomain
+    ) -> None:
+        """ExecutionFailedEvent captures error details."""
+        event = ExecutionFailedEvent(
+            execution_id="exec-1",
+            exit_code=1,
+            error_type=ExecutionErrorType.SCRIPT_ERROR,
+            error_message="NameError: undefined",
+            stdout="",
+            stderr="Traceback...",
+            resource_usage=resource_usage,
+            metadata=metadata,
+        )
+        assert event.event_type == EventType.EXECUTION_FAILED
+        assert event.topic == KafkaTopic.EXECUTION_FAILED
+        assert event.error_type == ExecutionErrorType.SCRIPT_ERROR
+
+    def test_execution_timeout_event(
+        self, metadata: AvroEventMetadata, resource_usage: ResourceUsageDomain
+    ) -> None:
+        """ExecutionTimeoutEvent records timeout details."""
+        event = ExecutionTimeoutEvent(
+            execution_id="exec-1",
+            timeout_seconds=30,
+            resource_usage=resource_usage,
+            stdout="partial output",
+            stderr="",
+            metadata=metadata,
+        )
+        assert event.event_type == EventType.EXECUTION_TIMEOUT
+        assert event.topic == KafkaTopic.EXECUTION_TIMEOUT
+        assert event.timeout_seconds == 30
+
+
+class TestSagaEvents:
+    """Tests for saga event types."""
+
+    @pytest.mark.parametrize(
+        "event_cls,event_type,extra_fields",
+        [
+            (
+                SagaStartedEvent,
+                EventType.SAGA_STARTED,
+                {
+                    "saga_id": "saga-1",
+                    "saga_name": "execution_saga",
+                    "execution_id": "exec-1",
+                    "initial_event_id": "evt-1",
+                },
+            ),
+            (
+                SagaCompletedEvent,
+                EventType.SAGA_COMPLETED,
+                {
+                    "saga_id": "saga-1",
+                    "saga_name": "execution_saga",
+                    "execution_id": "exec-1",
+                    "completed_steps": ["validate", "allocate", "create_pod"],
+                },
+            ),
+            (
+                SagaFailedEvent,
+                EventType.SAGA_FAILED,
+                {
+                    "saga_id": "saga-1",
+                    "saga_name": "execution_saga",
+                    "execution_id": "exec-1",
+                    "failed_step": "create_pod",
+                    "error": "Pod creation timeout",
+                },
+            ),
+            (
+                SagaCompensatingEvent,
+                EventType.SAGA_COMPENSATING,
+                {
+                    "saga_id": "saga-1",
+                    "saga_name": "execution_saga",
+                    "execution_id": "exec-1",
+                    "compensating_step": "release_resources",
+                },
+            ),
+            (
+                SagaCompensatedEvent,
+                EventType.SAGA_COMPENSATED,
+                {
+                    "saga_id": "saga-1",
+                    "saga_name": "execution_saga",
+                    "execution_id": "exec-1",
+                    "compensated_steps": ["allocate", "validate"],
+                },
+            ),
+        ],
+        ids=["started", "completed", "failed", "compensating", "compensated"],
+    )
+    def test_saga_event_types(
+        self,
+        metadata: AvroEventMetadata,
+        event_cls: type[BaseEvent],
+        event_type: EventType,
+        extra_fields: dict[str, Any],
+    ) -> None:
+        """Saga events have correct event_type and topic."""
+        event = event_cls(metadata=metadata, **extra_fields)
+        assert event.event_type == event_type
+        assert event_cls.topic == KafkaTopic.SAGA_EVENTS
+
+    def test_saga_cancelled_event(self, metadata: AvroEventMetadata) -> None:
+        """SagaCancelledEvent captures cancellation details."""
+        event = SagaCancelledEvent(
+            saga_id="saga-1",
+            saga_name="execution_saga",
+            execution_id="exec-1",
+            reason="user_cancelled",
+            completed_steps=["validate", "allocate"],
+            compensated_steps=["allocate"],
+            cancelled_by="user-123",
+            metadata=metadata,
+        )
+        assert event.event_type == EventType.SAGA_CANCELLED
+        assert len(event.completed_steps) == 2
+        assert len(event.compensated_steps) == 1
+
+    @pytest.mark.parametrize(
+        "event_cls,event_type,extra_fields",
+        [
+            (
+                CreatePodCommandEvent,
+                EventType.CREATE_POD_COMMAND,
+                {
+                    "saga_id": "saga-1",
+                    "execution_id": "exec-1",
+                    "script": "print(1)",
+                    "language": "python",
+                    "language_version": "3.11",
+                    "runtime_image": "python:3.11-slim",
+                    "runtime_command": ["python"],
+                    "runtime_filename": "main.py",
+                    "timeout_seconds": 30,
+                    "cpu_limit": "100m",
+                    "memory_limit": "128Mi",
+                    "cpu_request": "50m",
+                    "memory_request": "64Mi",
+                    "priority": 5,
+                },
+            ),
+            (
+                DeletePodCommandEvent,
+                EventType.DELETE_POD_COMMAND,
+                {
+                    "saga_id": "saga-1",
+                    "execution_id": "exec-1",
+                    "reason": "cleanup",
+                    "pod_name": "exec-1-pod",
+                },
+            ),
+            (
+                AllocateResourcesCommandEvent,
+                EventType.ALLOCATE_RESOURCES_COMMAND,
+                {
+                    "execution_id": "exec-1",
+                    "cpu_request": "100m",
+                    "memory_request": "128Mi",
+                },
+            ),
+            (
+                ReleaseResourcesCommandEvent,
+                EventType.RELEASE_RESOURCES_COMMAND,
+                {
+                    "execution_id": "exec-1",
+                    "cpu_request": "100m",
+                    "memory_request": "128Mi",
+                },
+            ),
+        ],
+        ids=["create-pod", "delete-pod", "allocate-resources", "release-resources"],
+    )
+    def test_saga_command_events(
+        self,
+        metadata: AvroEventMetadata,
+        event_cls: type[BaseEvent],
+        event_type: EventType,
+        extra_fields: dict[str, Any],
+    ) -> None:
+        """Saga command events have correct types and topic."""
+        event = event_cls(metadata=metadata, **extra_fields)
+        assert event.event_type == event_type
+        assert event_cls.topic == KafkaTopic.SAGA_COMMANDS
+
+
+class TestNotificationEvents:
+    """Tests for notification event types."""
+
+    @pytest.mark.parametrize(
+        "event_cls,event_type,extra_fields",
+        [
+            (
+                NotificationCreatedEvent,
+                EventType.NOTIFICATION_CREATED,
+                {
+                    "notification_id": "notif-1",
+                    "user_id": "user-1",
+                    "subject": "Test Subject",
+                    "body": "Test body",
+                    "severity": NotificationSeverity.MEDIUM,
+                    "tags": ["test"],
+                    "channels": [NotificationChannel.IN_APP],
+                },
+            ),
+            (
+                NotificationSentEvent,
+                EventType.NOTIFICATION_SENT,
+                {
+                    "notification_id": "notif-1",
+                    "user_id": "user-1",
+                    "channel": NotificationChannel.IN_APP,
+                    "sent_at": "2024-01-01T12:00:00Z",
+                },
+            ),
+            (
+                NotificationDeliveredEvent,
+                EventType.NOTIFICATION_DELIVERED,
+                {
+                    "notification_id": "notif-1",
+                    "user_id": "user-1",
+                    "channel": NotificationChannel.IN_APP,
+                    "delivered_at": "2024-01-01T12:00:01Z",
+                },
+            ),
+            (
+                NotificationFailedEvent,
+                EventType.NOTIFICATION_FAILED,
+                {
+                    "notification_id": "notif-1",
+                    "user_id": "user-1",
+                    "channel": NotificationChannel.WEBHOOK,
+                    "error": "Connection refused",
+                    "retry_count": 3,
+                },
+            ),
+            (
+                NotificationReadEvent,
+                EventType.NOTIFICATION_READ,
+                {
+                    "notification_id": "notif-1",
+                    "user_id": "user-1",
+                    "read_at": "2024-01-01T12:05:00Z",
+                },
+            ),
+            (
+                NotificationClickedEvent,
+                EventType.NOTIFICATION_CLICKED,
+                {
+                    "notification_id": "notif-1",
+                    "user_id": "user-1",
+                    "clicked_at": "2024-01-01T12:06:00Z",
+                    "action": "view_execution",
+                },
+            ),
+        ],
+        ids=["created", "sent", "delivered", "failed", "read", "clicked"],
+    )
+    def test_notification_event_types(
+        self,
+        metadata: AvroEventMetadata,
+        event_cls: type[BaseEvent],
+        event_type: EventType,
+        extra_fields: dict[str, Any],
+    ) -> None:
+        """Notification events have correct types and topic."""
+        event = event_cls(metadata=metadata, **extra_fields)
+        assert event.event_type == event_type
+        assert event_cls.topic == KafkaTopic.NOTIFICATION_EVENTS
+
+
+class TestUserEvents:
+    """Tests for user event types."""
+
+    @pytest.mark.parametrize(
+        "event_cls,event_type,extra_fields",
+        [
+            (
+                UserRegisteredEvent,
+                EventType.USER_REGISTERED,
+                {
+                    "user_id": "user-1",
+                    "username": "testuser",
+                    "email": "test@example.com",
+                },
+            ),
+            (
+                UserLoggedInEvent,
+                EventType.USER_LOGGED_IN,
+                {
+                    "user_id": "user-1",
+                    "login_method": LoginMethod.PASSWORD,
+                    "ip_address": "192.168.1.1",
+                },
+            ),
+            (
+                UserLoggedOutEvent,
+                EventType.USER_LOGGED_OUT,
+                {"user_id": "user-1", "logout_reason": "user_initiated"},
+            ),
+            (
+                UserUpdatedEvent,
+                EventType.USER_UPDATED,
+                {
+                    "user_id": "user-1",
+                    "updated_fields": ["email", "username"],
+                    "updated_by": "admin-1",
+                },
+            ),
+            (
+                UserDeletedEvent,
+                EventType.USER_DELETED,
+                {
+                    "user_id": "user-1",
+                    "deleted_by": "admin-1",
+                    "reason": "account_closure",
+                },
+            ),
+        ],
+        ids=["registered", "logged-in", "logged-out", "updated", "deleted"],
+    )
+    def test_user_event_types(
+        self,
+        metadata: AvroEventMetadata,
+        event_cls: type[BaseEvent],
+        event_type: EventType,
+        extra_fields: dict[str, Any],
+    ) -> None:
+        """User events have correct types and topic."""
+        event = event_cls(metadata=metadata, **extra_fields)
+        assert event.event_type == event_type
+        assert event_cls.topic == KafkaTopic.USER_EVENTS
+
+    def test_user_settings_updated_event(self, metadata: AvroEventMetadata) -> None:
+        """UserSettingsUpdatedEvent captures settings changes."""
+        event = UserSettingsUpdatedEvent(
+            user_id="user-1",
+            changed_fields=["theme", "timezone"],
+            theme="dark",
+            timezone="UTC",
+            metadata=metadata,
+        )
+        assert event.event_type == EventType.USER_SETTINGS_UPDATED
+        assert event.topic == KafkaTopic.USER_SETTINGS_EVENTS
+        assert "theme" in event.changed_fields
+
+
+class TestEventSerialization:
+    """Tests for event serialization edge cases."""
+
+    def test_complex_nested_payload(self, metadata: AvroEventMetadata) -> None:
+        """Events with nested structures serialize correctly."""
+        event = CreatePodCommandEvent(
+            saga_id="saga-1",
+            execution_id="exec-1",
+            script="import os\nprint(os.getcwd())",
+            language="python",
+            language_version="3.11",
+            runtime_image="python:3.11-slim",
+            runtime_command=["python", "-u"],
+            runtime_filename="script.py",
+            timeout_seconds=60,
+            cpu_limit="200m",
+            memory_limit="256Mi",
+            cpu_request="100m",
+            memory_request="128Mi",
+            priority=3,
+            pod_spec={"nodeSelector": "worker"},
+            metadata=metadata,
+        )
+        d = event.to_dict()
+        assert d["pod_spec"] == {"nodeSelector": "worker"}
+        assert d["runtime_command"] == ["python", "-u"]
+
+    def test_unicode_in_script(self, metadata: AvroEventMetadata) -> None:
+        """Events with unicode in script serialize correctly."""
+        script = "print('Hello 世界 🌍')"
+        event = ExecutionRequestedEvent(
+            execution_id="exec-unicode",
+            script=script,
+            language="python",
+            language_version="3.11",
+            runtime_image="python:3.11-slim",
+            runtime_command=["python"],
+            runtime_filename="main.py",
+            timeout_seconds=30,
+            cpu_limit="100m",
+            memory_limit="128Mi",
+            cpu_request="50m",
+            memory_request="64Mi",
+            metadata=metadata,
+        )
+        json_str = event.to_json()
+        parsed = json.loads(json_str)
+        assert "世界" in parsed["script"]
+        assert "🌍" in parsed["script"]
+
+    def test_empty_optional_fields(self, metadata: AvroEventMetadata) -> None:
+        """Events with None optional fields serialize without errors."""
+        event = ExecutionStartedEvent(
+            execution_id="exec-1",
+            pod_name="pod-1",
+            node_name=None,
+            container_id=None,
+            metadata=metadata,
+        )
+        d = event.to_dict()
+        assert d["node_name"] is None
+        assert d["container_id"] is None
diff --git a/backend/tests/unit/events/test_mappings_and_types.py b/backend/tests/unit/events/test_mappings_and_types.py
index 6a2dedc4..62477f63 100644
--- a/backend/tests/unit/events/test_mappings_and_types.py
+++ b/backend/tests/unit/events/test_mappings_and_types.py
@@ -9,7 +9,18 @@
 
 
 def test_producer_config_mapping() -> None:
-    cfg = ProducerConfig(bootstrap_servers="kafka:29092", client_id="cid", batch_size=123, linger_ms=7, compression_type="gzip", request_timeout_ms=1111, retries=2, enable_idempotence=True, acks="all", max_in_flight_requests_per_connection=3)
+    cfg = ProducerConfig(
+        bootstrap_servers="kafka:29092",
+        client_id="cid",
+        batch_size=123,
+        linger_ms=7,
+        compression_type="gzip",
+        request_timeout_ms=1111,
+        retries=2,
+        enable_idempotence=True,
+        acks="all",
+        max_in_flight_requests_per_connection=3,
+    )
     conf = cfg.to_producer_config()
     assert conf["bootstrap.servers"] == "kafka:29092"
     assert conf["client.id"] == "cid"
@@ -20,7 +31,19 @@ def test_producer_config_mapping() -> None:
 
 
 def test_consumer_config_mapping() -> None:
-    cfg = ConsumerConfig(bootstrap_servers="kafka:29092", group_id="g", client_id="c", auto_offset_reset="latest", enable_auto_commit=False, session_timeout_ms=12345, heartbeat_interval_ms=999, max_poll_interval_ms=555000, fetch_min_bytes=10, fetch_max_wait_ms=777, statistics_interval_ms=60000)
+    cfg = ConsumerConfig(
+        bootstrap_servers="kafka:29092",
+        group_id="g",
+        client_id="c",
+        auto_offset_reset="latest",
+        enable_auto_commit=False,
+        session_timeout_ms=12345,
+        heartbeat_interval_ms=999,
+        max_poll_interval_ms=555000,
+        fetch_min_bytes=10,
+        fetch_max_wait_ms=777,
+        statistics_interval_ms=60000,
+    )
     conf = cfg.to_consumer_config()
     assert conf["bootstrap.servers"] == "kafka:29092"
     assert conf["group.id"] == "g"
diff --git a/backend/tests/unit/events/test_schema_registry_manager.py b/backend/tests/unit/events/test_schema_registry_manager.py
index 77562a2e..9a867511 100644
--- a/backend/tests/unit/events/test_schema_registry_manager.py
+++ b/backend/tests/unit/events/test_schema_registry_manager.py
@@ -3,11 +3,12 @@
 import pytest
 from app.events.schema.schema_registry import SchemaRegistryManager
 from app.infrastructure.kafka.events.execution import ExecutionRequestedEvent
+from app.settings import Settings
 
 _test_logger = logging.getLogger("test.events.schema_registry_manager")
 
 
-def test_deserialize_json_execution_requested(test_settings) -> None:  # type: ignore[valid-type]
+def test_deserialize_json_execution_requested(test_settings: Settings, caplog: pytest.LogCaptureFixture) -> None:
     m = SchemaRegistryManager(test_settings, logger=_test_logger)
     data = {
         "event_type": "execution_requested",
@@ -32,7 +33,7 @@ def test_deserialize_json_execution_requested(test_settings) -> None:  # type: i
     assert ev.language == "python"
 
 
-def test_deserialize_json_missing_type_raises(test_settings) -> None:  # type: ignore[valid-type]
+def test_deserialize_json_missing_type_raises(test_settings: Settings, caplog: pytest.LogCaptureFixture) -> None:
     m = SchemaRegistryManager(test_settings, logger=_test_logger)
     with pytest.raises(ValueError):
         m.deserialize_json({})
diff --git a/backend/tests/unit/schemas_pydantic/test_events_schemas.py b/backend/tests/unit/schemas_pydantic/test_events_schemas.py
index 30ef50c2..38d17179 100644
--- a/backend/tests/unit/schemas_pydantic/test_events_schemas.py
+++ b/backend/tests/unit/schemas_pydantic/test_events_schemas.py
@@ -1,10 +1,9 @@
 import pytest
-
-from app.schemas_pydantic.events import EventFilterRequest
 from app.domain.enums.common import SortOrder
+from app.schemas_pydantic.events import EventFilterRequest
 
 
-def test_event_filter_request_sort_validator_accepts_allowed_fields():
+def test_event_filter_request_sort_validator_accepts_allowed_fields() -> None:
     req = EventFilterRequest(sort_by="timestamp", sort_order=SortOrder.DESC)
     assert req.sort_by == "timestamp"
 
@@ -13,6 +12,6 @@ def test_event_filter_request_sort_validator_accepts_allowed_fields():
         assert req2.sort_by == field
 
 
-def test_event_filter_request_sort_validator_rejects_invalid():
+def test_event_filter_request_sort_validator_rejects_invalid() -> None:
     with pytest.raises(ValueError):
         EventFilterRequest(sort_by="not-a-field")
diff --git a/backend/tests/unit/schemas_pydantic/test_execution_schemas.py b/backend/tests/unit/schemas_pydantic/test_execution_schemas.py
index 38e59401..3d219e38 100644
--- a/backend/tests/unit/schemas_pydantic/test_execution_schemas.py
+++ b/backend/tests/unit/schemas_pydantic/test_execution_schemas.py
@@ -1,22 +1,20 @@
-from datetime import datetime, timezone
 
 import pytest
-
 from app.schemas_pydantic.execution import ExecutionRequest
 
 
-def test_execution_request_valid_supported_runtime():
+def test_execution_request_valid_supported_runtime() -> None:
     req = ExecutionRequest(script="print('ok')", lang="python", lang_version="3.11")
     assert req.lang == "python" and req.lang_version == "3.11"
 
 
-def test_execution_request_unsupported_language_raises():
+def test_execution_request_unsupported_language_raises() -> None:
     with pytest.raises(ValueError) as e:
         ExecutionRequest(script="print(1)", lang="rust", lang_version="1.0")
     assert "Language 'rust' not supported" in str(e.value)
 
 
-def test_execution_request_unsupported_version_raises():
+def test_execution_request_unsupported_version_raises() -> None:
     with pytest.raises(ValueError) as e:
         ExecutionRequest(script="print(1)", lang="python", lang_version="9.9")
     assert "Version '9.9' not supported for python" in str(e.value)
diff --git a/backend/tests/unit/schemas_pydantic/test_notification_schemas.py b/backend/tests/unit/schemas_pydantic/test_notification_schemas.py
index 14b304bc..b50603f1 100644
--- a/backend/tests/unit/schemas_pydantic/test_notification_schemas.py
+++ b/backend/tests/unit/schemas_pydantic/test_notification_schemas.py
@@ -1,12 +1,11 @@
 from datetime import UTC, datetime, timedelta
 
 import pytest
-
 from app.domain.enums.notification import NotificationChannel, NotificationSeverity, NotificationStatus
 from app.schemas_pydantic.notification import Notification, NotificationBatch
 
 
-def test_notification_scheduled_for_must_be_future():
+def test_notification_scheduled_for_must_be_future() -> None:
     n = Notification(
         user_id="u1",
         channel=NotificationChannel.IN_APP,
@@ -28,7 +27,7 @@ def test_notification_scheduled_for_must_be_future():
         )
 
 
-def test_notification_batch_validation_limits():
+def test_notification_batch_validation_limits() -> None:
     n1 = Notification(user_id="u1", channel=NotificationChannel.IN_APP, subject="a", body="b")
     ok = NotificationBatch(notifications=[n1])
     assert ok.processed_count == 0
diff --git a/backend/tests/unit/services/auth/__init__.py b/backend/tests/unit/services/auth/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/unit/services/auth/test_auth_service.py b/backend/tests/unit/services/auth/test_auth_service.py
new file mode 100644
index 00000000..91e307d9
--- /dev/null
+++ b/backend/tests/unit/services/auth/test_auth_service.py
@@ -0,0 +1,226 @@
+import logging
+from datetime import datetime, timezone
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from app.domain.enums.user import UserRole
+from app.domain.user import AdminAccessRequiredError, AuthenticationRequiredError
+from app.services.auth_service import AuthService
+
+pytestmark = pytest.mark.unit
+
+
+class FakeUser:
+    """Minimal user mock for testing."""
+
+    def __init__(
+        self,
+        user_id: str = "user-123",
+        username: str = "testuser",
+        email: str = "test@example.com",
+        role: UserRole = UserRole.USER,
+        is_superuser: bool = False,
+    ) -> None:
+        self.user_id = user_id
+        self.username = username
+        self.email = email
+        self.role = role
+        self.is_superuser = is_superuser
+        self.created_at = datetime.now(timezone.utc)
+        self.updated_at = datetime.now(timezone.utc)
+
+
+class FakeRequest:
+    """Minimal request mock for testing."""
+
+    def __init__(self, cookies: dict[str, str] | None = None) -> None:
+        self.cookies = cookies or {}
+
+
+@pytest.fixture
+def mock_user_repo() -> AsyncMock:
+    """Mock user repository."""
+    return AsyncMock()
+
+
+@pytest.fixture
+def mock_logger() -> MagicMock:
+    """Mock logger."""
+    return MagicMock(spec=logging.Logger)
+
+
+@pytest.fixture
+def auth_service(mock_user_repo: AsyncMock, mock_logger: MagicMock) -> AuthService:
+    """Create AuthService with mocked dependencies."""
+    return AuthService(user_repo=mock_user_repo, logger=mock_logger)
+
+
+class TestGetCurrentUser:
+    """Tests for get_current_user method."""
+
+    async def test_raises_when_no_token_cookie(
+        self, auth_service: AuthService
+    ) -> None:
+        """Raises AuthenticationRequiredError when access_token cookie is missing."""
+        request = FakeRequest(cookies={})
+
+        with pytest.raises(AuthenticationRequiredError):
+            await auth_service.get_current_user(request)  # type: ignore
+
+    async def test_raises_when_token_empty(
+        self, auth_service: AuthService
+    ) -> None:
+        """Raises AuthenticationRequiredError when token is empty string."""
+        request = FakeRequest(cookies={"access_token": ""})
+
+        with pytest.raises(AuthenticationRequiredError):
+            await auth_service.get_current_user(request)  # type: ignore
+
+    @pytest.mark.parametrize(
+        "role,is_superuser",
+        [
+            (UserRole.USER, False),
+            (UserRole.ADMIN, False),
+            (UserRole.ADMIN, True),
+        ],
+        ids=["regular-user", "admin-not-superuser", "admin-superuser"],
+    )
+    async def test_returns_user_response_for_valid_token(
+        self,
+        auth_service: AuthService,
+        role: UserRole,
+        is_superuser: bool,
+    ) -> None:
+        """Returns UserResponse with correct fields for valid tokens."""
+        fake_user = FakeUser(
+            user_id="uid-456",
+            username="validuser",
+            email="valid@example.com",
+            role=role,
+            is_superuser=is_superuser,
+        )
+
+        with patch("app.services.auth_service.security_service") as mock_security:
+            mock_security.get_current_user = AsyncMock(return_value=fake_user)
+            request = FakeRequest(cookies={"access_token": "valid-jwt-token"})
+
+            result = await auth_service.get_current_user(request)  # type: ignore
+
+            assert result.user_id == "uid-456"
+            assert result.username == "validuser"
+            assert result.email == "valid@example.com"
+            assert result.role == role
+            assert result.is_superuser == is_superuser
+            mock_security.get_current_user.assert_called_once_with(
+                "valid-jwt-token", auth_service.user_repo
+            )
+
+    async def test_propagates_security_service_exception(
+        self, auth_service: AuthService
+    ) -> None:
+        """Propagates exceptions from security_service.get_current_user."""
+        with patch("app.services.auth_service.security_service") as mock_security:
+            mock_security.get_current_user = AsyncMock(
+                side_effect=AuthenticationRequiredError("Invalid token")
+            )
+            request = FakeRequest(cookies={"access_token": "invalid-token"})
+
+            with pytest.raises(AuthenticationRequiredError):
+                await auth_service.get_current_user(request)  # type: ignore
+
+
+class TestGetAdmin:
+    """Tests for get_admin method."""
+
+    async def test_returns_admin_user(
+        self, auth_service: AuthService
+    ) -> None:
+        """Returns user when they have ADMIN role."""
+        fake_admin = FakeUser(
+            user_id="admin-789",
+            username="adminuser",
+            email="admin@example.com",
+            role=UserRole.ADMIN,
+        )
+
+        with patch("app.services.auth_service.security_service") as mock_security:
+            mock_security.get_current_user = AsyncMock(return_value=fake_admin)
+            request = FakeRequest(cookies={"access_token": "admin-token"})
+
+            result = await auth_service.get_admin(request)  # type: ignore
+
+            assert result.user_id == "admin-789"
+            assert result.role == UserRole.ADMIN
+
+    @pytest.mark.parametrize(
+        "role",
+        [UserRole.USER],
+        ids=["regular-user"],
+    )
+    async def test_raises_for_non_admin_role(
+        self,
+        auth_service: AuthService,
+        mock_logger: MagicMock,
+        role: UserRole,
+    ) -> None:
+        """Raises AdminAccessRequiredError for non-admin roles."""
+        fake_user = FakeUser(
+            user_id="user-123",
+            username="normaluser",
+            email="user@example.com",
+            role=role,
+        )
+
+        with patch("app.services.auth_service.security_service") as mock_security:
+            mock_security.get_current_user = AsyncMock(return_value=fake_user)
+            request = FakeRequest(cookies={"access_token": "user-token"})
+
+            with pytest.raises(AdminAccessRequiredError) as exc_info:
+                await auth_service.get_admin(request)  # type: ignore
+
+            assert "normaluser" in str(exc_info.value)
+            mock_logger.warning.assert_called_once()
+            assert "normaluser" in mock_logger.warning.call_args[0][0]
+
+    async def test_propagates_auth_error_from_get_current_user(
+        self, auth_service: AuthService
+    ) -> None:
+        """Propagates AuthenticationRequiredError from get_current_user."""
+        request = FakeRequest(cookies={})
+
+        with pytest.raises(AuthenticationRequiredError):
+            await auth_service.get_admin(request)  # type: ignore
+
+
+class TestAuthServiceEdgeCases:
+    """Edge case tests for AuthService."""
+
+    async def test_handles_none_in_cookies(
+        self, auth_service: AuthService
+    ) -> None:
+        """Handles request.cookies returning None-like values gracefully."""
+        request = MagicMock()
+        request.cookies.get.return_value = None
+
+        with pytest.raises(AuthenticationRequiredError):
+            await auth_service.get_current_user(request)
+
+    async def test_user_response_preserves_timestamps(
+        self, auth_service: AuthService
+    ) -> None:
+        """UserResponse includes created_at and updated_at from domain user."""
+        created = datetime(2024, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
+        updated = datetime(2024, 6, 15, 18, 30, 0, tzinfo=timezone.utc)
+        fake_user = FakeUser()
+        fake_user.created_at = created
+        fake_user.updated_at = updated
+
+        with patch("app.services.auth_service.security_service") as mock_security:
+            mock_security.get_current_user = AsyncMock(return_value=fake_user)
+            request = FakeRequest(cookies={"access_token": "token"})
+
+            result = await auth_service.get_current_user(request)  # type: ignore
+
+            assert result.created_at == created
+            assert result.updated_at == updated
diff --git a/backend/tests/unit/services/coordinator/test_queue_manager.py b/backend/tests/unit/services/coordinator/test_queue_manager.py
index e3151a16..f62b2fb3 100644
--- a/backend/tests/unit/services/coordinator/test_queue_manager.py
+++ b/backend/tests/unit/services/coordinator/test_queue_manager.py
@@ -1,19 +1,20 @@
 import logging
+from typing import Any
 
 import pytest
-
 from app.services.coordinator.queue_manager import QueueManager, QueuePriority
+
 from tests.helpers import make_execution_requested_event
 
 _test_logger = logging.getLogger("test.services.coordinator.queue_manager")
 
 
-def ev(execution_id: str, priority: int = QueuePriority.NORMAL.value):
+def ev(execution_id: str, priority: int = QueuePriority.NORMAL.value) -> Any:
     return make_execution_requested_event(execution_id=execution_id, priority=priority)
 
 
 @pytest.mark.asyncio
-async def test_requeue_execution_increments_priority():
+async def test_requeue_execution_increments_priority() -> None:
     qm = QueueManager(max_queue_size=10, logger=_test_logger)
     await qm.start()
     # Use NORMAL priority which can be incremented to LOW
@@ -26,7 +27,7 @@ async def test_requeue_execution_increments_priority():
 
 
 @pytest.mark.asyncio
-async def test_queue_stats_empty_and_after_add():
+async def test_queue_stats_empty_and_after_add() -> None:
     qm = QueueManager(max_queue_size=5, logger=_test_logger)
     await qm.start()
     stats0 = await qm.get_queue_stats()
diff --git a/backend/tests/unit/services/coordinator/test_resource_manager.py b/backend/tests/unit/services/coordinator/test_resource_manager.py
index 5e1df687..1cea9f82 100644
--- a/backend/tests/unit/services/coordinator/test_resource_manager.py
+++ b/backend/tests/unit/services/coordinator/test_resource_manager.py
@@ -1,7 +1,6 @@
 import logging
 
 import pytest
-
 from app.services.coordinator.resource_manager import ResourceManager
 
 _test_logger = logging.getLogger("test.services.coordinator.resource_manager")
diff --git a/backend/tests/unit/services/grafana/__init__.py b/backend/tests/unit/services/grafana/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/unit/services/grafana/test_grafana_alert_processor.py b/backend/tests/unit/services/grafana/test_grafana_alert_processor.py
new file mode 100644
index 00000000..5b745e9a
--- /dev/null
+++ b/backend/tests/unit/services/grafana/test_grafana_alert_processor.py
@@ -0,0 +1,402 @@
+import logging
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+from app.domain.enums.notification import NotificationSeverity
+from app.schemas_pydantic.grafana import GrafanaAlertItem, GrafanaWebhook
+from app.services.grafana_alert_processor import GrafanaAlertProcessor
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.fixture
+def mock_notification_service() -> AsyncMock:
+    """Mock notification service."""
+    service = AsyncMock()
+    service.create_system_notification = AsyncMock()
+    return service
+
+
+@pytest.fixture
+def mock_logger() -> MagicMock:
+    """Mock logger."""
+    return MagicMock(spec=logging.Logger)
+
+
+@pytest.fixture
+def processor(
+    mock_notification_service: AsyncMock, mock_logger: MagicMock
+) -> GrafanaAlertProcessor:
+    """Create processor with mocked dependencies."""
+    return GrafanaAlertProcessor(
+        notification_service=mock_notification_service, logger=mock_logger
+    )
+
+
+class TestExtractSeverity:
+    """Tests for extract_severity class method."""
+
+    @pytest.mark.parametrize(
+        "alert_labels,webhook_labels,expected",
+        [
+            # Alert label takes precedence
+            ({"severity": "critical"}, {"severity": "warning"}, "critical"),
+            ({"severity": "ERROR"}, {}, "error"),  # Case insensitivity
+            # Webhook label used when alert has none
+            ({}, {"severity": "warning"}, "warning"),
+            ({"other": "value"}, {"severity": "info"}, "info"),
+            # Default when both empty
+            ({}, {}, "warning"),
+            (None, None, "warning"),
+        ],
+        ids=[
+            "alert-precedence",
+            "case-insensitive",
+            "webhook-fallback",
+            "webhook-with-other-labels",
+            "default-empty",
+            "default-none",
+        ],
+    )
+    def test_extract_severity_combinations(
+        self,
+        alert_labels: dict[str, str] | None,
+        webhook_labels: dict[str, str] | None,
+        expected: str,
+    ) -> None:
+        """Extracts severity from alert/webhook labels with correct precedence."""
+        alert = GrafanaAlertItem(labels=alert_labels or {})
+        webhook = GrafanaWebhook(commonLabels=webhook_labels or {})
+
+        result = GrafanaAlertProcessor.extract_severity(alert, webhook)
+
+        assert result == expected
+
+
+class TestMapSeverity:
+    """Tests for map_severity class method."""
+
+    @pytest.mark.parametrize(
+        "severity_str,alert_status,expected",
+        [
+            # Standard severity mapping
+            ("critical", None, NotificationSeverity.HIGH),
+            ("error", None, NotificationSeverity.HIGH),
+            ("warning", None, NotificationSeverity.MEDIUM),
+            ("info", None, NotificationSeverity.LOW),
+            # Unknown severity defaults to MEDIUM
+            ("unknown", None, NotificationSeverity.MEDIUM),
+            ("", None, NotificationSeverity.MEDIUM),
+            # Resolved statuses override to LOW
+            ("critical", "ok", NotificationSeverity.LOW),
+            ("critical", "resolved", NotificationSeverity.LOW),
+            ("error", "OK", NotificationSeverity.LOW),  # Case insensitivity
+            ("error", "RESOLVED", NotificationSeverity.LOW),
+            # Non-resolved statuses preserve severity
+            ("critical", "firing", NotificationSeverity.HIGH),
+            ("warning", "pending", NotificationSeverity.MEDIUM),
+        ],
+        ids=[
+            "critical-no-status",
+            "error-no-status",
+            "warning-no-status",
+            "info-no-status",
+            "unknown-default",
+            "empty-default",
+            "critical-ok-resolved",
+            "critical-resolved",
+            "error-OK-case",
+            "error-RESOLVED-case",
+            "critical-firing",
+            "warning-pending",
+        ],
+    )
+    def test_map_severity_combinations(
+        self,
+        severity_str: str,
+        alert_status: str | None,
+        expected: NotificationSeverity,
+    ) -> None:
+        """Maps string severity to enum with status consideration."""
+        result = GrafanaAlertProcessor.map_severity(severity_str, alert_status)
+        assert result == expected
+
+
+class TestExtractTitle:
+    """Tests for extract_title class method."""
+
+    @pytest.mark.parametrize(
+        "labels,annotations,expected",
+        [
+            # alertname in labels takes precedence
+            ({"alertname": "HighCPU"}, {"title": "CPU Alert"}, "HighCPU"),
+            ({"alertname": "DiskFull"}, {}, "DiskFull"),
+            # Title annotation as fallback
+            ({}, {"title": "Memory Warning"}, "Memory Warning"),
+            ({"other": "label"}, {"title": "Network Issue"}, "Network Issue"),
+            # Default when nothing found
+            ({}, {}, "Grafana Alert"),
+            (None, None, "Grafana Alert"),
+        ],
+        ids=[
+            "alertname-precedence",
+            "alertname-only",
+            "title-annotation",
+            "title-with-other-labels",
+            "default-empty",
+            "default-none",
+        ],
+    )
+    def test_extract_title_combinations(
+        self,
+        labels: dict[str, str] | None,
+        annotations: dict[str, str] | None,
+        expected: str,
+    ) -> None:
+        """Extracts title from labels/annotations with correct precedence."""
+        alert = GrafanaAlertItem(
+            labels=labels or {}, annotations=annotations or {}
+        )
+
+        result = GrafanaAlertProcessor.extract_title(alert)
+
+        assert result == expected
+
+
+class TestBuildMessage:
+    """Tests for build_message class method."""
+
+    @pytest.mark.parametrize(
+        "annotations,expected",
+        [
+            # Summary and description combined
+            (
+                {"summary": "High CPU usage", "description": "CPU at 95%"},
+                "High CPU usage\n\nCPU at 95%",
+            ),
+            # Summary only
+            ({"summary": "Disk space low"}, "Disk space low"),
+            # Description only
+            ({"description": "Memory threshold exceeded"}, "Memory threshold exceeded"),
+            # Empty annotations
+            ({}, "Alert triggered"),
+            (None, "Alert triggered"),
+            # Other annotations ignored
+            ({"other": "value"}, "Alert triggered"),
+        ],
+        ids=[
+            "summary-and-description",
+            "summary-only",
+            "description-only",
+            "empty-default",
+            "none-default",
+            "other-annotations-ignored",
+        ],
+    )
+    def test_build_message_combinations(
+        self,
+        annotations: dict[str, str] | None,
+        expected: str,
+    ) -> None:
+        """Builds message from annotations with correct formatting."""
+        alert = GrafanaAlertItem(annotations=annotations or {})
+
+        result = GrafanaAlertProcessor.build_message(alert)
+
+        assert result == expected
+
+
+class TestBuildMetadata:
+    """Tests for build_metadata class method."""
+
+    def test_includes_grafana_status_from_alert(self) -> None:
+        """Metadata includes status from alert when available."""
+        alert = GrafanaAlertItem(status="firing", labels={"env": "prod"})
+        webhook = GrafanaWebhook(status="alerting")
+
+        result = GrafanaAlertProcessor.build_metadata(alert, webhook, "critical")
+
+        assert result["grafana_status"] == "firing"
+        assert result["severity"] == "critical"
+
+    def test_falls_back_to_webhook_status(self) -> None:
+        """Metadata uses webhook status when alert status is None."""
+        alert = GrafanaAlertItem(status=None)
+        webhook = GrafanaWebhook(status="resolved")
+
+        result = GrafanaAlertProcessor.build_metadata(alert, webhook, "info")
+
+        assert result["grafana_status"] == "resolved"
+
+    def test_merges_labels_with_alert_precedence(self) -> None:
+        """Alert labels override webhook commonLabels."""
+        alert = GrafanaAlertItem(labels={"env": "staging", "team": "platform"})
+        webhook = GrafanaWebhook(commonLabels={"env": "prod", "region": "us-east"})
+
+        result = GrafanaAlertProcessor.build_metadata(alert, webhook, "warning")
+
+        assert result["env"] == "staging"  # Alert overrides webhook
+        assert result["team"] == "platform"  # Alert-only
+        assert result["region"] == "us-east"  # Webhook-only
+
+
+class TestProcessSingleAlert:
+    """Tests for process_single_alert method."""
+
+    async def test_successful_alert_processing(
+        self,
+        processor: GrafanaAlertProcessor,
+        mock_notification_service: AsyncMock,
+    ) -> None:
+        """Successfully processes alert and creates notification."""
+        alert = GrafanaAlertItem(
+            status="firing",
+            labels={"alertname": "TestAlert", "severity": "critical"},
+            annotations={"summary": "Test summary"},
+        )
+        webhook = GrafanaWebhook(status="alerting")
+
+        success, error = await processor.process_single_alert(
+            alert, webhook, "corr-123"
+        )
+
+        assert success is True
+        assert error is None
+        mock_notification_service.create_system_notification.assert_called_once()
+        call_kwargs = mock_notification_service.create_system_notification.call_args.kwargs
+        assert call_kwargs["title"] == "TestAlert"
+        assert call_kwargs["message"] == "Test summary"
+        assert call_kwargs["severity"] == NotificationSeverity.HIGH
+        assert "grafana" in call_kwargs["tags"]
+
+    async def test_handles_notification_service_error(
+        self,
+        processor: GrafanaAlertProcessor,
+        mock_notification_service: AsyncMock,
+        mock_logger: MagicMock,
+    ) -> None:
+        """Returns error tuple when notification service fails."""
+        mock_notification_service.create_system_notification.side_effect = Exception(
+            "DB connection failed"
+        )
+        alert = GrafanaAlertItem(labels={"alertname": "FailAlert"})
+        webhook = GrafanaWebhook()
+
+        success, error = await processor.process_single_alert(
+            alert, webhook, "corr-456"
+        )
+
+        assert success is False
+        assert error is not None
+        assert "Failed to process Grafana alert" in error
+        mock_logger.error.assert_called_once()
+
+
+class TestProcessWebhook:
+    """Tests for process_webhook method."""
+
+    async def test_processes_all_alerts_in_webhook(
+        self,
+        processor: GrafanaAlertProcessor,
+        mock_notification_service: AsyncMock,
+    ) -> None:
+        """Processes all alerts and returns correct count."""
+        webhook = GrafanaWebhook(
+            status="alerting",
+            alerts=[
+                GrafanaAlertItem(labels={"alertname": "Alert1"}),
+                GrafanaAlertItem(labels={"alertname": "Alert2"}),
+                GrafanaAlertItem(labels={"alertname": "Alert3"}),
+            ],
+        )
+
+        processed, errors = await processor.process_webhook(webhook, "corr-789")
+
+        assert processed == 3
+        assert errors == []
+        assert mock_notification_service.create_system_notification.call_count == 3
+
+    async def test_handles_empty_alerts_list(
+        self,
+        processor: GrafanaAlertProcessor,
+        mock_notification_service: AsyncMock,
+    ) -> None:
+        """Handles webhook with no alerts gracefully."""
+        webhook = GrafanaWebhook(status="resolved", alerts=[])
+
+        processed, errors = await processor.process_webhook(webhook, "corr-empty")
+
+        assert processed == 0
+        assert errors == []
+        mock_notification_service.create_system_notification.assert_not_called()
+
+    async def test_continues_on_individual_alert_failure(
+        self,
+        processor: GrafanaAlertProcessor,
+        mock_notification_service: AsyncMock,
+    ) -> None:
+        """Processes remaining alerts when one fails."""
+        call_count = 0
+
+        async def side_effect(*args: Any, **kwargs: Any) -> None:
+            nonlocal call_count
+            call_count += 1
+            if call_count == 2:
+                raise Exception("Second alert failed")
+
+        mock_notification_service.create_system_notification.side_effect = side_effect
+
+        webhook = GrafanaWebhook(
+            alerts=[
+                GrafanaAlertItem(labels={"alertname": "Alert1"}),
+                GrafanaAlertItem(labels={"alertname": "Alert2"}),
+                GrafanaAlertItem(labels={"alertname": "Alert3"}),
+            ]
+        )
+
+        processed, errors = await processor.process_webhook(webhook, "corr-partial")
+
+        assert processed == 2  # 1 and 3 succeeded
+        assert len(errors) == 1
+        assert "Second alert failed" in errors[0]
+
+    async def test_logs_webhook_processing_info(
+        self,
+        processor: GrafanaAlertProcessor,
+        mock_logger: MagicMock,
+    ) -> None:
+        """Logs processing start and completion."""
+        webhook = GrafanaWebhook(
+            status="firing",
+            alerts=[GrafanaAlertItem(labels={"alertname": "LogTest"})],
+        )
+
+        await processor.process_webhook(webhook, "corr-log")
+
+        # Should have at least 2 info logs: start and completion
+        assert mock_logger.info.call_count >= 2
+
+
+class TestClassConstants:
+    """Tests for class-level constants."""
+
+    def test_severity_mapping_completeness(self) -> None:
+        """SEVERITY_MAPPING covers expected severity strings."""
+        mapping = GrafanaAlertProcessor.SEVERITY_MAPPING
+        assert "critical" in mapping
+        assert "error" in mapping
+        assert "warning" in mapping
+        assert "info" in mapping
+
+    def test_resolved_statuses(self) -> None:
+        """RESOLVED_STATUSES contains expected values."""
+        statuses = GrafanaAlertProcessor.RESOLVED_STATUSES
+        assert "ok" in statuses
+        assert "resolved" in statuses
+
+    def test_default_values(self) -> None:
+        """Default constants have sensible values."""
+        assert GrafanaAlertProcessor.DEFAULT_SEVERITY == "warning"
+        assert GrafanaAlertProcessor.DEFAULT_TITLE == "Grafana Alert"
+        assert GrafanaAlertProcessor.DEFAULT_MESSAGE == "Alert triggered"
diff --git a/backend/tests/unit/services/idempotency/__init__.py b/backend/tests/unit/services/idempotency/__init__.py
index 05dd5682..62789346 100644
--- a/backend/tests/unit/services/idempotency/__init__.py
+++ b/backend/tests/unit/services/idempotency/__init__.py
@@ -1 +1 @@
-# Idempotency service unit tests
\ No newline at end of file
+# Idempotency service unit tests
diff --git a/backend/tests/unit/services/idempotency/test_idempotency_manager.py b/backend/tests/unit/services/idempotency/test_idempotency_manager.py
index df1b2092..62227363 100644
--- a/backend/tests/unit/services/idempotency/test_idempotency_manager.py
+++ b/backend/tests/unit/services/idempotency/test_idempotency_manager.py
@@ -1,15 +1,14 @@
 import logging
 from unittest.mock import MagicMock
-import pytest
 
+import pytest
 from app.infrastructure.kafka.events.base import BaseEvent
 from app.services.idempotency.idempotency_manager import (
     IdempotencyConfig,
-    IdempotencyManager,
     IdempotencyKeyStrategy,
+    IdempotencyManager,
 )
 
-
 pytestmark = pytest.mark.unit
 
 # Test logger
diff --git a/backend/tests/unit/services/idempotency/test_middleware.py b/backend/tests/unit/services/idempotency/test_middleware.py
index c4b19acf..4b1125e0 100644
--- a/backend/tests/unit/services/idempotency/test_middleware.py
+++ b/backend/tests/unit/services/idempotency/test_middleware.py
@@ -1,18 +1,13 @@
-import asyncio
 import logging
-from unittest.mock import AsyncMock, MagicMock, patch
-import pytest
+from unittest.mock import AsyncMock, MagicMock
 
+import pytest
+from app.domain.idempotency import IdempotencyStatus
 from app.infrastructure.kafka.events.base import BaseEvent
 from app.services.idempotency.idempotency_manager import IdempotencyManager, IdempotencyResult
 from app.services.idempotency.middleware import (
     IdempotentEventHandler,
-    idempotent_handler,
-    IdempotentConsumerWrapper,
 )
-from app.domain.idempotency import IdempotencyStatus
-from app.domain.enums.events import EventType
-from app.domain.enums.kafka import KafkaTopic
 
 _test_logger = logging.getLogger("test.services.idempotency.middleware")
 
@@ -22,24 +17,26 @@
 
 class TestIdempotentEventHandler:
     @pytest.fixture
-    def mock_idempotency_manager(self):
+    def mock_idempotency_manager(self) -> AsyncMock:
         return AsyncMock(spec=IdempotencyManager)
 
     @pytest.fixture
-    def mock_handler(self):
+    def mock_handler(self) -> AsyncMock:
         handler = AsyncMock()
         handler.__name__ = "test_handler"
         return handler
 
     @pytest.fixture
-    def event(self):
+    def event(self) -> MagicMock:
         event = MagicMock(spec=BaseEvent)
         event.event_type = "test.event"
         event.event_id = "event-123"
         return event
 
     @pytest.fixture
-    def idempotent_event_handler(self, mock_handler, mock_idempotency_manager):
+    def idempotent_event_handler(
+        self, mock_handler: AsyncMock, mock_idempotency_manager: AsyncMock
+    ) -> IdempotentEventHandler:
         return IdempotentEventHandler(
             handler=mock_handler,
             idempotency_manager=mock_idempotency_manager,
@@ -50,7 +47,9 @@ def idempotent_event_handler(self, mock_handler, mock_idempotency_manager):
         )
 
     @pytest.mark.asyncio
-    async def test_call_with_fields(self, mock_handler, mock_idempotency_manager, event):
+    async def test_call_with_fields(
+        self, mock_handler: AsyncMock, mock_idempotency_manager: AsyncMock, event: MagicMock
+    ) -> None:
         # Setup with specific fields
         fields = {"field1", "field2"}
 
@@ -83,7 +82,13 @@ async def test_call_with_fields(self, mock_handler, mock_idempotency_manager, ev
         )
 
     @pytest.mark.asyncio
-    async def test_call_handler_exception(self, idempotent_event_handler, mock_idempotency_manager, mock_handler, event):
+    async def test_call_handler_exception(
+        self,
+        idempotent_event_handler: IdempotentEventHandler,
+        mock_idempotency_manager: AsyncMock,
+        mock_handler: AsyncMock,
+        event: MagicMock,
+    ) -> None:
         # Setup: Handler raises exception
         idempotency_result = IdempotencyResult(
             is_duplicate=False,
diff --git a/backend/tests/unit/services/pod_monitor/test_config_and_init.py b/backend/tests/unit/services/pod_monitor/test_config_and_init.py
index 75723aea..66e8a89b 100644
--- a/backend/tests/unit/services/pod_monitor/test_config_and_init.py
+++ b/backend/tests/unit/services/pod_monitor/test_config_and_init.py
@@ -1,11 +1,8 @@
 import importlib
-import types
 
 import pytest
-
 from app.services.pod_monitor.config import PodMonitorConfig
 
-
 pytestmark = pytest.mark.unit
 
 
diff --git a/backend/tests/unit/services/pod_monitor/test_event_mapper.py b/backend/tests/unit/services/pod_monitor/test_event_mapper.py
index 48a36d4b..8fbd1bc7 100644
--- a/backend/tests/unit/services/pod_monitor/test_event_mapper.py
+++ b/backend/tests/unit/services/pod_monitor/test_event_mapper.py
@@ -1,91 +1,148 @@
 import json
 import logging
-import pytest
 
+import pytest
 from app.domain.enums.storage import ExecutionErrorType
 from app.infrastructure.kafka.events.metadata import AvroEventMetadata
 from app.services.pod_monitor.event_mapper import PodContext, PodEventMapper
+
 from tests.helpers.k8s_fakes import (
-    Meta,
-    Terminated,
-    Waiting,
-    State,
     ContainerStatus,
-    Spec,
-    Status,
-    Pod,
     FakeApi,
+    Pod,
+    State,
+    Terminated,
+    Waiting,
 )
 
-
 pytestmark = pytest.mark.unit
 
 _test_logger = logging.getLogger("test.services.pod_monitor.event_mapper")
 
 
-def _ctx(pod: Pod, event_type: str = "ADDED") -> PodContext:
-    return PodContext(pod=pod, execution_id="e1", metadata=AvroEventMetadata(service_name="t", service_version="1"), phase=pod.status.phase or "", event_type=event_type)
+# ===== Reusable test stubs =====
+
 
+class _Cond:
+    """Fake Kubernetes pod condition."""
 
-def test_pending_running_and_succeeded_mapping() -> None:
-    pem = PodEventMapper(k8s_api=FakeApi(json.dumps({"stdout": "ok", "stderr": "", "exit_code": 0, "resource_usage": {"execution_time_wall_seconds": 0, "cpu_time_jiffies": 0, "clk_tck_hertz": 0, "peak_memory_kb": 0}})), logger=_test_logger)
+    def __init__(self, condition_type: str, status: str) -> None:
+        self.type = condition_type
+        self.status = status
+
+
+class _API404(FakeApi):
+    """FakeApi that raises 404 on log read."""
+
+    async def read_namespaced_pod_log(
+        self, name: str, namespace: str, tail_lines: int = 10000  # noqa: ARG002
+    ) -> str:
+        raise Exception("404 Not Found")
+
+
+class _API400(FakeApi):
+    """FakeApi that raises 400 on log read."""
+
+    async def read_namespaced_pod_log(
+        self, name: str, namespace: str, tail_lines: int = 10000  # noqa: ARG002
+    ) -> str:
+        raise Exception("400 Bad Request")
+
+
+class _APIGenericError(FakeApi):
+    """FakeApi that raises generic error on log read."""
+
+    async def read_namespaced_pod_log(
+        self, name: str, namespace: str, tail_lines: int = 10000  # noqa: ARG002
+    ) -> str:
+        raise Exception("boom")
+
+
+def _ctx(pod: Pod, event_type: str = "ADDED") -> PodContext:
+    return PodContext(
+        pod=pod,
+        execution_id="e1",
+        metadata=AvroEventMetadata(service_name="t", service_version="1"),
+        phase=pod.status.phase or "",
+        event_type=event_type,
+    )
+
+
+@pytest.mark.asyncio
+async def test_pending_running_and_succeeded_mapping() -> None:
+    logs_json = json.dumps({
+        "stdout": "ok",
+        "stderr": "",
+        "exit_code": 0,
+        "resource_usage": {
+            "execution_time_wall_seconds": 0,
+            "cpu_time_jiffies": 0,
+            "clk_tck_hertz": 0,
+            "peak_memory_kb": 0,
+        },
+    })
+    pem = PodEventMapper(k8s_api=FakeApi(logs_json), logger=_test_logger)
 
     # Pending -> scheduled (set execution-id label and PodScheduled condition)
     pend = Pod("p", "Pending")
     pend.metadata.labels = {"execution-id": "e1"}
-    class Cond:
-        def __init__(self, t, s): self.type=t; self.status=s
-    pend.status.conditions = [Cond("PodScheduled", "True")]
+    pend.status.conditions = [_Cond("PodScheduled", "True")]
     pend.spec.node_name = "n"
-    evts = pem.map_pod_event(pend, "ADDED")
+    evts = await pem.map_pod_event(pend, "ADDED")
     assert any(e.event_type.value == "pod_scheduled" for e in evts)
 
     # Running -> running, includes container statuses JSON
     cs = [ContainerStatus(State(waiting=Waiting("Init"))), ContainerStatus(State(terminated=Terminated(2)))]
     run = Pod("p", "Running", cs=cs)
     run.metadata.labels = {"execution-id": "e1"}
-    evts = pem.map_pod_event(run, "MODIFIED")
+    evts = await pem.map_pod_event(run, "MODIFIED")
     # Print for debugging if test fails
     if not any(e.event_type.value == "pod_running" for e in evts):
         print(f"Events returned: {[e.event_type.value for e in evts]}")
     assert any(e.event_type.value == "pod_running" for e in evts)
     pr = [e for e in evts if e.event_type.value == "pod_running"][0]
-    statuses = json.loads(pr.container_statuses)
+    statuses = json.loads(pr.container_statuses)  # type: ignore[attr-defined]
     assert any("waiting" in s["state"] for s in statuses) and any("terminated" in s["state"] for s in statuses)
 
     # Succeeded -> completed; logs parsed JSON used
     term = ContainerStatus(State(terminated=Terminated(0)))
     suc = Pod("p", "Succeeded", cs=[term])
     suc.metadata.labels = {"execution-id": "e1"}
-    evts = pem.map_pod_event(suc, "MODIFIED")
+    evts = await pem.map_pod_event(suc, "MODIFIED")
     comp = [e for e in evts if e.event_type.value == "execution_completed"][0]
-    assert comp.exit_code == 0 and comp.stdout == "ok"
+    assert comp.exit_code == 0 and comp.stdout == "ok"  # type: ignore[attr-defined]
 
 
-def test_failed_timeout_and_deleted() -> None:
+@pytest.mark.asyncio
+async def test_failed_timeout_and_deleted() -> None:
     valid_logs = json.dumps({"stdout": "", "stderr": "", "exit_code": 137, "resource_usage": {}})
     pem = PodEventMapper(k8s_api=FakeApi(valid_logs), logger=_test_logger)
 
     # Timeout via DeadlineExceeded
-    pod_to = Pod("p", "Failed", cs=[ContainerStatus(State(terminated=Terminated(137)))], reason="DeadlineExceeded", adl=5)
+    pod_to = Pod(
+        "p", "Failed",
+        cs=[ContainerStatus(State(terminated=Terminated(137)))],
+        reason="DeadlineExceeded",
+        adl=5,
+    )
     pod_to.metadata.labels = {"execution-id": "e1"}
-    ev = pem.map_pod_event(pod_to, "MODIFIED")[0]
-    assert ev.event_type.value == "execution_timeout" and ev.timeout_seconds == 5
+    ev = (await pem.map_pod_event(pod_to, "MODIFIED"))[0]
+    assert ev.event_type.value == "execution_timeout" and ev.timeout_seconds == 5  # type: ignore[attr-defined]
 
     # Failed: terminated exit_code nonzero, message used as stderr, error type defaults to SCRIPT_ERROR
     # Note: ExecutionFailedEvent can have None resource_usage when logs extraction fails
     pem_no_logs = PodEventMapper(k8s_api=FakeApi(""), logger=_test_logger)
     pod_fail = Pod("p2", "Failed", cs=[ContainerStatus(State(terminated=Terminated(2, message="boom")))])
     pod_fail.metadata.labels = {"execution-id": "e2"}
-    evf = pem_no_logs.map_pod_event(pod_fail, "MODIFIED")[0]
-    assert evf.event_type.value == "execution_failed" and evf.error_type in {ExecutionErrorType.SCRIPT_ERROR}
+    evf = (await pem_no_logs.map_pod_event(pod_fail, "MODIFIED"))[0]
+    assert evf.event_type.value == "execution_failed" and evf.error_type in {ExecutionErrorType.SCRIPT_ERROR}  # type: ignore[attr-defined]
 
     # Deleted -> terminated when container terminated present (exit code 0 returns completed for DELETED)
     valid_logs_0 = json.dumps({"stdout": "", "stderr": "", "exit_code": 0, "resource_usage": {}})
     pem_completed = PodEventMapper(k8s_api=FakeApi(valid_logs_0), logger=_test_logger)
     pod_del = Pod("p3", "Failed", cs=[ContainerStatus(State(terminated=Terminated(0, reason="Completed")))])
     pod_del.metadata.labels = {"execution-id": "e3"}
-    evd = pem_completed.map_pod_event(pod_del, "DELETED")[0]
+    evd = (await pem_completed.map_pod_event(pod_del, "DELETED"))[0]
     # For DELETED event with exit code 0, it returns execution_completed, not pod_terminated
     assert evd.event_type.value == "execution_completed"
 
@@ -96,7 +153,7 @@ def test_extract_id_and_metadata_priority_and_duplicates() -> None:
     # From label
     p = Pod("any", "Pending")
     p.metadata.labels = {"execution-id": "L1", "user-id": "u", "correlation-id": "corrL"}
-    ctx = _ctx(p)
+    _ctx(p)  # validate context creation works
     md = pem._create_metadata(p)
     assert pem._extract_execution_id(p) == "L1" and md.user_id == "u" and md.correlation_id == "corrL"
 
@@ -117,49 +174,41 @@ def test_extract_id_and_metadata_priority_and_duplicates() -> None:
     assert pem._is_duplicate("n1", "Running") is True
 
 
-def test_scheduled_requires_condition() -> None:
-    class Cond:
-        def __init__(self, t, s): self.type=t; self.status=s
-
+@pytest.mark.asyncio
+async def test_scheduled_requires_condition() -> None:
     pem = PodEventMapper(k8s_api=FakeApi(""), logger=_test_logger)
     pod = Pod("p", "Pending")
     # No conditions -> None
-    assert pem._map_scheduled(_ctx(pod)) is None
+    assert await pem._map_scheduled(_ctx(pod)) is None
     # Wrong condition -> None
-    pod.status.conditions = [Cond("Ready", "True")]
-    assert pem._map_scheduled(_ctx(pod)) is None
+    pod.status.conditions = [_Cond("Ready", "True")]
+    assert await pem._map_scheduled(_ctx(pod)) is None
     # Correct -> event
-    pod.status.conditions = [Cond("PodScheduled", "True")]
+    pod.status.conditions = [_Cond("PodScheduled", "True")]
     pod.spec.node_name = "n"
-    assert pem._map_scheduled(_ctx(pod)) is not None
+    assert await pem._map_scheduled(_ctx(pod)) is not None
 
 
-def test_parse_and_log_paths_and_analyze_failure_variants(caplog) -> None:
+@pytest.mark.asyncio
+async def test_parse_and_log_paths_and_analyze_failure_variants(caplog: pytest.LogCaptureFixture) -> None:
     # _parse_executor_output line-by-line
     line_json = '{"stdout":"x","stderr":"","exit_code":3,"resource_usage":{}}'
     pem = PodEventMapper(k8s_api=FakeApi("junk\n" + line_json), logger=_test_logger)
     pod = Pod("p", "Succeeded", cs=[ContainerStatus(State(terminated=Terminated(0)))])
-    logs = pem._extract_logs(pod)
-    assert logs.exit_code == 3 and logs.stdout == "x"
+    logs = await pem._extract_logs(pod)
+    assert logs is not None and logs.exit_code == 3 and logs.stdout == "x"
 
     # _extract_logs: no api -> returns None
     pem2 = PodEventMapper(k8s_api=None, logger=_test_logger)
-    assert pem2._extract_logs(pod) is None
+    assert await pem2._extract_logs(pod) is None
 
     # _extract_logs exceptions -> 404/400/generic branches, all return None
-    class _API404(FakeApi):
-        def read_namespaced_pod_log(self, *a, **k): raise Exception("404 Not Found")
-    class _API400(FakeApi):
-        def read_namespaced_pod_log(self, *a, **k): raise Exception("400 Bad Request")
-    class _APIGen(FakeApi):
-        def read_namespaced_pod_log(self, *a, **k): raise Exception("boom")
-
     pem404 = PodEventMapper(k8s_api=_API404(""), logger=_test_logger)
-    assert pem404._extract_logs(pod) is None
+    assert await pem404._extract_logs(pod) is None
     pem400 = PodEventMapper(k8s_api=_API400(""), logger=_test_logger)
-    assert pem400._extract_logs(pod) is None
-    pemg = PodEventMapper(k8s_api=_APIGen(""), logger=_test_logger)
-    assert pemg._extract_logs(pod) is None
+    assert await pem400._extract_logs(pod) is None
+    pemg = PodEventMapper(k8s_api=_APIGenericError(""), logger=_test_logger)
+    assert await pemg._extract_logs(pod) is None
 
     # _analyze_failure: Evicted
     pod_e = Pod("p", "Failed")
@@ -180,7 +229,8 @@ def read_namespaced_pod_log(self, *a, **k): raise Exception("boom")
     assert pem._analyze_failure(pod_oom).error_type == ExecutionErrorType.RESOURCE_LIMIT
 
 
-def test_all_containers_succeeded_and_cache_behavior() -> None:
+@pytest.mark.asyncio
+async def test_all_containers_succeeded_and_cache_behavior() -> None:
     valid_logs = json.dumps({"stdout": "", "stderr": "", "exit_code": 0, "resource_usage": {}})
     pem = PodEventMapper(k8s_api=FakeApi(valid_logs), logger=_test_logger)
     term0 = ContainerStatus(State(terminated=Terminated(0)))
@@ -188,13 +238,13 @@ def test_all_containers_succeeded_and_cache_behavior() -> None:
     pod = Pod("p", "Failed", cs=[term0, term0b])
     pod.metadata.labels = {"execution-id": "e1"}
     # When all succeeded, failed mapping returns completed instead of failed
-    ev = pem.map_pod_event(pod, "MODIFIED")[0]
+    ev = (await pem.map_pod_event(pod, "MODIFIED"))[0]
     assert ev.event_type.value == "execution_completed"
 
     # Cache prevents duplicate for same phase unless event type changes
     p2 = Pod("p2", "Running")
-    a = pem.map_pod_event(p2, "ADDED")
-    b = pem.map_pod_event(p2, "MODIFIED")
+    a = await pem.map_pod_event(p2, "ADDED")
+    b = await pem.map_pod_event(p2, "MODIFIED")
     # First ADD should map; second MODIFIED with same phase might be filtered by cache → allow either empty or same
     assert a == [] or all(x.event_type for x in a)
     assert b == [] or all(x.event_type for x in b)
diff --git a/backend/tests/unit/services/pod_monitor/test_monitor.py b/backend/tests/unit/services/pod_monitor/test_monitor.py
index 1e6d5081..84d4f4cb 100644
--- a/backend/tests/unit/services/pod_monitor/test_monitor.py
+++ b/backend/tests/unit/services/pod_monitor/test_monitor.py
@@ -1,12 +1,14 @@
 import asyncio
 import logging
 import types
-from unittest.mock import MagicMock
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
 
 import pytest
 from app.core.k8s_clients import K8sClients
+from app.services.kafka_event_service import KafkaEventService
 from app.services.pod_monitor.config import PodMonitorConfig
-from app.services.pod_monitor.monitor import PodMonitor, create_pod_monitor
+from app.services.pod_monitor.monitor import PodMonitor, ReconciliationResult, create_pod_monitor
 
 from tests.helpers.k8s_fakes import FakeApi, make_pod, make_watch
 
@@ -16,48 +18,73 @@
 _test_logger = logging.getLogger("test.pod_monitor")
 
 
+def _make_kafka_service_mock() -> MagicMock:
+    """Create a properly typed mock for KafkaEventService."""
+    mock = MagicMock(spec=KafkaEventService)
+    mock.published_events = []
+
+    async def _publish(event: Any, key: Any = None) -> str:
+        mock.published_events.append((event, key))
+        return getattr(event, "event_id", "fake-id")
+
+    mock.publish_base_event = AsyncMock(side_effect=_publish)
+    return mock
+
+
 # ===== Shared stubs for k8s mocking =====
 
 
 class _Cfg:
     host = "https://k8s"
-    ssl_ca_cert = None
+    ssl_ca_cert: str | None = None
 
 
 class _K8sConfig:
-    def load_incluster_config(self):
+    def load_incluster_config(self) -> None:
         pass
 
-    def load_kube_config(self, config_file=None):
-        pass  # noqa: ARG002
+    async def load_kube_config(self, config_file: str | None = None) -> None:
+        pass
 
 
 class _Conf:
     @staticmethod
-    def get_default_copy():
+    def get_default_copy() -> _Cfg:
         return _Cfg()
 
 
-class _ApiClient:
-    def __init__(self, cfg):
-        pass  # noqa: ARG002
+class _FakeConfiguration:
+    """Fake configuration for kubernetes_asyncio."""
 
+    host = "https://k8s"
+    ssl_ca_cert: str | None = None
 
-class _Core:
-    def __init__(self, api):
-        pass  # noqa: ARG002
 
-    def get_api_resources(self):
-        return None
+class _ApiClient:
+    """Fake ApiClient for kubernetes_asyncio (used as context manager)."""
 
+    def __init__(self, cfg: Any = None) -> None:
+        self.configuration = _FakeConfiguration()
 
-class _Watch:
-    def __init__(self):
+    async def close(self) -> None:
         pass
 
-    def stop(self):
+
+class _Core:
+    """Fake CoreV1Api for kubernetes_asyncio with async methods."""
+
+    def __init__(self, api: Any = None) -> None:
         pass
 
+    async def get_api_resources(self) -> None:
+        return None
+
+    async def list_namespaced_pod(self, namespace: str, **kwargs: Any) -> Any:  # noqa: ARG002
+        class _PodList:
+            items: list[Any] = []
+
+        return _PodList()
+
 
 class _SpyMapper:
     def __init__(self) -> None:
@@ -68,54 +95,52 @@ def clear_cache(self) -> None:
 
 
 class _StubV1:
-    def get_api_resources(self):
-        return None
-
+    """Stub V1 API with async methods for kubernetes_asyncio."""
 
-class _StubWatch:
-    def stop(self):
+    async def get_api_resources(self) -> None:
         return None
 
+    async def list_namespaced_pod(self, namespace: str, **kwargs: Any) -> Any:  # noqa: ARG002
+        class _PodList:
+            items: list[Any] = []
 
-class _FakeKafkaEventService:
-    """Fake KafkaEventService for testing."""
+        return _PodList()
 
-    def __init__(self):
-        self.published_events = []
 
-    async def publish_base_event(self, event, key=None):
-        self.published_events.append((event, key))
-        return event.event_id if hasattr(event, "event_id") else "fake-id"
-
-
-def _patch_k8s(monkeypatch, k8s_config=None, conf=None, api_client=None, core=None, watch=None):
-    """Helper to patch k8s modules with defaults or custom stubs."""
+def _patch_k8s(
+    monkeypatch: pytest.MonkeyPatch,
+    k8s_config: Any = None,
+    api_client: Any = None,
+    core: Any = None,
+) -> None:
+    """Helper to patch k8s modules with defaults or custom stubs for kubernetes_asyncio."""
     monkeypatch.setattr("app.services.pod_monitor.monitor.k8s_config", k8s_config or _K8sConfig())
-    monkeypatch.setattr("app.services.pod_monitor.monitor.k8s_client.Configuration", conf or _Conf)
     monkeypatch.setattr("app.services.pod_monitor.monitor.k8s_client.ApiClient", api_client or _ApiClient)
     monkeypatch.setattr("app.services.pod_monitor.monitor.k8s_client.CoreV1Api", core or _Core)
-    monkeypatch.setattr("app.services.pod_monitor.monitor.watch", types.SimpleNamespace(Watch=watch or _Watch))
 
 
 # ===== Tests =====
 
 
 @pytest.mark.asyncio
-async def test_start_and_stop_lifecycle(monkeypatch) -> None:
+async def test_start_and_stop_lifecycle(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = PodMonitorConfig()
     cfg.enable_state_reconciliation = False
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
-    pm._initialize_kubernetes_client = lambda: None
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
+
+    async def _mock_init() -> None:
+        pm._api_client = _ApiClient()
+        pm._v1 = _StubV1()
+
+    pm._initialize_kubernetes_client = _mock_init  # type: ignore[method-assign]
     spy = _SpyMapper()
-    pm._event_mapper = spy
-    pm._v1 = _StubV1()
-    pm._watch = _StubWatch()
+    pm._event_mapper = spy  # type: ignore[assignment]
 
-    async def _quick_watch():
+    async def _quick_watch() -> None:
         return None
 
-    pm._watch_pods = _quick_watch
+    pm._watch_pods = _quick_watch  # type: ignore[method-assign]
 
     await pm.__aenter__()
     assert pm.state.name == "RUNNING"
@@ -124,33 +149,35 @@ async def _quick_watch():
     assert pm.state.name == "STOPPED" and spy.cleared is True
 
 
-def test_initialize_kubernetes_client_paths(monkeypatch) -> None:
+@pytest.mark.asyncio
+async def test_initialize_kubernetes_client_paths(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = PodMonitorConfig()
     _patch_k8s(monkeypatch)
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
-    pm._initialize_kubernetes_client()
-    assert pm._v1 is not None and pm._watch is not None
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
+    await pm._initialize_kubernetes_client()
+    assert pm._v1 is not None and pm._api_client is not None
 
 
 @pytest.mark.asyncio
-async def test_watch_pod_events_flow_and_publish(monkeypatch) -> None:
+async def test_watch_pod_events_flow_and_publish(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = PodMonitorConfig()
     cfg.enable_state_reconciliation = False
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
 
     from app.services.pod_monitor.event_mapper import PodEventMapper as PEM
 
     pm._event_mapper = PEM(k8s_api=FakeApi("{}"), logger=_test_logger)
 
-    class V1:
-        def list_namespaced_pod(self, **kwargs):  # noqa: ARG002
-            return None
-
-    pm._v1 = V1()
     pod = make_pod(name="p", phase="Succeeded", labels={"execution-id": "e1"}, term_exit=0, resource_version="rv1")
-    pm._watch = make_watch([{"type": "MODIFIED", "object": pod}], resource_version="rv2")
+    fake_watch = make_watch([{"type": "MODIFIED", "object": pod}], resource_version="rv2")
+
+    # Mock watch.Watch to return our fake async watch
+    monkeypatch.setattr("app.services.pod_monitor.monitor.watch.Watch", lambda: fake_watch)
+
+    # Set up a fake V1 API (won't be called since the watch returns events directly)
+    pm._v1 = _StubV1()
 
     pm._state = pm.state.__class__.RUNNING
     await pm._watch_pod_events()
@@ -158,9 +185,9 @@ def list_namespaced_pod(self, **kwargs):  # noqa: ARG002
 
 
 @pytest.mark.asyncio
-async def test_process_raw_event_invalid_and_handle_watch_error(monkeypatch) -> None:
+async def test_process_raw_event_invalid_and_handle_watch_error(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
 
     await pm._process_raw_event({})
 
@@ -178,7 +205,7 @@ async def test_get_status() -> None:
     cfg.label_selector = "app=test"
     cfg.enable_state_reconciliation = True
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     pm._tracked_pods = {"pod1", "pod2"}
     pm._reconnect_attempts = 3
     pm._last_resource_version = "v123"
@@ -194,32 +221,30 @@ async def test_get_status() -> None:
 
 
 @pytest.mark.asyncio
-async def test_reconciliation_loop_and_state(monkeypatch) -> None:
+async def test_reconciliation_loop_and_state(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = PodMonitorConfig()
     cfg.enable_state_reconciliation = True
-    cfg.reconcile_interval_seconds = 0.01
+    cfg.reconcile_interval_seconds = 0.01  # type: ignore[assignment]
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     pm._state = pm.state.__class__.RUNNING
 
     reconcile_called = []
 
-    async def mock_reconcile():
+    async def mock_reconcile() -> ReconciliationResult:
         reconcile_called.append(True)
-        from app.services.pod_monitor.monitor import ReconciliationResult
-
         return ReconciliationResult(missing_pods={"p1"}, extra_pods={"p2"}, duration_seconds=0.1, success=True)
 
-    pm._reconcile_state = mock_reconcile
+    pm._reconcile_state = mock_reconcile  # type: ignore[method-assign]
 
     evt = asyncio.Event()
 
-    async def wrapped_reconcile():
+    async def wrapped_reconcile() -> ReconciliationResult:
         res = await mock_reconcile()
         evt.set()
         return res
 
-    pm._reconcile_state = wrapped_reconcile
+    pm._reconcile_state = wrapped_reconcile  # type: ignore[method-assign]
 
     task = asyncio.create_task(pm._reconciliation_loop())
     await asyncio.wait_for(evt.wait(), timeout=1.0)
@@ -232,14 +257,14 @@ async def wrapped_reconcile():
 
 
 @pytest.mark.asyncio
-async def test_reconcile_state_success(monkeypatch) -> None:
+async def test_reconcile_state_success(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = PodMonitorConfig()
     cfg.namespace = "test"
     cfg.label_selector = "app=test"
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
 
-    def sync_list(namespace, label_selector):  # noqa: ARG002
+    async def async_list(namespace: str, label_selector: str) -> types.SimpleNamespace:  # noqa: ARG002
         return types.SimpleNamespace(
             items=[
                 make_pod(name="pod1", phase="Running", resource_version="v1"),
@@ -247,15 +272,15 @@ def sync_list(namespace, label_selector):  # noqa: ARG002
             ]
         )
 
-    pm._v1 = types.SimpleNamespace(list_namespaced_pod=sync_list)
+    pm._v1 = types.SimpleNamespace(list_namespaced_pod=async_list)
     pm._tracked_pods = {"pod2", "pod3"}
 
     processed = []
 
-    async def mock_process(event):
+    async def mock_process(event: Any) -> None:
         processed.append(event.pod.metadata.name)
 
-    pm._process_pod_event = mock_process
+    pm._process_pod_event = mock_process  # type: ignore[method-assign]
 
     result = await pm._reconcile_state()
 
@@ -269,7 +294,7 @@ async def mock_process(event):
 @pytest.mark.asyncio
 async def test_reconcile_state_no_v1_api() -> None:
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     pm._v1 = None
 
     result = await pm._reconcile_state()
@@ -280,17 +305,17 @@ async def test_reconcile_state_no_v1_api() -> None:
 @pytest.mark.asyncio
 async def test_reconcile_state_exception() -> None:
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
 
     class FailV1:
-        def list_namespaced_pod(self, *a, **k):
+        async def list_namespaced_pod(self, *a: Any, **k: Any) -> None:
             raise RuntimeError("API error")
 
     pm._v1 = FailV1()
 
     result = await pm._reconcile_state()
     assert result.success is False
-    assert "API error" in result.error
+    assert result.error is not None and "API error" in result.error
 
 
 @pytest.mark.asyncio
@@ -300,10 +325,10 @@ async def test_process_pod_event_full_flow() -> None:
     cfg = PodMonitorConfig()
     cfg.ignored_pod_phases = ["Unknown"]
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
 
     class MockMapper:
-        def map_pod_event(self, pod, event_type):
+        async def map_pod_event(self, pod: Any, event_type: Any) -> list[Any]:
             class Event:
                 event_type = types.SimpleNamespace(value="test_event")
                 metadata = types.SimpleNamespace(correlation_id=None)
@@ -311,14 +336,14 @@ class Event:
 
             return [Event()]
 
-    pm._event_mapper = MockMapper()
+    pm._event_mapper = MockMapper()  # type: ignore[assignment]
 
     published = []
 
-    async def mock_publish(event, pod):
+    async def mock_publish(event: Any, pod: Any) -> None:
         published.append(event)
 
-    pm._publish_event = mock_publish
+    pm._publish_event = mock_publish  # type: ignore[method-assign]
 
     event = PodEvent(
         event_type=WatchEventType.ADDED,
@@ -357,13 +382,13 @@ async def test_process_pod_event_exception_handling() -> None:
     from app.services.pod_monitor.monitor import PodEvent, WatchEventType
 
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
 
     class FailMapper:
-        def map_pod_event(self, pod, event_type):
+        async def map_pod_event(self, pod: Any, event_type: Any) -> list[Any]:
             raise RuntimeError("Mapping failed")
 
-    pm._event_mapper = FailMapper()
+    pm._event_mapper = FailMapper()  # type: ignore[assignment]
 
     event = PodEvent(
         event_type=WatchEventType.ADDED, pod=make_pod(name="fail-pod", phase="Pending"), resource_version=None
@@ -377,7 +402,7 @@ async def test_publish_event_full_flow() -> None:
     from app.domain.enums.events import EventType
 
     cfg = PodMonitorConfig()
-    fake_service = _FakeKafkaEventService()
+    fake_service = _make_kafka_service_mock()
     pm = PodMonitor(cfg, kafka_event_service=fake_service, logger=_test_logger)
 
     class Event:
@@ -388,7 +413,7 @@ class Event:
         event_id = "evt-123"
 
     pod = make_pod(name="test-pod", phase="Succeeded", labels={"execution-id": "exec1"})
-    await pm._publish_event(Event(), pod)
+    await pm._publish_event(Event(), pod)  # type: ignore[arg-type]
 
     assert len(fake_service.published_events) == 1
     assert fake_service.published_events[0][1] == "exec1"
@@ -401,10 +426,10 @@ async def test_publish_event_exception_handling() -> None:
     cfg = PodMonitorConfig()
 
     class FailingKafkaEventService:
-        async def publish_base_event(self, event, key=None):
+        async def publish_base_event(self, event: Any, key: Any = None) -> None:
             raise RuntimeError("Publish failed")
 
-    pm = PodMonitor(cfg, kafka_event_service=FailingKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=FailingKafkaEventService(), logger=_test_logger)  # type: ignore[arg-type]
 
     class Event:
         event_type = EventType.EXECUTION_STARTED
@@ -417,7 +442,7 @@ class Pod:
         status = None
 
     # Should not raise - errors are caught and logged
-    await pm._publish_event(Event(), Pod())
+    await pm._publish_event(Event(), Pod())  # type: ignore[arg-type]
 
 
 @pytest.mark.asyncio
@@ -425,7 +450,7 @@ async def test_handle_watch_error_max_attempts() -> None:
     cfg = PodMonitorConfig()
     cfg.max_reconnect_attempts = 2
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     pm._state = pm.state.__class__.RUNNING
     pm._reconnect_attempts = 2
 
@@ -435,47 +460,47 @@ async def test_handle_watch_error_max_attempts() -> None:
 
 
 @pytest.mark.asyncio
-async def test_watch_pods_main_loop(monkeypatch) -> None:
+async def test_watch_pods_main_loop(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     pm._state = pm.state.__class__.RUNNING
 
     watch_count = []
 
-    async def mock_watch():
+    async def mock_watch() -> None:
         watch_count.append(1)
         if len(watch_count) > 2:
             pm._state = pm.state.__class__.STOPPED
 
-    async def mock_handle_error():
+    async def mock_handle_error() -> None:
         pass
 
-    pm._watch_pod_events = mock_watch
-    pm._handle_watch_error = mock_handle_error
+    pm._watch_pod_events = mock_watch  # type: ignore[method-assign]
+    pm._handle_watch_error = mock_handle_error  # type: ignore[method-assign]
 
     await pm._watch_pods()
     assert len(watch_count) > 2
 
 
 @pytest.mark.asyncio
-async def test_watch_pods_api_exception(monkeypatch) -> None:
-    from kubernetes.client.rest import ApiException
+async def test_watch_pods_api_exception(monkeypatch: pytest.MonkeyPatch) -> None:
+    from kubernetes_asyncio.client.exceptions import ApiException
 
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     pm._state = pm.state.__class__.RUNNING
 
-    async def mock_watch():
+    async def mock_watch() -> None:
         raise ApiException(status=410)
 
     error_handled = []
 
-    async def mock_handle():
+    async def mock_handle() -> None:
         error_handled.append(True)
         pm._state = pm.state.__class__.STOPPED
 
-    pm._watch_pod_events = mock_watch
-    pm._handle_watch_error = mock_handle
+    pm._watch_pod_events = mock_watch  # type: ignore[method-assign]
+    pm._handle_watch_error = mock_handle  # type: ignore[method-assign]
 
     await pm._watch_pods()
 
@@ -484,74 +509,76 @@ async def mock_handle():
 
 
 @pytest.mark.asyncio
-async def test_watch_pods_generic_exception(monkeypatch) -> None:
+async def test_watch_pods_generic_exception(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     pm._state = pm.state.__class__.RUNNING
 
-    async def mock_watch():
+    async def mock_watch() -> None:
         raise RuntimeError("Unexpected error")
 
     error_handled = []
 
-    async def mock_handle():
+    async def mock_handle() -> None:
         error_handled.append(True)
         pm._state = pm.state.__class__.STOPPED
 
-    pm._watch_pod_events = mock_watch
-    pm._handle_watch_error = mock_handle
+    pm._watch_pod_events = mock_watch  # type: ignore[method-assign]
+    pm._handle_watch_error = mock_handle  # type: ignore[method-assign]
 
     await pm._watch_pods()
     assert len(error_handled) > 0
 
 
 @pytest.mark.asyncio
-async def test_create_pod_monitor_context_manager(monkeypatch) -> None:
+async def test_create_pod_monitor_context_manager(monkeypatch: pytest.MonkeyPatch) -> None:
     _patch_k8s(monkeypatch)
 
     cfg = PodMonitorConfig()
     cfg.enable_state_reconciliation = False
 
-    fake_service = _FakeKafkaEventService()
+    fake_service = _make_kafka_service_mock()
 
     async with create_pod_monitor(cfg, fake_service, _test_logger) as monitor:
-        assert monitor.state == monitor.state.__class__.RUNNING
+        assert monitor.state.name == "RUNNING"
 
-    assert monitor.state == monitor.state.__class__.STOPPED
+    assert monitor.state.name == "STOPPED"
 
 
 @pytest.mark.asyncio
-async def test_create_pod_monitor_with_injected_k8s_clients(monkeypatch) -> None:
+async def test_create_pod_monitor_with_injected_k8s_clients(monkeypatch: pytest.MonkeyPatch) -> None:
     """Test create_pod_monitor with injected K8sClients (DI path)."""
     _patch_k8s(monkeypatch)
 
     cfg = PodMonitorConfig()
     cfg.enable_state_reconciliation = False
 
-    fake_service = _FakeKafkaEventService()
+    fake_service = _make_kafka_service_mock()
 
     mock_v1 = MagicMock()
-    mock_v1.get_api_resources.return_value = None
+    mock_v1.get_api_resources = AsyncMock(return_value=None)  # Must be async for kubernetes_asyncio
+    mock_api_client = MagicMock()
+    mock_api_client.close = AsyncMock()  # Must be async for kubernetes_asyncio
     mock_k8s_clients = K8sClients(
-        api_client=MagicMock(),
+        api_client=mock_api_client,
         v1=mock_v1,
         apps_v1=MagicMock(),
         networking_v1=MagicMock(),
     )
 
     async with create_pod_monitor(cfg, fake_service, _test_logger, k8s_clients=mock_k8s_clients) as monitor:
-        assert monitor.state == monitor.state.__class__.RUNNING
+        assert monitor.state.name == "RUNNING"
         assert monitor._clients is mock_k8s_clients
         assert monitor._v1 is mock_v1
 
-    assert monitor.state == monitor.state.__class__.STOPPED
+    assert monitor.state.name == "STOPPED"
 
 
 @pytest.mark.asyncio
 async def test_start_already_running() -> None:
     """Test idempotent start via __aenter__."""
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     # Simulate already started state
     pm._lifecycle_started = True
     pm._state = pm.state.__class__.RUNNING
@@ -564,7 +591,7 @@ async def test_start_already_running() -> None:
 async def test_stop_already_stopped() -> None:
     """Test idempotent stop via aclose()."""
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     pm._state = pm.state.__class__.STOPPED
     # Not started, so aclose should be a no-op
 
@@ -575,16 +602,16 @@ async def test_stop_already_stopped() -> None:
 async def test_stop_with_tasks() -> None:
     """Test cleanup of tasks on aclose()."""
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     pm._state = pm.state.__class__.RUNNING
     pm._lifecycle_started = True  # Simulate started state
 
-    async def dummy_task():
+    async def dummy_task() -> None:
         await asyncio.Event().wait()
 
     pm._watch_task = asyncio.create_task(dummy_task())
     pm._reconcile_task = asyncio.create_task(dummy_task())
-    pm._watch = _StubWatch()
+    pm._api_client = _ApiClient()
     pm._tracked_pods = {"pod1"}
 
     await pm.aclose()
@@ -593,33 +620,21 @@ async def dummy_task():
     assert len(pm._tracked_pods) == 0
 
 
-def test_update_resource_version() -> None:
-    cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
-
-    class Stream:
-        _stop_event = types.SimpleNamespace(resource_version="v123")
-
-    pm._update_resource_version(Stream())
-    assert pm._last_resource_version == "v123"
-
-    class BadStream:
-        pass
-
-    pm._update_resource_version(BadStream())
+# NOTE: test_update_resource_version removed - method no longer exists
+# Resource version is now tracked internally by kubernetes_asyncio Watch
 
 
 @pytest.mark.asyncio
 async def test_process_raw_event_with_metadata() -> None:
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
 
     processed = []
 
-    async def mock_process(event):
+    async def mock_process(event: Any) -> None:
         processed.append(event)
 
-    pm._process_pod_event = mock_process
+    pm._process_pod_event = mock_process  # type: ignore[method-assign]
 
     raw_event = {
         "type": "ADDED",
@@ -637,162 +652,175 @@ async def mock_process(event):
     assert processed[1].resource_version is None
 
 
-def test_initialize_kubernetes_client_in_cluster(monkeypatch) -> None:
+@pytest.mark.asyncio
+async def test_initialize_kubernetes_client_in_cluster(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = PodMonitorConfig()
     cfg.in_cluster = True
 
-    load_incluster_called = []
+    load_incluster_called: list[bool] = []
 
     class TrackingK8sConfig:
-        def load_incluster_config(self):
+        def load_incluster_config(self) -> None:
             load_incluster_called.append(True)
 
-        def load_kube_config(self, config_file=None):
-            pass  # noqa: ARG002
+        async def load_kube_config(self, config_file: str | None = None) -> None:  # noqa: ARG002
+            pass
 
     _patch_k8s(monkeypatch, k8s_config=TrackingK8sConfig())
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
-    pm._initialize_kubernetes_client()
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
+    await pm._initialize_kubernetes_client()
 
     assert len(load_incluster_called) == 1
 
 
-def test_initialize_kubernetes_client_with_kubeconfig_path(monkeypatch) -> None:
+@pytest.mark.asyncio
+async def test_initialize_kubernetes_client_with_kubeconfig_path(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = PodMonitorConfig()
     cfg.in_cluster = False
     cfg.kubeconfig_path = "/custom/kubeconfig"
 
-    load_kube_called_with = []
+    load_kube_called_with: list[str | None] = []
 
     class TrackingK8sConfig:
-        def load_incluster_config(self):
+        def load_incluster_config(self) -> None:
             pass
 
-        def load_kube_config(self, config_file=None):
+        async def load_kube_config(self, config_file: str | None = None) -> None:
             load_kube_called_with.append(config_file)
 
     class ConfWithCert:
         @staticmethod
-        def get_default_copy():
+        def get_default_copy() -> types.SimpleNamespace:
             return types.SimpleNamespace(host="https://k8s", ssl_ca_cert="cert")
 
-    _patch_k8s(monkeypatch, k8s_config=TrackingK8sConfig(), conf=ConfWithCert)
+    _patch_k8s(monkeypatch, k8s_config=TrackingK8sConfig())
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
-    pm._initialize_kubernetes_client()
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
+    await pm._initialize_kubernetes_client()
 
     assert load_kube_called_with == ["/custom/kubeconfig"]
 
 
-def test_initialize_kubernetes_client_exception(monkeypatch) -> None:
+@pytest.mark.asyncio
+async def test_initialize_kubernetes_client_exception(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = PodMonitorConfig()
 
     class FailingK8sConfig:
-        def load_kube_config(self, config_file=None):
+        async def load_kube_config(self, config_file: str | None = None) -> None:  # noqa: ARG002
             raise Exception("K8s config error")
 
     monkeypatch.setattr("app.services.pod_monitor.monitor.k8s_config", FailingK8sConfig())
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
 
     with pytest.raises(Exception) as exc_info:
-        pm._initialize_kubernetes_client()
+        await pm._initialize_kubernetes_client()
 
     assert "K8s config error" in str(exc_info.value)
 
 
 @pytest.mark.asyncio
-async def test_watch_pods_api_exception_other_status(monkeypatch) -> None:
-    from kubernetes.client.rest import ApiException
+async def test_watch_pods_api_exception_other_status(monkeypatch: pytest.MonkeyPatch) -> None:
+    from kubernetes_asyncio.client.exceptions import ApiException
 
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     pm._state = pm.state.__class__.RUNNING
 
-    async def mock_watch():
+    async def mock_watch() -> None:
         raise ApiException(status=500)
 
-    error_handled = []
+    error_handled: list[bool] = []
 
-    async def mock_handle():
+    async def mock_handle() -> None:
         error_handled.append(True)
         pm._state = pm.state.__class__.STOPPED
 
-    pm._watch_pod_events = mock_watch
-    pm._handle_watch_error = mock_handle
+    pm._watch_pod_events = mock_watch  # type: ignore[method-assign]
+    pm._handle_watch_error = mock_handle  # type: ignore[method-assign]
 
     await pm._watch_pods()
     assert len(error_handled) > 0
 
 
 @pytest.mark.asyncio
-async def test_watch_pod_events_no_watch_or_v1() -> None:
+async def test_watch_pod_events_no_v1_api() -> None:
+    """Test that _watch_pod_events raises RuntimeError when _v1 is None."""
     cfg = PodMonitorConfig()
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
 
-    pm._watch = None
-    pm._v1 = _StubV1()
-
-    with pytest.raises(RuntimeError) as exc_info:
-        await pm._watch_pod_events()
-
-    assert "Watch or API not initialized" in str(exc_info.value)
-
-    pm._watch = _StubWatch()
+    # _v1 is None by default after construction
     pm._v1 = None
 
     with pytest.raises(RuntimeError) as exc_info:
         await pm._watch_pod_events()
 
-    assert "Watch or API not initialized" in str(exc_info.value)
+    assert "API not initialized" in str(exc_info.value)
 
 
 @pytest.mark.asyncio
-async def test_watch_pod_events_with_field_selector() -> None:
+async def test_watch_pod_events_with_field_selector(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Test that field_selector is passed to the watch stream."""
     cfg = PodMonitorConfig()
     cfg.field_selector = "status.phase=Running"
     cfg.enable_state_reconciliation = False
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
+
+    stream_kwargs: list[dict[str, Any]] = []
+
+    class _CapturingWatch:
+        """Async watch that captures kwargs passed to stream."""
+
+        resource_version: str | None = None
+        _stopped = False
+
+        def stream(self, func: Any, **kwargs: Any) -> "_CapturingWatch":
+            stream_kwargs.append(kwargs)
+            return self
 
-    watch_kwargs = []
+        def __aiter__(self) -> "_CapturingWatch":
+            return self
 
-    class V1:
-        def list_namespaced_pod(self, **kwargs):
-            watch_kwargs.append(kwargs)
-            return None
+        async def __anext__(self) -> dict[str, Any]:
+            # Stop after capturing kwargs
+            raise StopAsyncIteration
 
-    class Watch:
-        def stream(self, func, **kwargs):
-            watch_kwargs.append(kwargs)
-            return []
+        def stop(self) -> None:
+            self._stopped = True
 
-    pm._v1 = V1()
-    pm._watch = Watch()
+        async def close(self) -> None:
+            pass
+
+    monkeypatch.setattr("app.services.pod_monitor.monitor.watch.Watch", _CapturingWatch)
+
+    pm._v1 = _StubV1()
     pm._state = pm.state.__class__.RUNNING
 
     await pm._watch_pod_events()
 
-    assert any("field_selector" in kw for kw in watch_kwargs)
+    assert len(stream_kwargs) > 0
+    assert any("field_selector" in kw for kw in stream_kwargs)
+    assert stream_kwargs[0].get("field_selector") == "status.phase=Running"
 
 
 @pytest.mark.asyncio
 async def test_reconciliation_loop_exception() -> None:
     cfg = PodMonitorConfig()
     cfg.enable_state_reconciliation = True
-    cfg.reconcile_interval_seconds = 0.01
+    cfg.reconcile_interval_seconds = 0.01  # type: ignore[assignment]
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
     pm._state = pm.state.__class__.RUNNING
 
     hit = asyncio.Event()
 
-    async def raising():
+    async def raising() -> ReconciliationResult:
         hit.set()
         raise RuntimeError("Reconcile error")
 
-    pm._reconcile_state = raising
+    pm._reconcile_state = raising  # type: ignore[method-assign]
 
     task = asyncio.create_task(pm._reconciliation_loop())
     await asyncio.wait_for(hit.wait(), timeout=1.0)
@@ -804,22 +832,26 @@ async def raising():
 
 @pytest.mark.asyncio
 async def test_start_with_reconciliation() -> None:
+    """Test that reconciliation task is started when enabled."""
     cfg = PodMonitorConfig()
     cfg.enable_state_reconciliation = True
 
-    pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger)
-    pm._initialize_kubernetes_client = lambda: None
-    pm._v1 = _StubV1()
-    pm._watch = _StubWatch()
+    pm = PodMonitor(cfg, kafka_event_service=_make_kafka_service_mock(), logger=_test_logger)
+
+    async def mock_init() -> None:
+        pm._api_client = _ApiClient()
+        pm._v1 = _StubV1()
+
+    pm._initialize_kubernetes_client = mock_init  # type: ignore[method-assign]
 
-    async def mock_watch():
+    async def mock_watch() -> None:
         return None
 
-    async def mock_reconcile():
+    async def mock_reconcile() -> None:
         return None
 
-    pm._watch_pods = mock_watch
-    pm._reconciliation_loop = mock_reconcile
+    pm._watch_pods = mock_watch  # type: ignore[method-assign]
+    pm._reconciliation_loop = mock_reconcile  # type: ignore[method-assign]
 
     await pm.__aenter__()
     assert pm._watch_task is not None
diff --git a/backend/tests/unit/services/result_processor/__init__.py b/backend/tests/unit/services/result_processor/__init__.py
index 27a3238d..07d245cf 100644
--- a/backend/tests/unit/services/result_processor/__init__.py
+++ b/backend/tests/unit/services/result_processor/__init__.py
@@ -1 +1 @@
-# Result processor unit tests
\ No newline at end of file
+# Result processor unit tests
diff --git a/backend/tests/unit/services/result_processor/test_processor.py b/backend/tests/unit/services/result_processor/test_processor.py
index 26ef9fdd..79410f7e 100644
--- a/backend/tests/unit/services/result_processor/test_processor.py
+++ b/backend/tests/unit/services/result_processor/test_processor.py
@@ -12,7 +12,7 @@
 
 
 class TestResultProcessorConfig:
-    def test_default_values(self):
+    def test_default_values(self) -> None:
         config = ResultProcessorConfig()
         assert config.consumer_group == GroupId.RESULT_PROCESSOR
         assert KafkaTopic.EXECUTION_COMPLETED in config.topics
@@ -22,13 +22,13 @@ def test_default_values(self):
         assert config.batch_size == 10
         assert config.processing_timeout == 300
 
-    def test_custom_values(self):
+    def test_custom_values(self) -> None:
         config = ResultProcessorConfig(batch_size=20, processing_timeout=600)
         assert config.batch_size == 20
         assert config.processing_timeout == 600
 
 
-def test_create_dispatcher_registers_handlers():
+def test_create_dispatcher_registers_handlers() -> None:
     rp = ResultProcessor(
         execution_repo=MagicMock(),
         producer=MagicMock(),
diff --git a/backend/tests/unit/services/saga/test_execution_saga_steps.py b/backend/tests/unit/services/saga/test_execution_saga_steps.py
index ee57f431..02ceae88 100644
--- a/backend/tests/unit/services/saga/test_execution_saga_steps.py
+++ b/backend/tests/unit/services/saga/test_execution_saga_steps.py
@@ -1,23 +1,25 @@
-import pytest
+from typing import Any
 
+import pytest
 from app.domain.saga import DomainResourceAllocation
 from app.services.saga.execution_saga import (
-    ValidateExecutionStep,
     AllocateResourcesStep,
-    QueueExecutionStep,
     CreatePodStep,
+    DeletePodCompensation,
     MonitorExecutionStep,
+    QueueExecutionStep,
     ReleaseResourcesCompensation,
-    DeletePodCompensation,
+    RemoveFromQueueCompensation,
+    ValidateExecutionStep,
 )
 from app.services.saga.saga_step import SagaContext
-from tests.helpers import make_execution_requested_event
 
+from tests.helpers import make_execution_requested_event
 
 pytestmark = pytest.mark.unit
 
 
-def _req(timeout: int = 30, script: str = "print('x')"):
+def _req(timeout: int = 30, script: str = "print('x')") -> Any:
     return make_execution_requested_event(execution_id="e1", script=script, timeout_seconds=timeout)
 
 
@@ -48,7 +50,7 @@ def __init__(self, active: int = 0, alloc_id: str = "alloc-1") -> None:
     async def count_active(self, language: str) -> int:  # noqa: ARG002
         return self.active
 
-    async def create_allocation(self, create_data) -> DomainResourceAllocation:  # noqa: ARG002
+    async def create_allocation(self, create_data: Any) -> DomainResourceAllocation:  # noqa: ARG002
         return DomainResourceAllocation(
             allocation_id=self.alloc_id,
             execution_id=create_data.execution_id,
@@ -67,13 +69,13 @@ async def release_allocation(self, allocation_id: str) -> None:
 async def test_allocate_resources_step_paths() -> None:
     ctx = SagaContext("s1", "e1")
     ctx.set("execution_id", "e1")
-    ok = await AllocateResourcesStep(alloc_repo=_FakeAllocRepo(active=0, alloc_id="alloc-1")).execute(ctx, _req())
+    ok = await AllocateResourcesStep(alloc_repo=_FakeAllocRepo(active=0, alloc_id="alloc-1")).execute(ctx, _req())  # type: ignore[arg-type]
     assert ok is True and ctx.get("resources_allocated") is True and ctx.get("allocation_id") == "alloc-1"
 
     # Limit exceeded
     ctx2 = SagaContext("s2", "e2")
     ctx2.set("execution_id", "e2")
-    ok2 = await AllocateResourcesStep(alloc_repo=_FakeAllocRepo(active=100)).execute(ctx2, _req())
+    ok2 = await AllocateResourcesStep(alloc_repo=_FakeAllocRepo(active=100)).execute(ctx2, _req())  # type: ignore[arg-type]
     assert ok2 is False
 
     # Missing repo
@@ -95,7 +97,7 @@ async def test_queue_and_monitor_steps() -> None:
 
     # Force exceptions to exercise except paths
     class _Ctx(SagaContext):
-        def set(self, key, value):  # type: ignore[override]
+        def set(self, key: str, value: Any) -> None:
             raise RuntimeError("boom")
     bad = _Ctx("s", "e")
     assert await QueueExecutionStep().execute(bad, _req()) is False
@@ -106,7 +108,7 @@ class _FakeProducer:
     def __init__(self) -> None:
         self.events: list[object] = []
 
-    async def produce(self, event_to_produce, key: str | None = None):  # noqa: ARG002
+    async def produce(self, event_to_produce: Any, key: str | None = None) -> None:  # noqa: ARG002
         self.events.append(event_to_produce)
 
 
@@ -123,7 +125,7 @@ async def test_create_pod_step_publish_flag_and_compensation() -> None:
     ctx2 = SagaContext("s2", "e2")
     ctx2.set("execution_id", "e2")
     prod = _FakeProducer()
-    s2 = CreatePodStep(producer=prod, publish_commands=True)
+    s2 = CreatePodStep(producer=prod, publish_commands=True)  # type: ignore[arg-type]
     ok2 = await s2.execute(ctx2, _req())
     assert ok2 is True and ctx2.get("pod_creation_triggered") is True and prod.events
 
@@ -135,7 +137,7 @@ async def test_create_pod_step_publish_flag_and_compensation() -> None:
     assert ok3 is False and ctx3.error is not None
 
     # DeletePod compensation triggers only when flagged and producer exists
-    comp = DeletePodCompensation(producer=prod)
+    comp = DeletePodCompensation(producer=prod)  # type: ignore[arg-type]
     ctx2.set("pod_creation_triggered", True)
     assert await comp.compensate(ctx2) is True
 
@@ -143,7 +145,7 @@ async def test_create_pod_step_publish_flag_and_compensation() -> None:
 @pytest.mark.asyncio
 async def test_release_resources_compensation() -> None:
     repo = _FakeAllocRepo()
-    comp = ReleaseResourcesCompensation(alloc_repo=repo)
+    comp = ReleaseResourcesCompensation(alloc_repo=repo)  # type: ignore[arg-type]
     ctx = SagaContext("s1", "e1")
     ctx.set("allocation_id", "alloc-1")
     assert await comp.compensate(ctx) is True and repo.released == ["alloc-1"]
@@ -153,7 +155,7 @@ async def test_release_resources_compensation() -> None:
     assert await comp2.compensate(ctx) is False
     # Missing allocation_id -> True short-circuit
     ctx2 = SagaContext("sX", "eX")
-    assert await ReleaseResourcesCompensation(alloc_repo=repo).compensate(ctx2) is True
+    assert await ReleaseResourcesCompensation(alloc_repo=repo).compensate(ctx2) is True  # type: ignore[arg-type]
 
 
 @pytest.mark.asyncio
@@ -172,27 +174,27 @@ async def test_delete_pod_compensation_variants() -> None:
 
     # Exercise get_compensation methods return types (coverage for lines returning comps/None)
     assert ValidateExecutionStep().get_compensation() is None
-    assert isinstance(AllocateResourcesStep(_FakeAllocRepo()).get_compensation(), ReleaseResourcesCompensation)
-    assert isinstance(QueueExecutionStep().get_compensation(), type(DeletePodCompensation(None)).__bases__[0]) or True
-    assert CreatePodStep(None, publish_commands=False).get_compensation() is not None
+    assert isinstance(AllocateResourcesStep(_FakeAllocRepo()).get_compensation(), ReleaseResourcesCompensation)  # type: ignore[arg-type]
+    assert isinstance(QueueExecutionStep().get_compensation(), RemoveFromQueueCompensation)
+    assert isinstance(CreatePodStep(None, publish_commands=False).get_compensation(), DeletePodCompensation)
     assert MonitorExecutionStep().get_compensation() is None
 
 
 def test_execution_saga_bind_and_get_steps_sets_flags_and_types() -> None:
     # Dummy subclasses to satisfy isinstance checks without real deps
-    from app.events.core import UnifiedProducer
     from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository
+    from app.events.core import UnifiedProducer
 
     class DummyProd(UnifiedProducer):
-        def __init__(self): pass  # type: ignore[no-untyped-def]
+        def __init__(self) -> None: pass
 
     class DummyAlloc(ResourceAllocationRepository):
-        def __init__(self): pass  # type: ignore[no-untyped-def]
+        def __init__(self) -> None: pass
 
-    from app.services.saga.execution_saga import ExecutionSaga, CreatePodStep
+    from app.services.saga.execution_saga import CreatePodStep, ExecutionSaga
     s = ExecutionSaga()
     s.bind_dependencies(producer=DummyProd(), alloc_repo=DummyAlloc(), publish_commands=True)
     steps = s.get_steps()
     # CreatePod step should be configured and present
     cps = [st for st in steps if isinstance(st, CreatePodStep)][0]
-    assert getattr(cps, "publish_commands") is True
+    assert cps.publish_commands is True
diff --git a/backend/tests/unit/services/saga/test_saga_comprehensive.py b/backend/tests/unit/services/saga/test_saga_comprehensive.py
index e746164b..e6acc083 100644
--- a/backend/tests/unit/services/saga/test_saga_comprehensive.py
+++ b/backend/tests/unit/services/saga/test_saga_comprehensive.py
@@ -4,16 +4,16 @@
 require heavy mocking or external services. Full end‑to‑end behavior is covered
 by integration tests under tests/integration/saga/.
 """
+from typing import Any
 
 import pytest
-
 from app.domain.enums.events import EventType
 from app.domain.enums.saga import SagaState
 from app.domain.saga.models import Saga
-from tests.helpers import make_execution_requested_event
 from app.services.saga.execution_saga import ExecutionSaga
 from app.services.saga.saga_step import CompensationStep, SagaContext, SagaStep
 
+from tests.helpers import make_execution_requested_event
 
 pytestmark = pytest.mark.unit
 
@@ -23,19 +23,19 @@ async def compensate(self, context: SagaContext) -> bool:  # noqa: ARG002
         return True
 
 
-class _Step(SagaStep):
+class _Step(SagaStep[Any]):
     def __init__(self, name: str, ok: bool = True):
         super().__init__(name)
         self._ok = ok
 
-    async def execute(self, context: SagaContext, event) -> bool:  # noqa: ARG002
+    async def execute(self, context: SagaContext, event: Any) -> bool:  # noqa: ARG002
         return self._ok
 
-    def get_compensation(self):
+    def get_compensation(self) -> CompensationStep:
         return _NoopComp(f"{self.name}-comp")
 
 
-def _req_event():
+def _req_event() -> Any:
     return make_execution_requested_event(execution_id="e1", script="print('x')")
 
 
diff --git a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py
index 75fb2e25..8c7bfd3d 100644
--- a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py
+++ b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any
 
 import pytest
 from app.domain.enums.events import EventType
@@ -6,7 +7,7 @@
 from app.domain.saga.models import Saga, SagaConfig
 from app.services.saga.base_saga import BaseSaga
 from app.services.saga.saga_orchestrator import SagaOrchestrator
-from app.services.saga.saga_step import SagaStep
+from app.services.saga.saga_step import CompensationStep, SagaContext, SagaStep
 
 pytestmark = pytest.mark.unit
 
@@ -25,7 +26,7 @@ def __init__(self) -> None:
         self.saved: list[Saga] = []
         self.existing: dict[tuple[str, str], Saga] = {}
 
-    async def get_saga_by_execution_and_name(self, execution_id: str, saga_name: str):  # noqa: ARG002
+    async def get_saga_by_execution_and_name(self, execution_id: str, saga_name: str) -> Saga | None:  # noqa: ARG002
         return self.existing.get((execution_id, saga_name))
 
     async def upsert_saga(self, saga: Saga) -> bool:
@@ -34,12 +35,12 @@ async def upsert_saga(self, saga: Saga) -> bool:
 
 
 class _Prod:
-    async def produce(self, event_to_produce, key=None):  # noqa: ARG002
+    async def produce(self, event_to_produce: Any, key: str | None = None) -> None:  # noqa: ARG002
         return None
 
 
 class _Idem:
-    async def close(self):
+    async def close(self) -> None:
         return None
 
 
@@ -49,34 +50,48 @@ class _SchemaRegistry: ...
 class _Settings: ...
 
 
-class _StepOK(SagaStep[_Evt]):
+class _NoOpCompensation(CompensationStep):
+    """No-op compensation step for testing."""
+
+    def __init__(self) -> None:
+        super().__init__("noop_compensation")
+
+    async def compensate(self, context: SagaContext) -> bool:  # noqa: ARG002
+        return True
+
+
+class _StepOK(SagaStep[Any]):
     def __init__(self) -> None:
         super().__init__("ok")
-    async def execute(self, context, event) -> bool:  # noqa: ARG002
+
+    async def execute(self, context: Any, event: Any) -> bool:  # noqa: ARG002
         return True
 
+    def get_compensation(self) -> CompensationStep:
+        return _NoOpCompensation()
+
 
 class _Saga(BaseSaga):
     @classmethod
     def get_name(cls) -> str:
         return "s"
     @classmethod
-    def get_trigger_events(cls):
+    def get_trigger_events(cls) -> list[EventType]:
         return [EventType.EXECUTION_REQUESTED]
-    def get_steps(self):
+    def get_steps(self) -> list[SagaStep[Any]]:
         return [_StepOK()]
 
 
 def _orch() -> SagaOrchestrator:
     return SagaOrchestrator(
         config=SagaConfig(name="t", enable_compensation=True, store_events=True, publish_commands=False),
-        saga_repository=_Repo(),
-        producer=_Prod(),
+        saga_repository=_Repo(),  # type: ignore[arg-type]
+        producer=_Prod(),  # type: ignore[arg-type]
         schema_registry_manager=_SchemaRegistry(),  # type: ignore[arg-type]
         settings=_Settings(),  # type: ignore[arg-type]
-        event_store=_Store(),
-        idempotency_manager=_Idem(),
-        resource_allocation_repository=_Alloc(),
+        event_store=_Store(),  # type: ignore[arg-type]
+        idempotency_manager=_Idem(),  # type: ignore[arg-type]
+        resource_allocation_repository=_Alloc(),  # type: ignore[arg-type]
         logger=_test_logger,
     )
 
@@ -84,20 +99,20 @@ def _orch() -> SagaOrchestrator:
 @pytest.mark.asyncio
 async def test_min_success_flow() -> None:
     orch = _orch()
-    orch.register_saga(_Saga)  # type: ignore[arg-type]
-    orch._running = True
-    await orch._handle_event(_Evt(EventType.EXECUTION_REQUESTED, "e"))
-    assert orch._running is True  # basic sanity; deep behavior covered by integration
+    orch.register_saga(_Saga)
+    orch._running = True  # type: ignore[attr-defined]
+    await orch._handle_event(_Evt(EventType.EXECUTION_REQUESTED, "e"))  # type: ignore[arg-type]
+    assert orch._running is True  # type: ignore[attr-defined]
 
 
 @pytest.mark.asyncio
 async def test_should_trigger_and_existing_short_circuit() -> None:
     orch = _orch()
-    orch.register_saga(_Saga)  # type: ignore[arg-type]
-    assert orch._should_trigger_saga(_Saga, _Evt(EventType.EXECUTION_REQUESTED, "e")) is True
+    orch.register_saga(_Saga)
+    assert orch._should_trigger_saga(_Saga, _Evt(EventType.EXECUTION_REQUESTED, "e")) is True  # type: ignore[arg-type]
     # Existing short-circuit returns existing ID
-    repo = orch._repo  # type: ignore[attr-defined]
+    repo = orch._repo
     s = Saga(saga_id="sX", saga_name="s", execution_id="e", state=SagaState.RUNNING)
-    repo.existing[("e", "s")] = s
-    sid = await orch._start_saga("s", _Evt(EventType.EXECUTION_REQUESTED, "e"))
+    repo.existing[("e", "s")] = s  # type: ignore[attr-defined]
+    sid = await orch._start_saga("s", _Evt(EventType.EXECUTION_REQUESTED, "e"))  # type: ignore[arg-type]
     assert sid == "sX"
diff --git a/backend/tests/unit/services/saga/test_saga_step_and_base.py b/backend/tests/unit/services/saga/test_saga_step_and_base.py
index a8ab93bd..267e7f9c 100644
--- a/backend/tests/unit/services/saga/test_saga_step_and_base.py
+++ b/backend/tests/unit/services/saga/test_saga_step_and_base.py
@@ -1,8 +1,8 @@
-import pytest
+from typing import Any
 
-from app.services.saga.saga_step import SagaContext, CompensationStep
+import pytest
 from app.services.saga.base_saga import BaseSaga
-
+from app.services.saga.saga_step import CompensationStep, SagaContext
 
 pytestmark = pytest.mark.unit
 
@@ -37,9 +37,9 @@ async def compensate(self, context: SagaContext) -> bool:  # noqa: ARG002
 
 @pytest.mark.asyncio
 async def test_context_adders() -> None:
-    from app.infrastructure.kafka.events.metadata import AvroEventMetadata
-    from app.infrastructure.kafka.events.base import BaseEvent
     from app.domain.enums.events import EventType
+    from app.infrastructure.kafka.events.base import BaseEvent
+    from app.infrastructure.kafka.events.metadata import AvroEventMetadata
 
     class E(BaseEvent):
         event_type: EventType = EventType.SYSTEM_ERROR
@@ -63,18 +63,18 @@ def test_base_saga_abstract_calls_cover_pass_lines() -> None:
     # And the default bind hook returns None when called
     class Dummy(BaseSaga):
         @classmethod
-        def get_name(cls): return "d"
+        def get_name(cls) -> str: return "d"
         @classmethod
-        def get_trigger_events(cls): return []
-        def get_steps(self): return []
-    assert Dummy().bind_dependencies() is None
+        def get_trigger_events(cls) -> list[Any]: return []
+        def get_steps(self) -> list[Any]: return []
+    Dummy().bind_dependencies()  # Returns None by design
 
 
 def test_saga_step_str_and_can_execute() -> None:
     from app.services.saga.saga_step import SagaStep
-    class S(SagaStep):
-        async def execute(self, context, event): return True
-        def get_compensation(self): return None
+    class S(SagaStep[Any]):
+        async def execute(self, context: Any, event: Any) -> bool: return True
+        def get_compensation(self) -> None: return None
     s = S("nm")
     assert str(s) == "SagaStep(nm)"
     # can_execute default True
diff --git a/backend/tests/unit/services/sse/test_kafka_redis_bridge.py b/backend/tests/unit/services/sse/test_kafka_redis_bridge.py
index e4b0cded..461df7ae 100644
--- a/backend/tests/unit/services/sse/test_kafka_redis_bridge.py
+++ b/backend/tests/unit/services/sse/test_kafka_redis_bridge.py
@@ -1,12 +1,11 @@
-import asyncio
 import logging
-import pytest
-
-pytestmark = pytest.mark.unit
 
+import pytest
 from app.domain.enums.events import EventType
 from app.services.sse.kafka_redis_bridge import SSEKafkaRedisBridge
 
+pytestmark = pytest.mark.unit
+
 _test_logger = logging.getLogger("test.services.sse.kafka_redis_bridge")
 
 
@@ -42,7 +41,7 @@ def __init__(self, execution_id: str | None, et: EventType) -> None:
         self.event_type = et
         self.execution_id = execution_id
 
-    def model_dump(self) -> dict:
+    def model_dump(self) -> dict[str, str | None]:
         return {"execution_id": self.execution_id}
 
 
@@ -50,25 +49,26 @@ def model_dump(self) -> dict:
 async def test_register_and_route_events_without_kafka() -> None:
     # Build the bridge but don't call start(); directly test routing handlers
     bridge = SSEKafkaRedisBridge(
-        schema_registry=_FakeSchema(),
-        settings=_FakeSettings(),
-        event_metrics=_FakeEventMetrics(),
-        sse_bus=_FakeBus(),
+        schema_registry=_FakeSchema(),  # type: ignore[arg-type]
+        settings=_FakeSettings(),  # type: ignore[arg-type]
+        event_metrics=_FakeEventMetrics(),  # type: ignore[arg-type]
+        sse_bus=_FakeBus(),  # type: ignore[arg-type]
         logger=_test_logger,
     )
 
     disp = _StubDispatcher()
-    bridge._register_routing_handlers(disp)
+    bridge._register_routing_handlers(disp)  # type: ignore[arg-type]
     assert EventType.EXECUTION_STARTED in disp.handlers
 
     # Event without execution_id is ignored
     h = disp.handlers[EventType.EXECUTION_STARTED]
-    await h(_DummyEvent(None, EventType.EXECUTION_STARTED))
-    assert bridge.sse_bus.published == []
+    await h(_DummyEvent(None, EventType.EXECUTION_STARTED))  # type: ignore[operator]
+    fake_bus: _FakeBus = bridge.sse_bus  # type: ignore[assignment]
+    assert fake_bus.published == []
 
     # Proper event is published
-    await h(_DummyEvent("exec-123", EventType.EXECUTION_STARTED))
-    assert bridge.sse_bus.published and bridge.sse_bus.published[-1][0] == "exec-123"
+    await h(_DummyEvent("exec-123", EventType.EXECUTION_STARTED))  # type: ignore[operator]
+    assert fake_bus.published and fake_bus.published[-1][0] == "exec-123"
 
     s = bridge.get_stats()
     assert s["num_consumers"] == 0 and s["is_running"] is False
diff --git a/backend/tests/unit/services/sse/test_redis_bus.py b/backend/tests/unit/services/sse/test_redis_bus.py
new file mode 100644
index 00000000..a8c730e4
--- /dev/null
+++ b/backend/tests/unit/services/sse/test_redis_bus.py
@@ -0,0 +1,96 @@
+import asyncio
+import logging
+from typing import Any, TypeVar
+
+import pytest
+from app.domain.enums.notification import NotificationSeverity, NotificationStatus
+from app.domain.execution.models import ResourceUsageDomain
+from app.infrastructure.kafka.events.execution import ExecutionCompletedEvent
+from app.infrastructure.kafka.events.metadata import AvroEventMetadata
+from app.schemas_pydantic.sse import RedisNotificationMessage, RedisSSEMessage
+from app.services.sse.redis_bus import SSERedisBus
+from fakeredis import FakeAsyncRedis
+
+pytestmark = pytest.mark.unit
+
+_test_logger = logging.getLogger("test.services.sse.redis_bus")
+_T = TypeVar("_T")
+
+
+def _make_completed_event(execution_id: str) -> ExecutionCompletedEvent:
+    return ExecutionCompletedEvent(
+        execution_id=execution_id,
+        exit_code=0,
+        stdout="ok",
+        stderr="",
+        resource_usage=ResourceUsageDomain(),
+        metadata=AvroEventMetadata(service_name="test", service_version="1.0"),
+    )
+
+
+async def _wait_for_message(sub: Any, model: type[_T], timeout: float = 1.0) -> _T:
+    """Wait for a non-None message with explicit timeout."""
+    async with asyncio.timeout(timeout):
+        while True:
+            msg: _T | None = await sub.get(model)
+            if msg is not None:
+                return msg
+            await asyncio.sleep(0.01)  # Yield, not timing dependency
+
+
+@pytest.mark.asyncio
+async def test_publish_and_subscribe_round_trip() -> None:
+    redis = FakeAsyncRedis()
+    bus = SSERedisBus(redis, logger=_test_logger)
+
+    sub = await bus.open_subscription("exec-1")
+    evt = _make_completed_event("exec-1")
+
+    # Publish directly (subscription is already open and ready)
+    await bus.publish_event("exec-1", evt)
+
+    # Wait with explicit timeout
+    msg = await _wait_for_message(sub, RedisSSEMessage)
+    assert msg.execution_id == "exec-1"
+
+    # Invalid JSON should be skipped - verify by sending valid message after invalid
+    # and confirming we receive only the valid one (no crash, no stale data)
+    await redis.publish("sse:exec:exec-1", "not-json")
+    evt2 = _make_completed_event("exec-1")
+    await bus.publish_event("exec-1", evt2)
+
+    # Should receive the valid message, proving invalid JSON was skipped
+    msg2 = await _wait_for_message(sub, RedisSSEMessage)
+    assert msg2.execution_id == "exec-1"
+
+    await sub.close()
+    await redis.aclose()
+
+
+@pytest.mark.asyncio
+async def test_notifications_channels() -> None:
+    redis = FakeAsyncRedis()
+    bus = SSERedisBus(redis, logger=_test_logger)
+
+    nsub = await bus.open_notification_subscription("user-1")
+
+    notif = RedisNotificationMessage(
+        notification_id="n1",
+        severity=NotificationSeverity.LOW,
+        status=NotificationStatus.PENDING,
+        tags=[],
+        subject="test",
+        body="body",
+        action_url="",
+        created_at="2025-01-01T00:00:00Z",
+    )
+
+    # Publish directly (subscription is already open and ready)
+    await bus.publish_notification("user-1", notif)
+
+    # Wait with explicit timeout
+    got = await _wait_for_message(nsub, RedisNotificationMessage)
+    assert got.notification_id == "n1"
+
+    await nsub.close()
+    await redis.aclose()
diff --git a/backend/tests/unit/services/sse/test_shutdown_manager.py b/backend/tests/unit/services/sse/test_shutdown_manager.py
index 6db2190e..4bcc30c5 100644
--- a/backend/tests/unit/services/sse/test_shutdown_manager.py
+++ b/backend/tests/unit/services/sse/test_shutdown_manager.py
@@ -2,7 +2,6 @@
 import logging
 
 import pytest
-
 from app.services.sse.sse_shutdown_manager import SSEShutdownManager
 
 _test_logger = logging.getLogger("test.services.sse.shutdown_manager")
@@ -17,7 +16,7 @@ async def aclose(self) -> None:
 
 
 @pytest.mark.asyncio
-async def test_shutdown_graceful_notify_and_drain():
+async def test_shutdown_graceful_notify_and_drain() -> None:
     mgr = SSEShutdownManager(drain_timeout=1.0, notification_timeout=0.01, force_close_timeout=0.1, logger=_test_logger)
 
     # Register two connections and arrange that they unregister when notified
@@ -25,7 +24,7 @@ async def test_shutdown_graceful_notify_and_drain():
     ev2 = await mgr.register_connection("e1", "c2")
     assert ev1 is not None and ev2 is not None
 
-    async def on_shutdown(event, cid):  # noqa: ANN001
+    async def on_shutdown(event: asyncio.Event, cid: str) -> None:
         await asyncio.wait_for(event.wait(), timeout=0.5)
         await mgr.unregister_connection("e1", cid)
 
@@ -41,10 +40,12 @@ async def on_shutdown(event, cid):  # noqa: ANN001
 
 
 @pytest.mark.asyncio
-async def test_shutdown_force_close_calls_router_stop_and_rejects_new():
-    mgr = SSEShutdownManager(drain_timeout=0.01, notification_timeout=0.01, force_close_timeout=0.01, logger=_test_logger)
+async def test_shutdown_force_close_calls_router_stop_and_rejects_new() -> None:
+    mgr = SSEShutdownManager(
+        drain_timeout=0.01, notification_timeout=0.01, force_close_timeout=0.01, logger=_test_logger
+    )
     router = DummyRouter()
-    mgr.set_router(router)
+    mgr.set_router(router)  # type: ignore[arg-type]
 
     # Register a connection but never unregister -> force close path
     ev = await mgr.register_connection("e1", "c1")
@@ -63,7 +64,7 @@ async def test_shutdown_force_close_calls_router_stop_and_rejects_new():
 
 
 @pytest.mark.asyncio
-async def test_get_shutdown_status_transitions():
+async def test_get_shutdown_status_transitions() -> None:
     m = SSEShutdownManager(drain_timeout=0.01, notification_timeout=0.0, force_close_timeout=0.0, logger=_test_logger)
     st0 = m.get_shutdown_status()
     assert st0.phase == "ready"
diff --git a/backend/tests/unit/services/sse/test_sse_service.py b/backend/tests/unit/services/sse/test_sse_service.py
index 63299b4e..6ca62fc0 100644
--- a/backend/tests/unit/services/sse/test_sse_service.py
+++ b/backend/tests/unit/services/sse/test_sse_service.py
@@ -4,18 +4,16 @@
 from typing import Any, Type
 
 import pytest
+from app.domain.enums.events import EventType
+from app.domain.execution import DomainExecution, ResourceUsageDomain
+from app.domain.sse import ShutdownStatus, SSEHealthDomain
+from app.services.sse.sse_service import SSEService
 from pydantic import BaseModel
 
 pytestmark = pytest.mark.unit
 
 _test_logger = logging.getLogger("test.services.sse.sse_service")
 
-from app.domain.enums.events import EventType
-from app.domain.execution import DomainExecution, ResourceUsageDomain
-from app.domain.sse import ShutdownStatus, SSEHealthDomain
-from app.schemas_pydantic.sse import RedisNotificationMessage, RedisSSEMessage
-from app.services.sse.sse_service import SSEService
-
 T = Any  # TypeVar for fake
 
 
@@ -78,11 +76,11 @@ def __init__(self) -> None:
         self.registered: list[tuple[str, str]] = []
         self.unregistered: list[tuple[str, str]] = []
 
-    async def register_connection(self, execution_id: str, connection_id: str):
+    async def register_connection(self, execution_id: str, connection_id: str) -> Any:
         self.registered.append((execution_id, connection_id))
         return self._evt
 
-    async def unregister_connection(self, execution_id: str, connection_id: str):
+    async def unregister_connection(self, execution_id: str, connection_id: str) -> None:
         self.unregistered.append((execution_id, connection_id))
 
     def is_shutting_down(self) -> bool:
@@ -114,7 +112,8 @@ def get_stats(self) -> dict[str, int | bool]:
 def _decode(evt: dict[str, Any]) -> dict[str, Any]:
     import json
 
-    return json.loads(evt["data"])  # type: ignore[index]
+    result: dict[str, Any] = json.loads(evt["data"])
+    return result
 
 
 @pytest.mark.asyncio
@@ -122,7 +121,14 @@ async def test_execution_stream_closes_on_failed_event() -> None:
     repo = _FakeRepo()
     bus = _FakeBus()
     sm = _FakeShutdown()
-    svc = SSEService(repository=repo, router=_FakeRouter(), sse_bus=bus, shutdown_manager=sm, settings=_FakeSettings(), logger=_test_logger)
+    svc = SSEService(
+        repository=repo,  # type: ignore[arg-type]
+        router=_FakeRouter(),  # type: ignore[arg-type]
+        sse_bus=bus,  # type: ignore[arg-type]
+        shutdown_manager=sm,  # type: ignore[arg-type]
+        settings=_FakeSettings(),  # type: ignore[arg-type]
+        logger=_test_logger,
+    )
 
     agen = svc.create_execution_stream("exec-1", user_id="u1")
     first = await agen.__anext__()
@@ -159,7 +165,14 @@ async def test_execution_stream_result_stored_includes_result_payload() -> None:
     )
     bus = _FakeBus()
     sm = _FakeShutdown()
-    svc = SSEService(repository=repo, router=_FakeRouter(), sse_bus=bus, shutdown_manager=sm, settings=_FakeSettings(), logger=_test_logger)
+    svc = SSEService(
+        repository=repo,  # type: ignore[arg-type]
+        router=_FakeRouter(),  # type: ignore[arg-type]
+        sse_bus=bus,  # type: ignore[arg-type]
+        shutdown_manager=sm,  # type: ignore[arg-type]
+        settings=_FakeSettings(),  # type: ignore[arg-type]
+        logger=_test_logger,
+    )
 
     agen = svc.create_execution_stream("exec-2", user_id="u1")
     await agen.__anext__()  # connected
@@ -182,7 +195,14 @@ async def test_notification_stream_connected_and_heartbeat_and_message() -> None
     sm = _FakeShutdown()
     settings = _FakeSettings()
     settings.SSE_HEARTBEAT_INTERVAL = 0  # emit immediately
-    svc = SSEService(repository=repo, router=_FakeRouter(), sse_bus=bus, shutdown_manager=sm, settings=settings, logger=_test_logger)
+    svc = SSEService(
+        repository=repo,  # type: ignore[arg-type]
+        router=_FakeRouter(),  # type: ignore[arg-type]
+        sse_bus=bus,  # type: ignore[arg-type]
+        shutdown_manager=sm,  # type: ignore[arg-type]
+        settings=settings,  # type: ignore[arg-type]
+        logger=_test_logger,
+    )
 
     agen = svc.create_notification_stream("u1")
     connected = await agen.__anext__()
@@ -217,7 +237,14 @@ async def test_notification_stream_connected_and_heartbeat_and_message() -> None
 
 @pytest.mark.asyncio
 async def test_health_status_shape() -> None:
-    svc = SSEService(repository=_FakeRepo(), router=_FakeRouter(), sse_bus=_FakeBus(), shutdown_manager=_FakeShutdown(), settings=_FakeSettings(), logger=_test_logger)
+    svc = SSEService(
+        repository=_FakeRepo(),  # type: ignore[arg-type]
+        router=_FakeRouter(),  # type: ignore[arg-type]
+        sse_bus=_FakeBus(),  # type: ignore[arg-type]
+        shutdown_manager=_FakeShutdown(),  # type: ignore[arg-type]
+        settings=_FakeSettings(),  # type: ignore[arg-type]
+        logger=_test_logger,
+    )
     h = await svc.get_health_status()
     assert isinstance(h, SSEHealthDomain)
     assert h.active_consumers == 3 and h.active_executions == 2
diff --git a/backend/tests/unit/services/sse/test_sse_shutdown_manager.py b/backend/tests/unit/services/sse/test_sse_shutdown_manager.py
index 4e7300b3..e24e3727 100644
--- a/backend/tests/unit/services/sse/test_sse_shutdown_manager.py
+++ b/backend/tests/unit/services/sse/test_sse_shutdown_manager.py
@@ -1,12 +1,12 @@
 import asyncio
 import logging
 
+import backoff
 import pytest
+from app.services.sse.sse_shutdown_manager import SSEShutdownManager
 
 pytestmark = pytest.mark.unit
 
-from app.services.sse.sse_shutdown_manager import SSEShutdownManager
-
 _test_logger = logging.getLogger("test.services.sse.sse_shutdown_manager")
 
 
@@ -21,7 +21,7 @@ async def stop(self) -> None:
 @pytest.mark.asyncio
 async def test_register_unregister_and_shutdown_flow() -> None:
     mgr = SSEShutdownManager(drain_timeout=0.5, notification_timeout=0.1, force_close_timeout=0.1, logger=_test_logger)
-    mgr.set_router(_FakeRouter())
+    mgr.set_router(_FakeRouter())  # type: ignore[arg-type]
 
     # Register two connections
     e1 = await mgr.register_connection("exec-1", "c1")
@@ -32,12 +32,11 @@ async def test_register_unregister_and_shutdown_flow() -> None:
     task = asyncio.create_task(mgr.initiate_shutdown())
 
     # Wait until manager enters NOTIFYING phase (event-driven)
-    from tests.helpers.eventually import eventually
-
-    async def _is_notifying():
-        return mgr.get_shutdown_status().phase == "notifying"
+    @backoff.on_exception(backoff.constant, AssertionError, max_time=1.0, interval=0.02)
+    async def _wait_notifying() -> None:
+        assert mgr.get_shutdown_status().phase == "notifying"
 
-    await eventually(_is_notifying, timeout=1.0, interval=0.02)
+    await _wait_notifying()
 
     # Simulate clients acknowledging and disconnecting
     e1.set()
@@ -51,19 +50,21 @@ async def _is_notifying():
 
 @pytest.mark.asyncio
 async def test_reject_new_connection_during_shutdown() -> None:
-    mgr = SSEShutdownManager(drain_timeout=0.1, notification_timeout=0.01, force_close_timeout=0.01, logger=_test_logger)
+    mgr = SSEShutdownManager(
+        drain_timeout=0.1, notification_timeout=0.01, force_close_timeout=0.01, logger=_test_logger
+    )
     # Pre-register one active connection to reflect realistic state
     e = await mgr.register_connection("e", "c0")
     assert e is not None
 
     # Start shutdown and wait until initiated
     t = asyncio.create_task(mgr.initiate_shutdown())
-    from tests.helpers.eventually import eventually
 
-    async def _initiated():
+    @backoff.on_exception(backoff.constant, AssertionError, max_time=1.0, interval=0.02)
+    async def _wait_initiated() -> None:
         assert mgr.is_shutting_down() is True
 
-    await eventually(_initiated, timeout=1.0, interval=0.02)
+    await _wait_initiated()
 
     # New registrations rejected once shutdown initiated
     denied = await mgr.register_connection("e", "c1")
diff --git a/backend/tests/unit/services/test_pod_builder.py b/backend/tests/unit/services/test_pod_builder.py
index cd271631..282d2701 100644
--- a/backend/tests/unit/services/test_pod_builder.py
+++ b/backend/tests/unit/services/test_pod_builder.py
@@ -1,12 +1,11 @@
 from uuid import uuid4
 
 import pytest
-from kubernetes import client as k8s_client
-
 from app.infrastructure.kafka.events.metadata import AvroEventMetadata
 from app.infrastructure.kafka.events.saga import CreatePodCommandEvent
 from app.services.k8s_worker.config import K8sWorkerConfig
 from app.services.k8s_worker.pod_builder import PodBuilder
+from kubernetes_asyncio import client as k8s_client
 
 
 class TestPodBuilder:
diff --git a/backend/uv.lock b/backend/uv.lock
index 8bf078fe..909706cf 100644
--- a/backend/uv.lock
+++ b/backend/uv.lock
@@ -640,6 +640,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" },
 ]
 
+[[package]]
+name = "fakeredis"
+version = "2.33.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "redis" },
+    { name = "sortedcontainers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5f/f9/57464119936414d60697fcbd32f38909bb5688b616ae13de6e98384433e0/fakeredis-2.33.0.tar.gz", hash = "sha256:d7bc9a69d21df108a6451bbffee23b3eba432c21a654afc7ff2d295428ec5770", size = 175187, upload-time = "2025-12-16T19:45:52.269Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6e/78/a850fed8aeef96d4a99043c90b818b2ed5419cd5b24a4049fd7cfb9f1471/fakeredis-2.33.0-py3-none-any.whl", hash = "sha256:de535f3f9ccde1c56672ab2fdd6a8efbc4f2619fc2f1acc87b8737177d71c965", size = 119605, upload-time = "2025-12-16T19:45:51.08Z" },
+]
+
 [[package]]
 name = "fastapi"
 version = "0.128.0"
@@ -1050,7 +1063,7 @@ dependencies = [
     { name = "itsdangerous" },
     { name = "jinja2" },
     { name = "kiwisolver" },
-    { name = "kubernetes" },
+    { name = "kubernetes-asyncio" },
     { name = "limits" },
     { name = "markdown-it-py" },
     { name = "markupsafe" },
@@ -1126,6 +1139,7 @@ dependencies = [
 [package.dev-dependencies]
 dev = [
     { name = "coverage" },
+    { name = "fakeredis" },
     { name = "hypothesis" },
     { name = "iniconfig" },
     { name = "matplotlib" },
@@ -1192,7 +1206,7 @@ requires-dist = [
     { name = "itsdangerous", specifier = "==2.2.0" },
     { name = "jinja2", specifier = "==3.1.6" },
     { name = "kiwisolver", specifier = "==1.4.9" },
-    { name = "kubernetes", specifier = "==31.0.0" },
+    { name = "kubernetes-asyncio", specifier = "==33.3.0" },
     { name = "limits", specifier = "==3.13.0" },
     { name = "markdown-it-py", specifier = "==4.0.0" },
     { name = "markupsafe", specifier = "==3.0.2" },
@@ -1256,7 +1270,7 @@ requires-dist = [
     { name = "tiktoken", specifier = "==0.11.0" },
     { name = "tomli", specifier = "==2.0.2" },
     { name = "typing-extensions", specifier = "==4.12.2" },
-    { name = "urllib3", specifier = "==2.6.2" },
+    { name = "urllib3", specifier = "==2.6.3" },
     { name = "uvicorn", specifier = "==0.34.2" },
     { name = "websocket-client", specifier = "==1.8.0" },
     { name = "werkzeug", specifier = "==3.1.4" },
@@ -1268,6 +1282,7 @@ requires-dist = [
 [package.metadata.requires-dev]
 dev = [
     { name = "coverage", specifier = "==7.13.0" },
+    { name = "fakeredis", specifier = ">=2.33.0" },
     { name = "hypothesis", specifier = "==6.103.4" },
     { name = "iniconfig", specifier = "==2.0.0" },
     { name = "matplotlib", specifier = "==3.10.8" },
@@ -1379,25 +1394,20 @@ wheels = [
 ]
 
 [[package]]
-name = "kubernetes"
-version = "31.0.0"
+name = "kubernetes-asyncio"
+version = "33.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "aiohttp" },
     { name = "certifi" },
-    { name = "durationpy" },
-    { name = "google-auth" },
-    { name = "oauthlib" },
     { name = "python-dateutil" },
     { name = "pyyaml" },
-    { name = "requests" },
-    { name = "requests-oauthlib" },
     { name = "six" },
     { name = "urllib3" },
-    { name = "websocket-client" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/7e/bd/ffcd3104155b467347cd9b3a64eb24182e459579845196b3a200569c8912/kubernetes-31.0.0.tar.gz", hash = "sha256:28945de906c8c259c1ebe62703b56a03b714049372196f854105afe4e6d014c0", size = 916096, upload-time = "2024-09-20T03:16:08.089Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/41/5f/c175f86b92ff5f19444e3be1423819491ae9859d1f6f7d83d404eab8b10d/kubernetes_asyncio-33.3.0.tar.gz", hash = "sha256:4c59cd4c99b197995ef38ef0c8ff45aab24b84830ebf0ddcb67355caea9674c9", size = 1124931, upload-time = "2025-08-11T21:39:37.825Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/fb/a8/17f5e28cecdbd6d48127c22abdb794740803491f422a11905c4569d8e139/kubernetes-31.0.0-py2.py3-none-any.whl", hash = "sha256:bf141e2d380c8520eada8b351f4e319ffee9636328c137aa432bc486ca1200e1", size = 1857013, upload-time = "2024-09-20T03:16:06.05Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/20/90985f53c141e6f3464b7295a617ffd36574168861882f9291847d09f9b1/kubernetes_asyncio-33.3.0-py3-none-any.whl", hash = "sha256:25e6e265932ebb1aeecbdb30a107dbef3ee0bcd388ed12d092be70915733982b", size = 2174591, upload-time = "2025-08-11T21:39:35.697Z" },
 ]
 
 [[package]]
@@ -2883,11 +2893,11 @@ wheels = [
 
 [[package]]
 name = "urllib3"
-version = "2.6.2"
+version = "2.6.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/1e/24/a2a2ed9addd907787d7aa0355ba36a6cadf1768b934c652ea78acbd59dcd/urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797", size = 432930, upload-time = "2025-12-11T15:56:40.252Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6d/b9/4095b668ea3678bf6a0af005527f39de12fb026516fb3df17495a733b7f8/urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd", size = 131182, upload-time = "2025-12-11T15:56:38.584Z" },
+    { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
 ]
 
 [[package]]
diff --git a/backend/workers/dlq_processor.py b/backend/workers/dlq_processor.py
index 711d1ff2..22cf8898 100644
--- a/backend/workers/dlq_processor.py
+++ b/backend/workers/dlq_processor.py
@@ -5,12 +5,9 @@
 from typing import Optional
 
 from app.core.container import create_dlq_processor_container
-from app.core.database_context import Database
-from app.db.docs import ALL_DOCUMENTS
 from app.dlq import DLQMessage, RetryPolicy, RetryStrategy
 from app.dlq.manager import DLQManager
 from app.settings import Settings, get_settings
-from beanie import init_beanie
 
 
 def _configure_retry_policies(manager: DLQManager, logger: logging.Logger) -> None:
@@ -109,9 +106,6 @@ async def main(settings: Settings | None = None) -> None:
     logger = await container.get(logging.Logger)
     logger.info("Starting DLQ Processor with DI container...")
 
-    db = await container.get(Database)
-    await init_beanie(database=db, document_models=ALL_DOCUMENTS)
-
     manager = await container.get(DLQManager)
 
     _configure_retry_policies(manager, logger)
diff --git a/backend/workers/run_coordinator.py b/backend/workers/run_coordinator.py
index ef617444..c9fb7a60 100644
--- a/backend/workers/run_coordinator.py
+++ b/backend/workers/run_coordinator.py
@@ -3,15 +3,11 @@
 import signal
 
 from app.core.container import create_coordinator_container
-from app.core.database_context import Database
 from app.core.logging import setup_logger
 from app.core.tracing import init_tracing
-from app.db.docs import ALL_DOCUMENTS
 from app.domain.enums.kafka import GroupId
-from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas
 from app.services.coordinator.coordinator import ExecutionCoordinator
 from app.settings import Settings, get_settings
-from beanie import init_beanie
 
 
 async def run_coordinator(settings: Settings | None = None) -> None:
@@ -23,12 +19,6 @@ async def run_coordinator(settings: Settings | None = None) -> None:
     logger = await container.get(logging.Logger)
     logger.info("Starting ExecutionCoordinator with DI container...")
 
-    db = await container.get(Database)
-    await init_beanie(database=db, document_models=ALL_DOCUMENTS)
-
-    schema_registry = await container.get(SchemaRegistryManager)
-    await initialize_event_schemas(schema_registry)
-
     # Services are already started by the DI container providers
     coordinator = await container.get(ExecutionCoordinator)
 
diff --git a/backend/workers/run_event_replay.py b/backend/workers/run_event_replay.py
index 949cf8af..74dc01b5 100644
--- a/backend/workers/run_event_replay.py
+++ b/backend/workers/run_event_replay.py
@@ -3,14 +3,11 @@
 from contextlib import AsyncExitStack
 
 from app.core.container import create_event_replay_container
-from app.core.database_context import Database
 from app.core.logging import setup_logger
 from app.core.tracing import init_tracing
-from app.db.docs import ALL_DOCUMENTS
 from app.events.core import UnifiedProducer
 from app.services.event_replay.replay_service import EventReplayService
 from app.settings import Settings, get_settings
-from beanie import init_beanie
 
 
 async def cleanup_task(replay_service: EventReplayService, logger: logging.Logger, interval_hours: int = 6) -> None:
@@ -33,9 +30,6 @@ async def run_replay_service(settings: Settings | None = None) -> None:
     logger = await container.get(logging.Logger)
     logger.info("Starting EventReplayService with DI container...")
 
-    db = await container.get(Database)
-    await init_beanie(database=db, document_models=ALL_DOCUMENTS)
-
     producer = await container.get(UnifiedProducer)
     replay_service = await container.get(EventReplayService)
 
diff --git a/backend/workers/run_k8s_worker.py b/backend/workers/run_k8s_worker.py
index 49b945fa..777e876f 100644
--- a/backend/workers/run_k8s_worker.py
+++ b/backend/workers/run_k8s_worker.py
@@ -3,15 +3,11 @@
 import signal
 
 from app.core.container import create_k8s_worker_container
-from app.core.database_context import Database
 from app.core.logging import setup_logger
 from app.core.tracing import init_tracing
-from app.db.docs import ALL_DOCUMENTS
 from app.domain.enums.kafka import GroupId
-from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas
 from app.services.k8s_worker.worker import KubernetesWorker
 from app.settings import Settings, get_settings
-from beanie import init_beanie
 
 
 async def run_kubernetes_worker(settings: Settings | None = None) -> None:
@@ -23,12 +19,6 @@ async def run_kubernetes_worker(settings: Settings | None = None) -> None:
     logger = await container.get(logging.Logger)
     logger.info("Starting KubernetesWorker with DI container...")
 
-    db = await container.get(Database)
-    await init_beanie(database=db, document_models=ALL_DOCUMENTS)
-
-    schema_registry = await container.get(SchemaRegistryManager)
-    await initialize_event_schemas(schema_registry)
-
     # Services are already started by the DI container providers
     worker = await container.get(KubernetesWorker)
 
diff --git a/backend/workers/run_pod_monitor.py b/backend/workers/run_pod_monitor.py
index 9c1fe09e..9baba6b1 100644
--- a/backend/workers/run_pod_monitor.py
+++ b/backend/workers/run_pod_monitor.py
@@ -3,15 +3,11 @@
 import signal
 
 from app.core.container import create_pod_monitor_container
-from app.core.database_context import Database
 from app.core.logging import setup_logger
 from app.core.tracing import init_tracing
-from app.db.docs import ALL_DOCUMENTS
 from app.domain.enums.kafka import GroupId
-from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas
 from app.services.pod_monitor.monitor import MonitorState, PodMonitor
 from app.settings import Settings, get_settings
-from beanie import init_beanie
 
 RECONCILIATION_LOG_INTERVAL: int = 60
 
@@ -25,12 +21,6 @@ async def run_pod_monitor(settings: Settings | None = None) -> None:
     logger = await container.get(logging.Logger)
     logger.info("Starting PodMonitor with DI container...")
 
-    db = await container.get(Database)
-    await init_beanie(database=db, document_models=ALL_DOCUMENTS)
-
-    schema_registry = await container.get(SchemaRegistryManager)
-    await initialize_event_schemas(schema_registry)
-
     # Services are already started by the DI container providers
     monitor = await container.get(PodMonitor)
 
diff --git a/backend/workers/run_result_processor.py b/backend/workers/run_result_processor.py
index 0151ad9f..6b7ecb36 100644
--- a/backend/workers/run_result_processor.py
+++ b/backend/workers/run_result_processor.py
@@ -6,7 +6,6 @@
 from app.core.container import create_result_processor_container
 from app.core.logging import setup_logger
 from app.core.tracing import init_tracing
-from app.db.docs import ALL_DOCUMENTS
 from app.db.repositories.execution_repository import ExecutionRepository
 from app.domain.enums.kafka import GroupId
 from app.events.core import UnifiedProducer
@@ -14,26 +13,19 @@
 from app.services.idempotency import IdempotencyManager
 from app.services.result_processor.processor import ProcessingState, ResultProcessor
 from app.settings import Settings, get_settings
-from beanie import init_beanie
-from pymongo.asynchronous.mongo_client import AsyncMongoClient
 
 
 async def run_result_processor(settings: Settings | None = None) -> None:
     if settings is None:
         settings = get_settings()
 
-    db_client: AsyncMongoClient[dict[str, object]] = AsyncMongoClient(
-        settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000
-    )
-    await init_beanie(database=db_client[settings.DATABASE_NAME], document_models=ALL_DOCUMENTS)
-
     container = create_result_processor_container(settings)
     producer = await container.get(UnifiedProducer)
     schema_registry = await container.get(SchemaRegistryManager)
     idempotency_manager = await container.get(IdempotencyManager)
     execution_repo = await container.get(ExecutionRepository)
     logger = await container.get(logging.Logger)
-    logger.info(f"Beanie ODM initialized with {len(ALL_DOCUMENTS)} document models")
+    logger.info("Starting ResultProcessor with DI container...")
 
     # ResultProcessor is manually created (not from DI), so we own its lifecycle
     processor = ResultProcessor(
@@ -53,7 +45,6 @@ async def run_result_processor(settings: Settings | None = None) -> None:
 
     # We own the processor, so we use async with to manage its lifecycle
     async with AsyncExitStack() as stack:
-        stack.callback(db_client.close)
         stack.push_async_callback(container.close)
         await stack.enter_async_context(processor)
 
diff --git a/backend/workers/run_saga_orchestrator.py b/backend/workers/run_saga_orchestrator.py
index 04ad8a8d..666fd684 100644
--- a/backend/workers/run_saga_orchestrator.py
+++ b/backend/workers/run_saga_orchestrator.py
@@ -3,15 +3,11 @@
 import signal
 
 from app.core.container import create_saga_orchestrator_container
-from app.core.database_context import Database
 from app.core.logging import setup_logger
 from app.core.tracing import init_tracing
-from app.db.docs import ALL_DOCUMENTS
 from app.domain.enums.kafka import GroupId
-from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas
 from app.services.saga import SagaOrchestrator
 from app.settings import Settings, get_settings
-from beanie import init_beanie
 
 
 async def run_saga_orchestrator(settings: Settings | None = None) -> None:
@@ -23,12 +19,6 @@ async def run_saga_orchestrator(settings: Settings | None = None) -> None:
     logger = await container.get(logging.Logger)
     logger.info("Starting SagaOrchestrator with DI container...")
 
-    db = await container.get(Database)
-    await init_beanie(database=db, document_models=ALL_DOCUMENTS)
-
-    schema_registry = await container.get(SchemaRegistryManager)
-    await initialize_event_schemas(schema_registry)
-
     # Services are already started by the DI container providers
     orchestrator = await container.get(SagaOrchestrator)
 
diff --git a/docker-bake.hcl b/docker-bake.hcl
new file mode 100644
index 00000000..ffc4a6eb
--- /dev/null
+++ b/docker-bake.hcl
@@ -0,0 +1,217 @@
+// Docker Bake file for building all services with proper caching
+// Usage: docker buildx bake -f docker-bake.hcl [target]
+//
+// Targets:
+//   base          - Shared Python base image (dependencies only)
+//   backend       - Backend API server
+//   workers       - All worker services (saga, k8s, pod-monitor, etc.)
+//   all           - Everything needed for E2E tests
+//
+// CI Usage:
+//   docker buildx bake -f docker-bake.hcl all \
+//     --set *.cache-from=type=gha \
+//     --set *.cache-to=type=gha,mode=max
+
+// Variables for cache configuration (can be overridden via --set)
+variable "CACHE_FROM" {
+  default = ""
+}
+
+variable "CACHE_TO" {
+  default = ""
+}
+
+// Base image - contains Python, system deps, and all Python dependencies
+// This is the most important layer to cache since it rarely changes
+target "base" {
+  context    = "./backend"
+  dockerfile = "Dockerfile.base"
+  tags       = ["integr8scode-base:latest"]
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// Backend API server
+target "backend" {
+  context    = "./backend"
+  dockerfile = "Dockerfile"
+  tags       = ["integr8scode-backend:latest"]
+  contexts = {
+    base = "target:base"
+  }
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// Certificate generator for Zookeeper/Kafka
+target "zookeeper-certgen" {
+  context    = "./backend/zookeeper"
+  dockerfile = "Dockerfile.certgen"
+  tags       = ["integr8scode-zookeeper-certgen:latest"]
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// Certificate generator for TLS (mkcert)
+target "cert-generator" {
+  context    = "./cert-generator"
+  dockerfile = "Dockerfile"
+  tags       = ["integr8scode-cert-generator:latest"]
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// Execution Coordinator worker
+target "coordinator" {
+  context    = "./backend"
+  dockerfile = "workers/Dockerfile.coordinator"
+  tags       = ["integr8scode-coordinator:latest"]
+  contexts = {
+    base = "target:base"
+  }
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// Saga Orchestrator worker
+target "saga-orchestrator" {
+  context    = "./backend"
+  dockerfile = "workers/Dockerfile.saga_orchestrator"
+  tags       = ["integr8scode-saga-orchestrator:latest"]
+  contexts = {
+    base = "target:base"
+  }
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// Kubernetes Worker
+target "k8s-worker" {
+  context    = "./backend"
+  dockerfile = "workers/Dockerfile.k8s_worker"
+  tags       = ["integr8scode-k8s-worker:latest"]
+  contexts = {
+    base = "target:base"
+  }
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// Pod Monitor worker
+target "pod-monitor" {
+  context    = "./backend"
+  dockerfile = "workers/Dockerfile.pod_monitor"
+  tags       = ["integr8scode-pod-monitor:latest"]
+  contexts = {
+    base = "target:base"
+  }
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// Result Processor worker
+target "result-processor" {
+  context    = "./backend"
+  dockerfile = "workers/Dockerfile.result_processor"
+  tags       = ["integr8scode-result-processor:latest"]
+  contexts = {
+    base = "target:base"
+  }
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// Event Replay service
+target "event-replay" {
+  context    = "./backend"
+  dockerfile = "workers/Dockerfile.event_replay"
+  tags       = ["integr8scode-event-replay:latest"]
+  contexts = {
+    base = "target:base"
+  }
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// DLQ Processor service
+target "dlq-processor" {
+  context    = "./backend"
+  dockerfile = "workers/Dockerfile.dlq_processor"
+  tags       = ["integr8scode-dlq-processor:latest"]
+  contexts = {
+    base = "target:base"
+  }
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// Frontend
+target "frontend" {
+  context    = "./frontend"
+  dockerfile = "Dockerfile"
+  tags       = ["integr8scode-frontend:latest"]
+  cache-from = CACHE_FROM != "" ? [CACHE_FROM] : []
+  cache-to   = CACHE_TO != "" ? [CACHE_TO] : []
+}
+
+// =============================================================================
+// GROUP TARGETS
+// =============================================================================
+
+// All worker services
+group "workers" {
+  targets = [
+    "coordinator",
+    "saga-orchestrator",
+    "k8s-worker",
+    "pod-monitor",
+    "result-processor",
+    "event-replay",
+    "dlq-processor",
+  ]
+}
+
+// Infrastructure build targets (certs)
+group "infra" {
+  targets = [
+    "zookeeper-certgen",
+  ]
+}
+
+// Backend E2E tests - everything needed except frontend
+group "backend-e2e" {
+  targets = [
+    "base",
+    "backend",
+    "zookeeper-certgen",
+    "cert-generator",
+    "coordinator",
+    "saga-orchestrator",
+    "k8s-worker",
+    "pod-monitor",
+    "result-processor",
+  ]
+}
+
+// Full stack
+group "all" {
+  targets = [
+    "base",
+    "backend",
+    "zookeeper-certgen",
+    "cert-generator",
+    "coordinator",
+    "saga-orchestrator",
+    "k8s-worker",
+    "pod-monitor",
+    "result-processor",
+    "event-replay",
+    "dlq-processor",
+    "frontend",
+  ]
+}
+
+// Default target
+group "default" {
+  targets = ["backend-e2e"]
+}
diff --git a/docker-compose.ci.yaml b/docker-compose.ci.yaml
deleted file mode 100644
index 3367c677..00000000
--- a/docker-compose.ci.yaml
+++ /dev/null
@@ -1,243 +0,0 @@
-# CI-optimized Docker Compose configuration
-#
-# Usage:
-#   Backend integration tests (infra only, no builds):
-#     docker compose -f docker-compose.ci.yaml up -d --wait
-#
-#   Frontend E2E tests (full stack with builds):
-#     docker compose -f docker-compose.ci.yaml --profile full up -d --wait
-#
-# Key differences from docker-compose.yaml:
-#   - KRaft Kafka (no Zookeeper) - simpler, faster startup
-#   - No SASL/TLS for Kafka - not needed for tests
-#   - Profiles separate infra from app services
-#   - Minimal services for fast CI
-
-services:
-  # =============================================================================
-  # INFRASTRUCTURE SERVICES (no profile = always started)
-  # =============================================================================
-
-  mongo:
-    image: mongo:8.0
-    container_name: mongo
-    ports:
-      - "27017:27017"
-    environment:
-      MONGO_INITDB_ROOT_USERNAME: root
-      MONGO_INITDB_ROOT_PASSWORD: rootpassword
-      MONGO_INITDB_DATABASE: integr8scode
-    tmpfs:
-      - /data/db  # Use tmpfs for faster CI
-    networks:
-      - ci-network
-    healthcheck:
-      test: mongosh --eval 'db.runCommand("ping").ok' --quiet
-      interval: 2s
-      timeout: 3s
-      retries: 15
-      start_period: 5s
-
-  redis:
-    image: redis:7-alpine
-    container_name: redis
-    ports:
-      - "6379:6379"
-    command: redis-server --maxmemory 128mb --maxmemory-policy allkeys-lru --save ""
-    networks:
-      - ci-network
-    healthcheck:
-      test: ["CMD", "redis-cli", "ping"]
-      interval: 2s
-      timeout: 2s
-      retries: 10
-      start_period: 2s
-
-  # KRaft mode Kafka - official Apache image, no Zookeeper needed
-  kafka:
-    image: apache/kafka:3.9.0
-    container_name: kafka
-    ports:
-      - "9092:9092"
-    environment:
-      # KRaft mode configuration
-      KAFKA_NODE_ID: 1
-      KAFKA_PROCESS_ROLES: broker,controller
-      KAFKA_CONTROLLER_QUORUM_VOTERS: 1@localhost:9093
-      # Listeners: CONTROLLER for raft, HOST for external, DOCKER for internal
-      KAFKA_LISTENERS: CONTROLLER://localhost:9093,HOST://0.0.0.0:9092,DOCKER://0.0.0.0:29092
-      KAFKA_ADVERTISED_LISTENERS: HOST://localhost:9092,DOCKER://kafka:29092
-      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,HOST:PLAINTEXT,DOCKER:PLAINTEXT
-      KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
-      KAFKA_INTER_BROKER_LISTENER_NAME: DOCKER
-      # CI optimizations
-      KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true"
-      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
-      KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
-      KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
-      KAFKA_NUM_PARTITIONS: 1
-      KAFKA_DEFAULT_REPLICATION_FACTOR: 1
-      # Reduce memory usage
-      KAFKA_HEAP_OPTS: "-Xms256m -Xmx512m"
-    networks:
-      - ci-network
-    healthcheck:
-      test: /opt/kafka/bin/kafka-broker-api-versions.sh --bootstrap-server localhost:9092 || exit 1
-      interval: 2s
-      timeout: 5s
-      retries: 30
-      start_period: 10s
-
-  schema-registry:
-    image: confluentinc/cp-schema-registry:7.5.0
-    container_name: schema-registry
-    ports:
-      - "8081:8081"
-    environment:
-      SCHEMA_REGISTRY_HOST_NAME: schema-registry
-      SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: kafka:29092
-      SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081
-      SCHEMA_REGISTRY_HEAP_OPTS: "-Xms128m -Xmx256m"
-    depends_on:
-      kafka:
-        condition: service_healthy
-    networks:
-      - ci-network
-    healthcheck:
-      test: curl -f http://localhost:8081/config || exit 1
-      interval: 2s
-      timeout: 3s
-      retries: 20
-      start_period: 3s
-
-  # =============================================================================
-  # APPLICATION SERVICES (profile: full - only for E2E tests)
-  # =============================================================================
-
-  # Shared base image for backend
-  base:
-    build:
-      context: ./backend
-      dockerfile: Dockerfile.base
-    image: integr8scode-base:latest
-    profiles: ["full"]
-
-  # Certificate generator for TLS
-  shared-ca:
-    image: alpine:latest
-    profiles: ["full"]
-    volumes:
-      - shared_ca:/shared_ca
-    command: sh -c "mkdir -p /shared_ca && chmod 777 /shared_ca && sleep 1"
-    networks:
-      - ci-network
-
-  cert-generator:
-    build:
-      context: ./cert-generator
-      dockerfile: Dockerfile
-    image: integr8scode-cert-generator:latest
-    profiles: ["full"]
-    volumes:
-      - ./backend/certs:/backend-certs
-      - ./frontend/certs:/frontend-certs
-      - shared_ca:/shared_ca
-      - ./backend:/backend
-    environment:
-      - SHARED_CA_DIR=/shared_ca
-      - BACKEND_CERT_DIR=/backend-certs
-      - FRONTEND_CERT_DIR=/frontend-certs
-      - CI=true
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    restart: "no"
-    network_mode: host
-    depends_on:
-      shared-ca:
-        condition: service_completed_successfully
-
-  backend:
-    build:
-      context: ./backend
-      dockerfile: Dockerfile
-      additional_contexts:
-        base: service:base
-    image: integr8scode-backend:latest
-    profiles: ["full"]
-    container_name: backend
-    ports:
-      - "443:443"
-    environment:
-      - SERVER_HOST=0.0.0.0
-      - TESTING=true
-      - MONGODB_URL=mongodb://root:rootpassword@mongo:27017/integr8scode?authSource=admin
-      - KAFKA_BOOTSTRAP_SERVERS=kafka:29092
-      - SCHEMA_REGISTRY_URL=http://schema-registry:8081
-      - REDIS_HOST=redis
-      - REDIS_PORT=6379
-      - OTEL_SDK_DISABLED=true
-      - ENABLE_TRACING=false
-      - SECRET_KEY=ci-test-secret-key-for-testing-only-32chars!!
-    volumes:
-      - ./backend/certs:/app/certs:ro
-      - shared_ca:/shared_ca:ro
-      - ./backend/kubeconfig.yaml:/app/kubeconfig.yaml:ro
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    depends_on:
-      base:
-        condition: service_completed_successfully
-      cert-generator:
-        condition: service_completed_successfully
-      mongo:
-        condition: service_healthy
-      redis:
-        condition: service_healthy
-      kafka:
-        condition: service_healthy
-      schema-registry:
-        condition: service_healthy
-    networks:
-      - ci-network
-    healthcheck:
-      test: ["CMD-SHELL", "curl -k -f -s https://localhost:443/api/v1/health/live || exit 1"]
-      interval: 5s
-      timeout: 5s
-      retries: 20
-      start_period: 30s
-
-  frontend:
-    build:
-      context: ./frontend
-      dockerfile: Dockerfile
-    image: integr8scode-frontend:latest
-    profiles: ["full"]
-    container_name: frontend
-    ports:
-      - "5001:5001"
-    environment:
-      - VITE_BACKEND_URL=https://backend:443
-      - NODE_EXTRA_CA_CERTS=/shared_ca/mkcert-ca.pem
-    volumes:
-      - ./frontend/certs:/app/certs:ro
-      - shared_ca:/shared_ca:ro
-    depends_on:
-      cert-generator:
-        condition: service_completed_successfully
-      backend:
-        condition: service_healthy
-    networks:
-      - ci-network
-    healthcheck:
-      test: ["CMD-SHELL", "curl -k -f -s https://localhost:5001/ || exit 1"]
-      interval: 5s
-      timeout: 5s
-      retries: 20
-      start_period: 30s
-
-volumes:
-  shared_ca:
-
-networks:
-  ci-network:
-    driver: bridge
diff --git a/docker-compose.yaml b/docker-compose.yaml
index f68ec656..d2c25d65 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -15,6 +15,7 @@ services:
       - app-network
 
   cert-generator:
+    image: integr8scode-cert-generator:latest
     build:
       context: ./cert-generator
       dockerfile: Dockerfile
@@ -49,10 +50,10 @@ services:
     container_name: mongo
     healthcheck:
       test: echo 'db.runCommand("ping").ok' | mongosh localhost/integr8scode -u ${MONGO_ROOT_USER:-root} -p ${MONGO_ROOT_PASSWORD:-rootpassword} --authenticationDatabase admin --quiet
-      interval: 10s
-      timeout: 10s
-      retries: 5
-      start_period: 30s
+      interval: 5s
+      timeout: 5s
+      retries: 10
+      start_period: 10s
 
   redis:
     image: redis:7-alpine
@@ -66,12 +67,13 @@ services:
       - app-network
     healthcheck:
       test: ["CMD", "redis-cli", "ping"]
-      interval: 10s
-      timeout: 5s
-      retries: 5
-      start_period: 10s
+      interval: 3s
+      timeout: 3s
+      retries: 10
+      start_period: 5s
 
   backend:
+    image: integr8scode-backend:latest
     build:
       context: ./backend
       dockerfile: Dockerfile
@@ -123,6 +125,7 @@ services:
       start_period: 10s
 
   frontend:
+    image: integr8scode-frontend:latest
     container_name: frontend
     build:
       context: ./frontend
@@ -165,6 +168,7 @@ services:
   # Kafka Infrastructure for Event-Driven Design
   # Certificate generator for Zookeeper/Kafka SSL
   zookeeper-certgen:
+    image: integr8scode-zookeeper-certgen:latest
     build:
       context: ./backend/zookeeper
       dockerfile: Dockerfile.certgen
@@ -247,10 +251,10 @@ services:
         hard: 65536
     healthcheck:
       test: ["CMD-SHELL", "echo ruok | nc localhost 2181 | grep imok"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-      start_period: 40s
+      interval: 5s
+      timeout: 5s
+      retries: 15
+      start_period: 15s
 
   kafka:
     image: confluentinc/cp-kafka:7.5.0
@@ -292,8 +296,8 @@ services:
       KAFKA_LOG_RETENTION_CHECK_INTERVAL_MS: 300000
       KAFKA_LOG_CLEANUP_POLICY: 'delete'
       
-      # JVM settings
-      KAFKA_HEAP_OPTS: '-Xms2G -Xmx2G'
+      # JVM settings (CI overrides with smaller heap via KAFKA_HEAP_OPTS env var)
+      KAFKA_HEAP_OPTS: ${KAFKA_HEAP_OPTS:--Xms2G -Xmx2G}
       KAFKA_JVM_PERFORMANCE_OPTS: '-server -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 -XX:+ExplicitGCInvokesConcurrent -Djava.awt.headless=true'
       
     volumes:
@@ -308,10 +312,10 @@ services:
         hard: 65536
     healthcheck:
       test: ["CMD-SHELL", "kafka-broker-api-versions --bootstrap-server localhost:9092"]
-      interval: 30s
+      interval: 5s
       timeout: 10s
-      retries: 3
-      start_period: 60s
+      retries: 20
+      start_period: 20s
 
   schema-registry:
     image: confluentinc/cp-schema-registry:7.5.0
@@ -329,10 +333,10 @@ services:
       - app-network
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8081/config"]
-      interval: 10s
+      interval: 5s
       timeout: 5s
-      retries: 5
-      start_period: 30s
+      retries: 15
+      start_period: 10s
 
   kafdrop:
     image: obsidiandynamics/kafdrop:3.31.0
@@ -351,6 +355,7 @@ services:
 
   # Kafka topic initialization
   kafka-init:
+    image: integr8scode-backend:latest
     build:
       context: ./backend
       dockerfile: Dockerfile
@@ -374,6 +379,7 @@ services:
 
   # Seed default users (runs once after mongo is ready)
   user-seed:
+    image: integr8scode-backend:latest
     build:
       context: ./backend
       dockerfile: Dockerfile
@@ -396,6 +402,7 @@ services:
 
   # Event-driven workers
   coordinator:
+    image: integr8scode-coordinator:latest
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.coordinator
@@ -427,6 +434,7 @@ services:
     restart: unless-stopped
 
   k8s-worker:
+    image: integr8scode-k8s-worker:latest
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.k8s_worker
@@ -465,6 +473,7 @@ services:
     restart: unless-stopped
 
   pod-monitor:
+    image: integr8scode-pod-monitor:latest
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.pod_monitor
@@ -500,6 +509,7 @@ services:
     restart: unless-stopped
 
   result-processor:
+    image: integr8scode-result-processor:latest
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.result_processor
@@ -537,6 +547,7 @@ services:
     restart: unless-stopped
 
   saga-orchestrator:
+    image: integr8scode-saga-orchestrator:latest
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.saga_orchestrator
@@ -587,6 +598,7 @@ services:
 
   # Event replay service
   event-replay:
+    image: integr8scode-event-replay:latest
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.event_replay
@@ -618,6 +630,7 @@ services:
 
   # DLQ Processor Service
   dlq-processor:
+    image: integr8scode-dlq-processor:latest
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.dlq_processor
diff --git a/docs/operations/cicd.md b/docs/operations/cicd.md
index 07440893..72ea2e98 100644
--- a/docs/operations/cicd.md
+++ b/docs/operations/cicd.md
@@ -152,13 +152,9 @@ The workflow starts by installing [k3s](https://k3s.io/), a lightweight Kubernet
 interact with a real cluster during tests. It pre-pulls container images in parallel to avoid cold-start delays during
 the build step.
 
-Before building, the workflow modifies `docker-compose.yaml` using [yq](https://github.com/mikefarah/yq) to create a
-CI-specific configuration. These modifications disable SASL authentication on Kafka and Zookeeper (unnecessary for
-isolated CI), remove volume mounts that cause permission conflicts, inject test credentials for MongoDB, and disable
-OpenTelemetry export to avoid connection errors. The result is a `docker-compose.ci.yaml` that works reliably in the
-ephemeral CI environment.
-
-The [docker/bake-action](https://github.com/docker/bake-action) builds all services with GitHub Actions cache support.
+The [docker/bake-action](https://github.com/docker/bake-action) builds all services using `docker-bake.hcl` with GitHub
+Actions cache support. The bake file defines build targets with proper dependencies (e.g., backend depends on base) and
+cache configuration. Using a single `docker-compose.yaml` for both development and CI ensures consistency.
 It reads cache layers from previous runs and writes new layers back, so unchanged dependencies don't rebuild. The cache
 scopes are branch-specific with a fallback to main, meaning feature branches benefit from the main branch cache even on
 their first run.
diff --git a/tests.md b/tests.md
new file mode 100644
index 00000000..b33ecccf
--- /dev/null
+++ b/tests.md
@@ -0,0 +1,72 @@
+● Missing Test Coverage Report
+
+  Critical Gaps (Fix Immediately)
+
+  | Component                               | Type        | Missing Tests                                       | Priority |
+  |-----------------------------------------|-------------|-----------------------------------------------------|----------|
+  | auth_service.py                         | Unit        | Authentication flow, token validation, error cases  | CRITICAL |
+  | grafana_alert_processor.py              | Unit        | Severity mapping, alert parsing, webhook processing | CRITICAL |
+  | All Kafka Events (8 modules, 790 lines) | Unit        | Serialization, validation, construction             | CRITICAL |
+  | event_repository.py                     | Integration | Filtering, aggregation, pagination                  | HIGH     |
+  | notification_repository.py              | Integration | CRUD, status updates, queries                       | HIGH     |
+  | saga_repository.py                      | Integration | Persistence, state updates, step tracking           | HIGH     |
+
+  Repositories with ZERO Tests (7 of 14)
+
+  app/db/repositories/
+  ├── event_repository.py        (295 lines) ❌
+  ├── notification_repository.py (233 lines) ❌
+  ├── replay_repository.py       (99 lines)  ❌
+  ├── saga_repository.py         (146 lines) ❌
+  ├── sse_repository.py          (23 lines)  ❌
+  ├── user_repository.py         (70 lines)  ❌
+  └── user_settings_repository.py(74 lines)  ❌
+
+  Services with Inadequate Coverage
+
+  app/services/
+  ├── auth_service.py           (39 lines)  - 0 direct tests ❌
+  ├── grafana_alert_processor.py(150 lines) - 0 tests ❌
+  ├── event_bus.py              (350 lines) - limited ⚠️
+  ├── notification_service.py   (951 lines) - 2 imports only ⚠️
+  └── rate_limit_service.py     (592 lines) - 1 import only ⚠️
+
+  Kafka Events - ALL UNTESTED
+
+  app/infrastructure/kafka/events/
+  ├── execution.py   (136 lines) ❌
+  ├── saga.py        (112 lines) ❌
+  ├── system.py      (123 lines) ❌
+  ├── notification.py(63 lines)  ❌
+  ├── user.py        (86 lines)  ❌
+  ├── pod.py         (69 lines)  ❌
+  ├── base.py        (37 lines)  ❌
+  └── metadata.py    (31 lines)  ❌
+
+  Workers - NO UNIT TESTS
+
+  workers/
+  ├── run_saga_orchestrator.py ❌
+  ├── run_event_replay.py      ❌
+  ├── run_coordinator.py       ❌
+  ├── run_pod_monitor.py       ❌
+  └── dlq_processor.py         ❌
+
+  Middleware - 4 of 5 UNTESTED
+
+  app/core/middlewares/
+  ├── cache.py            ❌
+  ├── metrics.py          ❌
+  ├── rate_limit.py       ❌
+  └── request_size_limit.py ❌
+
+  Summary
+
+  | Category     | Coverage | Missing Tests |
+  |--------------|----------|---------------|
+  | Services     | 30%      | ~150 tests    |
+  | Repositories | 29%      | ~120 tests    |
+  | Kafka Events | 0%       | ~70 tests     |
+  | Middleware   | 20%      | ~30 tests     |
+  | Workers      | 0%       | ~50 tests     |
+  | TOTAL        | ~40%     | ~420 tests    |
\ No newline at end of file