diff --git a/.github/workflows/backend-ci.yml b/.github/workflows/backend-ci.yml index 1ea5c6fa..f3c56abd 100644 --- a/.github/workflows/backend-ci.yml +++ b/.github/workflows/backend-ci.yml @@ -90,6 +90,15 @@ jobs: docker compose -f docker-compose.ci.yaml up -d --wait --wait-timeout 120 docker compose -f docker-compose.ci.yaml ps + - name: Create Kafka topics + timeout-minutes: 2 + env: + KAFKA_BOOTSTRAP_SERVERS: localhost:9092 + KAFKA_TOPIC_PREFIX: "ci.${{ github.run_id }}." + run: | + cd backend + uv run python -m scripts.create_topics + - name: Run integration tests timeout-minutes: 10 env: @@ -99,6 +108,7 @@ jobs: MONGODB_PORT: 27017 MONGODB_URL: mongodb://root:rootpassword@127.0.0.1:27017/?authSource=admin KAFKA_BOOTSTRAP_SERVERS: localhost:9092 + KAFKA_TOPIC_PREFIX: "ci.${{ github.run_id }}." SCHEMA_REGISTRY_URL: http://localhost:8081 REDIS_HOST: localhost REDIS_PORT: 6379 @@ -174,6 +184,15 @@ jobs: timeout 90 bash -c 'until sudo k3s kubectl cluster-info; do sleep 5; done' kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f - + - name: Create Kafka topics + timeout-minutes: 2 + env: + KAFKA_BOOTSTRAP_SERVERS: localhost:9092 + KAFKA_TOPIC_PREFIX: "ci.${{ github.run_id }}." + run: | + cd backend + uv run python -m scripts.create_topics + - name: Run E2E tests timeout-minutes: 10 env: @@ -181,6 +200,7 @@ jobs: MONGO_ROOT_PASSWORD: rootpassword MONGODB_URL: mongodb://root:rootpassword@127.0.0.1:27017/?authSource=admin KAFKA_BOOTSTRAP_SERVERS: localhost:9092 + KAFKA_TOPIC_PREFIX: "ci.${{ github.run_id }}." SCHEMA_REGISTRY_URL: http://localhost:8081 REDIS_HOST: localhost REDIS_PORT: 6379 diff --git a/backend/.env.test b/backend/.env.test index ae7222f4..efc653de 100644 --- a/backend/.env.test +++ b/backend/.env.test @@ -22,6 +22,7 @@ REDIS_DECODE_RESPONSES=true # Kafka - use localhost for tests KAFKA_BOOTSTRAP_SERVERS=localhost:9092 +KAFKA_TOPIC_PREFIX=test. SCHEMA_REGISTRY_URL=http://localhost:8081 # Security @@ -31,9 +32,12 @@ CORS_ALLOWED_ORIGINS=["http://localhost:3000","https://localhost:3000"] # Features RATE_LIMIT_ENABLED=true ENABLE_TRACING=false -OTEL_SDK_DISABLED=true + +# OpenTelemetry - explicitly disabled for tests (no endpoint = NoOp meter) +OTEL_EXPORTER_OTLP_ENDPOINT= OTEL_METRICS_EXPORTER=none OTEL_TRACES_EXPORTER=none +OTEL_LOGS_EXPORTER=none # Development DEVELOPMENT_MODE=false diff --git a/backend/app/core/container.py b/backend/app/core/container.py index c7f8b9d7..97411a49 100644 --- a/backend/app/core/container.py +++ b/backend/app/core/container.py @@ -5,22 +5,35 @@ AdminServicesProvider, AuthProvider, BusinessServicesProvider, - ConnectionProvider, + CoordinatorProvider, CoreServicesProvider, DatabaseProvider, + DLQProcessorProvider, EventProvider, + EventReplayProvider, + K8sWorkerProvider, + KafkaServicesProvider, + KubernetesProvider, LoggingProvider, MessagingProvider, + MetricsProvider, + PodMonitorProvider, RedisProvider, - ResultProcessorProvider, + RepositoryProvider, + SagaOrchestratorProvider, SettingsProvider, + SSEProvider, UserServicesProvider, ) +from app.settings import Settings -def create_app_container() -> AsyncContainer: +def create_app_container(settings: Settings) -> AsyncContainer: """ Create the application DI container. + + Args: + settings: Application settings (injected via from_context). """ return make_async_container( SettingsProvider(), @@ -28,30 +41,138 @@ def create_app_container() -> AsyncContainer: DatabaseProvider(), RedisProvider(), CoreServicesProvider(), + MetricsProvider(), + RepositoryProvider(), MessagingProvider(), EventProvider(), - ConnectionProvider(), + KafkaServicesProvider(), + SSEProvider(), AuthProvider(), UserServicesProvider(), AdminServicesProvider(), BusinessServicesProvider(), FastapiProvider(), + context={Settings: settings}, ) -def create_result_processor_container() -> AsyncContainer: +def create_result_processor_container(settings: Settings) -> AsyncContainer: """ Create a minimal DI container for the ResultProcessor worker. - Includes only settings, database, event/kafka, and required repositories. + + Args: + settings: Application settings (injected via from_context). """ + return make_async_container( + SettingsProvider(), + LoggingProvider(), + DatabaseProvider(), + RedisProvider(), + CoreServicesProvider(), + MetricsProvider(), + RepositoryProvider(), + EventProvider(), + MessagingProvider(), + context={Settings: settings}, + ) + + +def create_coordinator_container(settings: Settings) -> AsyncContainer: + """Create DI container for the ExecutionCoordinator worker.""" + return make_async_container( + SettingsProvider(), + LoggingProvider(), + DatabaseProvider(), + RedisProvider(), + CoreServicesProvider(), + MetricsProvider(), + RepositoryProvider(), + MessagingProvider(), + EventProvider(), + CoordinatorProvider(), + context={Settings: settings}, + ) + + +def create_k8s_worker_container(settings: Settings) -> AsyncContainer: + """Create DI container for the KubernetesWorker.""" + return make_async_container( + SettingsProvider(), + LoggingProvider(), + DatabaseProvider(), + RedisProvider(), + CoreServicesProvider(), + MetricsProvider(), + RepositoryProvider(), + MessagingProvider(), + EventProvider(), + KubernetesProvider(), + K8sWorkerProvider(), + context={Settings: settings}, + ) + + +def create_pod_monitor_container(settings: Settings) -> AsyncContainer: + """Create DI container for the PodMonitor worker.""" return make_async_container( SettingsProvider(), LoggingProvider(), DatabaseProvider(), CoreServicesProvider(), - ConnectionProvider(), + MetricsProvider(), + RepositoryProvider(), + MessagingProvider(), + EventProvider(), + KafkaServicesProvider(), + KubernetesProvider(), + PodMonitorProvider(), + context={Settings: settings}, + ) + + +def create_saga_orchestrator_container(settings: Settings) -> AsyncContainer: + """Create DI container for the SagaOrchestrator worker.""" + return make_async_container( + SettingsProvider(), + LoggingProvider(), + DatabaseProvider(), RedisProvider(), + CoreServicesProvider(), + MetricsProvider(), + RepositoryProvider(), + MessagingProvider(), EventProvider(), + SagaOrchestratorProvider(), + context={Settings: settings}, + ) + + +def create_event_replay_container(settings: Settings) -> AsyncContainer: + """Create DI container for the EventReplay worker.""" + return make_async_container( + SettingsProvider(), + LoggingProvider(), + DatabaseProvider(), + CoreServicesProvider(), + MetricsProvider(), + RepositoryProvider(), MessagingProvider(), - ResultProcessorProvider(), + EventProvider(), + EventReplayProvider(), + context={Settings: settings}, + ) + + +def create_dlq_processor_container(settings: Settings) -> AsyncContainer: + """Create DI container for the DLQ processor worker.""" + return make_async_container( + SettingsProvider(), + LoggingProvider(), + DatabaseProvider(), + CoreServicesProvider(), + MetricsProvider(), + RepositoryProvider(), + EventProvider(), + DLQProcessorProvider(), + context={Settings: settings}, ) diff --git a/backend/app/core/lifecycle.py b/backend/app/core/lifecycle.py index eec4fe93..2e0d8f85 100644 --- a/backend/app/core/lifecycle.py +++ b/backend/app/core/lifecycle.py @@ -1,24 +1,62 @@ from __future__ import annotations from types import TracebackType -from typing import Optional, Self, Type +from typing import Self class LifecycleEnabled: - async def start(self) -> None: # pragma: no cover - raise NotImplementedError + """Base class for services with async lifecycle management. - async def stop(self) -> None: # pragma: no cover - raise NotImplementedError + Usage: + async with MyService() as service: + # service is running + # service is stopped + + Subclasses override _on_start() and _on_stop() for their logic. + Base class handles idempotency and context manager protocol. + + For internal component cleanup, use aclose() which follows Python's + standard async cleanup pattern (like aiofiles, aiohttp). + """ + + def __init__(self) -> None: + self._lifecycle_started: bool = False + + async def _on_start(self) -> None: + """Override with startup logic. Called once on enter.""" + pass + + async def _on_stop(self) -> None: + """Override with cleanup logic. Called once on exit.""" + pass + + async def aclose(self) -> None: + """Close the service. For internal component cleanup. + + Mirrors Python's standard aclose() pattern (like aiofiles, aiohttp). + Idempotent - safe to call multiple times. + """ + if not self._lifecycle_started: + return + self._lifecycle_started = False + await self._on_stop() + + @property + def is_running(self) -> bool: + """Check if service is currently running.""" + return self._lifecycle_started async def __aenter__(self) -> Self: - await self.start() + if self._lifecycle_started: + return self # Already started, idempotent + await self._on_start() + self._lifecycle_started = True return self async def __aexit__( self, - exc_type: Optional[Type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ) -> None: - await self.stop() + await self.aclose() diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py index 0ab818d4..c1f29693 100644 --- a/backend/app/core/providers.py +++ b/backend/app/core/providers.py @@ -2,7 +2,7 @@ from typing import AsyncIterator import redis.asyncio as redis -from dishka import Provider, Scope, provide +from dishka import Provider, Scope, from_context, provide from pymongo.asynchronous.mongo_client import AsyncMongoClient from app.core.database_context import Database @@ -40,6 +40,7 @@ from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository from app.db.repositories.user_settings_repository import UserSettingsRepository from app.dlq.manager import DLQManager, create_dlq_manager +from app.domain.enums.kafka import KafkaTopic from app.domain.saga.models import SagaConfig from app.events.core import ProducerConfig, UnifiedProducer from app.events.event_store import EventStore, create_event_store @@ -57,8 +58,12 @@ from app.services.idempotency import IdempotencyConfig, IdempotencyManager from app.services.idempotency.idempotency_manager import create_idempotency_manager from app.services.idempotency.redis_repository import RedisIdempotencyRepository +from app.services.k8s_worker.config import K8sWorkerConfig +from app.services.k8s_worker.worker import KubernetesWorker from app.services.kafka_event_service import KafkaEventService from app.services.notification_service import NotificationService +from app.services.pod_monitor.config import PodMonitorConfig +from app.services.pod_monitor.monitor import PodMonitor from app.services.rate_limit_service import RateLimitService from app.services.replay_service import ReplayService from app.services.saga import SagaOrchestrator, create_saga_orchestrator @@ -69,15 +74,13 @@ from app.services.sse.sse_service import SSEService from app.services.sse.sse_shutdown_manager import SSEShutdownManager, create_sse_shutdown_manager from app.services.user_settings_service import UserSettingsService -from app.settings import Settings, get_settings +from app.settings import Settings class SettingsProvider(Provider): - scope = Scope.APP + """Provides Settings from context (passed to make_async_container).""" - @provide - def get_settings(self) -> Settings: - return get_settings() + settings = from_context(provides=Settings, scope=Scope.APP) class LoggingProvider(Provider): @@ -106,7 +109,7 @@ async def get_redis_client(self, settings: Settings, logger: logging.Logger) -> socket_timeout=5, ) # Test connection - await client.execute_command("PING") # type: ignore[no-untyped-call] + await client.ping() # type: ignore[misc] # redis-py dual sync/async return type logger.info(f"Redis connected: {settings.REDIS_HOST}:{settings.REDIS_PORT}/{settings.REDIS_DB}") try: yield client @@ -152,23 +155,15 @@ async def get_kafka_producer( self, settings: Settings, schema_registry: SchemaRegistryManager, logger: logging.Logger ) -> AsyncIterator[UnifiedProducer]: config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS) - producer = UnifiedProducer(config, schema_registry, logger) - await producer.start() - try: + async with UnifiedProducer(config, schema_registry, logger) as producer: yield producer - finally: - await producer.stop() @provide async def get_dlq_manager( - self, schema_registry: SchemaRegistryManager, logger: logging.Logger + self, settings: Settings, schema_registry: SchemaRegistryManager, logger: logging.Logger ) -> AsyncIterator[DLQManager]: - manager = create_dlq_manager(schema_registry, logger) - await manager.start() - try: + async with create_dlq_manager(settings, schema_registry, logger) as manager: yield manager - finally: - await manager.stop() @provide def get_idempotency_repository(self, redis_client: redis.Redis) -> RedisIdempotencyRepository: @@ -190,8 +185,8 @@ class EventProvider(Provider): scope = Scope.APP @provide - def get_schema_registry(self, logger: logging.Logger) -> SchemaRegistryManager: - return create_schema_registry_manager(logger) + def get_schema_registry(self, settings: Settings, logger: logging.Logger) -> SchemaRegistryManager: + return create_schema_registry_manager(settings, logger) @provide async def get_event_store(self, schema_registry: SchemaRegistryManager, logger: logging.Logger) -> EventStore: @@ -203,17 +198,20 @@ async def get_event_store_consumer( self, event_store: EventStore, schema_registry: SchemaRegistryManager, + settings: Settings, kafka_producer: UnifiedProducer, logger: logging.Logger, - ) -> EventStoreConsumer: + ) -> AsyncIterator[EventStoreConsumer]: topics = get_all_topics() - return create_event_store_consumer( + async with create_event_store_consumer( event_store=event_store, topics=list(topics), schema_registry_manager=schema_registry, + settings=settings, producer=kafka_producer, logger=logger, - ) + ) as consumer: + yield consumer @provide async def get_event_bus_manager(self, logger: logging.Logger) -> AsyncIterator[EventBusManager]: @@ -236,18 +234,17 @@ async def get_k8s_clients(self, settings: Settings, logger: logging.Logger) -> A close_k8s_clients(clients) -class ConnectionProvider(Provider): +class MetricsProvider(Provider): + """Provides all metrics instances.""" + scope = Scope.APP @provide def get_event_metrics(self) -> EventMetrics: - # Create the metrics instance that will be placed in context - # No longer a singleton - context manages the single instance return EventMetrics() @provide def get_connection_metrics(self) -> ConnectionMetrics: - # Create the metrics instance that will be placed in context return ConnectionMetrics() @provide @@ -290,11 +287,80 @@ def get_replay_metrics(self) -> ReplayMetrics: def get_security_metrics(self) -> SecurityMetrics: return SecurityMetrics() - @provide(scope=Scope.REQUEST) - def get_sse_shutdown_manager(self, logger: logging.Logger) -> SSEShutdownManager: - return create_sse_shutdown_manager(logger=logger) - @provide(scope=Scope.APP) +class RepositoryProvider(Provider): + """Provides all repository instances. Repositories are stateless facades over database operations.""" + + scope = Scope.APP + + @provide + def get_execution_repository(self, logger: logging.Logger) -> ExecutionRepository: + return ExecutionRepository(logger) + + @provide + def get_saga_repository(self) -> SagaRepository: + return SagaRepository() + + @provide + def get_resource_allocation_repository(self) -> ResourceAllocationRepository: + return ResourceAllocationRepository() + + @provide + def get_saved_script_repository(self) -> SavedScriptRepository: + return SavedScriptRepository() + + @provide + def get_dlq_repository(self, logger: logging.Logger) -> DLQRepository: + return DLQRepository(logger) + + @provide + def get_replay_repository(self, logger: logging.Logger) -> ReplayRepository: + return ReplayRepository(logger) + + @provide + def get_event_repository(self, logger: logging.Logger) -> EventRepository: + return EventRepository(logger) + + @provide + def get_user_settings_repository(self, logger: logging.Logger) -> UserSettingsRepository: + return UserSettingsRepository(logger) + + @provide + def get_admin_events_repository(self) -> AdminEventsRepository: + return AdminEventsRepository() + + @provide + def get_admin_settings_repository(self, logger: logging.Logger) -> AdminSettingsRepository: + return AdminSettingsRepository(logger) + + @provide + def get_admin_user_repository(self) -> AdminUserRepository: + return AdminUserRepository() + + @provide + def get_notification_repository(self, logger: logging.Logger) -> NotificationRepository: + return NotificationRepository(logger) + + @provide + def get_sse_repository(self) -> SSERepository: + return SSERepository() + + @provide + def get_user_repository(self) -> UserRepository: + return UserRepository() + + +class SSEProvider(Provider): + """Provides SSE (Server-Sent Events) related services.""" + + scope = Scope.APP + + @provide + async def get_sse_redis_bus(self, redis_client: redis.Redis, logger: logging.Logger) -> AsyncIterator[SSERedisBus]: + bus = SSERedisBus(redis_client, logger) + yield bus + + @provide async def get_sse_kafka_redis_bridge( self, schema_registry: SchemaRegistryManager, @@ -302,23 +368,19 @@ async def get_sse_kafka_redis_bridge( event_metrics: EventMetrics, sse_redis_bus: SSERedisBus, logger: logging.Logger, - ) -> SSEKafkaRedisBridge: - return create_sse_kafka_redis_bridge( + ) -> AsyncIterator[SSEKafkaRedisBridge]: + async with create_sse_kafka_redis_bridge( schema_registry=schema_registry, settings=settings, event_metrics=event_metrics, sse_bus=sse_redis_bus, logger=logger, - ) + ) as bridge: + yield bridge - @provide - def get_sse_repository(self) -> SSERepository: - return SSERepository() - - @provide - async def get_sse_redis_bus(self, redis_client: redis.Redis, logger: logging.Logger) -> AsyncIterator[SSERedisBus]: - bus = SSERedisBus(redis_client, logger) - yield bus + @provide(scope=Scope.REQUEST) + def get_sse_shutdown_manager(self, logger: logging.Logger) -> SSEShutdownManager: + return create_sse_shutdown_manager(logger=logger) @provide(scope=Scope.REQUEST) def get_sse_service( @@ -330,7 +392,6 @@ def get_sse_service( settings: Settings, logger: logging.Logger, ) -> SSEService: - # Ensure shutdown manager coordinates with the router in this request scope shutdown_manager.set_router(router) return SSEService( repository=sse_repository, @@ -345,36 +406,30 @@ def get_sse_service( class AuthProvider(Provider): scope = Scope.APP - @provide - def get_user_repository(self) -> UserRepository: - return UserRepository() - @provide def get_auth_service(self, user_repository: UserRepository, logger: logging.Logger) -> AuthService: return AuthService(user_repository, logger) -class UserServicesProvider(Provider): - scope = Scope.APP - - @provide - def get_user_settings_repository(self, logger: logging.Logger) -> UserSettingsRepository: - return UserSettingsRepository(logger) +class KafkaServicesProvider(Provider): + """Provides Kafka-related event services used by both main app and workers.""" - @provide - def get_event_repository(self, logger: logging.Logger) -> EventRepository: - return EventRepository(logger) + scope = Scope.APP @provide - async def get_event_service(self, event_repository: EventRepository) -> EventService: + def get_event_service(self, event_repository: EventRepository) -> EventService: return EventService(event_repository) @provide - async def get_kafka_event_service( + def get_kafka_event_service( self, event_repository: EventRepository, kafka_producer: UnifiedProducer, logger: logging.Logger ) -> KafkaEventService: return KafkaEventService(event_repository=event_repository, kafka_producer=kafka_producer, logger=logger) + +class UserServicesProvider(Provider): + scope = Scope.APP + @provide async def get_user_settings_service( self, @@ -391,10 +446,6 @@ async def get_user_settings_service( class AdminServicesProvider(Provider): scope = Scope.APP - @provide - def get_admin_events_repository(self) -> AdminEventsRepository: - return AdminEventsRepository() - @provide(scope=Scope.REQUEST) def get_admin_events_service( self, @@ -404,10 +455,6 @@ def get_admin_events_service( ) -> AdminEventsService: return AdminEventsService(admin_events_repository, replay_service, logger) - @provide - def get_admin_settings_repository(self, logger: logging.Logger) -> AdminSettingsRepository: - return AdminSettingsRepository(logger) - @provide def get_admin_settings_service( self, @@ -416,18 +463,6 @@ def get_admin_settings_service( ) -> AdminSettingsService: return AdminSettingsService(admin_settings_repository, logger) - @provide - def get_admin_user_repository(self) -> AdminUserRepository: - return AdminUserRepository() - - @provide - def get_saga_repository(self) -> SagaRepository: - return SagaRepository() - - @provide - def get_notification_repository(self, logger: logging.Logger) -> NotificationRepository: - return NotificationRepository(logger) - @provide def get_notification_service( self, @@ -460,60 +495,75 @@ def get_grafana_alert_processor( return GrafanaAlertProcessor(notification_service, logger) -class BusinessServicesProvider(Provider): - scope = Scope.REQUEST - - @provide - def get_execution_repository(self, logger: logging.Logger) -> ExecutionRepository: - return ExecutionRepository(logger) - - @provide - def get_resource_allocation_repository(self) -> ResourceAllocationRepository: - return ResourceAllocationRepository() +def _create_default_saga_config() -> SagaConfig: + """Factory for default SagaConfig used by orchestrators.""" + return SagaConfig( + name="main-orchestrator", + timeout_seconds=300, + max_retries=3, + retry_delay_seconds=5, + enable_compensation=True, + store_events=True, + publish_commands=True, + ) + + +# Standalone factory functions for lifecycle-managed services (eliminates duplication) +async def _provide_saga_orchestrator( + saga_repository: SagaRepository, + kafka_producer: UnifiedProducer, + schema_registry: SchemaRegistryManager, + settings: Settings, + event_store: EventStore, + idempotency_manager: IdempotencyManager, + resource_allocation_repository: ResourceAllocationRepository, + logger: logging.Logger, +) -> AsyncIterator[SagaOrchestrator]: + """Shared factory for SagaOrchestrator with lifecycle management.""" + async with create_saga_orchestrator( + saga_repository=saga_repository, + producer=kafka_producer, + schema_registry_manager=schema_registry, + settings=settings, + event_store=event_store, + idempotency_manager=idempotency_manager, + resource_allocation_repository=resource_allocation_repository, + config=_create_default_saga_config(), + logger=logger, + ) as orchestrator: + yield orchestrator + + +async def _provide_execution_coordinator( + kafka_producer: UnifiedProducer, + schema_registry: SchemaRegistryManager, + settings: Settings, + event_store: EventStore, + execution_repository: ExecutionRepository, + idempotency_manager: IdempotencyManager, + logger: logging.Logger, +) -> AsyncIterator[ExecutionCoordinator]: + """Shared factory for ExecutionCoordinator with lifecycle management.""" + async with ExecutionCoordinator( + producer=kafka_producer, + schema_registry_manager=schema_registry, + settings=settings, + event_store=event_store, + execution_repository=execution_repository, + idempotency_manager=idempotency_manager, + logger=logger, + ) as coordinator: + yield coordinator - @provide - def get_saved_script_repository(self) -> SavedScriptRepository: - return SavedScriptRepository() - @provide - def get_dlq_repository(self, logger: logging.Logger) -> DLQRepository: - return DLQRepository(logger) +class BusinessServicesProvider(Provider): + scope = Scope.REQUEST - @provide - def get_replay_repository(self, logger: logging.Logger) -> ReplayRepository: - return ReplayRepository(logger) - - @provide - async def get_saga_orchestrator( - self, - saga_repository: SagaRepository, - kafka_producer: UnifiedProducer, - event_store: EventStore, - idempotency_manager: IdempotencyManager, - resource_allocation_repository: ResourceAllocationRepository, - settings: Settings, - ) -> AsyncIterator[SagaOrchestrator]: - config = SagaConfig( - name="main-orchestrator", - timeout_seconds=300, - max_retries=3, - retry_delay_seconds=5, - enable_compensation=True, - store_events=True, - publish_commands=True, - ) - orchestrator = create_saga_orchestrator( - saga_repository=saga_repository, - producer=kafka_producer, - event_store=event_store, - idempotency_manager=idempotency_manager, - resource_allocation_repository=resource_allocation_repository, - config=config, - ) - try: - yield orchestrator - finally: - await orchestrator.stop() + def __init__(self) -> None: + super().__init__() + # Register shared factory functions on instance (avoids warning about missing self) + self.provide(_provide_saga_orchestrator) + self.provide(_provide_execution_coordinator) @provide def get_saga_service( @@ -583,33 +633,103 @@ def get_admin_user_service( logger=logger, ) + +class CoordinatorProvider(Provider): + scope = Scope.APP + + def __init__(self) -> None: + super().__init__() + self.provide(_provide_execution_coordinator) + + +class K8sWorkerProvider(Provider): + scope = Scope.APP + @provide - async def get_execution_coordinator( + async def get_kubernetes_worker( self, kafka_producer: UnifiedProducer, schema_registry: SchemaRegistryManager, + settings: Settings, event_store: EventStore, - execution_repository: ExecutionRepository, idempotency_manager: IdempotencyManager, logger: logging.Logger, - ) -> AsyncIterator[ExecutionCoordinator]: - coordinator = ExecutionCoordinator( + ) -> AsyncIterator[KubernetesWorker]: + config = K8sWorkerConfig() + async with KubernetesWorker( + config=config, producer=kafka_producer, schema_registry_manager=schema_registry, + settings=settings, event_store=event_store, - execution_repository=execution_repository, idempotency_manager=idempotency_manager, logger=logger, + ) as worker: + yield worker + + +class PodMonitorProvider(Provider): + scope = Scope.APP + + @provide + async def get_pod_monitor( + self, + kafka_event_service: KafkaEventService, + k8s_clients: K8sClients, + logger: logging.Logger, + ) -> AsyncIterator[PodMonitor]: + config = PodMonitorConfig() + async with PodMonitor( + config=config, + kafka_event_service=kafka_event_service, + logger=logger, + k8s_clients=k8s_clients, + ) as monitor: + yield monitor + + +class SagaOrchestratorProvider(Provider): + scope = Scope.APP + + def __init__(self) -> None: + super().__init__() + self.provide(_provide_saga_orchestrator) + + +class EventReplayProvider(Provider): + scope = Scope.APP + + @provide + def get_event_replay_service( + self, + replay_repository: ReplayRepository, + kafka_producer: UnifiedProducer, + event_store: EventStore, + logger: logging.Logger, + ) -> EventReplayService: + return EventReplayService( + repository=replay_repository, + producer=kafka_producer, + event_store=event_store, + logger=logger, ) - try: - yield coordinator - finally: - await coordinator.stop() -class ResultProcessorProvider(Provider): +class DLQProcessorProvider(Provider): scope = Scope.APP @provide - def get_execution_repository(self, logger: logging.Logger) -> ExecutionRepository: - return ExecutionRepository(logger) + async def get_dlq_manager( + self, + settings: Settings, + schema_registry: SchemaRegistryManager, + logger: logging.Logger, + ) -> AsyncIterator[DLQManager]: + async with create_dlq_manager( + settings=settings, + schema_registry=schema_registry, + logger=logger, + dlq_topic=KafkaTopic.DEAD_LETTER_QUEUE, + retry_topic_suffix="-retry", + ) as manager: + yield manager diff --git a/backend/app/db/repositories/notification_repository.py b/backend/app/db/repositories/notification_repository.py index 6facbe8e..ffed3a1a 100644 --- a/backend/app/db/repositories/notification_repository.py +++ b/backend/app/db/repositories/notification_repository.py @@ -1,6 +1,7 @@ import logging from dataclasses import asdict from datetime import UTC, datetime, timedelta +from typing import Any from beanie.odm.enums import SortDirection from beanie.operators import GTE, LT, LTE, ElemMatch, In, NotIn, Or @@ -90,7 +91,7 @@ async def list_notifications( ) return [DomainNotification(**doc.model_dump(exclude={"id"})) for doc in docs] - async def count_notifications(self, user_id: str, *additional_conditions) -> int: # type: ignore[no-untyped-def] + async def count_notifications(self, user_id: str, *additional_conditions: Any) -> int: conditions = [NotificationDocument.user_id == user_id, *additional_conditions] return await NotificationDocument.find(*conditions).count() diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py index 96f1528f..f64ca00f 100644 --- a/backend/app/dlq/manager.py +++ b/backend/app/dlq/manager.py @@ -21,12 +21,13 @@ ) from app.domain.enums.kafka import GroupId, KafkaTopic from app.events.schema.schema_registry import SchemaRegistryManager -from app.settings import get_settings +from app.settings import Settings class DLQManager(LifecycleEnabled): def __init__( self, + settings: Settings, consumer: Consumer, producer: Producer, schema_registry: SchemaRegistryManager, @@ -35,6 +36,8 @@ def __init__( retry_topic_suffix: str = "-retry", default_retry_policy: RetryPolicy | None = None, ): + super().__init__() + self.settings = settings self.metrics = get_dlq_metrics() self.schema_registry = schema_registry self.logger = logger @@ -46,7 +49,6 @@ def __init__( self.consumer: Consumer = consumer self.producer: Producer = producer - self._running = False self._process_task: asyncio.Task[None] | None = None self._monitor_task: asyncio.Task[None] | None = None @@ -142,29 +144,19 @@ def _kafka_msg_to_message(self, msg: Message) -> DLQMessage: headers=headers, ) - async def start(self) -> None: - """Start DLQ manager""" - if self._running: - return - - topic_name = f"{get_settings().KAFKA_TOPIC_PREFIX}{str(self.dlq_topic)}" + async def _on_start(self) -> None: + """Start DLQ manager.""" + topic_name = f"{self.settings.KAFKA_TOPIC_PREFIX}{str(self.dlq_topic)}" self.consumer.subscribe([topic_name]) - self._running = True - # Start processing tasks self._process_task = asyncio.create_task(self._process_messages()) self._monitor_task = asyncio.create_task(self._monitor_dlq()) self.logger.info("DLQ Manager started") - async def stop(self) -> None: - """Stop DLQ manager""" - if not self._running: - return - - self._running = False - + async def _on_stop(self) -> None: + """Stop DLQ manager.""" # Cancel tasks for task in [self._process_task, self._monitor_task]: if task: @@ -181,7 +173,7 @@ async def stop(self) -> None: self.logger.info("DLQ Manager stopped") async def _process_messages(self) -> None: - while self._running: + while self.is_running: try: msg = await self._poll_message() if msg is None: @@ -190,7 +182,7 @@ async def _process_messages(self) -> None: if not await self._validate_message(msg): continue - start_time = asyncio.get_event_loop().time() + start_time = asyncio.get_running_loop().time() dlq_message = self._kafka_msg_to_message(msg) await self._record_message_metrics(dlq_message) @@ -249,7 +241,7 @@ async def _process_message_with_tracing(self, msg: Message, dlq_message: DLQMess async def _commit_and_record_duration(self, start_time: float) -> None: """Commit offset and record processing duration.""" await asyncio.to_thread(self.consumer.commit, asynchronous=False) - duration = asyncio.get_event_loop().time() - start_time + duration = asyncio.get_running_loop().time() - start_time self.metrics.record_dlq_processing_duration(duration, "process") async def _process_dlq_message(self, message: DLQMessage) -> None: @@ -395,7 +387,7 @@ async def _discard_message(self, message: DLQMessage, reason: str) -> None: self.logger.warning("Discarded message", extra={"event_id": message.event_id, "reason": reason}) async def _monitor_dlq(self) -> None: - while self._running: + while self.is_running: try: # Find messages ready for retry using Beanie now = datetime.now(timezone.utc) @@ -469,13 +461,13 @@ async def retry_message_manually(self, event_id: str) -> bool: def create_dlq_manager( + settings: Settings, schema_registry: SchemaRegistryManager, logger: logging.Logger, dlq_topic: KafkaTopic = KafkaTopic.DEAD_LETTER_QUEUE, retry_topic_suffix: str = "-retry", default_retry_policy: RetryPolicy | None = None, ) -> DLQManager: - settings = get_settings() consumer = Consumer( { "bootstrap.servers": settings.KAFKA_BOOTSTRAP_SERVERS, @@ -499,6 +491,7 @@ def create_dlq_manager( if default_retry_policy is None: default_retry_policy = RetryPolicy(topic="default", strategy=RetryStrategy.EXPONENTIAL_BACKOFF) return DLQManager( + settings=settings, consumer=consumer, producer=producer, schema_registry=schema_registry, diff --git a/backend/app/events/admin_utils.py b/backend/app/events/admin_utils.py index 759c3630..a0a50679 100644 --- a/backend/app/events/admin_utils.py +++ b/backend/app/events/admin_utils.py @@ -41,7 +41,7 @@ async def create_topic(self, topic: str, num_partitions: int = 1, replication_fa futures = self._admin.create_topics([new_topic], operation_timeout=30.0) # Wait for result - result() returns None on success, raises exception on failure - await asyncio.get_event_loop().run_in_executor(None, lambda: futures[topic].result(timeout=30.0)) + await asyncio.get_running_loop().run_in_executor(None, lambda: futures[topic].result(timeout=30.0)) self.logger.info(f"Topic {topic} created successfully") return True except Exception as e: diff --git a/backend/app/events/core/consumer.py b/backend/app/events/core/consumer.py index aad22ac3..ab5656d5 100644 --- a/backend/app/events/core/consumer.py +++ b/backend/app/events/core/consumer.py @@ -15,7 +15,7 @@ from app.domain.enums.kafka import KafkaTopic from app.events.schema.schema_registry import SchemaRegistryManager from app.infrastructure.kafka.events.base import BaseEvent -from app.settings import get_settings +from app.settings import Settings from .dispatcher import EventDispatcher from .types import ConsumerConfig, ConsumerMetrics, ConsumerMetricsSnapshot, ConsumerState, ConsumerStatus @@ -26,12 +26,14 @@ def __init__( self, config: ConsumerConfig, event_dispatcher: EventDispatcher, + schema_registry: SchemaRegistryManager, + settings: Settings, logger: logging.Logger, stats_callback: Callable[[dict[str, Any]], None] | None = None, ): self._config = config self.logger = logger - self._schema_registry = SchemaRegistryManager(logger=logger) + self._schema_registry = schema_registry self._dispatcher = event_dispatcher self._stats_callback = stats_callback self._consumer: Consumer | None = None @@ -41,7 +43,7 @@ def __init__( self._event_metrics = get_event_metrics() # Singleton for Kafka metrics self._error_callback: "Callable[[Exception, BaseEvent], Awaitable[None]] | None" = None self._consume_task: asyncio.Task[None] | None = None - self._topic_prefix = get_settings().KAFKA_TOPIC_PREFIX + self._topic_prefix = settings.KAFKA_TOPIC_PREFIX async def start(self, topics: list[KafkaTopic]) -> None: self._state = self._state if self._state != ConsumerState.STOPPED else ConsumerState.STARTING diff --git a/backend/app/events/core/dispatcher.py b/backend/app/events/core/dispatcher.py index 7f972524..b2f527ca 100644 --- a/backend/app/events/core/dispatcher.py +++ b/backend/app/events/core/dispatcher.py @@ -44,19 +44,25 @@ def _build_topic_mapping(self) -> None: self._topic_event_types[topic].add(event_class) self.logger.debug(f"Mapped {event_class.__name__} to topic {topic}") - def register(self, event_type: EventType) -> Callable[[EventHandler], EventHandler]: + def register( + self, event_type: EventType + ) -> Callable[[Callable[[T], Awaitable[None]]], Callable[[T], Awaitable[None]]]: """ Decorator for registering type-safe event handlers. + Generic over T (any BaseEvent subtype) - accepts handlers with specific + event types while preserving their type signature for callers. + Usage: @dispatcher.register(EventType.EXECUTION_REQUESTED) async def handle_execution(event: ExecutionRequestedEvent) -> None: - # Handler logic here + # Handler logic here - event is properly typed """ - def decorator(handler: EventHandler) -> EventHandler: + def decorator(handler: Callable[[T], Awaitable[None]]) -> Callable[[T], Awaitable[None]]: self.logger.info(f"Registering handler '{handler.__name__}' for event type '{event_type.value}'") - self._handlers[event_type].append(handler) + # Safe: dispatch() routes by event_type, guaranteeing correct types at runtime + self._handlers[event_type].append(handler) # type: ignore[arg-type] return handler return decorator diff --git a/backend/app/events/core/producer.py b/backend/app/events/core/producer.py index 76115e56..b45858ea 100644 --- a/backend/app/events/core/producer.py +++ b/backend/app/events/core/producer.py @@ -35,13 +35,13 @@ def __init__( logger: logging.Logger, stats_callback: StatsCallback | None = None, ): + super().__init__() self._config = config self._schema_registry = schema_registry_manager self.logger = logger self._producer: Producer | None = None self._stats_callback = stats_callback self._state = ProducerState.STOPPED - self._running = False self._metrics = ProducerMetrics() self._event_metrics = get_event_metrics() # Singleton for Kafka metrics self._poll_task: asyncio.Task[None] | None = None @@ -108,11 +108,8 @@ def _handle_stats(self, stats_json: str) -> None: except Exception as e: self.logger.error(f"Error parsing producer stats: {e}") - async def start(self) -> None: - if self._state not in (ProducerState.STOPPED, ProducerState.ERROR): - self.logger.warning(f"Producer already in state {self._state}, skipping start") - return - + async def _on_start(self) -> None: + """Start the Kafka producer.""" self._state = ProducerState.STARTING self.logger.info("Starting producer...") @@ -123,7 +120,6 @@ async def start(self) -> None: # Serialize Producer initialization to prevent librdkafka race condition with _producer_init_lock: self._producer = Producer(producer_config) - self._running = True self._poll_task = asyncio.create_task(self._poll_loop()) self._state = ProducerState.RUNNING @@ -150,14 +146,10 @@ def get_status(self) -> dict[str, Any]: }, } - async def stop(self) -> None: - if self._state in (ProducerState.STOPPED, ProducerState.STOPPING): - self.logger.info(f"Producer already in state {self._state}, skipping stop") - return - + async def _on_stop(self) -> None: + """Stop the Kafka producer.""" self._state = ProducerState.STOPPING self.logger.info("Stopping producer...") - self._running = False if self._poll_task: self._poll_task.cancel() @@ -174,7 +166,7 @@ async def stop(self) -> None: async def _poll_loop(self) -> None: self.logger.info("Started producer poll loop") - while self._running and self._producer: + while self.is_running and self._producer: self._producer.poll(timeout=0.1) await asyncio.sleep(0.01) diff --git a/backend/app/events/event_store.py b/backend/app/events/event_store.py index e6b633b4..10007c01 100644 --- a/backend/app/events/event_store.py +++ b/backend/app/events/event_store.py @@ -54,7 +54,7 @@ async def initialize(self) -> None: self.logger.info("Event store initialized with Beanie") async def store_event(self, event: BaseEvent) -> bool: - start = asyncio.get_event_loop().time() + start = asyncio.get_running_loop().time() try: now = datetime.now(timezone.utc) data = event.model_dump(exclude={"topic"}) @@ -71,7 +71,7 @@ async def store_event(self, event: BaseEvent) -> bool: } ) - duration = asyncio.get_event_loop().time() - start + duration = asyncio.get_running_loop().time() - start self.metrics.record_event_store_duration(duration, "store_single", "event_store") self.metrics.record_event_stored(event.event_type, "event_store") return True @@ -84,7 +84,7 @@ async def store_event(self, event: BaseEvent) -> bool: return False async def store_batch(self, events: list[BaseEvent]) -> dict[str, int]: - start = asyncio.get_event_loop().time() + start = asyncio.get_running_loop().time() results = {"total": len(events), "stored": 0, "duplicates": 0, "failed": 0} if not events: return results @@ -112,7 +112,7 @@ async def store_batch(self, events: list[BaseEvent]) -> dict[str, int]: else: raise - duration = asyncio.get_event_loop().time() - start + duration = asyncio.get_running_loop().time() - start self.metrics.record_event_store_duration(duration, "store_batch", "event_store") add_span_attributes(**{"events.batch.count": len(events)}) if results["stored"] > 0: @@ -125,14 +125,14 @@ async def store_batch(self, events: list[BaseEvent]) -> dict[str, int]: return results async def get_event(self, event_id: str) -> BaseEvent | None: - start = asyncio.get_event_loop().time() + start = asyncio.get_running_loop().time() doc = await EventDocument.find_one({"event_id": event_id}) if not doc: return None event = self.schema_registry.deserialize_json(_flatten_doc(doc)) - duration = asyncio.get_event_loop().time() - start + duration = asyncio.get_running_loop().time() - start self.metrics.record_event_query_duration(duration, "get_by_id", "event_store") return event @@ -144,7 +144,7 @@ async def get_events_by_type( limit: int = 100, offset: int = 0, ) -> list[BaseEvent]: - start = asyncio.get_event_loop().time() + start = asyncio.get_running_loop().time() query: dict[str, Any] = {"event_type": event_type} if tr := self._time_range(start_time, end_time): query["timestamp"] = tr @@ -158,7 +158,7 @@ async def get_events_by_type( ) events = [self.schema_registry.deserialize_json(_flatten_doc(doc)) for doc in docs] - duration = asyncio.get_event_loop().time() - start + duration = asyncio.get_running_loop().time() - start self.metrics.record_event_query_duration(duration, "get_by_type", "event_store") return events @@ -167,7 +167,7 @@ async def get_execution_events( execution_id: str, event_types: list[EventType] | None = None, ) -> list[BaseEvent]: - start = asyncio.get_event_loop().time() + start = asyncio.get_running_loop().time() query: dict[str, Any] = {"$or": [{"payload.execution_id": execution_id}, {"aggregate_id": execution_id}]} if event_types: query["event_type"] = {"$in": event_types} @@ -175,7 +175,7 @@ async def get_execution_events( docs = await EventDocument.find(query).sort([("timestamp", SortDirection.ASCENDING)]).to_list() events = [self.schema_registry.deserialize_json(_flatten_doc(doc)) for doc in docs] - duration = asyncio.get_event_loop().time() - start + duration = asyncio.get_running_loop().time() - start self.metrics.record_event_query_duration(duration, "get_execution_events", "event_store") return events @@ -187,7 +187,7 @@ async def get_user_events( end_time: datetime | None = None, limit: int = 100, ) -> list[BaseEvent]: - start = asyncio.get_event_loop().time() + start = asyncio.get_running_loop().time() query: dict[str, Any] = {"metadata.user_id": str(user_id)} if event_types: query["event_type"] = {"$in": event_types} @@ -197,7 +197,7 @@ async def get_user_events( docs = await EventDocument.find(query).sort([("timestamp", SortDirection.DESCENDING)]).limit(limit).to_list() events = [self.schema_registry.deserialize_json(_flatten_doc(doc)) for doc in docs] - duration = asyncio.get_event_loop().time() - start + duration = asyncio.get_running_loop().time() - start self.metrics.record_event_query_duration(duration, "get_user_events", "event_store") return events @@ -208,7 +208,7 @@ async def get_security_events( user_id: str | None = None, limit: int = 100, ) -> list[BaseEvent]: - start = asyncio.get_event_loop().time() + start = asyncio.get_running_loop().time() query: dict[str, Any] = {"event_type": {"$in": self._SECURITY_TYPES}} if user_id: query["metadata.user_id"] = str(user_id) @@ -218,12 +218,12 @@ async def get_security_events( docs = await EventDocument.find(query).sort([("timestamp", SortDirection.DESCENDING)]).limit(limit).to_list() events = [self.schema_registry.deserialize_json(_flatten_doc(doc)) for doc in docs] - duration = asyncio.get_event_loop().time() - start + duration = asyncio.get_running_loop().time() - start self.metrics.record_event_query_duration(duration, "get_security_events", "event_store") return events async def get_correlation_chain(self, correlation_id: str) -> list[BaseEvent]: - start = asyncio.get_event_loop().time() + start = asyncio.get_running_loop().time() docs = await ( EventDocument.find({"metadata.correlation_id": str(correlation_id)}) .sort([("timestamp", SortDirection.ASCENDING)]) @@ -231,7 +231,7 @@ async def get_correlation_chain(self, correlation_id: str) -> list[BaseEvent]: ) events = [self.schema_registry.deserialize_json(_flatten_doc(doc)) for doc in docs] - duration = asyncio.get_event_loop().time() - start + duration = asyncio.get_running_loop().time() - start self.metrics.record_event_query_duration(duration, "get_correlation_chain", "event_store") return events @@ -242,7 +242,7 @@ async def replay_events( event_types: list[EventType] | None = None, callback: Callable[[BaseEvent], Awaitable[None]] | None = None, ) -> int: - start = asyncio.get_event_loop().time() + start = asyncio.get_running_loop().time() count = 0 try: @@ -258,7 +258,7 @@ async def replay_events( await callback(event) count += 1 - duration = asyncio.get_event_loop().time() - start + duration = asyncio.get_running_loop().time() - start self.metrics.record_event_query_duration(duration, "replay_events", "event_store") self.logger.info(f"Replayed {count} events from {start_time} to {end_time}") return count diff --git a/backend/app/events/event_store_consumer.py b/backend/app/events/event_store_consumer.py index bb26612e..af6239f7 100644 --- a/backend/app/events/event_store_consumer.py +++ b/backend/app/events/event_store_consumer.py @@ -11,7 +11,7 @@ from app.events.event_store import EventStore from app.events.schema.schema_registry import SchemaRegistryManager from app.infrastructure.kafka.events.base import BaseEvent -from app.settings import get_settings +from app.settings import Settings class EventStoreConsumer(LifecycleEnabled): @@ -22,14 +22,17 @@ def __init__( event_store: EventStore, topics: list[KafkaTopic], schema_registry_manager: SchemaRegistryManager, + settings: Settings, logger: logging.Logger, producer: UnifiedProducer | None = None, group_id: GroupId = GroupId.EVENT_STORE_CONSUMER, batch_size: int = 100, batch_timeout_seconds: float = 5.0, ): + super().__init__() self.event_store = event_store self.topics = topics + self.settings = settings self.group_id = group_id self.batch_size = batch_size self.batch_timeout = batch_timeout_seconds @@ -40,24 +43,26 @@ def __init__( self.producer = producer # For DLQ handling self._batch_buffer: list[BaseEvent] = [] self._batch_lock = asyncio.Lock() - self._last_batch_time = asyncio.get_event_loop().time() + self._last_batch_time: float = 0.0 self._batch_task: asyncio.Task[None] | None = None - self._running = False - async def start(self) -> None: + async def _on_start(self) -> None: """Start consuming and storing events.""" - if self._running: - return - - settings = get_settings() + self._last_batch_time = asyncio.get_running_loop().time() config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=f"{self.group_id}.{settings.KAFKA_GROUP_SUFFIX}", + bootstrap_servers=self.settings.KAFKA_BOOTSTRAP_SERVERS, + group_id=f"{self.group_id}.{self.settings.KAFKA_GROUP_SUFFIX}", enable_auto_commit=False, max_poll_records=self.batch_size, ) - self.consumer = UnifiedConsumer(config, event_dispatcher=self.dispatcher, logger=self.logger) + self.consumer = UnifiedConsumer( + config, + event_dispatcher=self.dispatcher, + schema_registry=self.schema_registry_manager, + settings=self.settings, + logger=self.logger, + ) # Register handler for all event types - store everything for event_type in EventType: @@ -78,19 +83,13 @@ async def start(self) -> None: self.consumer.register_error_callback(self._handle_error_with_event) await self.consumer.start(self.topics) - self._running = True self._batch_task = asyncio.create_task(self._batch_processor()) self.logger.info(f"Event store consumer started for topics: {self.topics}") - async def stop(self) -> None: + async def _on_stop(self) -> None: """Stop consumer.""" - if not self._running: - return - - self._running = False - await self._flush_batch() if self._batch_task: @@ -121,12 +120,12 @@ async def _handle_error_with_event(self, error: Exception, event: BaseEvent) -> async def _batch_processor(self) -> None: """Periodically flush batches based on timeout.""" - while self._running: + while self.is_running: try: await asyncio.sleep(1) async with self._batch_lock: - time_since_last_batch = asyncio.get_event_loop().time() - self._last_batch_time + time_since_last_batch = asyncio.get_running_loop().time() - self._last_batch_time if self._batch_buffer and time_since_last_batch >= self.batch_timeout: await self._flush_batch() @@ -140,7 +139,7 @@ async def _flush_batch(self) -> None: batch = self._batch_buffer.copy() self._batch_buffer.clear() - self._last_batch_time = asyncio.get_event_loop().time() + self._last_batch_time = asyncio.get_running_loop().time() self.logger.info(f"Event store flushing batch of {len(batch)} events") with trace_span( @@ -161,6 +160,7 @@ def create_event_store_consumer( event_store: EventStore, topics: list[KafkaTopic], schema_registry_manager: SchemaRegistryManager, + settings: Settings, logger: logging.Logger, producer: UnifiedProducer | None = None, group_id: GroupId = GroupId.EVENT_STORE_CONSUMER, @@ -174,6 +174,7 @@ def create_event_store_consumer( batch_size=batch_size, batch_timeout_seconds=batch_timeout_seconds, schema_registry_manager=schema_registry_manager, + settings=settings, logger=logger, producer=producer, ) diff --git a/backend/app/events/schema/schema_registry.py b/backend/app/events/schema/schema_registry.py index fa5b28da..e149a192 100644 --- a/backend/app/events/schema/schema_registry.py +++ b/backend/app/events/schema/schema_registry.py @@ -12,7 +12,7 @@ from app.domain.enums.events import EventType from app.infrastructure.kafka.events.base import BaseEvent -from app.settings import get_settings +from app.settings import Settings T = TypeVar("T", bound=BaseEvent) @@ -54,9 +54,8 @@ def _get_event_type_to_class_mapping() -> Dict[EventType, Type[BaseEvent]]: class SchemaRegistryManager: """Schema registry manager for Avro serialization with Confluent wire format.""" - def __init__(self, logger: logging.Logger, schema_registry_url: str | None = None): + def __init__(self, settings: Settings, logger: logging.Logger, schema_registry_url: str | None = None): self.logger = logger - settings = get_settings() self.url = schema_registry_url or settings.SCHEMA_REGISTRY_URL self.namespace = "com.integr8scode.events" # Optional per-session/worker subject prefix for tests/local isolation @@ -232,9 +231,9 @@ async def initialize_schemas(self) -> None: def create_schema_registry_manager( - logger: logging.Logger, schema_registry_url: str | None = None + settings: Settings, logger: logging.Logger, schema_registry_url: str | None = None ) -> SchemaRegistryManager: - return SchemaRegistryManager(logger, schema_registry_url) + return SchemaRegistryManager(settings, logger, schema_registry_url) async def initialize_event_schemas(registry: SchemaRegistryManager) -> None: diff --git a/backend/app/infrastructure/kafka/events/metadata.py b/backend/app/infrastructure/kafka/events/metadata.py index 71cba2bf..a6522ae1 100644 --- a/backend/app/infrastructure/kafka/events/metadata.py +++ b/backend/app/infrastructure/kafka/events/metadata.py @@ -1,7 +1,7 @@ from uuid import uuid4 from pydantic import ConfigDict, Field -from pydantic_avro import AvroBase # type: ignore[attr-defined] +from pydantic_avro.to_avro.base import AvroBase from app.domain.enums.common import Environment diff --git a/backend/app/main.py b/backend/app/main.py index f1c58209..466df23e 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -38,11 +38,18 @@ RequestSizeLimitMiddleware, setup_metrics, ) -from app.settings import get_settings +from app.settings import Settings, get_settings -def create_app() -> FastAPI: - settings = get_settings() +def create_app(settings: Settings | None = None) -> FastAPI: + """ + Create the FastAPI application. + + Args: + settings: Optional pre-configured settings (e.g., TestSettings for testing). + If None, uses get_settings() which reads from .env. + """ + settings = settings or get_settings() logger = setup_logger(settings.LOG_LEVEL) # Disable OpenAPI/Docs in production for security; health endpoints provide readiness app = FastAPI( @@ -53,7 +60,7 @@ def create_app() -> FastAPI: redoc_url=None, ) - container = create_app_container() + container = create_app_container(settings) setup_dishka(container, app) setup_metrics(app, logger) diff --git a/backend/app/services/coordinator/coordinator.py b/backend/app/services/coordinator/coordinator.py index fb4cb8ba..ef915720 100644 --- a/backend/app/services/coordinator/coordinator.py +++ b/backend/app/services/coordinator/coordinator.py @@ -1,31 +1,21 @@ import asyncio import logging -import signal import time from collections.abc import Coroutine from typing import Any, TypeAlias from uuid import uuid4 -import redis.asyncio as redis -from beanie import init_beanie -from pymongo.asynchronous.mongo_client import AsyncMongoClient - -from app.core.database_context import DBClient from app.core.lifecycle import LifecycleEnabled from app.core.metrics.context import get_coordinator_metrics -from app.db.docs import ALL_DOCUMENTS from app.db.repositories.execution_repository import ExecutionRepository from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic from app.domain.enums.storage import ExecutionErrorType -from app.events.core import ConsumerConfig, EventDispatcher, ProducerConfig, UnifiedConsumer, UnifiedProducer -from app.events.event_store import EventStore, create_event_store +from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer +from app.events.event_store import EventStore from app.events.schema.schema_registry import ( SchemaRegistryManager, - create_schema_registry_manager, - initialize_event_schemas, ) -from app.infrastructure.kafka.events.base import BaseEvent from app.infrastructure.kafka.events.execution import ( ExecutionAcceptedEvent, ExecutionCancelledEvent, @@ -38,10 +28,8 @@ from app.services.coordinator.queue_manager import QueueManager, QueuePriority from app.services.coordinator.resource_manager import ResourceAllocation, ResourceManager from app.services.idempotency import IdempotencyManager -from app.services.idempotency.idempotency_manager import IdempotencyConfig, create_idempotency_manager from app.services.idempotency.middleware import IdempotentConsumerWrapper -from app.services.idempotency.redis_repository import RedisIdempotencyRepository -from app.settings import get_settings +from app.settings import Settings EventHandler: TypeAlias = Coroutine[Any, Any, None] ExecutionMap: TypeAlias = dict[str, ResourceAllocation] @@ -63,6 +51,7 @@ def __init__( self, producer: UnifiedProducer, schema_registry_manager: SchemaRegistryManager, + settings: Settings, event_store: EventStore, execution_repository: ExecutionRepository, idempotency_manager: IdempotencyManager, @@ -71,12 +60,13 @@ def __init__( max_concurrent_scheduling: int = 10, scheduling_interval_seconds: float = 0.5, ): + super().__init__() self.logger = logger self.metrics = get_coordinator_metrics() - settings = get_settings() + self._settings = settings # Kafka configuration - self.kafka_servers = settings.KAFKA_BOOTSTRAP_SERVERS + self.kafka_servers = self._settings.KAFKA_BOOTSTRAP_SERVERS self.consumer_group = consumer_group # Components @@ -104,29 +94,23 @@ def __init__( self._scheduling_semaphore = asyncio.Semaphore(max_concurrent_scheduling) # State tracking - self._running = False self._scheduling_task: asyncio.Task[None] | None = None self._active_executions: set[str] = set() self._execution_resources: ExecutionMap = {} self._schema_registry_manager = schema_registry_manager self.dispatcher = EventDispatcher(logger=self.logger) - async def start(self) -> None: - """Start the coordinator service""" - if self._running: - self.logger.warning("ExecutionCoordinator already running") - return - + async def _on_start(self) -> None: + """Start the coordinator service.""" self.logger.info("Starting ExecutionCoordinator service...") await self.queue_manager.start() await self.idempotency_manager.initialize() - settings = get_settings() consumer_config = ConsumerConfig( bootstrap_servers=self.kafka_servers, - group_id=f"{self.consumer_group}.{settings.KAFKA_GROUP_SUFFIX}", + group_id=f"{self.consumer_group}.{self._settings.KAFKA_GROUP_SUFFIX}", enable_auto_commit=False, session_timeout_ms=30000, # 30 seconds heartbeat_interval_ms=10000, # 10 seconds (must be < session_timeout / 3) @@ -136,24 +120,30 @@ async def start(self) -> None: fetch_min_bytes=1, # Return immediately if any data available ) - self.consumer = UnifiedConsumer(consumer_config, event_dispatcher=self.dispatcher, logger=self.logger) + self.consumer = UnifiedConsumer( + consumer_config, + event_dispatcher=self.dispatcher, + schema_registry=self._schema_registry_manager, + settings=self._settings, + logger=self.logger, + ) # Register handlers with EventDispatcher BEFORE wrapping with idempotency @self.dispatcher.register(EventType.EXECUTION_REQUESTED) - async def handle_requested(event: BaseEvent) -> None: + async def handle_requested(event: ExecutionRequestedEvent) -> None: await self._route_execution_event(event) @self.dispatcher.register(EventType.EXECUTION_COMPLETED) - async def handle_completed(event: BaseEvent) -> None: + async def handle_completed(event: ExecutionCompletedEvent) -> None: await self._route_execution_result(event) @self.dispatcher.register(EventType.EXECUTION_FAILED) - async def handle_failed(event: BaseEvent) -> None: + async def handle_failed(event: ExecutionFailedEvent) -> None: await self._route_execution_result(event) @self.dispatcher.register(EventType.EXECUTION_CANCELLED) - async def handle_cancelled(event: BaseEvent) -> None: - await self._route_execution_result(event) + async def handle_cancelled(event: ExecutionCancelledEvent) -> None: + await self._route_execution_event(event) self.idempotent_consumer = IdempotentConsumerWrapper( consumer=self.consumer, @@ -170,18 +160,13 @@ async def handle_cancelled(event: BaseEvent) -> None: await self.idempotent_consumer.start([KafkaTopic.EXECUTION_EVENTS]) # Start scheduling task - self._running = True self._scheduling_task = asyncio.create_task(self._scheduling_loop()) self.logger.info("ExecutionCoordinator service started successfully") - async def stop(self) -> None: - """Stop the coordinator service""" - if not self._running: - return - + async def _on_stop(self) -> None: + """Stop the coordinator service.""" self.logger.info("Stopping ExecutionCoordinator service...") - self._running = False # Stop scheduling task if self._scheduling_task: @@ -203,7 +188,7 @@ async def stop(self) -> None: self.logger.info(f"ExecutionCoordinator service stopped. Active executions: {len(self._active_executions)}") - async def _route_execution_event(self, event: BaseEvent) -> None: + async def _route_execution_event(self, event: ExecutionRequestedEvent | ExecutionCancelledEvent) -> None: """Route execution events to appropriate handlers based on event type""" self.logger.info( f"COORDINATOR: Routing execution event - type: {event.event_type}, " @@ -212,18 +197,18 @@ async def _route_execution_event(self, event: BaseEvent) -> None: ) if event.event_type == EventType.EXECUTION_REQUESTED: - await self._handle_execution_requested(event) # type: ignore + await self._handle_execution_requested(event) elif event.event_type == EventType.EXECUTION_CANCELLED: - await self._handle_execution_cancelled(event) # type: ignore + await self._handle_execution_cancelled(event) else: self.logger.debug(f"Ignoring execution event type: {event.event_type}") - async def _route_execution_result(self, event: BaseEvent) -> None: + async def _route_execution_result(self, event: ExecutionCompletedEvent | ExecutionFailedEvent) -> None: """Route execution result events to appropriate handlers based on event type""" if event.event_type == EventType.EXECUTION_COMPLETED: - await self._handle_execution_completed(event) # type: ignore + await self._handle_execution_completed(event) elif event.event_type == EventType.EXECUTION_FAILED: - await self._handle_execution_failed(event) # type: ignore + await self._handle_execution_failed(event) else: self.logger.debug(f"Ignoring execution result event type: {event.event_type}") @@ -306,7 +291,7 @@ async def _handle_execution_failed(self, event: ExecutionFailedEvent) -> None: async def _scheduling_loop(self) -> None: """Main scheduling loop""" - while self._running: + while self.is_running: try: # Get next execution from queue execution = await self.queue_manager.get_next_execution() @@ -488,86 +473,8 @@ async def _publish_scheduling_failed(self, request: ExecutionRequestedEvent, err async def get_status(self) -> dict[str, Any]: """Get coordinator status""" return { - "running": self._running, + "running": self.is_running, "active_executions": len(self._active_executions), "queue_stats": await self.queue_manager.get_queue_stats(), "resource_stats": await self.resource_manager.get_resource_stats(), } - - -async def run_coordinator() -> None: - """Run the execution coordinator service""" - import os - from contextlib import AsyncExitStack - - from app.core.logging import setup_logger - - logger = setup_logger(os.environ.get("LOG_LEVEL", "INFO")) - logger.info("Initializing schema registry for coordinator...") - schema_registry_manager = create_schema_registry_manager(logger) - await initialize_event_schemas(schema_registry_manager) - - settings = get_settings() - config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS) - producer = UnifiedProducer(config, schema_registry_manager, logger) - - db_client: DBClient = AsyncMongoClient(settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000) - db_name = settings.DATABASE_NAME - database = db_client[db_name] - - # Initialize Beanie ODM (indexes are idempotently created via Document.Settings.indexes) - await init_beanie(database=database, document_models=ALL_DOCUMENTS) - - logger.info("Creating event store for coordinator...") - event_store = create_event_store(schema_registry_manager, logger, ttl_days=90) - - exec_repo = ExecutionRepository(logger) - r = redis.Redis( - host=settings.REDIS_HOST, - port=settings.REDIS_PORT, - db=settings.REDIS_DB, - password=settings.REDIS_PASSWORD, - ssl=settings.REDIS_SSL, - max_connections=settings.REDIS_MAX_CONNECTIONS, - decode_responses=settings.REDIS_DECODE_RESPONSES, - socket_connect_timeout=5, - socket_timeout=5, - ) - idem_repo = RedisIdempotencyRepository(r, key_prefix="idempotency") - idem_manager = create_idempotency_manager(repository=idem_repo, config=IdempotencyConfig(), logger=logger) - await idem_manager.initialize() - - coordinator = ExecutionCoordinator( - producer=producer, - schema_registry_manager=schema_registry_manager, - event_store=event_store, - execution_repository=exec_repo, - idempotency_manager=idem_manager, - logger=logger, - ) - - def signal_handler(sig: int, frame: Any) -> None: - logger.info(f"Received signal {sig}, initiating shutdown...") - asyncio.create_task(coordinator.stop()) - - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - async with AsyncExitStack() as stack: - await stack.enter_async_context(producer) - await stack.enter_async_context(coordinator) - stack.push_async_callback(idem_manager.close) - stack.push_async_callback(r.aclose) - stack.callback(db_client.close) - - while coordinator._running: - await asyncio.sleep(60) - status = await coordinator.get_status() - logger.info(f"Coordinator status: {status}") - - -if __name__ == "__main__": - # Run coordinator as standalone service - - settings = get_settings() - asyncio.run(run_coordinator()) diff --git a/backend/app/services/event_bus.py b/backend/app/services/event_bus.py index 78c875ca..93208612 100644 --- a/backend/app/services/event_bus.py +++ b/backend/app/services/event_bus.py @@ -46,6 +46,7 @@ class EventBus(LifecycleEnabled): """ def __init__(self, logger: logging.Logger) -> None: + super().__init__() self.logger = logger self.settings = get_settings() self.metrics = get_connection_metrics() @@ -53,22 +54,15 @@ def __init__(self, logger: logging.Logger) -> None: self.consumer: Optional[Consumer] = None self._subscriptions: dict[str, Subscription] = {} # id -> Subscription self._pattern_index: dict[str, set[str]] = {} # pattern -> set of subscription ids - self._running = False self._consumer_task: Optional[asyncio.Task[None]] = None self._lock = asyncio.Lock() self._topic = f"{self.settings.KAFKA_TOPIC_PREFIX}{KafkaTopic.EVENT_BUS_STREAM}" self._executor: Optional[Callable[..., Any]] = None # Will store the executor function - async def start(self) -> None: + async def _on_start(self) -> None: """Start the event bus with Kafka backing.""" - if self._running: - return - - self._running = True - await self._initialize_kafka() self._consumer_task = asyncio.create_task(self._kafka_listener()) - self._running = True self.logger.info("Event bus started with Kafka backing") async def _initialize_kafka(self) -> None: @@ -96,18 +90,11 @@ async def _initialize_kafka(self) -> None: self.consumer.subscribe([self._topic]) # Store the executor function for sync operations - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() self._executor = loop.run_in_executor - async def stop(self) -> None: + async def _on_stop(self) -> None: """Stop the event bus and clean up resources.""" - await self._cleanup() - self.logger.info("Event bus stopped") - - async def _cleanup(self) -> None: - """Clean up all resources.""" - self._running = False - # Cancel consumer task if self._consumer_task and not self._consumer_task.done(): self._consumer_task.cancel() @@ -131,6 +118,8 @@ async def _cleanup(self) -> None: self._subscriptions.clear() self._pattern_index.clear() + self.logger.info("Event bus stopped") + async def publish(self, event_type: str, data: dict[str, Any]) -> None: """ Publish an event to Kafka and local subscribers. @@ -278,7 +267,7 @@ async def _kafka_listener(self) -> None: self.logger.info("Kafka listener started") try: - while self._running: + while self.is_running: # Poll for messages with small timeout if self._executor: msg = await self._executor(None, self.consumer.poll, 0.1) @@ -312,7 +301,6 @@ async def _kafka_listener(self) -> None: self.logger.info("Kafka listener cancelled") except Exception as e: self.logger.error(f"Fatal error in Kafka listener: {e}") - self._running = False def _update_metrics(self, pattern: str) -> None: """Update metrics for a pattern (must be called within lock).""" @@ -328,7 +316,7 @@ async def get_statistics(self) -> dict[str, Any]: "total_patterns": len(self._pattern_index), "total_subscriptions": len(self._subscriptions), "kafka_enabled": self.producer is not None, - "running": self._running, + "running": self.is_running, } @@ -345,14 +333,14 @@ async def get_event_bus(self) -> EventBus: async with self._lock: if self._event_bus is None: self._event_bus = EventBus(self.logger) - await self._event_bus.start() + await self._event_bus.__aenter__() return self._event_bus async def close(self) -> None: """Stop and clean up the event bus.""" async with self._lock: if self._event_bus: - await self._event_bus.stop() + await self._event_bus.aclose() self._event_bus = None diff --git a/backend/app/services/event_replay/replay_service.py b/backend/app/services/event_replay/replay_service.py index aa00c7ec..837d8147 100644 --- a/backend/app/services/event_replay/replay_service.py +++ b/backend/app/services/event_replay/replay_service.py @@ -67,7 +67,7 @@ async def start_replay(self, session_id: str) -> None: self.logger.info("Started replay session", extra={"session_id": session_id}) async def _run_replay(self, session: ReplaySessionState) -> None: - start_time = asyncio.get_event_loop().time() + start_time = asyncio.get_running_loop().time() try: with trace_span( @@ -119,7 +119,7 @@ async def _complete_session(self, session: ReplaySessionState, start_time: float session.status = ReplayStatus.COMPLETED session.completed_at = datetime.now(timezone.utc) - duration = asyncio.get_event_loop().time() - start_time + duration = asyncio.get_running_loop().time() - start_time self._metrics.record_replay_duration(duration, session.config.replay_type) await self._update_session_in_db(session) @@ -281,7 +281,7 @@ async def _write_event_to_file(self, event: BaseEvent, file_path: str) -> None: self._file_locks[file_path] = asyncio.Lock() async with self._file_locks[file_path]: - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() await loop.run_in_executor(None, self._write_to_file_sync, event, file_path) def _write_to_file_sync(self, event: BaseEvent, file_path: str) -> None: diff --git a/backend/app/services/idempotency/redis_repository.py b/backend/app/services/idempotency/redis_repository.py index c8d18624..8bbc0fc8 100644 --- a/backend/app/services/idempotency/redis_repository.py +++ b/backend/app/services/idempotency/redis_repository.py @@ -138,4 +138,4 @@ async def aggregate_status_counts(self, key_prefix: str) -> dict[str, int]: return counts async def health_check(self) -> None: - await self._r.execute_command("PING") # type: ignore[no-untyped-call] + await self._r.ping() # type: ignore[misc] # redis-py dual sync/async return type diff --git a/backend/app/services/k8s_worker/worker.py b/backend/app/services/k8s_worker/worker.py index 96ef651d..8bad97c2 100644 --- a/backend/app/services/k8s_worker/worker.py +++ b/backend/app/services/k8s_worker/worker.py @@ -1,31 +1,23 @@ import asyncio import logging import os -import signal import time from pathlib import Path from typing import Any -import redis.asyncio as redis -from beanie import init_beanie from kubernetes import client as k8s_client from kubernetes import config as k8s_config from kubernetes.client.rest import ApiException -from pymongo.asynchronous.mongo_client import AsyncMongoClient -from app.core.database_context import DBClient from app.core.lifecycle import LifecycleEnabled from app.core.metrics import ExecutionMetrics, KubernetesMetrics -from app.db.docs import ALL_DOCUMENTS from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic from app.domain.enums.storage import ExecutionErrorType -from app.events.core import ConsumerConfig, EventDispatcher, ProducerConfig, UnifiedConsumer, UnifiedProducer -from app.events.event_store import EventStore, create_event_store +from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer +from app.events.event_store import EventStore from app.events.schema.schema_registry import ( SchemaRegistryManager, - create_schema_registry_manager, - initialize_event_schemas, ) from app.infrastructure.kafka.events.base import BaseEvent from app.infrastructure.kafka.events.execution import ( @@ -36,12 +28,10 @@ from app.infrastructure.kafka.events.saga import CreatePodCommandEvent, DeletePodCommandEvent from app.runtime_registry import RUNTIME_REGISTRY from app.services.idempotency import IdempotencyManager -from app.services.idempotency.idempotency_manager import IdempotencyConfig, create_idempotency_manager from app.services.idempotency.middleware import IdempotentConsumerWrapper -from app.services.idempotency.redis_repository import RedisIdempotencyRepository from app.services.k8s_worker.config import K8sWorkerConfig from app.services.k8s_worker.pod_builder import PodBuilder -from app.settings import get_settings +from app.settings import Settings class KubernetesWorker(LifecycleEnabled): @@ -61,17 +51,19 @@ def __init__( config: K8sWorkerConfig, producer: UnifiedProducer, schema_registry_manager: SchemaRegistryManager, + settings: Settings, event_store: EventStore, idempotency_manager: IdempotencyManager, logger: logging.Logger, ): + super().__init__() self.logger = logger self.metrics = KubernetesMetrics() self.execution_metrics = ExecutionMetrics() self.config = config or K8sWorkerConfig() - settings = get_settings() + self._settings = settings - self.kafka_servers = self.config.kafka_bootstrap_servers or settings.KAFKA_BOOTSTRAP_SERVERS + self.kafka_servers = self.config.kafka_bootstrap_servers or self._settings.KAFKA_BOOTSTRAP_SERVERS self._event_store = event_store # Kubernetes clients @@ -88,17 +80,12 @@ def __init__( self.producer: UnifiedProducer = producer # State tracking - self._running = False self._active_creations: set[str] = set() self._creation_semaphore = asyncio.Semaphore(self.config.max_concurrent_pods) self._schema_registry_manager = schema_registry_manager - async def start(self) -> None: - """Start the Kubernetes worker""" - if self._running: - self.logger.warning("KubernetesWorker already running") - return - + async def _on_start(self) -> None: + """Start the Kubernetes worker.""" self.logger.info("Starting KubernetesWorker service...") self.logger.info("DEBUG: About to initialize Kubernetes client") @@ -118,7 +105,7 @@ async def start(self) -> None: # Create consumer configuration consumer_config = ConsumerConfig( bootstrap_servers=self.kafka_servers, - group_id=f"{self.config.consumer_group}.{get_settings().KAFKA_GROUP_SUFFIX}", + group_id=f"{self.config.consumer_group}.{self._settings.KAFKA_GROUP_SUFFIX}", enable_auto_commit=False, ) @@ -128,7 +115,13 @@ async def start(self) -> None: self.dispatcher.register_handler(EventType.DELETE_POD_COMMAND, self._handle_delete_pod_command_wrapper) # Create consumer with dispatcher - self.consumer = UnifiedConsumer(consumer_config, event_dispatcher=self.dispatcher, logger=self.logger) + self.consumer = UnifiedConsumer( + consumer_config, + event_dispatcher=self.dispatcher, + schema_registry=self._schema_registry_manager, + settings=self._settings, + logger=self.logger, + ) # Wrap consumer with idempotency - use content hash for pod commands self.idempotent_consumer = IdempotentConsumerWrapper( @@ -143,7 +136,6 @@ async def start(self) -> None: # Start the consumer with idempotency - listen to saga commands topic await self.idempotent_consumer.start([KafkaTopic.SAGA_COMMANDS]) - self._running = True # Create daemonset for image pre-pulling asyncio.create_task(self.ensure_image_pre_puller_daemonset()) @@ -151,13 +143,9 @@ async def start(self) -> None: self.logger.info("KubernetesWorker service started successfully") - async def stop(self) -> None: - """Stop the Kubernetes worker""" - if not self._running: - return - + async def _on_stop(self) -> None: + """Stop the Kubernetes worker.""" self.logger.info("Stopping KubernetesWorker service...") - self._running = False # Wait for active creations to complete if self._active_creations: @@ -178,9 +166,7 @@ async def stop(self) -> None: # Close idempotency manager await self.idempotency_manager.close() - # Stop producer if we created it - if self.producer: - await self.producer.stop() + # Note: producer is managed by DI container, not stopped here self.logger.info("KubernetesWorker service stopped") @@ -437,7 +423,7 @@ async def _publish_pod_creation_failed(self, command: CreatePodCommandEvent, err async def get_status(self) -> dict[str, Any]: """Get worker status""" return { - "running": self._running, + "running": self.is_running, "active_creations": len(self._active_creations), "config": { "namespace": self.config.namespace, @@ -514,83 +500,3 @@ async def ensure_image_pre_puller_daemonset(self) -> None: self.logger.error(f"K8s API error applying DaemonSet '{daemonset_name}': {e.reason}", exc_info=True) except Exception as e: self.logger.error(f"Unexpected error applying image-puller DaemonSet: {e}", exc_info=True) - - -async def run_kubernetes_worker() -> None: - """Run the Kubernetes worker service""" - import os - from contextlib import AsyncExitStack - - from app.core.logging import setup_logger - - logger = setup_logger(os.environ.get("LOG_LEVEL", "INFO")) - logger.info("Initializing database connection...") - settings = get_settings() - db_client: DBClient = AsyncMongoClient(settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000) - db_name = settings.DATABASE_NAME - database = db_client[db_name] - await db_client.admin.command("ping") - logger.info(f"Connected to database: {db_name}") - - # Initialize Beanie ODM (indexes are idempotently created via Document.Settings.indexes) - await init_beanie(database=database, document_models=ALL_DOCUMENTS) - - logger.info("Initializing schema registry...") - schema_registry_manager = create_schema_registry_manager(logger) - await initialize_event_schemas(schema_registry_manager) - - logger.info("Creating event store...") - event_store = create_event_store(schema_registry_manager, logger) - - logger.info("Creating producer for Kubernetes worker...") - producer_config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS) - producer = UnifiedProducer(producer_config, schema_registry_manager, logger) - - config = K8sWorkerConfig() - r = redis.Redis( - host=settings.REDIS_HOST, - port=settings.REDIS_PORT, - db=settings.REDIS_DB, - password=settings.REDIS_PASSWORD, - ssl=settings.REDIS_SSL, - max_connections=settings.REDIS_MAX_CONNECTIONS, - decode_responses=settings.REDIS_DECODE_RESPONSES, - socket_connect_timeout=5, - socket_timeout=5, - ) - idem_repo = RedisIdempotencyRepository(r, key_prefix="idempotency") - idem_manager = create_idempotency_manager(repository=idem_repo, config=IdempotencyConfig(), logger=logger) - await idem_manager.initialize() - - worker = KubernetesWorker( - config=config, - producer=producer, - schema_registry_manager=schema_registry_manager, - event_store=event_store, - idempotency_manager=idem_manager, - logger=logger, - ) - - def signal_handler(sig: int, frame: Any) -> None: - logger.info(f"Received signal {sig}, initiating shutdown...") - asyncio.create_task(worker.stop()) - - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - async with AsyncExitStack() as stack: - await stack.enter_async_context(producer) - await stack.enter_async_context(worker) - stack.push_async_callback(idem_manager.close) - stack.push_async_callback(r.aclose) - stack.callback(db_client.close) - - while worker._running: - await asyncio.sleep(60) - status = await worker.get_status() - logger.info(f"Kubernetes worker status: {status}") - - -if __name__ == "__main__": - # Run worker as standalone service - asyncio.run(run_kubernetes_worker()) diff --git a/backend/app/services/kafka_event_service.py b/backend/app/services/kafka_event_service.py index 25a13f46..26bdbd21 100644 --- a/backend/app/services/kafka_event_service.py +++ b/backend/app/services/kafka_event_service.py @@ -286,4 +286,4 @@ async def publish_base_event(self, event: BaseEvent, key: str | None = None) -> async def close(self) -> None: """Close event service resources""" - await self.kafka_producer.stop() + await self.kafka_producer.aclose() diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py index e04754e0..084de72c 100644 --- a/backend/app/services/notification_service.py +++ b/backend/app/services/notification_service.py @@ -43,7 +43,7 @@ from app.services.event_bus import EventBusManager from app.services.kafka_event_service import KafkaEventService from app.services.sse.redis_bus import SSERedisBus -from app.settings import Settings, get_settings +from app.settings import Settings # Constants ENTITY_EXECUTION_TAG = "entity:execution" @@ -218,7 +218,7 @@ async def _subscribe_to_events(self) -> None: # Configure consumer for notification-relevant events consumer_config = ConsumerConfig( bootstrap_servers=self.settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=f"{GroupId.NOTIFICATION_SERVICE}.{get_settings().KAFKA_GROUP_SUFFIX}", + group_id=f"{GroupId.NOTIFICATION_SERVICE}.{self.settings.KAFKA_GROUP_SUFFIX}", max_poll_records=10, enable_auto_commit=True, auto_offset_reset="latest", # Only process new events @@ -237,7 +237,13 @@ async def _subscribe_to_events(self) -> None: self._dispatcher.register_handler(EventType.EXECUTION_TIMEOUT, self._handle_execution_event) # Create consumer with dispatcher - self._consumer = UnifiedConsumer(consumer_config, event_dispatcher=self._dispatcher, logger=self.logger) + self._consumer = UnifiedConsumer( + consumer_config, + event_dispatcher=self._dispatcher, + schema_registry=self.schema_registry_manager, + settings=self.settings, + logger=self.logger, + ) # Start consumer await self._consumer.start([execution_results_topic]) @@ -858,7 +864,7 @@ async def _deliver_notification(self, notification: DomainNotification) -> None: assert subscription is not None # Send through channel - start_time = asyncio.get_event_loop().time() + start_time = asyncio.get_running_loop().time() try: handler = self._channel_handlers.get(notification.channel) if handler is None: @@ -869,7 +875,7 @@ async def _deliver_notification(self, notification: DomainNotification) -> None: self.logger.debug(f"Using handler {handler.__name__} for channel {notification.channel}") await handler(notification, subscription) - delivery_time = asyncio.get_event_loop().time() - start_time + delivery_time = asyncio.get_running_loop().time() - start_time # Mark delivered await self.repository.update_notification( diff --git a/backend/app/services/pod_monitor/monitor.py b/backend/app/services/pod_monitor/monitor.py index b2512e68..bdc61583 100644 --- a/backend/app/services/pod_monitor/monitor.py +++ b/backend/app/services/pod_monitor/monitor.py @@ -1,34 +1,25 @@ import asyncio import logging -import signal import time from collections.abc import AsyncIterator -from contextlib import AsyncExitStack, asynccontextmanager +from contextlib import asynccontextmanager from dataclasses import dataclass from enum import auto from typing import Any -from beanie import init_beanie from kubernetes import client as k8s_client from kubernetes import config as k8s_config from kubernetes import watch from kubernetes.client.rest import ApiException -from pymongo.asynchronous.mongo_client import AsyncMongoClient -from app.core.k8s_clients import K8sClients, close_k8s_clients, create_k8s_clients +from app.core.k8s_clients import K8sClients from app.core.lifecycle import LifecycleEnabled -from app.core.logging import setup_logger from app.core.metrics.context import get_kubernetes_metrics from app.core.utils import StringEnum -from app.db.docs import ALL_DOCUMENTS -from app.db.repositories.event_repository import EventRepository -from app.events.core import ProducerConfig, UnifiedProducer -from app.events.schema.schema_registry import create_schema_registry_manager, initialize_event_schemas from app.infrastructure.kafka.events import BaseEvent from app.services.kafka_event_service import KafkaEventService from app.services.pod_monitor.config import PodMonitorConfig from app.services.pod_monitor.event_mapper import PodEventMapper -from app.settings import get_settings # Type aliases type PodName = str @@ -116,6 +107,7 @@ def __init__( k8s_clients: K8sClients | None = None, ) -> None: """Initialize the pod monitor.""" + super().__init__() self.logger = logger self.config = config or PodMonitorConfig() @@ -146,12 +138,8 @@ def state(self) -> MonitorState: """Get current monitor state.""" return self._state - async def start(self) -> None: + async def _on_start(self) -> None: """Start the pod monitor.""" - if self._state != MonitorState.IDLE: - self.logger.warning(f"Cannot start monitor in state: {self._state}") - return - self.logger.info("Starting PodMonitor service...") # Initialize components @@ -167,11 +155,8 @@ async def start(self) -> None: self.logger.info("PodMonitor service started successfully") - async def stop(self) -> None: + async def _on_stop(self) -> None: """Stop the pod monitor.""" - if self._state == MonitorState.STOPPED: - return - self.logger.info("Stopping PodMonitor service...") self._state = MonitorState.STOPPING @@ -526,79 +511,5 @@ async def create_pod_monitor( k8s_clients=k8s_clients, ) - try: - await monitor.start() + async with monitor: yield monitor - finally: - await monitor.stop() - - -async def run_pod_monitor() -> None: - """Run the pod monitor service.""" - import os - - logger = setup_logger(os.environ.get("LOG_LEVEL", "INFO")) - settings = get_settings() - - # Initialize MongoDB - db_client: AsyncMongoClient[Any] = AsyncMongoClient( - settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000 - ) - database = db_client[settings.DATABASE_NAME] - await db_client.admin.command("ping") - logger.info(f"Connected to database: {settings.DATABASE_NAME}") - await init_beanie(database=database, document_models=ALL_DOCUMENTS) - - # Initialize schema registry - schema_registry_manager = create_schema_registry_manager(logger) - await initialize_event_schemas(schema_registry_manager) - - # Create producer - producer_config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS) - producer = UnifiedProducer(producer_config, schema_registry_manager, logger) - - # Create KafkaEventService (stores events + publishes to Kafka) - event_repository = EventRepository(logger) - kafka_event_service = KafkaEventService( - event_repository=event_repository, - kafka_producer=producer, - logger=logger, - ) - - # Create monitor - monitor_config = PodMonitorConfig() - clients = create_k8s_clients(logger) - monitor = PodMonitor( - config=monitor_config, - kafka_event_service=kafka_event_service, - logger=logger, - k8s_clients=clients, - ) - - # Setup signal handlers - loop = asyncio.get_running_loop() - - async def shutdown() -> None: - """Shutdown handler.""" - logger.info("Initiating graceful shutdown...") - await monitor.stop() - await producer.stop() - await db_client.close() - - for sig in (signal.SIGINT, signal.SIGTERM): - loop.add_signal_handler(sig, lambda: asyncio.create_task(shutdown())) - - async with AsyncExitStack() as stack: - stack.callback(close_k8s_clients, clients) - await stack.enter_async_context(producer) - await stack.enter_async_context(monitor) - - while monitor.state == MonitorState.RUNNING: - await asyncio.sleep(RECONCILIATION_LOG_INTERVAL) - status = await monitor.get_status() - logger.info(f"Pod monitor status: {status}") - - -if __name__ == "__main__": - settings = get_settings() - asyncio.run(run_pod_monitor()) diff --git a/backend/app/services/result_processor/processor.py b/backend/app/services/result_processor/processor.py index cb88b9c0..45ed07ca 100644 --- a/backend/app/services/result_processor/processor.py +++ b/backend/app/services/result_processor/processor.py @@ -1,18 +1,12 @@ -import asyncio import logging -from contextlib import AsyncExitStack from enum import auto from typing import Any -from beanie import init_beanie from pydantic import BaseModel, ConfigDict, Field -from pymongo.asynchronous.mongo_client import AsyncMongoClient -from app.core.container import create_result_processor_container from app.core.lifecycle import LifecycleEnabled from app.core.metrics.context import get_execution_metrics from app.core.utils import StringEnum -from app.db.docs import ALL_DOCUMENTS from app.db.repositories.execution_repository import ExecutionRepository from app.domain.enums.events import EventType from app.domain.enums.execution import ExecutionStatus @@ -20,6 +14,7 @@ from app.domain.enums.storage import ExecutionErrorType, StorageType from app.domain.execution import ExecutionNotFoundError, ExecutionResultDomain from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer +from app.events.schema.schema_registry import SchemaRegistryManager from app.infrastructure.kafka import BaseEvent from app.infrastructure.kafka.events.execution import ( ExecutionCompletedEvent, @@ -33,7 +28,7 @@ ) from app.services.idempotency import IdempotencyManager from app.services.idempotency.middleware import IdempotentConsumerWrapper -from app.settings import get_settings +from app.settings import Settings class ProcessingState(StringEnum): @@ -69,13 +64,18 @@ def __init__( self, execution_repo: ExecutionRepository, producer: UnifiedProducer, + schema_registry: SchemaRegistryManager, + settings: Settings, idempotency_manager: IdempotencyManager, logger: logging.Logger, ) -> None: """Initialize the result processor.""" + super().__init__() self.config = ResultProcessorConfig() self._execution_repo = execution_repo self._producer = producer + self._schema_registry = schema_registry + self._settings = settings self._metrics = get_execution_metrics() self._idempotency_manager: IdempotencyManager = idempotency_manager self._state = ProcessingState.IDLE @@ -83,12 +83,8 @@ def __init__( self._dispatcher: EventDispatcher | None = None self.logger = logger - async def start(self) -> None: + async def _on_start(self) -> None: """Start the result processor.""" - if self._state != ProcessingState.IDLE: - self.logger.warning(f"Cannot start processor in state: {self._state}") - return - self.logger.info("Starting ResultProcessor...") # Initialize idempotency manager (safe to call multiple times) @@ -100,11 +96,8 @@ async def start(self) -> None: self._state = ProcessingState.PROCESSING self.logger.info("ResultProcessor started successfully with idempotency protection") - async def stop(self) -> None: + async def _on_stop(self) -> None: """Stop the result processor.""" - if self._state == ProcessingState.STOPPED: - return - self.logger.info("Stopping ResultProcessor...") self._state = ProcessingState.STOPPED @@ -112,7 +105,7 @@ async def stop(self) -> None: await self._consumer.stop() await self._idempotency_manager.close() - await self._producer.stop() + # Note: producer is managed by DI container, not stopped here self.logger.info("ResultProcessor stopped") def _create_dispatcher(self) -> EventDispatcher: @@ -128,10 +121,9 @@ def _create_dispatcher(self) -> EventDispatcher: async def _create_consumer(self) -> IdempotentConsumerWrapper: """Create and configure idempotent Kafka consumer.""" - settings = get_settings() consumer_config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=f"{self.config.consumer_group}.{settings.KAFKA_GROUP_SUFFIX}", + bootstrap_servers=self._settings.KAFKA_BOOTSTRAP_SERVERS, + group_id=f"{self.config.consumer_group}.{self._settings.KAFKA_GROUP_SUFFIX}", max_poll_records=1, enable_auto_commit=True, auto_offset_reset="earliest", @@ -141,7 +133,13 @@ async def _create_consumer(self) -> IdempotentConsumerWrapper: if not self._dispatcher: raise RuntimeError("Event dispatcher not initialized") - base_consumer = UnifiedConsumer(consumer_config, event_dispatcher=self._dispatcher, logger=self.logger) + base_consumer = UnifiedConsumer( + consumer_config, + event_dispatcher=self._dispatcher, + schema_registry=self._schema_registry, + settings=self._settings, + logger=self.logger, + ) wrapper = IdempotentConsumerWrapper( consumer=base_consumer, idempotency_manager=self._idempotency_manager, @@ -187,7 +185,7 @@ async def _handle_completed(self, event: ExecutionCompletedEvent) -> None: self._metrics.record_memory_usage(memory_mib, lang_and_version) # Calculate and record memory utilization percentage - settings_limit = get_settings().K8S_POD_MEMORY_LIMIT + settings_limit = self._settings.K8S_POD_MEMORY_LIMIT memory_limit_mib = int(settings_limit.rstrip("Mi")) # TODO: Less brittle acquisition of limit memory_percent = (memory_mib / memory_limit_mib) * 100 self._metrics.memory_utilization_percent.record( @@ -308,37 +306,3 @@ async def get_status(self) -> dict[str, Any]: "state": self._state.value, "consumer_active": self._consumer is not None, } - - -async def run_result_processor() -> None: - settings = get_settings() - - # Initialize MongoDB and Beanie ODM (required per-process) - db_client: AsyncMongoClient[dict[str, object]] = AsyncMongoClient( - settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000 - ) - await init_beanie(database=db_client[settings.DATABASE_NAME], document_models=ALL_DOCUMENTS) - - container = create_result_processor_container() - producer = await container.get(UnifiedProducer) - idempotency_manager = await container.get(IdempotencyManager) - execution_repo = await container.get(ExecutionRepository) - logger = await container.get(logging.Logger) - logger.info(f"Beanie ODM initialized with {len(ALL_DOCUMENTS)} document models") - - processor = ResultProcessor( - execution_repo=execution_repo, - producer=producer, - idempotency_manager=idempotency_manager, - logger=logger, - ) - - async with AsyncExitStack() as stack: - await stack.enter_async_context(processor) - stack.push_async_callback(container.close) - stack.callback(db_client.close) - - while True: - await asyncio.sleep(60) - status = await processor.get_status() - logger.info(f"ResultProcessor status: {status}") diff --git a/backend/app/services/result_processor/resource_cleaner.py b/backend/app/services/result_processor/resource_cleaner.py index 1a48a1da..db6ff518 100644 --- a/backend/app/services/result_processor/resource_cleaner.py +++ b/backend/app/services/result_processor/resource_cleaner.py @@ -87,7 +87,7 @@ async def _delete_pod(self, pod_name: str, namespace: str) -> None: raise InvalidStateError("Kubernetes client not initialized") try: - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() await loop.run_in_executor(None, self.v1.read_namespaced_pod, pod_name, namespace) await loop.run_in_executor( @@ -134,7 +134,7 @@ async def _delete_labeled_resources( ) -> None: """Generic function to delete labeled resources""" try: - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() label_selector = f"execution-id={execution_id}" resources = await loop.run_in_executor(None, partial(list_func, namespace, label_selector=label_selector)) @@ -179,7 +179,7 @@ async def _cleanup_orphaned_pods( if not self.v1: raise InvalidStateError("Kubernetes client not initialized") - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() pods = await loop.run_in_executor( None, partial(self.v1.list_namespaced_pod, namespace, label_selector="app=integr8s") ) @@ -206,7 +206,7 @@ async def _cleanup_orphaned_configmaps( if not self.v1: raise InvalidStateError("Kubernetes client not initialized") - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() configmaps = await loop.run_in_executor( None, partial(self.v1.list_namespaced_config_map, namespace, label_selector="app=integr8s") ) @@ -227,7 +227,7 @@ async def get_resource_usage(self, namespace: str = "default") -> CountDict: """Get current resource usage counts""" await self.initialize() - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() label_selector = "app=integr8s" default_counts = {"pods": 0, "configmaps": 0, "network_policies": 0} diff --git a/backend/app/services/saga/saga_orchestrator.py b/backend/app/services/saga/saga_orchestrator.py index e200bf63..6a6f0987 100644 --- a/backend/app/services/saga/saga_orchestrator.py +++ b/backend/app/services/saga/saga_orchestrator.py @@ -14,20 +14,19 @@ from app.domain.saga.models import Saga, SagaConfig from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer from app.events.event_store import EventStore +from app.events.schema.schema_registry import SchemaRegistryManager from app.infrastructure.kafka.events.base import BaseEvent from app.infrastructure.kafka.events.metadata import AvroEventMetadata as EventMetadata from app.infrastructure.kafka.events.saga import SagaCancelledEvent from app.infrastructure.kafka.mappings import get_topic_for_event from app.services.idempotency import IdempotentConsumerWrapper from app.services.idempotency.idempotency_manager import IdempotencyManager -from app.settings import get_settings +from app.settings import Settings from .base_saga import BaseSaga from .execution_saga import ExecutionSaga from .saga_step import SagaContext -logger = logging.getLogger(__name__) - class SagaOrchestrator(LifecycleEnabled): """Orchestrates saga execution and compensation""" @@ -37,36 +36,39 @@ def __init__( config: SagaConfig, saga_repository: SagaRepository, producer: UnifiedProducer, + schema_registry_manager: SchemaRegistryManager, + settings: Settings, event_store: EventStore, idempotency_manager: IdempotencyManager, resource_allocation_repository: ResourceAllocationRepository, + logger: logging.Logger, ): + super().__init__() self.config = config self._sagas: dict[str, type[BaseSaga]] = {} self._running_instances: dict[str, Saga] = {} self._consumer: IdempotentConsumerWrapper | None = None self._idempotency_manager: IdempotencyManager = idempotency_manager self._producer = producer + self._schema_registry_manager = schema_registry_manager + self._settings = settings self._event_store = event_store self._repo: SagaRepository = saga_repository self._alloc_repo: ResourceAllocationRepository = resource_allocation_repository - self._running = False self._tasks: list[asyncio.Task[None]] = [] + self.logger = logger def register_saga(self, saga_class: type[BaseSaga]) -> None: self._sagas[saga_class.get_name()] = saga_class - logger.info(f"Registered saga: {saga_class.get_name()}") + self.logger.info(f"Registered saga: {saga_class.get_name()}") def _register_default_sagas(self) -> None: self.register_saga(ExecutionSaga) - logger.info("Registered default sagas") - - @property - def is_running(self) -> bool: - return self._running + self.logger.info("Registered default sagas") - async def start(self) -> None: - logger.info(f"Starting saga orchestrator: {self.config.name}") + async def _on_start(self) -> None: + """Start the saga orchestrator.""" + self.logger.info(f"Starting saga orchestrator: {self.config.name}") self._register_default_sagas() @@ -75,13 +77,11 @@ async def start(self) -> None: timeout_task = asyncio.create_task(self._check_timeouts()) self._tasks.append(timeout_task) - self._running = True - logger.info("Saga orchestrator started") - - async def stop(self) -> None: - logger.info("Stopping saga orchestrator...") + self.logger.info("Saga orchestrator started") - self._running = False + async def _on_stop(self) -> None: + """Stop the saga orchestrator.""" + self.logger.info("Stopping saga orchestrator...") if self._consumer: await self._consumer.stop() @@ -95,50 +95,51 @@ async def stop(self) -> None: if self._tasks: await asyncio.gather(*self._tasks, return_exceptions=True) - logger.info("Saga orchestrator stopped") + self.logger.info("Saga orchestrator stopped") async def _start_consumer(self) -> None: - logger.info(f"Registered sagas: {list(self._sagas.keys())}") + self.logger.info(f"Registered sagas: {list(self._sagas.keys())}") topics = set() event_types_to_register = set() for saga_class in self._sagas.values(): trigger_event_types = saga_class.get_trigger_events() - logger.info(f"Saga {saga_class.get_name()} triggers on event types: {trigger_event_types}") + self.logger.info(f"Saga {saga_class.get_name()} triggers on event types: {trigger_event_types}") # Convert event types to topics for subscription for event_type in trigger_event_types: topic = get_topic_for_event(event_type) topics.add(topic) event_types_to_register.add(event_type) - logger.debug(f"Event type {event_type} maps to topic {topic}") + self.logger.debug(f"Event type {event_type} maps to topic {topic}") if not topics: - logger.warning("No trigger events found in registered sagas") + self.logger.warning("No trigger events found in registered sagas") return - settings = get_settings() consumer_config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=f"saga-{self.config.name}.{settings.KAFKA_GROUP_SUFFIX}", + bootstrap_servers=self._settings.KAFKA_BOOTSTRAP_SERVERS, + group_id=f"saga-{self.config.name}.{self._settings.KAFKA_GROUP_SUFFIX}", enable_auto_commit=False, ) - dispatcher = EventDispatcher(logger=logger) + dispatcher = EventDispatcher(logger=self.logger) for event_type in event_types_to_register: dispatcher.register_handler(event_type, self._handle_event) - logger.info(f"Registered handler for event type: {event_type}") + self.logger.info(f"Registered handler for event type: {event_type}") base_consumer = UnifiedConsumer( config=consumer_config, event_dispatcher=dispatcher, - logger=logger, + schema_registry=self._schema_registry_manager, + settings=self._settings, + logger=self.logger, ) self._consumer = IdempotentConsumerWrapper( consumer=base_consumer, idempotency_manager=self._idempotency_manager, dispatcher=dispatcher, - logger=logger, + logger=self.logger, default_key_strategy="event_based", default_ttl_seconds=7200, enable_for_all_handlers=False, @@ -147,33 +148,33 @@ async def _start_consumer(self) -> None: assert self._consumer is not None await self._consumer.start(list(topics)) - logger.info(f"Saga consumer started for topics: {topics}") + self.logger.info(f"Saga consumer started for topics: {topics}") async def _handle_event(self, event: BaseEvent) -> None: """Handle incoming event""" - logger.info(f"Saga orchestrator handling event: type={event.event_type}, id={event.event_id}") + self.logger.info(f"Saga orchestrator handling event: type={event.event_type}, id={event.event_id}") try: saga_triggered = False for saga_name, saga_class in self._sagas.items(): - logger.debug(f"Checking if {saga_name} should be triggered by {event.event_type}") + self.logger.debug(f"Checking if {saga_name} should be triggered by {event.event_type}") if self._should_trigger_saga(saga_class, event): - logger.info(f"Event {event.event_type} triggers saga {saga_name}") + self.logger.info(f"Event {event.event_type} triggers saga {saga_name}") saga_triggered = True saga_id = await self._start_saga(saga_name, event) if not saga_id: raise RuntimeError(f"Failed to create saga {saga_name} for event {event.event_id}") if not saga_triggered: - logger.debug(f"Event {event.event_type} did not trigger any saga") + self.logger.debug(f"Event {event.event_type} did not trigger any saga") except Exception as e: - logger.error(f"Error handling event {event.event_id}: {e}", exc_info=True) + self.logger.error(f"Error handling event {event.event_id}: {e}", exc_info=True) raise def _should_trigger_saga(self, saga_class: type[BaseSaga], event: BaseEvent) -> bool: trigger_event_types = saga_class.get_trigger_events() should_trigger = event.event_type in trigger_event_types - logger.debug( + self.logger.debug( f"Saga {saga_class.get_name()} triggers on {trigger_event_types}, " f"event is {event.event_type}, should trigger: {should_trigger}" ) @@ -181,20 +182,20 @@ def _should_trigger_saga(self, saga_class: type[BaseSaga], event: BaseEvent) -> async def _start_saga(self, saga_name: str, trigger_event: BaseEvent) -> str | None: """Start a new saga instance""" - logger.info(f"Starting saga {saga_name} for event {trigger_event.event_type}") + self.logger.info(f"Starting saga {saga_name} for event {trigger_event.event_type}") saga_class = self._sagas.get(saga_name) if not saga_class: raise ValueError(f"Unknown saga: {saga_name}") execution_id = getattr(trigger_event, "execution_id", None) - logger.debug(f"Extracted execution_id={execution_id} from event") + self.logger.debug(f"Extracted execution_id={execution_id} from event") if not execution_id: - logger.warning(f"Could not extract execution ID from event: {trigger_event}") + self.logger.warning(f"Could not extract execution ID from event: {trigger_event}") return None existing = await self._repo.get_saga_by_execution_and_name(execution_id, saga_name) if existing: - logger.info(f"Saga {saga_name} already exists for execution {execution_id}") + self.logger.info(f"Saga {saga_name} already exists for execution {execution_id}") saga_id: str = existing.saga_id return saga_id @@ -208,7 +209,7 @@ async def _start_saga(self, saga_name: str, trigger_event: BaseEvent) -> str | N await self._save_saga(instance) self._running_instances[instance.saga_id] = instance - logger.info(f"Started saga {saga_name} (ID: {instance.saga_id}) for execution {execution_id}") + self.logger.info(f"Started saga {saga_name} (ID: {instance.saga_id}) for execution {execution_id}") saga = saga_class() # Inject runtime dependencies explicitly (no DI via context) @@ -243,14 +244,14 @@ async def _execute_saga( # Execute each step for step in steps: - if not self._running: + if not self.is_running: break # Update current step instance.current_step = step.name await self._save_saga(instance) - logger.info(f"Executing saga step: {step.name} for saga {instance.saga_id}") + self.logger.info(f"Executing saga step: {step.name} for saga {instance.saga_id}") # Execute step within a span with tracer.start_as_current_span( @@ -277,7 +278,7 @@ async def _execute_saga( context.add_compensation(compensation) else: # Step failed, start compensation - logger.error(f"Saga step {step.name} failed for saga {instance.saga_id}") + self.logger.error(f"Saga step {step.name} failed for saga {instance.saga_id}") if self.config.enable_compensation: await self._compensate_saga(instance, context) @@ -290,7 +291,7 @@ async def _execute_saga( await self._complete_saga(instance) except Exception as e: - logger.error(f"Error executing saga {instance.saga_id}: {e}", exc_info=True) + self.logger.error(f"Error executing saga {instance.saga_id}: {e}", exc_info=True) if self.config.enable_compensation: await self._compensate_saga(instance, context) @@ -299,7 +300,7 @@ async def _execute_saga( async def _compensate_saga(self, instance: Saga, context: SagaContext) -> None: """Execute compensation steps""" - logger.info(f"Starting compensation for saga {instance.saga_id}") + self.logger.info(f"Starting compensation for saga {instance.saga_id}") # Only update state if not already cancelled if instance.state != SagaState.CANCELLED: @@ -309,24 +310,24 @@ async def _compensate_saga(self, instance: Saga, context: SagaContext) -> None: # Execute compensations in reverse order for compensation in reversed(context.compensations): try: - logger.info(f"Executing compensation: {compensation.name} for saga {instance.saga_id}") + self.logger.info(f"Executing compensation: {compensation.name} for saga {instance.saga_id}") success = await compensation.compensate(context) if success: instance.compensated_steps.append(compensation.name) else: - logger.error(f"Compensation {compensation.name} failed for saga {instance.saga_id}") + self.logger.error(f"Compensation {compensation.name} failed for saga {instance.saga_id}") except Exception as e: - logger.error(f"Error in compensation {compensation.name}: {e}", exc_info=True) + self.logger.error(f"Error in compensation {compensation.name}: {e}", exc_info=True) # Mark saga as failed or keep as cancelled if instance.state == SagaState.CANCELLED: # Keep cancelled state but update compensated steps instance.updated_at = datetime.now(UTC) await self._save_saga(instance) - logger.info(f"Saga {instance.saga_id} compensation completed after cancellation") + self.logger.info(f"Saga {instance.saga_id} compensation completed after cancellation") else: # Mark as failed for non-cancelled compensations await self._fail_saga(instance, "Saga compensated due to failure") @@ -340,7 +341,7 @@ async def _complete_saga(self, instance: Saga) -> None: # Remove from running instances self._running_instances.pop(instance.saga_id, None) - logger.info(f"Saga {instance.saga_id} completed successfully") + self.logger.info(f"Saga {instance.saga_id} completed successfully") async def _fail_saga(self, instance: Saga, error_message: str) -> None: """Mark saga as failed""" @@ -352,11 +353,11 @@ async def _fail_saga(self, instance: Saga, error_message: str) -> None: # Remove from running instances self._running_instances.pop(instance.saga_id, None) - logger.error(f"Saga {instance.saga_id} failed: {error_message}") + self.logger.error(f"Saga {instance.saga_id} failed: {error_message}") async def _check_timeouts(self) -> None: """Check for saga timeouts""" - while self._running: + while self.is_running: try: # Check every 30 seconds await asyncio.sleep(30) @@ -366,7 +367,7 @@ async def _check_timeouts(self) -> None: timed_out = await self._repo.find_timed_out_sagas(cutoff_time) for instance in timed_out: - logger.warning(f"Saga {instance.saga_id} timed out") + self.logger.warning(f"Saga {instance.saga_id} timed out") instance.state = SagaState.TIMEOUT instance.error_message = f"Saga timed out after {self.config.timeout_seconds} seconds" @@ -376,7 +377,7 @@ async def _check_timeouts(self) -> None: self._running_instances.pop(instance.saga_id, None) except Exception as e: - logger.error(f"Error checking timeouts: {e}") + self.logger.error(f"Error checking timeouts: {e}") async def _save_saga(self, instance: Saga) -> None: """Persist saga through repository""" @@ -409,14 +410,14 @@ async def cancel_saga(self, saga_id: str) -> bool: # Get saga instance saga_instance = await self.get_saga_status(saga_id) if not saga_instance: - logger.error(f"Saga {saga_id} not found") + self.logger.error("Saga not found", extra={"saga_id": saga_id}) return False # Check if saga can be cancelled if saga_instance.state not in [SagaState.RUNNING, SagaState.CREATED]: - logger.warning( - f"Cannot cancel saga {saga_id} in state {saga_instance.state}. " - f"Only RUNNING or CREATED sagas can be cancelled." + self.logger.warning( + "Cannot cancel saga in current state. Only RUNNING or CREATED sagas can be cancelled.", + extra={"saga_id": saga_id, "state": saga_instance.state.value}, ) return False @@ -427,10 +428,14 @@ async def cancel_saga(self, saga_id: str) -> bool: # Log cancellation with user context if available user_id = saga_instance.context_data.get("user_id") - if user_id: - logger.info(f"User {user_id} cancelled saga {saga_id} (execution: {saga_instance.execution_id})") - else: - logger.info(f"Saga {saga_id} cancelled (execution: {saga_instance.execution_id})") + self.logger.info( + "Saga cancellation initiated", + extra={ + "saga_id": saga_id, + "execution_id": saga_instance.execution_id, + "user_id": user_id, + }, + ) # Save state await self._save_saga(saga_instance) @@ -474,13 +479,20 @@ async def cancel_saga(self, saga_id: str) -> bool: # Execute compensation await self._compensate_saga(saga_instance, context) else: - logger.error(f"Saga class {saga_instance.saga_name} not found for compensation") + self.logger.error( + "Saga class not found for compensation", + extra={"saga_name": saga_instance.saga_name, "saga_id": saga_id}, + ) - logger.info(f"Saga {saga_id} cancelled successfully") + self.logger.info("Saga cancelled successfully", extra={"saga_id": saga_id}) return True except Exception as e: - logger.error(f"Error cancelling saga {saga_id}: {e}", exc_info=True) + self.logger.error( + "Error cancelling saga", + extra={"saga_id": saga_id, "error": str(e)}, + exc_info=True, + ) return False async def _publish_saga_cancelled_event(self, saga_instance: Saga) -> None: @@ -512,26 +524,35 @@ async def _publish_saga_cancelled_event(self, saga_instance: Saga) -> None: if self._producer: await self._producer.produce(event_to_produce=event, key=saga_instance.execution_id) - logger.info(f"Published cancellation event for saga {saga_instance.saga_id}") + self.logger.info(f"Published cancellation event for saga {saga_instance.saga_id}") except Exception as e: - logger.error(f"Failed to publish saga cancellation event: {e}") + self.logger.error(f"Failed to publish saga cancellation event: {e}") def create_saga_orchestrator( saga_repository: SagaRepository, producer: UnifiedProducer, + schema_registry_manager: SchemaRegistryManager, + settings: Settings, event_store: EventStore, idempotency_manager: IdempotencyManager, resource_allocation_repository: ResourceAllocationRepository, config: SagaConfig, + logger: logging.Logger, ) -> SagaOrchestrator: """Factory function to create a saga orchestrator. Args: + saga_repository: Repository for saga persistence producer: Kafka producer instance + schema_registry_manager: Schema registry manager for event serialization + settings: Application settings event_store: Event store instance for event sourcing - config: Optional saga configuration (uses defaults if not provided) + idempotency_manager: Manager for idempotent event processing + resource_allocation_repository: Repository for resource allocations + config: Saga configuration + logger: Logger instance Returns: A new saga orchestrator instance @@ -540,7 +561,10 @@ def create_saga_orchestrator( config, saga_repository=saga_repository, producer=producer, + schema_registry_manager=schema_registry_manager, + settings=settings, event_store=event_store, idempotency_manager=idempotency_manager, resource_allocation_repository=resource_allocation_repository, + logger=logger, ) diff --git a/backend/app/services/sse/kafka_redis_bridge.py b/backend/app/services/sse/kafka_redis_bridge.py index f34b29c3..478e3420 100644 --- a/backend/app/services/sse/kafka_redis_bridge.py +++ b/backend/app/services/sse/kafka_redis_bridge.py @@ -1,6 +1,5 @@ from __future__ import annotations -import asyncio import logging import os @@ -32,6 +31,7 @@ def __init__( sse_bus: SSERedisBus, logger: logging.Logger, ) -> None: + super().__init__() self.schema_registry = schema_registry self.settings = settings self.event_metrics = event_metrics @@ -41,39 +41,25 @@ def __init__( self.num_consumers = settings.SSE_CONSUMER_POOL_SIZE self.consumers: list[UnifiedConsumer] = [] - self._lock = asyncio.Lock() - self._running = False - self._initialized = False + async def _on_start(self) -> None: + """Start the SSE Kafka→Redis bridge.""" + self.logger.info(f"Starting SSE Kafka→Redis bridge with {self.num_consumers} consumers") - async def start(self) -> None: - async with self._lock: - if self._initialized: - return - - self.logger.info(f"Starting SSE Kafka→Redis bridge with {self.num_consumers} consumers") - - for i in range(self.num_consumers): - consumer = await self._create_consumer(i) - self.consumers.append(consumer) + for i in range(self.num_consumers): + consumer = await self._create_consumer(i) + self.consumers.append(consumer) - self._running = True - self._initialized = True - self.logger.info("SSE Kafka→Redis bridge started successfully") + self.logger.info("SSE Kafka→Redis bridge started successfully") - async def stop(self) -> None: - async with self._lock: - if not self._initialized: - return - - self.logger.info("Stopping SSE Kafka→Redis bridge") - self._running = False + async def _on_stop(self) -> None: + """Stop the SSE Kafka→Redis bridge.""" + self.logger.info("Stopping SSE Kafka→Redis bridge") - for consumer in self.consumers: - await consumer.stop() + for consumer in self.consumers: + await consumer.stop() - self.consumers.clear() - self._initialized = False - self.logger.info("SSE Kafka→Redis bridge stopped") + self.consumers.clear() + self.logger.info("SSE Kafka→Redis bridge stopped") async def _create_consumer(self, consumer_index: int) -> UnifiedConsumer: suffix = os.environ.get("KAFKA_GROUP_SUFFIX", "") @@ -98,7 +84,13 @@ async def _create_consumer(self, consumer_index: int) -> UnifiedConsumer: dispatcher = EventDispatcher(logger=self.logger) self._register_routing_handlers(dispatcher) - consumer = UnifiedConsumer(config=config, event_dispatcher=dispatcher, logger=self.logger) + consumer = UnifiedConsumer( + config=config, + event_dispatcher=dispatcher, + schema_registry=self.schema_registry, + settings=self.settings, + logger=self.logger, + ) topics = [ KafkaTopic.EXECUTION_EVENTS, @@ -158,7 +150,7 @@ def get_stats(self) -> dict[str, int | bool]: "num_consumers": len(self.consumers), "active_executions": 0, "total_buffers": 0, - "is_running": self._running, + "is_running": self.is_running, } diff --git a/backend/app/services/sse/redis_bus.py b/backend/app/services/sse/redis_bus.py index 979edd16..03123138 100644 --- a/backend/app/services/sse/redis_bus.py +++ b/backend/app/services/sse/redis_bus.py @@ -38,7 +38,7 @@ async def close(self) -> None: try: await self._pubsub.unsubscribe(self._channel) finally: - await self._pubsub.aclose() # type: ignore[no-untyped-call] + await self._pubsub.aclose() # type: ignore[no-untyped-call] # redis-py PubSub.aclose lacks annotations class SSERedisBus: diff --git a/backend/app/services/sse/sse_shutdown_manager.py b/backend/app/services/sse/sse_shutdown_manager.py index 086682b9..dcc74873 100644 --- a/backend/app/services/sse/sse_shutdown_manager.py +++ b/backend/app/services/sse/sse_shutdown_manager.py @@ -257,7 +257,7 @@ async def _force_close_connections(self) -> None: # If we have a router, tell it to stop accepting new subscriptions if self._router: - await self._router.stop() + await self._router.aclose() self.metrics.update_sse_draining_connections(0) self.logger.info("Force close phase complete") diff --git a/backend/app/services/user_settings_service.py b/backend/app/services/user_settings_service.py index 1c9a888d..34f996cf 100644 --- a/backend/app/services/user_settings_service.py +++ b/backend/app/services/user_settings_service.py @@ -119,9 +119,7 @@ async def update_user_settings( await self.repository.create_snapshot(new_settings) return new_settings - async def _publish_settings_event( - self, user_id: str, changes: dict[str, Any], reason: str | None - ) -> None: + async def _publish_settings_event(self, user_id: str, changes: dict[str, Any], reason: str | None) -> None: """Publish settings update event with typed payload fields.""" await self.event_service.publish_event( event_type=EventType.USER_SETTINGS_UPDATED, diff --git a/backend/pyproject.toml b/backend/pyproject.toml index d92e2dde..9f68a2c7 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -96,7 +96,6 @@ dependencies = [ "pymongo==4.12.1", "pyparsing==3.2.3", "python-dateutil==2.9.0.post0", - "python-dotenv==1.0.1", "python-json-logger==2.0.7", "python-multipart==0.0.18", "PyYAML==6.0.2", @@ -145,6 +144,7 @@ dev = [ "pytest==8.3.3", "pytest-asyncio==1.3.0", "pytest-cov==5.0.0", + "pytest-env>=1.1.5", "pytest-xdist==3.6.1", "ruff==0.14.10", "types-cachetools==6.2.0.20250827", @@ -210,6 +210,7 @@ log_cli = false log_cli_level = "ERROR" log_level = "ERROR" addopts = "-n 4 --dist loadfile --tb=short -q --no-header -q" +env = ["OTEL_SDK_DISABLED=true"] # Coverage configuration [tool.coverage.run] diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index 7a076ffa..2b1b00a1 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -1,207 +1,138 @@ -import asyncio import os import uuid from contextlib import asynccontextmanager -from pathlib import Path -from typing import AsyncGenerator, Callable, Awaitable +from typing import AsyncGenerator import httpx import pytest import pytest_asyncio +import redis.asyncio as redis +from app.core.database_context import Database +from app.main import create_app +from app.settings import Settings from dishka import AsyncContainer -from dotenv import load_dotenv from httpx import ASGITransport -from app.core.database_context import Database -import redis.asyncio as redis +from pydantic_settings import SettingsConfigDict -# Load test environment variables BEFORE any app imports -test_env_path = Path(__file__).parent.parent / ".env.test" -if test_env_path.exists(): - load_dotenv(test_env_path, override=True) - -# IMPORTANT: avoid importing app.main at module import time because it -# constructs the FastAPI app immediately (reading settings from .env). -# We import lazily inside the fixture after test env vars are set.y -# DO NOT import any app.* modules at import time here, as it would -# construct global singletons (logger, settings) before we set test env. - -# ===== Early, host-friendly defaults (applied at import time) ===== -# Ensure tests connect to localhost services when run outside Docker. -os.environ.setdefault("TESTING", "true") -os.environ.setdefault("ENABLE_TRACING", "false") -os.environ.setdefault("OTEL_SDK_DISABLED", "true") -os.environ.setdefault("OTEL_METRICS_EXPORTER", "none") -os.environ.setdefault("OTEL_TRACES_EXPORTER", "none") - -# Force localhost endpoints to avoid Docker DNS names like 'mongo' -# Do not override if MONGODB_URL is already provided in the environment. -if "MONGODB_URL" not in os.environ: - from urllib.parse import quote_plus - - user = os.environ.get("MONGO_ROOT_USER", "root") - pwd = os.environ.get("MONGO_ROOT_PASSWORD", "rootpassword") - host = os.environ.get("MONGODB_HOST", "127.0.0.1") - port = os.environ.get("MONGODB_PORT", "27017") - try: - u = quote_plus(user) - p = quote_plus(pwd) - except Exception: - u = user - p = pwd - os.environ["MONGODB_URL"] = ( - f"mongodb://{u}:{p}@{host}:{port}/?authSource=admin&authMechanism=SCRAM-SHA-256" + +class TestSettings(Settings): + """Test configuration - loads from .env.test instead of .env""" + + model_config = SettingsConfigDict( + env_file=".env.test", + env_file_encoding="utf-8", + case_sensitive=True, + extra="ignore", ) -os.environ.setdefault("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092") -os.environ.setdefault("REDIS_HOST", "localhost") -os.environ.setdefault("REDIS_PORT", "6379") -os.environ.setdefault("SCHEMA_REGISTRY_URL", "http://localhost:8081") -os.environ.setdefault("RATE_LIMIT_ENABLED", "false") -os.environ.setdefault("SECRET_KEY", "test-secret-key-for-testing-only-32chars!!") -# ===== Global test environment (reinforce and isolation) ===== +# ===== Worker-specific isolation for pytest-xdist ===== def _compute_worker_id() -> str: return os.environ.get("PYTEST_XDIST_WORKER", "gw0") -@pytest.fixture(scope="session", autouse=True) -def _test_env() -> None: - # Core toggles - os.environ.setdefault("TESTING", "true") - os.environ.setdefault("ENABLE_TRACING", "false") - os.environ.setdefault("OTEL_SDK_DISABLED", "true") - os.environ.setdefault("OTEL_METRICS_EXPORTER", "none") - os.environ.setdefault("OTEL_TRACES_EXPORTER", "none") +def _setup_worker_env() -> None: + """Set worker-specific environment variables for pytest-xdist isolation. - # External services - force localhost when running tests on host - os.environ["MONGODB_URL"] = os.environ.get( - "MONGODB_URL", - "mongodb://root:rootpassword@localhost:27017/?authSource=admin", - ) - os.environ.setdefault("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092") - os.environ.setdefault("REDIS_HOST", "localhost") - os.environ.setdefault("REDIS_PORT", "6379") - os.environ.setdefault("SCHEMA_REGISTRY_URL", "http://localhost:8081") - os.environ.setdefault("RATE_LIMIT_ENABLED", "false") - os.environ.setdefault("SECRET_KEY", "test-secret-key-for-testing-only-32chars!!") - - # Isolation identifiers + Must be called BEFORE TestSettings is instantiated so env vars are picked up. + """ session_id = os.environ.get("PYTEST_SESSION_ID") or uuid.uuid4().hex[:8] worker_id = _compute_worker_id() os.environ["PYTEST_SESSION_ID"] = session_id - # Unique database name for test isolation + # Unique database name per worker os.environ["DATABASE_NAME"] = f"integr8scode_test_{session_id}_{worker_id}" - # Try to distribute Redis DBs across workers (0-15 by default). Fallback to 0. + # Distribute Redis DBs across workers (0-15) try: worker_num = int(worker_id[2:]) if worker_id.startswith("gw") else 0 os.environ["REDIS_DB"] = str(worker_num % 16) except Exception: os.environ.setdefault("REDIS_DB", "0") - # Use a single shared test topic prefix for all tests - # This avoids creating unique topics per worker/session - os.environ.setdefault("KAFKA_TOPIC_PREFIX", "test.") + # Unique Kafka consumer group per worker + os.environ["KAFKA_GROUP_SUFFIX"] = f"{session_id}.{worker_id}" + + # Unique Schema Registry prefix per worker + os.environ["SCHEMA_SUBJECT_PREFIX"] = f"test.{session_id}.{worker_id}." - # Schema Registry subject prefix for isolation across local runs/workers - # Example: test... - os.environ.setdefault("SCHEMA_SUBJECT_PREFIX", f"test.{session_id}.{worker_id}.") + # Disable OpenTelemetry exporters to prevent "otel-collector:4317" retry noise + os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "" + os.environ["OTEL_METRICS_EXPORTER"] = "none" + os.environ["OTEL_TRACES_EXPORTER"] = "none" + os.environ["OTEL_LOGS_EXPORTER"] = "none" - # Keep unique consumer groups per worker to avoid conflicts - # Code under test reads this suffix and appends it to base group IDs. - os.environ.setdefault("KAFKA_GROUP_SUFFIX", f"{session_id}.{worker_id}") +# Set up worker env at module load time (before any Settings instantiation) +_setup_worker_env() -# ===== App creation for tests ===== -def create_test_app(): - """Create the FastAPI app for testing.""" - # Clear settings cache to ensure .env.test values are used - from app.settings import get_settings - get_settings.cache_clear() - from importlib import import_module - mainmod = import_module("app.main") - return getattr(mainmod, "create_app")() +# ===== Settings fixture ===== +@pytest.fixture(scope="session") +def test_settings() -> Settings: + """Provide TestSettings for tests that need to create their own components.""" + return TestSettings() -# ===== App without lifespan for tests ===== +# ===== App fixture ===== @pytest_asyncio.fixture(scope="session") -async def app(_test_env): # type: ignore[valid-type] - """Create FastAPI app once per session/worker. +async def app(): + """Create FastAPI app with TestSettings. Session-scoped to avoid Pydantic schema validator memory issues when FastAPI recreates OpenAPI schemas hundreds of times with pytest-xdist. - See: https://github.com/pydantic/pydantic/issues/1864 - - Depends on _test_env to ensure env vars (REDIS_DB, DATABASE_NAME, etc.) - are set before the app/Settings are created. - - Note: Tests must not modify app.state or registered routes. - Use function-scoped `client` fixture for test isolation. """ - application = create_test_app() + application = create_app(settings=TestSettings()) yield application - if hasattr(application.state, 'dishka_container'): - container: AsyncContainer = application.state.dishka_container - await container.close() + if hasattr(application.state, "dishka_container"): + await application.state.dishka_container.close() @pytest_asyncio.fixture(scope="session") -async def app_container(app): # type: ignore[valid-type] +async def app_container(app): """Expose the Dishka container attached to the app.""" - container: AsyncContainer = app.state.dishka_container # type: ignore[attr-defined] + container: AsyncContainer = app.state.dishka_container return container -# ===== Client (function-scoped for clean cookies per test) ===== @pytest_asyncio.fixture -async def client(app) -> AsyncGenerator[httpx.AsyncClient, None]: # type: ignore[valid-type] - # Use httpx with ASGI app directly - # The app fixture already handles lifespan via LifespanManager - # Use HTTPS scheme so 'Secure' cookies set by the app (access_token, csrf_token) - # are accepted and sent by the client during tests. +async def client(app) -> AsyncGenerator[httpx.AsyncClient, None]: + """HTTP client for testing API endpoints.""" async with httpx.AsyncClient( - transport=ASGITransport(app=app), - base_url="https://test", - timeout=30.0, - follow_redirects=True + transport=ASGITransport(app=app), + base_url="https://test", + timeout=30.0, + follow_redirects=True, ) as c: yield c -# ===== Request-scope accessor ===== @asynccontextmanager async def _container_scope(container: AsyncContainer): - async with container() as scope: # type: ignore[misc] + async with container() as scope: yield scope @pytest_asyncio.fixture -async def scope(app_container: AsyncContainer): # type: ignore[valid-type] +async def scope(app_container: AsyncContainer): async with _container_scope(app_container) as s: yield s @pytest_asyncio.fixture -async def db(scope) -> AsyncGenerator[Database, None]: # type: ignore[valid-type] +async def db(scope) -> AsyncGenerator[Database, None]: database: Database = await scope.get(Database) yield database @pytest_asyncio.fixture -async def redis_client(scope) -> AsyncGenerator[redis.Redis, None]: # type: ignore[valid-type] +async def redis_client(scope) -> AsyncGenerator[redis.Redis, None]: client: redis.Redis = await scope.get(redis.Redis) yield client -# ===== Per-test cleanup (only for integration tests, see integration/conftest.py) ===== -# Note: autouse cleanup moved to tests/integration/conftest.py to avoid -# requiring DB/Redis for unit tests. Unit tests use tests/unit/conftest.py instead. - - # ===== HTTP helpers (auth) ===== async def _http_login(client: httpx.AsyncClient, username: str, password: str) -> str: data = {"username": username, "password": password} @@ -210,10 +141,9 @@ async def _http_login(client: httpx.AsyncClient, username: str, password: str) - return resp.json().get("csrf_token", "") -# Session-scoped shared users for convenience -@pytest.fixture(scope="session") +@pytest.fixture def test_user_credentials(): - uid = os.environ.get("PYTEST_SESSION_ID", uuid.uuid4().hex[:8]) + uid = uuid.uuid4().hex[:8] return { "username": f"test_user_{uid}", "email": f"test_user_{uid}@example.com", @@ -222,9 +152,9 @@ def test_user_credentials(): } -@pytest.fixture(scope="session") +@pytest.fixture def test_admin_credentials(): - uid = os.environ.get("PYTEST_SESSION_ID", uuid.uuid4().hex[:8]) + uid = uuid.uuid4().hex[:8] return { "username": f"admin_user_{uid}", "email": f"admin_user_{uid}@example.com", @@ -235,22 +165,22 @@ def test_admin_credentials(): @pytest_asyncio.fixture async def test_user(client: httpx.AsyncClient, test_user_credentials): - """Function-scoped authenticated user. Recreated each test (DB wiped between tests).""" + """Function-scoped authenticated user.""" creds = test_user_credentials r = await client.post("/api/v1/auth/register", json=creds) if r.status_code not in (200, 201, 400): - pytest.skip(f"Cannot create test user (status {r.status_code}).") + pytest.fail(f"Cannot create test user (status {r.status_code}): {r.text}") csrf = await _http_login(client, creds["username"], creds["password"]) return {**creds, "csrf_token": csrf, "headers": {"X-CSRF-Token": csrf}} @pytest_asyncio.fixture async def test_admin(client: httpx.AsyncClient, test_admin_credentials): - """Function-scoped authenticated admin. Recreated each test (DB wiped between tests).""" + """Function-scoped authenticated admin.""" creds = test_admin_credentials r = await client.post("/api/v1/auth/register", json=creds) if r.status_code not in (200, 201, 400): - pytest.skip(f"Cannot create test admin (status {r.status_code}).") + pytest.fail(f"Cannot create test admin (status {r.status_code}): {r.text}") csrf = await _http_login(client, creds["username"], creds["password"]) return {**creds, "csrf_token": csrf, "headers": {"X-CSRF-Token": csrf}} @@ -260,12 +190,20 @@ async def another_user(client: httpx.AsyncClient): username = f"test_user_{uuid.uuid4().hex[:8]}" email = f"{username}@example.com" password = "TestPass123!" - await client.post("/api/v1/auth/register", json={ + await client.post( + "/api/v1/auth/register", + json={ + "username": username, + "email": email, + "password": password, + "role": "user", + }, + ) + csrf = await _http_login(client, username, password) + return { "username": username, "email": email, "password": password, - "role": "user", - }) - csrf = await _http_login(client, username, password) - return {"username": username, "email": email, "password": password, "csrf_token": csrf, - "headers": {"X-CSRF-Token": csrf}} + "csrf_token": csrf, + "headers": {"X-CSRF-Token": csrf}, + } diff --git a/backend/tests/e2e/conftest.py b/backend/tests/e2e/conftest.py index 7b2dff4b..e8243e1c 100644 --- a/backend/tests/e2e/conftest.py +++ b/backend/tests/e2e/conftest.py @@ -1,9 +1,9 @@ +"""E2E tests conftest - with infrastructure cleanup.""" import pytest_asyncio import redis.asyncio as redis -from beanie import init_beanie from app.core.database_context import Database -from app.db.docs import ALL_DOCUMENTS +from tests.helpers.cleanup import cleanup_db_and_redis @pytest_asyncio.fixture(autouse=True) @@ -12,20 +12,7 @@ async def _cleanup(db: Database, redis_client: redis.Redis): Only pre-test cleanup - post-test cleanup causes event loop issues when SSE/streaming tests hold connections across loop boundaries. - - NOTE: With pytest-xdist, each worker uses a separate Redis database - (gw0→db0, gw1→db1, etc.), so flushdb() is safe and only affects - that worker's database. See tests/conftest.py for REDIS_DB setup. """ - collections = await db.list_collection_names() - for name in collections: - if not name.startswith("system."): - await db.drop_collection(name) - - await redis_client.flushdb() - - # Initialize Beanie with document models - await init_beanie(database=db, document_models=ALL_DOCUMENTS) - + await cleanup_db_and_redis(db, redis_client) yield # No post-test cleanup to avoid "Event loop is closed" errors diff --git a/backend/tests/e2e/test_k8s_worker_create_pod.py b/backend/tests/e2e/test_k8s_worker_create_pod.py index c9e4b400..63c6c0ee 100644 --- a/backend/tests/e2e/test_k8s_worker_create_pod.py +++ b/backend/tests/e2e/test_k8s_worker_create_pod.py @@ -11,6 +11,8 @@ from app.services.idempotency import IdempotencyManager from app.services.k8s_worker.config import K8sWorkerConfig from app.services.k8s_worker.worker import KubernetesWorker +from app.settings import Settings +from dishka import AsyncContainer from kubernetes.client.rest import ApiException pytestmark = [pytest.mark.e2e, pytest.mark.k8s] @@ -19,7 +21,9 @@ @pytest.mark.asyncio -async def test_worker_creates_configmap_and_pod(scope, monkeypatch): # type: ignore[valid-type] +async def test_worker_creates_configmap_and_pod( + scope: AsyncContainer, monkeypatch: pytest.MonkeyPatch, test_settings: Settings +) -> None: # Ensure non-default namespace for worker validation ns = os.environ.get("K8S_NAMESPACE", "integr8scode") if ns == "default": @@ -36,6 +40,7 @@ async def test_worker_creates_configmap_and_pod(scope, monkeypatch): # type: ig config=cfg, producer=producer, schema_registry_manager=schema, + settings=test_settings, event_store=store, idempotency_manager=idem, logger=_test_logger, diff --git a/backend/tests/e2e/test_resource_cleaner_k8s.py b/backend/tests/e2e/test_resource_cleaner_k8s.py index 1616b555..33e57386 100644 --- a/backend/tests/e2e/test_resource_cleaner_k8s.py +++ b/backend/tests/e2e/test_resource_cleaner_k8s.py @@ -42,14 +42,14 @@ async def test_cleanup_nonexistent_pod() -> None: nonexistent_pod = "integr8s-test-nonexistent-pod" # Should complete within timeout and not raise any exceptions - start_time = asyncio.get_event_loop().time() + start_time = asyncio.get_running_loop().time() await rc.cleanup_pod_resources( pod_name=nonexistent_pod, namespace=namespace, execution_id="test-exec-nonexistent", timeout=5, ) - elapsed = asyncio.get_event_loop().time() - start_time + elapsed = asyncio.get_running_loop().time() - start_time # Verify it completed quickly (not waiting full timeout for non-existent resources) assert elapsed < 5, f"Cleanup took {elapsed}s, should be quick for non-existent resources" diff --git a/backend/tests/fixtures/real_services.py b/backend/tests/fixtures/real_services.py deleted file mode 100644 index 7a51e602..00000000 --- a/backend/tests/fixtures/real_services.py +++ /dev/null @@ -1,356 +0,0 @@ -""" -Real service fixtures for integration testing. -Uses actual MongoDB, Redis, Kafka from docker-compose instead of mocks. -""" -import asyncio -import uuid -from typing import AsyncGenerator, Optional, Dict, Any -from contextlib import asynccontextmanager - -import pytest -import pytest_asyncio -import redis.asyncio as redis -from aiokafka import AIOKafkaProducer, AIOKafkaConsumer -from aiokafka.errors import KafkaConnectionError -from pymongo.asynchronous.mongo_client import AsyncMongoClient - -from app.core.database_context import Database, DBClient -from app.settings import Settings - - -class TestServiceConnections: - """Manages connections to real services for testing.""" - - def __init__(self, test_id: str): - self.test_id = test_id - self.mongo_client: Optional[DBClient] = None - self.redis_client: Optional[redis.Redis] = None - self.kafka_producer: Optional[AIOKafkaProducer] = None - self.kafka_consumer: Optional[AIOKafkaConsumer] = None - self.db_name = f"test_{test_id}" - - async def connect_mongodb(self, url: str) -> Database: - """Connect to MongoDB and return test-specific database.""" - self.mongo_client = AsyncMongoClient( - url, - serverSelectionTimeoutMS=5000, - connectTimeoutMS=5000, - maxPoolSize=10 - ) - # Verify connection - await self.mongo_client.admin.command("ping") - return self.mongo_client[self.db_name] - - async def connect_redis(self, host: str = "localhost", port: int = 6379, db: int = 1) -> redis.Redis: - """Connect to Redis using test database (db=1).""" - self.redis_client = redis.Redis( - host=host, - port=port, - db=db, # Use db 1 for tests, 0 for production - decode_responses=True, - max_connections=10, - socket_connect_timeout=5, - socket_timeout=5 - ) - # Verify connection - await self.redis_client.execute_command("PING") - # Clear test namespace - await self.redis_client.flushdb() - return self.redis_client - - async def connect_kafka_producer(self, bootstrap_servers: str) -> Optional[AIOKafkaProducer]: - """Connect Kafka producer if available.""" - try: - self.kafka_producer = AIOKafkaProducer( - bootstrap_servers=bootstrap_servers, - compression_type="gzip", - acks="all", - enable_idempotence=True, - max_in_flight_requests_per_connection=5, - request_timeout_ms=30000, - metadata_max_age_ms=60000 - ) - await self.kafka_producer.start() - return self.kafka_producer - except (KafkaConnectionError, OSError): - # Kafka not available, tests can still run without it - return None - - async def connect_kafka_consumer(self, bootstrap_servers: str, group_id: str) -> Optional[AIOKafkaConsumer]: - """Connect Kafka consumer if available.""" - try: - self.kafka_consumer = AIOKafkaConsumer( - bootstrap_servers=bootstrap_servers, - group_id=group_id, - auto_offset_reset="earliest", - enable_auto_commit=False, - max_poll_records=100, - session_timeout_ms=30000, - heartbeat_interval_ms=10000 - ) - await self.kafka_consumer.start() - return self.kafka_consumer - except (KafkaConnectionError, OSError): - return None - - async def cleanup(self): - """Clean up all connections and test data.""" - # Drop test MongoDB database - if self.mongo_client: - await self.mongo_client.drop_database(self.db_name) - await self.mongo_client.close() - - # Clear Redis test database - if self.redis_client: - await self.redis_client.flushdb() - await self.redis_client.aclose() - - # Close Kafka connections - if self.kafka_producer: - await self.kafka_producer.stop() - if self.kafka_consumer: - await self.kafka_consumer.stop() - - -@pytest_asyncio.fixture -async def real_services(request) -> AsyncGenerator[TestServiceConnections, None]: - """ - Provides real service connections for testing. - Each test gets its own isolated database. - """ - # Generate unique test ID - test_id = f"{request.node.name}_{uuid.uuid4().hex[:8]}" - test_id = test_id.replace("[", "_").replace("]", "_").replace("-", "_") - - connections = TestServiceConnections(test_id) - - yield connections - - # Cleanup after test - await connections.cleanup() - - -@pytest_asyncio.fixture -async def real_mongodb(real_services: TestServiceConnections) -> Database: - """Get real MongoDB database for testing.""" - # Use MongoDB from docker-compose with auth - return await real_services.connect_mongodb( - "mongodb://root:rootpassword@localhost:27017" - ) - - -@pytest_asyncio.fixture -async def real_redis(real_services: TestServiceConnections) -> redis.Redis: - """Get real Redis client for testing.""" - return await real_services.connect_redis() - - -@pytest_asyncio.fixture -async def real_kafka_producer(real_services: TestServiceConnections) -> Optional[AIOKafkaProducer]: - """Get real Kafka producer if available.""" - return await real_services.connect_kafka_producer("localhost:9092") - - -@pytest_asyncio.fixture -async def real_kafka_consumer(real_services: TestServiceConnections) -> Optional[AIOKafkaConsumer]: - """Get real Kafka consumer if available.""" - test_group = f"test_group_{real_services.test_id}" - return await real_services.connect_kafka_consumer("localhost:9092", test_group) - - -@asynccontextmanager -async def mongodb_transaction(db: Database): - """ - Context manager for MongoDB transactions. - Automatically rolls back on error. - """ - client = db.client - async with await client.start_session() as session: - async with session.start_transaction(): - try: - yield session - await session.commit_transaction() - except Exception: - await session.abort_transaction() - raise - - -@asynccontextmanager -async def redis_pipeline(client: redis.Redis): - """Context manager for Redis pipeline operations.""" - pipe = client.pipeline() - try: - yield pipe - await pipe.execute() - except Exception: - # Redis doesn't support rollback, but we can clear the pipeline - pipe.reset() - raise - - -class TestDataFactory: - """Factory for creating test data in real services.""" - - @staticmethod - async def create_test_user(db: Database, **kwargs) -> Dict[str, Any]: - """Create a test user in MongoDB.""" - user_data = { - "user_id": str(uuid.uuid4()), - "username": kwargs.get("username", f"testuser_{uuid.uuid4().hex[:8]}"), - "email": kwargs.get("email", f"test_{uuid.uuid4().hex[:8]}@example.com"), - "password_hash": "$2b$12$test_hash", # bcrypt format - "role": kwargs.get("role", "user"), - "is_active": kwargs.get("is_active", True), - "is_superuser": kwargs.get("is_superuser", False), - "created_at": asyncio.get_event_loop().time(), - "updated_at": asyncio.get_event_loop().time() - } - user_data.update(kwargs) - - result = await db.users.insert_one(user_data) - user_data["_id"] = result.inserted_id - return user_data - - @staticmethod - async def create_test_execution(db: Database, **kwargs) -> Dict[str, Any]: - """Create a test execution in MongoDB.""" - execution_data = { - "execution_id": str(uuid.uuid4()), - "user_id": kwargs.get("user_id", str(uuid.uuid4())), - "script": kwargs.get("script", "print('test')"), - "language": kwargs.get("language", "python"), - "language_version": kwargs.get("language_version", "3.11"), - "status": kwargs.get("status", "queued"), - "created_at": asyncio.get_event_loop().time(), - "updated_at": asyncio.get_event_loop().time() - } - execution_data.update(kwargs) - - result = await db.executions.insert_one(execution_data) - execution_data["_id"] = result.inserted_id - return execution_data - - @staticmethod - async def create_test_event(db: Database, **kwargs) -> Dict[str, Any]: - """Create a test event in MongoDB.""" - event_data = { - "event_id": str(uuid.uuid4()), - "event_type": kwargs.get("event_type", "test.event"), - "aggregate_id": kwargs.get("aggregate_id", str(uuid.uuid4())), - "correlation_id": kwargs.get("correlation_id", str(uuid.uuid4())), - "payload": kwargs.get("payload", {}), - "metadata": kwargs.get("metadata", {}), - "timestamp": asyncio.get_event_loop().time(), - "user_id": kwargs.get("user_id", str(uuid.uuid4())) - } - event_data.update(kwargs) - - result = await db.events.insert_one(event_data) - event_data["_id"] = result.inserted_id - return event_data - - @staticmethod - async def publish_test_event(producer: Optional[AIOKafkaProducer], topic: str, event: Dict[str, Any]): - """Publish test event to Kafka if available.""" - if not producer: - return None - - import json - value = json.dumps(event).encode("utf-8") - key = event.get("aggregate_id", str(uuid.uuid4())).encode("utf-8") - - return await producer.send_and_wait(topic, value=value, key=key) - - @staticmethod - async def cache_test_data(client: redis.Redis, key: str, data: Any, ttl: int = 60): - """Cache test data in Redis.""" - import json - if isinstance(data, dict): - data = json.dumps(data) - await client.setex(key, ttl, data) - - @staticmethod - async def get_cached_data(client: redis.Redis, key: str) -> Optional[Any]: - """Get cached test data from Redis.""" - import json - data = await client.get(key) - if data: - try: - return json.loads(data) - except (json.JSONDecodeError, TypeError): - return data - return None - - -@pytest.fixture -def test_data_factory(): - """Provide test data factory.""" - return TestDataFactory() - - -async def wait_for_service(check_func, timeout: int = 30, service_name: str = "service"): - """Wait for a service to be ready.""" - import time - start = time.time() - last_error = None - - while time.time() - start < timeout: - try: - await check_func() - return True - except Exception as e: - last_error = e - await asyncio.sleep(0.5) - - raise TimeoutError(f"{service_name} not ready after {timeout}s: {last_error}") - - -@pytest_asyncio.fixture(scope="session") -async def ensure_services_running(): - """Ensure required Docker services are running.""" - import subprocess - - # Check MongoDB - async def check_mongo() -> None: - client = AsyncMongoClient( - "mongodb://root:rootpassword@localhost:27017", - serverSelectionTimeoutMS=5000 - ) - try: - await client.admin.command("ping") - finally: - await client.close() - - try: - await check_mongo() - except Exception: - print("Starting MongoDB...") - subprocess.run(["docker-compose", "up", "-d", "mongo"], check=False) - await wait_for_service(check_mongo, service_name="MongoDB") - - # Check Redis - async def check_redis() -> None: - r = redis.Redis(host="localhost", port=6379, socket_connect_timeout=5) - try: - await r.execute_command("PING") - finally: - await r.aclose() - - try: - await check_redis() - except Exception: - print("Starting Redis...") - subprocess.run(["docker-compose", "up", "-d", "redis"], check=False) - await wait_for_service(check_redis, service_name="Redis") - - # Kafka is optional - don't fail if not available - try: - producer = AIOKafkaProducer(bootstrap_servers="localhost:9092") - await asyncio.wait_for(producer.start(), timeout=5) - await producer.stop() - except Exception: - print("Kafka not available - some tests may be skipped") - - yield - - # Services stay running for next test run \ No newline at end of file diff --git a/backend/tests/helpers/cleanup.py b/backend/tests/helpers/cleanup.py new file mode 100644 index 00000000..33a4cdfd --- /dev/null +++ b/backend/tests/helpers/cleanup.py @@ -0,0 +1,23 @@ +"""Shared cleanup utilities for integration and E2E tests.""" +import redis.asyncio as redis +from beanie import init_beanie + +from app.core.database_context import Database +from app.db.docs import ALL_DOCUMENTS + + +async def cleanup_db_and_redis(db: Database, redis_client: redis.Redis) -> None: + """Clean DB and Redis before a test. + + NOTE: With pytest-xdist, each worker uses a separate Redis database + (gw0→db0, gw1→db1, etc.), so flushdb() is safe and only affects + that worker's database. See tests/conftest.py for REDIS_DB setup. + """ + collections = await db.list_collection_names() + for name in collections: + if not name.startswith("system."): + await db.drop_collection(name) + + await redis_client.flushdb() + + await init_beanie(database=db, document_models=ALL_DOCUMENTS) diff --git a/backend/tests/helpers/eventually.py b/backend/tests/helpers/eventually.py index 76be7ef4..f72689f3 100644 --- a/backend/tests/helpers/eventually.py +++ b/backend/tests/helpers/eventually.py @@ -17,7 +17,7 @@ async def eventually( - Returns the value of `fn` on success. - Raises the last exception after timeout. """ - deadline = asyncio.get_event_loop().time() + timeout + deadline = asyncio.get_running_loop().time() + timeout last_exc: BaseException | None = None while True: try: @@ -27,7 +27,7 @@ async def eventually( return res # type: ignore[return-value] except exceptions as exc: # type: ignore[misc] last_exc = exc - if asyncio.get_event_loop().time() >= deadline: + if asyncio.get_running_loop().time() >= deadline: raise await asyncio.sleep(interval) diff --git a/backend/tests/integration/conftest.py b/backend/tests/integration/conftest.py index 02ad99d2..a59a32a9 100644 --- a/backend/tests/integration/conftest.py +++ b/backend/tests/integration/conftest.py @@ -1,10 +1,9 @@ """Integration tests conftest - with infrastructure cleanup.""" import pytest_asyncio import redis.asyncio as redis -from beanie import init_beanie from app.core.database_context import Database -from app.db.docs import ALL_DOCUMENTS +from tests.helpers.cleanup import cleanup_db_and_redis @pytest_asyncio.fixture(autouse=True) @@ -13,21 +12,7 @@ async def _cleanup(db: Database, redis_client: redis.Redis): Only pre-test cleanup - post-test cleanup causes event loop issues when SSE/streaming tests hold connections across loop boundaries. - - NOTE: With pytest-xdist, each worker uses a separate Redis database - (gw0→db0, gw1→db1, etc.), so flushdb() is safe and only affects - that worker's database. See tests/conftest.py for REDIS_DB setup. """ - collections = await db.list_collection_names() - for name in collections: - if not name.startswith("system."): - await db.drop_collection(name) - - await redis_client.flushdb() - - # Initialize Beanie with document models - # Note: db fixture is already the AsyncDatabase object (type alias Database = AsyncDatabase[MongoDocument]) - await init_beanie(database=db, document_models=ALL_DOCUMENTS) - + await cleanup_db_and_redis(db, redis_client) yield # No post-test cleanup to avoid "Event loop is closed" errors diff --git a/backend/tests/integration/dlq/test_dlq_discard_policy.py b/backend/tests/integration/dlq/test_dlq_discard_policy.py index 000e7b19..ba625f58 100644 --- a/backend/tests/integration/dlq/test_dlq_discard_policy.py +++ b/backend/tests/integration/dlq/test_dlq_discard_policy.py @@ -1,18 +1,16 @@ -import asyncio import json import logging -import os import uuid from datetime import datetime, timezone import pytest -from confluent_kafka import Producer - from app.db.docs import DLQMessageDocument from app.dlq.manager import create_dlq_manager from app.dlq.models import DLQMessageStatus, RetryPolicy, RetryStrategy from app.domain.enums.kafka import KafkaTopic from app.events.schema.schema_registry import create_schema_registry_manager +from confluent_kafka import Producer + from tests.helpers import make_execution_requested_event from tests.helpers.eventually import eventually @@ -25,10 +23,11 @@ @pytest.mark.asyncio -async def test_dlq_manager_discards_with_manual_policy(db) -> None: # type: ignore[valid-type] - schema_registry = create_schema_registry_manager(_test_logger) - manager = create_dlq_manager(schema_registry=schema_registry, logger=_test_logger) - prefix = os.environ.get("KAFKA_TOPIC_PREFIX", "") +async def test_dlq_manager_discards_with_manual_policy(db, test_settings) -> None: # type: ignore[valid-type] + schema_registry = create_schema_registry_manager(test_settings, _test_logger) + manager = create_dlq_manager(settings=test_settings, schema_registry=schema_registry, logger=_test_logger) + # Use prefix from test_settings to match what the manager uses + prefix = test_settings.KAFKA_TOPIC_PREFIX topic = f"{prefix}{str(KafkaTopic.EXECUTION_EVENTS)}" manager.set_retry_policy(topic, RetryPolicy(topic=topic, strategy=RetryStrategy.MANUAL)) @@ -53,6 +52,7 @@ async def test_dlq_manager_discards_with_manual_policy(db) -> None: # type: ign producer.flush(5) async with manager: + async def _discarded() -> None: doc = await DLQMessageDocument.find_one({"event_id": ev.event_id}) assert doc is not None diff --git a/backend/tests/integration/dlq/test_dlq_manager.py b/backend/tests/integration/dlq/test_dlq_manager.py index a6fbc8f3..b6da245e 100644 --- a/backend/tests/integration/dlq/test_dlq_manager.py +++ b/backend/tests/integration/dlq/test_dlq_manager.py @@ -1,16 +1,15 @@ -import asyncio import json import logging -import os +import uuid from datetime import datetime, timezone import pytest -from confluent_kafka import Producer - from app.db.docs import DLQMessageDocument from app.dlq.manager import create_dlq_manager from app.domain.enums.kafka import KafkaTopic from app.events.schema.schema_registry import create_schema_registry_manager +from confluent_kafka import Producer + from tests.helpers import make_execution_requested_event from tests.helpers.eventually import eventually @@ -23,14 +22,15 @@ @pytest.mark.asyncio -async def test_dlq_manager_persists_in_mongo(db) -> None: # type: ignore[valid-type] - schema_registry = create_schema_registry_manager(_test_logger) - manager = create_dlq_manager(schema_registry=schema_registry, logger=_test_logger) +async def test_dlq_manager_persists_in_mongo(db, test_settings) -> None: # type: ignore[valid-type] + schema_registry = create_schema_registry_manager(test_settings, _test_logger) + manager = create_dlq_manager(settings=test_settings, schema_registry=schema_registry, logger=_test_logger) - # Build a DLQ payload - ev = make_execution_requested_event(execution_id="exec-dlq-1") + # Use prefix from test_settings to match what the manager uses + prefix = test_settings.KAFKA_TOPIC_PREFIX - prefix = os.environ.get("KAFKA_TOPIC_PREFIX", "") + # Use unique execution_id to avoid conflicts with parallel test workers + ev = make_execution_requested_event(execution_id=f"exec-dlq-persist-{uuid.uuid4().hex[:8]}") payload = { "event": ev.to_dict(), "original_topic": f"{prefix}{str(KafkaTopic.EXECUTION_EVENTS)}", @@ -51,6 +51,7 @@ async def test_dlq_manager_persists_in_mongo(db) -> None: # type: ignore[valid- # Run the manager briefly to consume and persist async with manager: + async def _exists(): doc = await DLQMessageDocument.find_one({"event_id": ev.event_id}) assert doc is not None diff --git a/backend/tests/integration/dlq/test_dlq_retry_immediate.py b/backend/tests/integration/dlq/test_dlq_retry_immediate.py index 0adf91aa..5c435b92 100644 --- a/backend/tests/integration/dlq/test_dlq_retry_immediate.py +++ b/backend/tests/integration/dlq/test_dlq_retry_immediate.py @@ -1,18 +1,16 @@ -import asyncio import json import logging -import os import uuid from datetime import datetime, timezone import pytest -from confluent_kafka import Producer - from app.db.docs import DLQMessageDocument from app.dlq.manager import create_dlq_manager from app.dlq.models import DLQMessageStatus, RetryPolicy, RetryStrategy from app.domain.enums.kafka import KafkaTopic from app.events.schema.schema_registry import create_schema_registry_manager +from confluent_kafka import Producer + from tests.helpers import make_execution_requested_event from tests.helpers.eventually import eventually @@ -25,10 +23,11 @@ @pytest.mark.asyncio -async def test_dlq_manager_immediate_retry_updates_doc(db) -> None: # type: ignore[valid-type] - schema_registry = create_schema_registry_manager(_test_logger) - manager = create_dlq_manager(schema_registry=schema_registry, logger=_test_logger) - prefix = os.environ.get("KAFKA_TOPIC_PREFIX", "") +async def test_dlq_manager_immediate_retry_updates_doc(db, test_settings) -> None: # type: ignore[valid-type] + schema_registry = create_schema_registry_manager(test_settings, _test_logger) + manager = create_dlq_manager(settings=test_settings, schema_registry=schema_registry, logger=_test_logger) + # Use prefix from test_settings to match what the manager uses + prefix = test_settings.KAFKA_TOPIC_PREFIX topic = f"{prefix}{str(KafkaTopic.EXECUTION_EVENTS)}" manager.set_retry_policy( topic, @@ -56,6 +55,7 @@ async def test_dlq_manager_immediate_retry_updates_doc(db) -> None: # type: ign prod.flush(5) async with manager: + async def _retried() -> None: doc = await DLQMessageDocument.find_one({"event_id": ev.event_id}) assert doc is not None diff --git a/backend/tests/integration/events/test_consume_roundtrip.py b/backend/tests/integration/events/test_consume_roundtrip.py index 185196e5..b2ceb48b 100644 --- a/backend/tests/integration/events/test_consume_roundtrip.py +++ b/backend/tests/integration/events/test_consume_roundtrip.py @@ -9,7 +9,7 @@ from app.events.core.dispatcher import EventDispatcher from app.events.core.types import ConsumerConfig from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas -from app.settings import get_settings +from app.settings import Settings from tests.helpers import make_execution_requested_event @@ -22,13 +22,13 @@ async def test_produce_consume_roundtrip(scope) -> None: # type: ignore[valid-type] # Ensure schemas are registered registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager) + settings: Settings = await scope.get(Settings) await initialize_event_schemas(registry) # Real producer from DI producer: UnifiedProducer = await scope.get(UnifiedProducer) # Build a consumer that handles EXECUTION_REQUESTED - settings = get_settings() dispatcher = EventDispatcher(logger=_test_logger) received = asyncio.Event() @@ -44,7 +44,13 @@ async def _handle(_event) -> None: # noqa: ANN001 auto_offset_reset="earliest", ) - consumer = UnifiedConsumer(config, dispatcher, logger=_test_logger) + consumer = UnifiedConsumer( + config, + dispatcher, + schema_registry=registry, + settings=settings, + logger=_test_logger, + ) await consumer.start([str(KafkaTopic.EXECUTION_EVENTS)]) try: diff --git a/backend/tests/integration/events/test_consumer_lifecycle.py b/backend/tests/integration/events/test_consumer_lifecycle.py index 70f2878b..eb63b770 100644 --- a/backend/tests/integration/events/test_consumer_lifecycle.py +++ b/backend/tests/integration/events/test_consumer_lifecycle.py @@ -4,6 +4,8 @@ import pytest from app.domain.enums.kafka import KafkaTopic from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer +from app.events.schema.schema_registry import SchemaRegistryManager +from app.settings import Settings pytestmark = [pytest.mark.integration, pytest.mark.kafka] @@ -11,10 +13,18 @@ @pytest.mark.asyncio -async def test_consumer_start_status_seek_and_stop(): - cfg = ConsumerConfig(bootstrap_servers="localhost:9092", group_id=f"test-consumer-{uuid4().hex[:6]}") +async def test_consumer_start_status_seek_and_stop(scope) -> None: # type: ignore[valid-type] + registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager) + settings: Settings = await scope.get(Settings) + cfg = ConsumerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, group_id=f"test-consumer-{uuid4().hex[:6]}") disp = EventDispatcher(logger=_test_logger) - c = UnifiedConsumer(cfg, event_dispatcher=disp, logger=_test_logger) + c = UnifiedConsumer( + cfg, + event_dispatcher=disp, + schema_registry=registry, + settings=settings, + logger=_test_logger, + ) await c.start([KafkaTopic.EXECUTION_EVENTS]) try: st = c.get_status() diff --git a/backend/tests/integration/events/test_event_dispatcher.py b/backend/tests/integration/events/test_event_dispatcher.py index c88e3fa6..aa65d181 100644 --- a/backend/tests/integration/events/test_event_dispatcher.py +++ b/backend/tests/integration/events/test_event_dispatcher.py @@ -9,7 +9,7 @@ from app.events.core.dispatcher import EventDispatcher from app.events.core.types import ConsumerConfig from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas -from app.settings import get_settings +from app.settings import Settings from tests.helpers import make_execution_requested_event @@ -22,6 +22,7 @@ async def test_dispatcher_with_multiple_handlers(scope) -> None: # type: ignore[valid-type] # Ensure schema registry is ready registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager) + settings: Settings = await scope.get(Settings) await initialize_event_schemas(registry) # Build dispatcher with two handlers for the same event @@ -38,14 +39,19 @@ async def h2(_e) -> None: # noqa: ANN001 h2_called.set() # Real consumer against execution-events - settings = get_settings() cfg = ConsumerConfig( bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, group_id=f"dispatcher-it.{uuid.uuid4().hex[:6]}", enable_auto_commit=True, auto_offset_reset="earliest", ) - consumer = UnifiedConsumer(cfg, dispatcher, logger=_test_logger) + consumer = UnifiedConsumer( + cfg, + dispatcher, + schema_registry=registry, + settings=settings, + logger=_test_logger, + ) await consumer.start([str(KafkaTopic.EXECUTION_EVENTS)]) # Produce a request event via DI diff --git a/backend/tests/integration/events/test_event_store_consumer.py b/backend/tests/integration/events/test_event_store_consumer.py index 111d6fe2..ec35a99b 100644 --- a/backend/tests/integration/events/test_event_store_consumer.py +++ b/backend/tests/integration/events/test_event_store_consumer.py @@ -8,8 +8,10 @@ from app.events.event_store import EventStore from app.events.event_store_consumer import EventStoreConsumer, create_event_store_consumer from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas +from app.domain.enums.auth import LoginMethod from app.infrastructure.kafka.events.metadata import AvroEventMetadata from app.infrastructure.kafka.events.user import UserLoggedInEvent +from app.settings import Settings pytestmark = [pytest.mark.integration, pytest.mark.kafka, pytest.mark.mongodb] @@ -26,11 +28,12 @@ async def test_event_store_consumer_stores_events(scope) -> None: # type: ignor producer: UnifiedProducer = await scope.get(UnifiedProducer) db: Database = await scope.get(Database) store: EventStore = await scope.get(EventStore) + settings: Settings = await scope.get(Settings) # Build an event ev = UserLoggedInEvent( user_id=f"u-{uuid.uuid4().hex[:6]}", - login_method="password", + login_method=LoginMethod.PASSWORD, metadata=AvroEventMetadata(service_name="tests", service_version="1.0.0"), ) @@ -39,6 +42,7 @@ async def test_event_store_consumer_stores_events(scope) -> None: # type: ignor event_store=store, topics=[KafkaTopic.USER_EVENTS], schema_registry_manager=registry, + settings=settings, logger=_test_logger, producer=producer, batch_size=10, diff --git a/backend/tests/integration/events/test_producer_roundtrip.py b/backend/tests/integration/events/test_producer_roundtrip.py index 05e53cc4..c35364b9 100644 --- a/backend/tests/integration/events/test_producer_roundtrip.py +++ b/backend/tests/integration/events/test_producer_roundtrip.py @@ -17,9 +17,8 @@ async def test_unified_producer_start_produce_send_to_dlq_stop(scope): # type: ignore[valid-type] schema: SchemaRegistryManager = await scope.get(SchemaRegistryManager) prod = UnifiedProducer(ProducerConfig(bootstrap_servers="localhost:9092"), schema, logger=_test_logger) - await prod.start() - try: + async with prod: ev = make_execution_requested_event(execution_id=f"exec-{uuid4().hex[:8]}") await prod.produce(ev) @@ -28,8 +27,6 @@ async def test_unified_producer_start_produce_send_to_dlq_stop(scope): # type: st = prod.get_status() assert st["running"] is True and st["state"] == "running" - finally: - await prod.stop() def test_producer_handle_stats_path(): diff --git a/backend/tests/integration/events/test_schema_registry_real.py b/backend/tests/integration/events/test_schema_registry_real.py index 895f109d..273c7706 100644 --- a/backend/tests/integration/events/test_schema_registry_real.py +++ b/backend/tests/integration/events/test_schema_registry_real.py @@ -4,15 +4,16 @@ from app.events.schema.schema_registry import SchemaRegistryManager from app.infrastructure.kafka.events.metadata import AvroEventMetadata from app.infrastructure.kafka.events.pod import PodCreatedEvent +from app.settings import Settings pytestmark = [pytest.mark.integration, pytest.mark.kafka] _test_logger = logging.getLogger("test.events.schema_registry_real") -def test_serialize_and_deserialize_event_real_registry() -> None: +def test_serialize_and_deserialize_event_real_registry(test_settings: Settings) -> None: # Uses real Schema Registry configured via env (SCHEMA_REGISTRY_URL) - m = SchemaRegistryManager(logger=_test_logger) + m = SchemaRegistryManager(settings=test_settings, logger=_test_logger) ev = PodCreatedEvent( execution_id="e1", pod_name="p", diff --git a/backend/tests/integration/events/test_schema_registry_roundtrip.py b/backend/tests/integration/events/test_schema_registry_roundtrip.py index a218c59b..4791c16f 100644 --- a/backend/tests/integration/events/test_schema_registry_roundtrip.py +++ b/backend/tests/integration/events/test_schema_registry_roundtrip.py @@ -2,6 +2,7 @@ import pytest from app.events.schema.schema_registry import MAGIC_BYTE, SchemaRegistryManager +from app.settings import Settings from tests.helpers import make_execution_requested_event @@ -24,7 +25,7 @@ async def test_schema_registry_serialize_deserialize_roundtrip(scope): # type: await reg.initialize_schemas() -def test_schema_registry_deserialize_invalid_header(): - reg = SchemaRegistryManager(logger=_test_logger) +def test_schema_registry_deserialize_invalid_header(test_settings: Settings) -> None: + reg = SchemaRegistryManager(settings=test_settings, logger=_test_logger) with pytest.raises(ValueError): reg.deserialize_event(b"\x01\x00\x00\x00\x01", topic="t") # wrong magic byte diff --git a/backend/tests/integration/idempotency/test_consumer_idempotent.py b/backend/tests/integration/idempotency/test_consumer_idempotent.py index e5334149..bdcc04d9 100644 --- a/backend/tests/integration/idempotency/test_consumer_idempotent.py +++ b/backend/tests/integration/idempotency/test_consumer_idempotent.py @@ -8,10 +8,11 @@ from app.domain.enums.kafka import KafkaTopic from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer from app.events.core.dispatcher import EventDispatcher as Disp +from app.events.schema.schema_registry import SchemaRegistryManager from tests.helpers import make_execution_requested_event from app.services.idempotency.idempotency_manager import IdempotencyManager from app.services.idempotency.middleware import IdempotentConsumerWrapper -from app.settings import get_settings +from app.settings import Settings from tests.helpers.eventually import eventually pytestmark = [pytest.mark.integration, pytest.mark.kafka, pytest.mark.redis] @@ -23,6 +24,8 @@ async def test_consumer_idempotent_wrapper_blocks_duplicates(scope) -> None: # type: ignore[valid-type] producer: UnifiedProducer = await scope.get(UnifiedProducer) idm: IdempotencyManager = await scope.get(IdempotencyManager) + registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager) + settings: Settings = await scope.get(Settings) # Build a dispatcher with a counter disp: Disp = EventDispatcher(logger=_test_logger) @@ -33,14 +36,19 @@ async def handle(_ev): # noqa: ANN001 seen["n"] += 1 # Real consumer with idempotent wrapper - settings = get_settings() cfg = ConsumerConfig( bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, group_id=f"test-idem-consumer.{uuid.uuid4().hex[:6]}", enable_auto_commit=True, auto_offset_reset="earliest", ) - base = UnifiedConsumer(cfg, event_dispatcher=disp, logger=_test_logger) + base = UnifiedConsumer( + cfg, + event_dispatcher=disp, + schema_registry=registry, + settings=settings, + logger=_test_logger, + ) wrapper = IdempotentConsumerWrapper( consumer=base, idempotency_manager=idm, diff --git a/backend/tests/integration/result_processor/test_result_processor.py b/backend/tests/integration/result_processor/test_result_processor.py index e400e8dc..5c9a98c4 100644 --- a/backend/tests/integration/result_processor/test_result_processor.py +++ b/backend/tests/integration/result_processor/test_result_processor.py @@ -20,7 +20,7 @@ from app.infrastructure.kafka.events.metadata import AvroEventMetadata from app.services.idempotency import IdempotencyManager from app.services.result_processor.processor import ResultProcessor -from app.settings import get_settings +from app.settings import Settings pytestmark = [pytest.mark.integration, pytest.mark.kafka, pytest.mark.mongodb] @@ -31,6 +31,7 @@ async def test_result_processor_persists_and_emits(scope) -> None: # type: ignore[valid-type] # Ensure schemas registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager) + settings: Settings = await scope.get(Settings) await initialize_event_schemas(registry) # Dependencies @@ -53,12 +54,13 @@ async def test_result_processor_persists_and_emits(scope) -> None: # type: igno processor = ResultProcessor( execution_repo=repo, producer=producer, + schema_registry=registry, + settings=settings, idempotency_manager=idem, logger=_test_logger, ) # Setup a small consumer to capture ResultStoredEvent - settings = get_settings() dispatcher = EventDispatcher(logger=_test_logger) stored_received = asyncio.Event() @@ -73,7 +75,13 @@ async def _stored(_event) -> None: # noqa: ANN001 enable_auto_commit=True, auto_offset_reset="earliest", ) - stored_consumer = UnifiedConsumer(cconf, dispatcher, logger=_test_logger) + stored_consumer = UnifiedConsumer( + cconf, + dispatcher, + schema_registry=registry, + settings=settings, + logger=_test_logger, + ) await stored_consumer.start([str(KafkaTopic.EXECUTION_RESULTS)]) try: diff --git a/backend/tests/integration/services/sse/test_partitioned_event_router.py b/backend/tests/integration/services/sse/test_partitioned_event_router.py index 40720ee4..040a62b5 100644 --- a/backend/tests/integration/services/sse/test_partitioned_event_router.py +++ b/backend/tests/integration/services/sse/test_partitioned_event_router.py @@ -1,27 +1,25 @@ -import asyncio import logging from uuid import uuid4 -from tests.helpers.eventually import eventually -import pytest +import pytest from app.core.metrics.events import EventMetrics from app.events.core import EventDispatcher from app.events.schema.schema_registry import SchemaRegistryManager -from tests.helpers import make_execution_requested_event -from app.infrastructure.kafka.events.pod import PodCreatedEvent from app.schemas_pydantic.sse import RedisSSEMessage from app.services.sse.kafka_redis_bridge import SSEKafkaRedisBridge from app.services.sse.redis_bus import SSERedisBus from app.settings import Settings +from tests.helpers import make_execution_requested_event +from tests.helpers.eventually import eventually + pytestmark = [pytest.mark.integration, pytest.mark.redis] _test_logger = logging.getLogger("test.services.sse.partitioned_event_router_integration") @pytest.mark.asyncio -async def test_router_bridges_to_redis(redis_client) -> None: # type: ignore[valid-type] - settings = Settings() +async def test_router_bridges_to_redis(redis_client, test_settings: Settings) -> None: suffix = uuid4().hex[:6] bus = SSERedisBus( redis_client, @@ -30,8 +28,8 @@ async def test_router_bridges_to_redis(redis_client) -> None: # type: ignore[va logger=_test_logger, ) router = SSEKafkaRedisBridge( - schema_registry=SchemaRegistryManager(logger=_test_logger), - settings=settings, + schema_registry=SchemaRegistryManager(settings=test_settings, logger=_test_logger), + settings=test_settings, event_metrics=EventMetrics(), sse_bus=bus, logger=_test_logger, @@ -57,13 +55,12 @@ async def _recv(): @pytest.mark.asyncio -async def test_router_start_and_stop(redis_client) -> None: # type: ignore[valid-type] - settings = Settings() - settings.SSE_CONSUMER_POOL_SIZE = 1 +async def test_router_start_and_stop(redis_client, test_settings: Settings) -> None: + test_settings.SSE_CONSUMER_POOL_SIZE = 1 suffix = uuid4().hex[:6] router = SSEKafkaRedisBridge( - schema_registry=SchemaRegistryManager(logger=_test_logger), - settings=settings, + schema_registry=SchemaRegistryManager(settings=test_settings, logger=_test_logger), + settings=test_settings, event_metrics=EventMetrics(), sse_bus=SSERedisBus( redis_client, @@ -74,13 +71,13 @@ async def test_router_start_and_stop(redis_client) -> None: # type: ignore[vali logger=_test_logger, ) - await router.start() + await router.__aenter__() stats = router.get_stats() assert stats["num_consumers"] == 1 - await router.stop() + await router.aclose() assert router.get_stats()["num_consumers"] == 0 # idempotent start/stop - await router.start() - await router.start() - await router.stop() - await router.stop() + await router.__aenter__() + await router.__aenter__() + await router.aclose() + await router.aclose() diff --git a/backend/tests/integration/services/test_rate_limit_service.py b/backend/tests/integration/services/test_rate_limit_service.py deleted file mode 100644 index 07ecf895..00000000 --- a/backend/tests/integration/services/test_rate_limit_service.py +++ /dev/null @@ -1,48 +0,0 @@ -import asyncio -import pytest - -from app.domain.rate_limit import EndpointGroup, RateLimitAlgorithm, RateLimitConfig, RateLimitRule, UserRateLimit -from app.services.rate_limit_service import RateLimitService - - -pytestmark = [pytest.mark.integration, pytest.mark.redis] - - -@pytest.mark.asyncio -async def test_rate_limit_happy_path_and_block(scope) -> None: # type: ignore[valid-type] - svc: RateLimitService = await scope.get(RateLimitService) - - # Ensure rate limiting enabled for this test - svc.settings.RATE_LIMIT_ENABLED = True - - # Install a small custom rule for a test user on a synthetic endpoint - user_id = "user-test" - endpoint = "/api/v1/limits/demo" - - cfg = RateLimitConfig.get_default_config() - rule = RateLimitRule( - endpoint_pattern=r"^/api/v1/limits/demo$", - group=EndpointGroup.API, - requests=2, - window_seconds=2, - burst_multiplier=1.0, - algorithm=RateLimitAlgorithm.SLIDING_WINDOW, - priority=50, - enabled=True, - ) - cfg.user_overrides[user_id] = UserRateLimit(user_id=user_id, rules=[rule]) - await svc.update_config(cfg) - - # First two requests allowed, third blocked - s1 = await svc.check_rate_limit(user_id, endpoint) - s2 = await svc.check_rate_limit(user_id, endpoint) - s3 = await svc.check_rate_limit(user_id, endpoint) - - assert s1.allowed is True and s2.allowed is True - assert s3.allowed is False and s3.retry_after is not None and s3.retry_after > 0 - - # Reset user keys and verify reset works (stats may be empty due to Redis timing) - await svc.reset_user_limits(user_id) - stats_after = await svc.get_usage_stats(user_id) - assert stats_after == {} - diff --git a/backend/tests/unit/.env.unit b/backend/tests/unit/.env.unit deleted file mode 100644 index f3205c30..00000000 --- a/backend/tests/unit/.env.unit +++ /dev/null @@ -1,4 +0,0 @@ -TESTING=true -SECRET_KEY=test-secret-key-for-testing-only-32chars!! -ENABLE_TRACING=false -OTEL_SDK_DISABLED=true diff --git a/backend/tests/unit/conftest.py b/backend/tests/unit/conftest.py index 617233cf..e89e4163 100644 --- a/backend/tests/unit/conftest.py +++ b/backend/tests/unit/conftest.py @@ -1,18 +1,9 @@ -import os -from pathlib import Path +"""Unit test configuration. +Unit tests should NOT access real infrastructure (DB, Redis, HTTP). +These fixtures raise errors to catch accidental usage. +""" import pytest -from dotenv import load_dotenv - -# Load unit test env -unit_env = Path(__file__).parent / ".env.unit" -load_dotenv(unit_env, override=True) - - -@pytest.fixture(scope="function", autouse=False) -def _cleanup(): - """No-op - unit tests don't need DB/Redis cleanup.""" - yield @pytest.fixture diff --git a/backend/tests/unit/db/__init__.py b/backend/tests/unit/db/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/tests/unit/db/schema/__init__.py b/backend/tests/unit/db/schema/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/tests/unit/dlq/__init__.py b/backend/tests/unit/dlq/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/tests/unit/events/test_schema_registry_manager.py b/backend/tests/unit/events/test_schema_registry_manager.py index 9d155b5d..77562a2e 100644 --- a/backend/tests/unit/events/test_schema_registry_manager.py +++ b/backend/tests/unit/events/test_schema_registry_manager.py @@ -1,17 +1,14 @@ import logging import pytest - from app.events.schema.schema_registry import SchemaRegistryManager +from app.infrastructure.kafka.events.execution import ExecutionRequestedEvent _test_logger = logging.getLogger("test.events.schema_registry_manager") -from app.infrastructure.kafka.events.execution import ExecutionRequestedEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata -from app.infrastructure.kafka.events.pod import PodCreatedEvent -def test_deserialize_json_execution_requested() -> None: - m = SchemaRegistryManager(logger=_test_logger) +def test_deserialize_json_execution_requested(test_settings) -> None: # type: ignore[valid-type] + m = SchemaRegistryManager(test_settings, logger=_test_logger) data = { "event_type": "execution_requested", "execution_id": "e1", @@ -35,10 +32,7 @@ def test_deserialize_json_execution_requested() -> None: assert ev.language == "python" -def test_deserialize_json_missing_type_raises() -> None: - m = SchemaRegistryManager(logger=_test_logger) +def test_deserialize_json_missing_type_raises(test_settings) -> None: # type: ignore[valid-type] + m = SchemaRegistryManager(test_settings, logger=_test_logger) with pytest.raises(ValueError): m.deserialize_json({}) - - - diff --git a/backend/tests/fixtures/__init__.py b/backend/tests/unit/services/pod_monitor/__init__.py similarity index 100% rename from backend/tests/fixtures/__init__.py rename to backend/tests/unit/services/pod_monitor/__init__.py diff --git a/backend/tests/unit/services/pod_monitor/test_monitor.py b/backend/tests/unit/services/pod_monitor/test_monitor.py index 4aff899f..1e6d5081 100644 --- a/backend/tests/unit/services/pod_monitor/test_monitor.py +++ b/backend/tests/unit/services/pod_monitor/test_monitor.py @@ -1,12 +1,14 @@ import asyncio import logging import types -import pytest +from unittest.mock import MagicMock +import pytest +from app.core.k8s_clients import K8sClients from app.services.pod_monitor.config import PodMonitorConfig -from app.services.pod_monitor.monitor import PodMonitor -from tests.helpers.k8s_fakes import make_pod, make_watch, FakeApi +from app.services.pod_monitor.monitor import PodMonitor, create_pod_monitor +from tests.helpers.k8s_fakes import FakeApi, make_pod, make_watch pytestmark = pytest.mark.unit @@ -115,10 +117,10 @@ async def _quick_watch(): pm._watch_pods = _quick_watch - await pm.start() + await pm.__aenter__() assert pm.state.name == "RUNNING" - await pm.stop() + await pm.aclose() assert pm.state.name == "STOPPED" and spy.cleared is True @@ -505,8 +507,6 @@ async def mock_handle(): @pytest.mark.asyncio async def test_create_pod_monitor_context_manager(monkeypatch) -> None: - from app.services.pod_monitor.monitor import create_pod_monitor - _patch_k8s(monkeypatch) cfg = PodMonitorConfig() @@ -520,29 +520,64 @@ async def test_create_pod_monitor_context_manager(monkeypatch) -> None: assert monitor.state == monitor.state.__class__.STOPPED +@pytest.mark.asyncio +async def test_create_pod_monitor_with_injected_k8s_clients(monkeypatch) -> None: + """Test create_pod_monitor with injected K8sClients (DI path).""" + _patch_k8s(monkeypatch) + + cfg = PodMonitorConfig() + cfg.enable_state_reconciliation = False + + fake_service = _FakeKafkaEventService() + + mock_v1 = MagicMock() + mock_v1.get_api_resources.return_value = None + mock_k8s_clients = K8sClients( + api_client=MagicMock(), + v1=mock_v1, + apps_v1=MagicMock(), + networking_v1=MagicMock(), + ) + + async with create_pod_monitor(cfg, fake_service, _test_logger, k8s_clients=mock_k8s_clients) as monitor: + assert monitor.state == monitor.state.__class__.RUNNING + assert monitor._clients is mock_k8s_clients + assert monitor._v1 is mock_v1 + + assert monitor.state == monitor.state.__class__.STOPPED + + @pytest.mark.asyncio async def test_start_already_running() -> None: + """Test idempotent start via __aenter__.""" cfg = PodMonitorConfig() pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger) + # Simulate already started state + pm._lifecycle_started = True pm._state = pm.state.__class__.RUNNING - await pm.start() + # Should be idempotent - just return self + await pm.__aenter__() @pytest.mark.asyncio async def test_stop_already_stopped() -> None: + """Test idempotent stop via aclose().""" cfg = PodMonitorConfig() pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger) pm._state = pm.state.__class__.STOPPED + # Not started, so aclose should be a no-op - await pm.stop() + await pm.aclose() @pytest.mark.asyncio async def test_stop_with_tasks() -> None: + """Test cleanup of tasks on aclose().""" cfg = PodMonitorConfig() pm = PodMonitor(cfg, kafka_event_service=_FakeKafkaEventService(), logger=_test_logger) pm._state = pm.state.__class__.RUNNING + pm._lifecycle_started = True # Simulate started state async def dummy_task(): await asyncio.Event().wait() @@ -552,7 +587,7 @@ async def dummy_task(): pm._watch = _StubWatch() pm._tracked_pods = {"pod1"} - await pm.stop() + await pm.aclose() assert pm._state == pm.state.__class__.STOPPED assert len(pm._tracked_pods) == 0 @@ -786,8 +821,8 @@ async def mock_reconcile(): pm._watch_pods = mock_watch pm._reconciliation_loop = mock_reconcile - await pm.start() + await pm.__aenter__() assert pm._watch_task is not None assert pm._reconcile_task is not None - await pm.stop() + await pm.aclose() diff --git a/backend/tests/unit/services/result_processor/test_processor.py b/backend/tests/unit/services/result_processor/test_processor.py index 4c44dd59..26ef9fdd 100644 --- a/backend/tests/unit/services/result_processor/test_processor.py +++ b/backend/tests/unit/services/result_processor/test_processor.py @@ -1,13 +1,11 @@ import logging - -import pytest from unittest.mock import MagicMock +import pytest from app.domain.enums.events import EventType from app.domain.enums.kafka import GroupId, KafkaTopic from app.services.result_processor.processor import ResultProcessor, ResultProcessorConfig - pytestmark = pytest.mark.unit _test_logger = logging.getLogger("test.services.result_processor.processor") @@ -31,7 +29,14 @@ def test_custom_values(self): def test_create_dispatcher_registers_handlers(): - rp = ResultProcessor(execution_repo=MagicMock(), producer=MagicMock(), idempotency_manager=MagicMock(), logger=_test_logger) + rp = ResultProcessor( + execution_repo=MagicMock(), + producer=MagicMock(), + schema_registry=MagicMock(), + settings=MagicMock(), + idempotency_manager=MagicMock(), + logger=_test_logger, + ) dispatcher = rp._create_dispatcher() assert dispatcher is not None assert EventType.EXECUTION_COMPLETED in dispatcher._handlers diff --git a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py index 4ef10e5c..75fb2e25 100644 --- a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py +++ b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py @@ -1,6 +1,6 @@ -import asyncio -import pytest +import logging +import pytest from app.domain.enums.events import EventType from app.domain.enums.saga import SagaState from app.domain.saga.models import Saga, SagaConfig @@ -8,9 +8,10 @@ from app.services.saga.saga_orchestrator import SagaOrchestrator from app.services.saga.saga_step import SagaStep - pytestmark = pytest.mark.unit +_test_logger = logging.getLogger("test.services.saga.orchestrator") + class _Evt: def __init__(self, et: EventType, execution_id: str): @@ -44,6 +45,8 @@ async def close(self): class _Store: ... class _Alloc: ... +class _SchemaRegistry: ... +class _Settings: ... class _StepOK(SagaStep[_Evt]): @@ -69,9 +72,12 @@ def _orch() -> SagaOrchestrator: config=SagaConfig(name="t", enable_compensation=True, store_events=True, publish_commands=False), saga_repository=_Repo(), producer=_Prod(), + schema_registry_manager=_SchemaRegistry(), # type: ignore[arg-type] + settings=_Settings(), # type: ignore[arg-type] event_store=_Store(), idempotency_manager=_Idem(), resource_allocation_repository=_Alloc(), + logger=_test_logger, ) diff --git a/backend/tests/unit/services/saga/test_saga_step_and_base.py b/backend/tests/unit/services/saga/test_saga_step_and_base.py index 76e4dcc9..a8ab93bd 100644 --- a/backend/tests/unit/services/saga/test_saga_step_and_base.py +++ b/backend/tests/unit/services/saga/test_saga_step_and_base.py @@ -79,4 +79,4 @@ def get_compensation(self): return None assert str(s) == "SagaStep(nm)" # can_execute default True import asyncio - assert asyncio.get_event_loop().run_until_complete(s.can_execute(SagaContext("s","e"), object())) is True + assert asyncio.run(s.can_execute(SagaContext("s","e"), object())) is True diff --git a/backend/tests/unit/services/sse/test_shutdown_manager.py b/backend/tests/unit/services/sse/test_shutdown_manager.py index 64ac5211..6db2190e 100644 --- a/backend/tests/unit/services/sse/test_shutdown_manager.py +++ b/backend/tests/unit/services/sse/test_shutdown_manager.py @@ -9,9 +9,11 @@ class DummyRouter: - def __init__(self): self.stopped = False + def __init__(self) -> None: + self.stopped = False - async def stop(self): self.stopped = True # noqa: ANN001 + async def aclose(self) -> None: + self.stopped = True @pytest.mark.asyncio diff --git a/backend/uv.lock b/backend/uv.lock index 257eafe7..adafaab9 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -1077,7 +1077,6 @@ dependencies = [ { name = "pymongo" }, { name = "pyparsing" }, { name = "python-dateutil" }, - { name = "python-dotenv" }, { name = "python-json-logger" }, { name = "python-multipart" }, { name = "pyyaml" }, @@ -1119,6 +1118,7 @@ dev = [ { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, + { name = "pytest-env" }, { name = "pytest-xdist" }, { name = "ruff" }, { name = "types-cachetools" }, @@ -1219,7 +1219,6 @@ requires-dist = [ { name = "pymongo", specifier = "==4.12.1" }, { name = "pyparsing", specifier = "==3.2.3" }, { name = "python-dateutil", specifier = "==2.9.0.post0" }, - { name = "python-dotenv", specifier = "==1.0.1" }, { name = "python-json-logger", specifier = "==2.0.7" }, { name = "python-multipart", specifier = "==0.0.18" }, { name = "pyyaml", specifier = "==6.0.2" }, @@ -1261,6 +1260,7 @@ dev = [ { name = "pytest", specifier = "==8.3.3" }, { name = "pytest-asyncio", specifier = "==1.3.0" }, { name = "pytest-cov", specifier = "==5.0.0" }, + { name = "pytest-env", specifier = ">=1.1.5" }, { name = "pytest-xdist", specifier = "==3.6.1" }, { name = "ruff", specifier = "==0.14.10" }, { name = "types-cachetools", specifier = "==6.2.0.20250827" }, @@ -2457,6 +2457,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/3a/af5b4fa5961d9a1e6237b530eb87dd04aea6eb83da09d2a4073d81b54ccf/pytest_cov-5.0.0-py3-none-any.whl", hash = "sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652", size = 21990, upload-time = "2024-03-24T20:16:32.444Z" }, ] +[[package]] +name = "pytest-env" +version = "1.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1f/31/27f28431a16b83cab7a636dce59cf397517807d247caa38ee67d65e71ef8/pytest_env-1.1.5.tar.gz", hash = "sha256:91209840aa0e43385073ac464a554ad2947cc2fd663a9debf88d03b01e0cc1cf", size = 8911, upload-time = "2024-09-17T22:39:18.566Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/b8/87cfb16045c9d4092cfcf526135d73b88101aac83bc1adcf82dfb5fd3833/pytest_env-1.1.5-py3-none-any.whl", hash = "sha256:ce90cf8772878515c24b31cd97c7fa1f4481cd68d588419fd45f10ecaee6bc30", size = 6141, upload-time = "2024-09-17T22:39:16.942Z" }, +] + [[package]] name = "pytest-xdist" version = "3.6.1" diff --git a/backend/workers/dlq_processor.py b/backend/workers/dlq_processor.py index 727f8e8e..711d1ff2 100644 --- a/backend/workers/dlq_processor.py +++ b/backend/workers/dlq_processor.py @@ -1,21 +1,19 @@ import asyncio -import os +import logging import signal +from contextlib import AsyncExitStack from typing import Optional -from app.core.database_context import DBClient -from app.core.logging import setup_logger +from app.core.container import create_dlq_processor_container +from app.core.database_context import Database +from app.db.docs import ALL_DOCUMENTS from app.dlq import DLQMessage, RetryPolicy, RetryStrategy -from app.dlq.manager import DLQManager, create_dlq_manager -from app.domain.enums.kafka import KafkaTopic -from app.events.schema.schema_registry import create_schema_registry_manager -from app.settings import get_settings -from pymongo.asynchronous.mongo_client import AsyncMongoClient +from app.dlq.manager import DLQManager +from app.settings import Settings, get_settings +from beanie import init_beanie -logger = setup_logger(os.environ.get("LOG_LEVEL", "INFO")) - -def _configure_retry_policies(manager: DLQManager) -> None: +def _configure_retry_policies(manager: DLQManager, logger: logging.Logger) -> None: manager.set_retry_policy( "execution-requests", RetryPolicy( @@ -58,7 +56,7 @@ def _configure_retry_policies(manager: DLQManager) -> None: ) -def _configure_filters(manager: DLQManager, testing: bool) -> None: +def _configure_filters(manager: DLQManager, testing: bool, logger: logging.Logger) -> None: if not testing: def filter_test_events(message: DLQMessage) -> bool: @@ -74,7 +72,7 @@ def filter_old_messages(message: DLQMessage) -> bool: manager.add_filter(filter_old_messages) -def _configure_callbacks(manager: DLQManager, testing: bool) -> None: +def _configure_callbacks(manager: DLQManager, testing: bool, logger: logging.Logger) -> None: async def log_before_retry(message: DLQMessage) -> None: logger.info( f"Retrying message {message.event_id} (type: {message.event_type}, " @@ -102,44 +100,36 @@ async def alert_on_discard(message: DLQMessage, reason: str) -> None: manager.add_callback("on_discard", alert_on_discard) -async def main() -> None: - settings = get_settings() - db_client: DBClient = AsyncMongoClient( - settings.MONGODB_URL, - tz_aware=True, - serverSelectionTimeoutMS=5000, - ) - db_name = settings.DATABASE_NAME - _ = db_client[db_name] # Access database to verify connection - await db_client.admin.command("ping") - logger.info(f"Connected to database: {db_name}") - - schema_registry = create_schema_registry_manager(logger) - manager = create_dlq_manager( - schema_registry=schema_registry, - logger=logger, - dlq_topic=KafkaTopic.DEAD_LETTER_QUEUE, - retry_topic_suffix="-retry", - ) +async def main(settings: Settings | None = None) -> None: + """Run the DLQ processor.""" + if settings is None: + settings = get_settings() + + container = create_dlq_processor_container(settings) + logger = await container.get(logging.Logger) + logger.info("Starting DLQ Processor with DI container...") - _configure_retry_policies(manager) - _configure_filters(manager, testing=settings.TESTING) - _configure_callbacks(manager, testing=settings.TESTING) + db = await container.get(Database) + await init_beanie(database=db, document_models=ALL_DOCUMENTS) + + manager = await container.get(DLQManager) + + _configure_retry_policies(manager, logger) + _configure_filters(manager, testing=settings.TESTING, logger=logger) + _configure_callbacks(manager, testing=settings.TESTING, logger=logger) stop_event = asyncio.Event() + loop = asyncio.get_running_loop() - def signal_handler(signum: int, frame: object | None) -> None: - logger.info(f"Received signal {signum}, initiating shutdown...") + def signal_handler() -> None: + logger.info("Received signal, initiating shutdown...") stop_event.set() - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - from contextlib import AsyncExitStack + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, signal_handler) async with AsyncExitStack() as stack: - await stack.enter_async_context(manager) - stack.callback(db_client.close) + stack.push_async_callback(container.close) await stop_event.wait() diff --git a/backend/workers/run_coordinator.py b/backend/workers/run_coordinator.py index 29f1c1dd..ef617444 100644 --- a/backend/workers/run_coordinator.py +++ b/backend/workers/run_coordinator.py @@ -1,28 +1,66 @@ -"""Run ExecutionCoordinator as a standalone worker service""" - import asyncio import logging +import signal +from app.core.container import create_coordinator_container +from app.core.database_context import Database from app.core.logging import setup_logger from app.core.tracing import init_tracing +from app.db.docs import ALL_DOCUMENTS from app.domain.enums.kafka import GroupId -from app.services.coordinator.coordinator import run_coordinator -from app.settings import get_settings +from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas +from app.services.coordinator.coordinator import ExecutionCoordinator +from app.settings import Settings, get_settings +from beanie import init_beanie + + +async def run_coordinator(settings: Settings | None = None) -> None: + """Run the execution coordinator service.""" + if settings is None: + settings = get_settings() + + container = create_coordinator_container(settings) + logger = await container.get(logging.Logger) + logger.info("Starting ExecutionCoordinator with DI container...") + + db = await container.get(Database) + await init_beanie(database=db, document_models=ALL_DOCUMENTS) + + schema_registry = await container.get(SchemaRegistryManager) + await initialize_event_schemas(schema_registry) + + # Services are already started by the DI container providers + coordinator = await container.get(ExecutionCoordinator) + + # Shutdown event - signal handlers just set this + shutdown_event = asyncio.Event() + loop = asyncio.get_running_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, shutdown_event.set) + + logger.info("ExecutionCoordinator started and running") + + try: + # Wait for shutdown signal or service to stop + while coordinator.is_running and not shutdown_event.is_set(): + await asyncio.sleep(60) + status = await coordinator.get_status() + logger.info(f"Coordinator status: {status}") + finally: + # Container cleanup stops everything + logger.info("Initiating graceful shutdown...") + await container.close() def main() -> None: """Main entry point for coordinator worker""" settings = get_settings() - # Setup logging logger = setup_logger(settings.LOG_LEVEL) - - # Configure root logger for worker logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting ExecutionCoordinator worker...") - # Initialize tracing if settings.ENABLE_TRACING: init_tracing( service_name=GroupId.EXECUTION_COORDINATOR, @@ -33,7 +71,7 @@ def main() -> None: ) logger.info("Tracing initialized for ExecutionCoordinator") - asyncio.run(run_coordinator()) + asyncio.run(run_coordinator(settings)) if __name__ == "__main__": diff --git a/backend/workers/run_event_replay.py b/backend/workers/run_event_replay.py index c2fda75b..949cf8af 100644 --- a/backend/workers/run_event_replay.py +++ b/backend/workers/run_event_replay.py @@ -2,72 +2,50 @@ import logging from contextlib import AsyncExitStack -from app.core.database_context import DBClient +from app.core.container import create_event_replay_container +from app.core.database_context import Database from app.core.logging import setup_logger from app.core.tracing import init_tracing from app.db.docs import ALL_DOCUMENTS -from app.db.repositories.replay_repository import ReplayRepository -from app.events.core import ProducerConfig, UnifiedProducer -from app.events.event_store import create_event_store -from app.events.schema.schema_registry import SchemaRegistryManager +from app.events.core import UnifiedProducer from app.services.event_replay.replay_service import EventReplayService -from app.settings import get_settings +from app.settings import Settings, get_settings from beanie import init_beanie -from pymongo.asynchronous.mongo_client import AsyncMongoClient -async def cleanup_task(replay_service: EventReplayService, interval_hours: int = 6) -> None: +async def cleanup_task(replay_service: EventReplayService, logger: logging.Logger, interval_hours: int = 6) -> None: """Periodically clean up old replay sessions""" - logger = logging.getLogger(__name__) - while True: try: - await asyncio.sleep(interval_hours * 3600) # Convert hours to seconds + await asyncio.sleep(interval_hours * 3600) removed = await replay_service.cleanup_old_sessions(older_than_hours=48) logger.info(f"Cleaned up {removed} old replay sessions") except Exception as e: logger.error(f"Error during cleanup: {e}") -async def run_replay_service(logger: logging.Logger) -> None: - """Run the event replay service with cleanup task""" - # Get settings - settings = get_settings() - - # Create database connection - db_client: DBClient = AsyncMongoClient(settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000) - db_name = settings.DATABASE_NAME - database = db_client[db_name] - - # Verify connection - await db_client.admin.command("ping") - logger.info(f"Connected to database: {db_name}") - - # Initialize Beanie ODM (indexes are idempotently created via Document.Settings.indexes) - await init_beanie(database=database, document_models=ALL_DOCUMENTS) +async def run_replay_service(settings: Settings | None = None) -> None: + """Run the event replay service with cleanup task.""" + if settings is None: + settings = get_settings() - # Initialize services - schema_registry = SchemaRegistryManager(logger) - producer_config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS) - producer = UnifiedProducer(producer_config, schema_registry, logger) + container = create_event_replay_container(settings) + logger = await container.get(logging.Logger) + logger.info("Starting EventReplayService with DI container...") - # Create event store - event_store = create_event_store(schema_registry=schema_registry, logger=logger) + db = await container.get(Database) + await init_beanie(database=db, document_models=ALL_DOCUMENTS) - # Create repository - replay_repository = ReplayRepository(logger) + producer = await container.get(UnifiedProducer) + replay_service = await container.get(EventReplayService) - # Create replay service - replay_service = EventReplayService( - repository=replay_repository, producer=producer, event_store=event_store, logger=logger - ) logger.info("Event replay service initialized") async with AsyncExitStack() as stack: + stack.push_async_callback(container.close) await stack.enter_async_context(producer) - stack.callback(db_client.close) - task = asyncio.create_task(cleanup_task(replay_service)) + task = asyncio.create_task(cleanup_task(replay_service, logger)) async def _cancel_task() -> None: task.cancel() @@ -85,15 +63,11 @@ def main() -> None: """Main entry point for event replay service""" settings = get_settings() - # Setup logging logger = setup_logger(settings.LOG_LEVEL) - - # Configure root logger for worker logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting Event Replay Service...") - # Initialize tracing if settings.ENABLE_TRACING: init_tracing( service_name="event-replay", @@ -104,7 +78,7 @@ def main() -> None: ) logger.info("Tracing initialized for Event Replay Service") - asyncio.run(run_replay_service(logger)) + asyncio.run(run_replay_service(settings)) if __name__ == "__main__": diff --git a/backend/workers/run_k8s_worker.py b/backend/workers/run_k8s_worker.py index 0e080e6b..49b945fa 100644 --- a/backend/workers/run_k8s_worker.py +++ b/backend/workers/run_k8s_worker.py @@ -1,28 +1,66 @@ -"""Run KubernetesWorker as a standalone worker service""" - import asyncio import logging +import signal +from app.core.container import create_k8s_worker_container +from app.core.database_context import Database from app.core.logging import setup_logger from app.core.tracing import init_tracing +from app.db.docs import ALL_DOCUMENTS from app.domain.enums.kafka import GroupId -from app.services.k8s_worker.worker import run_kubernetes_worker -from app.settings import get_settings +from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas +from app.services.k8s_worker.worker import KubernetesWorker +from app.settings import Settings, get_settings +from beanie import init_beanie + + +async def run_kubernetes_worker(settings: Settings | None = None) -> None: + """Run the Kubernetes worker service.""" + if settings is None: + settings = get_settings() + + container = create_k8s_worker_container(settings) + logger = await container.get(logging.Logger) + logger.info("Starting KubernetesWorker with DI container...") + + db = await container.get(Database) + await init_beanie(database=db, document_models=ALL_DOCUMENTS) + + schema_registry = await container.get(SchemaRegistryManager) + await initialize_event_schemas(schema_registry) + + # Services are already started by the DI container providers + worker = await container.get(KubernetesWorker) + + # Shutdown event - signal handlers just set this + shutdown_event = asyncio.Event() + loop = asyncio.get_running_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, shutdown_event.set) + + logger.info("KubernetesWorker started and running") + + try: + # Wait for shutdown signal or service to stop + while worker.is_running and not shutdown_event.is_set(): + await asyncio.sleep(60) + status = await worker.get_status() + logger.info(f"Kubernetes worker status: {status}") + finally: + # Container cleanup stops everything + logger.info("Initiating graceful shutdown...") + await container.close() def main() -> None: """Main entry point for Kubernetes worker""" settings = get_settings() - # Setup logging logger = setup_logger(settings.LOG_LEVEL) - - # Configure root logger for worker logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting KubernetesWorker...") - # Initialize tracing if settings.ENABLE_TRACING: init_tracing( service_name=GroupId.K8S_WORKER, @@ -33,7 +71,7 @@ def main() -> None: ) logger.info("Tracing initialized for KubernetesWorker") - asyncio.run(run_kubernetes_worker()) + asyncio.run(run_kubernetes_worker(settings)) if __name__ == "__main__": diff --git a/backend/workers/run_pod_monitor.py b/backend/workers/run_pod_monitor.py index ebed2fc8..9c1fe09e 100644 --- a/backend/workers/run_pod_monitor.py +++ b/backend/workers/run_pod_monitor.py @@ -1,28 +1,68 @@ -"""Run PodMonitor as a standalone worker service""" - import asyncio import logging +import signal +from app.core.container import create_pod_monitor_container +from app.core.database_context import Database from app.core.logging import setup_logger from app.core.tracing import init_tracing +from app.db.docs import ALL_DOCUMENTS from app.domain.enums.kafka import GroupId -from app.services.pod_monitor.monitor import run_pod_monitor -from app.settings import get_settings +from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas +from app.services.pod_monitor.monitor import MonitorState, PodMonitor +from app.settings import Settings, get_settings +from beanie import init_beanie + +RECONCILIATION_LOG_INTERVAL: int = 60 + + +async def run_pod_monitor(settings: Settings | None = None) -> None: + """Run the pod monitor service.""" + if settings is None: + settings = get_settings() + + container = create_pod_monitor_container(settings) + logger = await container.get(logging.Logger) + logger.info("Starting PodMonitor with DI container...") + + db = await container.get(Database) + await init_beanie(database=db, document_models=ALL_DOCUMENTS) + + schema_registry = await container.get(SchemaRegistryManager) + await initialize_event_schemas(schema_registry) + + # Services are already started by the DI container providers + monitor = await container.get(PodMonitor) + + # Shutdown event - signal handlers just set this + shutdown_event = asyncio.Event() + loop = asyncio.get_running_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, shutdown_event.set) + + logger.info("PodMonitor started and running") + + try: + # Wait for shutdown signal or service to stop + while monitor.state == MonitorState.RUNNING and not shutdown_event.is_set(): + await asyncio.sleep(RECONCILIATION_LOG_INTERVAL) + status = await monitor.get_status() + logger.info(f"Pod monitor status: {status}") + finally: + # Container cleanup stops everything + logger.info("Initiating graceful shutdown...") + await container.close() def main() -> None: """Main entry point for pod monitor worker""" settings = get_settings() - # Setup logging logger = setup_logger(settings.LOG_LEVEL) - - # Configure root logger for worker logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting PodMonitor worker...") - # Initialize tracing if settings.ENABLE_TRACING: init_tracing( service_name=GroupId.POD_MONITOR, @@ -33,7 +73,7 @@ def main() -> None: ) logger.info("Tracing initialized for PodMonitor Service") - asyncio.run(run_pod_monitor()) + asyncio.run(run_pod_monitor(settings)) if __name__ == "__main__": diff --git a/backend/workers/run_result_processor.py b/backend/workers/run_result_processor.py index 2150b112..0151ad9f 100644 --- a/backend/workers/run_result_processor.py +++ b/backend/workers/run_result_processor.py @@ -1,26 +1,82 @@ import asyncio import logging +import signal +from contextlib import AsyncExitStack +from app.core.container import create_result_processor_container from app.core.logging import setup_logger from app.core.tracing import init_tracing +from app.db.docs import ALL_DOCUMENTS +from app.db.repositories.execution_repository import ExecutionRepository from app.domain.enums.kafka import GroupId -from app.services.result_processor.processor import run_result_processor -from app.settings import get_settings +from app.events.core import UnifiedProducer +from app.events.schema.schema_registry import SchemaRegistryManager +from app.services.idempotency import IdempotencyManager +from app.services.result_processor.processor import ProcessingState, ResultProcessor +from app.settings import Settings, get_settings +from beanie import init_beanie +from pymongo.asynchronous.mongo_client import AsyncMongoClient + + +async def run_result_processor(settings: Settings | None = None) -> None: + if settings is None: + settings = get_settings() + + db_client: AsyncMongoClient[dict[str, object]] = AsyncMongoClient( + settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000 + ) + await init_beanie(database=db_client[settings.DATABASE_NAME], document_models=ALL_DOCUMENTS) + + container = create_result_processor_container(settings) + producer = await container.get(UnifiedProducer) + schema_registry = await container.get(SchemaRegistryManager) + idempotency_manager = await container.get(IdempotencyManager) + execution_repo = await container.get(ExecutionRepository) + logger = await container.get(logging.Logger) + logger.info(f"Beanie ODM initialized with {len(ALL_DOCUMENTS)} document models") + + # ResultProcessor is manually created (not from DI), so we own its lifecycle + processor = ResultProcessor( + execution_repo=execution_repo, + producer=producer, + schema_registry=schema_registry, + settings=settings, + idempotency_manager=idempotency_manager, + logger=logger, + ) + + # Shutdown event - signal handlers just set this + shutdown_event = asyncio.Event() + loop = asyncio.get_running_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, shutdown_event.set) + + # We own the processor, so we use async with to manage its lifecycle + async with AsyncExitStack() as stack: + stack.callback(db_client.close) + stack.push_async_callback(container.close) + await stack.enter_async_context(processor) + + logger.info("ResultProcessor started and running") + + # Wait for shutdown signal or service to stop + while processor._state == ProcessingState.PROCESSING and not shutdown_event.is_set(): + await asyncio.sleep(60) + status = await processor.get_status() + logger.info(f"ResultProcessor status: {status}") + + logger.info("Initiating graceful shutdown...") def main() -> None: """Main entry point for result processor worker""" settings = get_settings() - # Setup logging logger = setup_logger(settings.LOG_LEVEL) - - # Configure root logger for worker logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting ResultProcessor worker...") - # Initialize tracing if settings.ENABLE_TRACING: init_tracing( service_name=GroupId.RESULT_PROCESSOR, @@ -31,7 +87,7 @@ def main() -> None: ) logger.info("Tracing initialized for ResultProcessor Service") - asyncio.run(run_result_processor()) + asyncio.run(run_result_processor(settings)) if __name__ == "__main__": diff --git a/backend/workers/run_saga_orchestrator.py b/backend/workers/run_saga_orchestrator.py index 53e45a83..04ad8a8d 100644 --- a/backend/workers/run_saga_orchestrator.py +++ b/backend/workers/run_saga_orchestrator.py @@ -1,134 +1,66 @@ import asyncio import logging +import signal -import redis.asyncio as redis -from app.core.database_context import DBClient +from app.core.container import create_saga_orchestrator_container +from app.core.database_context import Database from app.core.logging import setup_logger from app.core.tracing import init_tracing from app.db.docs import ALL_DOCUMENTS -from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository -from app.db.repositories.saga_repository import SagaRepository from app.domain.enums.kafka import GroupId -from app.domain.saga.models import SagaConfig -from app.events.core import ProducerConfig, UnifiedProducer -from app.events.event_store import create_event_store -from app.events.schema.schema_registry import SchemaRegistryManager -from app.services.idempotency import IdempotencyConfig, create_idempotency_manager -from app.services.idempotency.redis_repository import RedisIdempotencyRepository -from app.services.saga import create_saga_orchestrator -from app.settings import get_settings +from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas +from app.services.saga import SagaOrchestrator +from app.settings import Settings, get_settings from beanie import init_beanie -from pymongo.asynchronous.mongo_client import AsyncMongoClient -async def run_saga_orchestrator() -> None: - """Run the saga orchestrator""" - # Get settings - settings = get_settings() - logger = logging.getLogger(__name__) - - # Create database connection - db_client: DBClient = AsyncMongoClient(settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000) - db_name = settings.DATABASE_NAME - database = db_client[db_name] - - # Verify connection - await db_client.admin.command("ping") - logger.info(f"Connected to database: {db_name}") - - # Initialize Beanie ODM (indexes are idempotently created via Document.Settings.indexes) - await init_beanie(database=database, document_models=ALL_DOCUMENTS) - - # Initialize schema registry - logger.info("Initializing schema registry...") - schema_registry_manager = SchemaRegistryManager(logger) - await schema_registry_manager.initialize_schemas() - - # Initialize Kafka producer - logger.info("Initializing Kafka producer...") - producer_config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS) - producer = UnifiedProducer(producer_config, schema_registry_manager, logger) - await producer.start() - - # Create event store (schema ensured separately) - logger.info("Creating event store...") - event_store = create_event_store(schema_registry=schema_registry_manager, logger=logger, ttl_days=90) - - # Create repository and idempotency manager (Redis-backed) - saga_repository = SagaRepository() - r = redis.Redis( - host=settings.REDIS_HOST, - port=settings.REDIS_PORT, - db=settings.REDIS_DB, - password=settings.REDIS_PASSWORD, - ssl=settings.REDIS_SSL, - max_connections=settings.REDIS_MAX_CONNECTIONS, - decode_responses=settings.REDIS_DECODE_RESPONSES, - socket_connect_timeout=5, - socket_timeout=5, - ) - idem_repo = RedisIdempotencyRepository(r, key_prefix="idempotency") - idempotency_manager = create_idempotency_manager(repository=idem_repo, config=IdempotencyConfig(), logger=logger) - resource_allocation_repository = ResourceAllocationRepository() - - # Create saga orchestrator - saga_config = SagaConfig( - name="main-orchestrator", - timeout_seconds=300, - max_retries=3, - retry_delay_seconds=5, - enable_compensation=True, - store_events=True, - publish_commands=True, - ) - - saga_orchestrator = create_saga_orchestrator( - saga_repository=saga_repository, - producer=producer, - event_store=event_store, - idempotency_manager=idempotency_manager, - resource_allocation_repository=resource_allocation_repository, - config=saga_config, - ) - - # Start the orchestrator - await saga_orchestrator.start() +async def run_saga_orchestrator(settings: Settings | None = None) -> None: + """Run the saga orchestrator.""" + if settings is None: + settings = get_settings() - logger.info("Saga orchestrator started and running") + container = create_saga_orchestrator_container(settings) + logger = await container.get(logging.Logger) + logger.info("Starting SagaOrchestrator with DI container...") - try: - while True: - await asyncio.sleep(60) + db = await container.get(Database) + await init_beanie(database=db, document_models=ALL_DOCUMENTS) + + schema_registry = await container.get(SchemaRegistryManager) + await initialize_event_schemas(schema_registry) + + # Services are already started by the DI container providers + orchestrator = await container.get(SagaOrchestrator) - if saga_orchestrator.is_running: - logger.info("Saga orchestrator is running...") - else: - logger.warning("Saga orchestrator stopped unexpectedly") - break + # Shutdown event - signal handlers just set this + shutdown_event = asyncio.Event() + loop = asyncio.get_running_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, shutdown_event.set) + logger.info("Saga orchestrator started and running") + + try: + # Wait for shutdown signal or service to stop + while orchestrator.is_running and not shutdown_event.is_set(): + await asyncio.sleep(1) finally: - logger.info("Shutting down saga orchestrator...") - await saga_orchestrator.stop() - await producer.stop() - await idempotency_manager.close() - await r.aclose() - await db_client.close() - logger.info("Saga orchestrator shutdown complete") + # Container cleanup stops everything + logger.info("Initiating graceful shutdown...") + await container.close() + + logger.warning("Saga orchestrator stopped") def main() -> None: """Main entry point for saga orchestrator worker""" settings = get_settings() - # Setup logging logger = setup_logger(settings.LOG_LEVEL) - - # Configure root logger for worker logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting Saga Orchestrator worker...") - # Initialize tracing if settings.ENABLE_TRACING: init_tracing( service_name=GroupId.SAGA_ORCHESTRATOR, @@ -139,7 +71,7 @@ def main() -> None: ) logger.info("Tracing initialized for Saga Orchestrator Service") - asyncio.run(run_saga_orchestrator()) + asyncio.run(run_saga_orchestrator(settings)) if __name__ == "__main__":