diff --git a/backend/.env b/backend/.env index cec960d1..aa213436 100644 --- a/backend/.env +++ b/backend/.env @@ -24,7 +24,9 @@ EVENT_RETENTION_DAYS=30 KAFKA_CONSUMER_GROUP_ID=integr8scode-backend KAFKA_AUTO_OFFSET_RESET=earliest KAFKA_ENABLE_AUTO_COMMIT=true -KAFKA_SESSION_TIMEOUT_MS=30000 +KAFKA_SESSION_TIMEOUT_MS=45000 +KAFKA_HEARTBEAT_INTERVAL_MS=10000 +KAFKA_REQUEST_TIMEOUT_MS=40000 KAFKA_MAX_POLL_RECORDS=500 # WebSocket Configuration diff --git a/backend/.env.test b/backend/.env.test index 5f984770..7d175192 100644 --- a/backend/.env.test +++ b/backend/.env.test @@ -25,6 +25,13 @@ KAFKA_TOPIC_PREFIX=test. SCHEMA_SUBJECT_PREFIX=test. SCHEMA_REGISTRY_URL=http://localhost:8081 +# Reduce consumer pool and timeouts for faster test startup/teardown +# https://github.com/aio-libs/aiokafka/issues/773 +SSE_CONSUMER_POOL_SIZE=1 +KAFKA_SESSION_TIMEOUT_MS=6000 +KAFKA_HEARTBEAT_INTERVAL_MS=2000 +KAFKA_REQUEST_TIMEOUT_MS=5000 + # Security SECURE_COOKIES=true BCRYPT_ROUNDS=4 @@ -33,7 +40,9 @@ BCRYPT_ROUNDS=4 RATE_LIMIT_ENABLED=true ENABLE_TRACING=false -# OpenTelemetry - explicitly disabled for tests +# OpenTelemetry - disabled for tests +# Empty endpoint prevents OTLP exporter creation in setup_metrics() +# OTEL_SDK_DISABLED=true (set via pytest-env) provides additional safety OTEL_EXPORTER_OTLP_ENDPOINT= # Development diff --git a/backend/app/api/routes/dlq.py b/backend/app/api/routes/dlq.py index df88504a..47f788b5 100644 --- a/backend/app/api/routes/dlq.py +++ b/backend/app/api/routes/dlq.py @@ -1,6 +1,3 @@ -from datetime import datetime, timezone -from typing import List - from dishka import FromDishka from dishka.integrations.fastapi import DishkaRoute from fastapi import APIRouter, Depends, HTTPException, Query @@ -31,19 +28,7 @@ @router.get("/stats", response_model=DLQStats) async def get_dlq_statistics(repository: FromDishka[DLQRepository]) -> DLQStats: stats = await repository.get_dlq_stats() - return DLQStats( - by_status=stats.by_status, - by_topic=[{"topic": t.topic, "count": t.count, "avg_retry_count": t.avg_retry_count} for t in stats.by_topic], - by_event_type=[{"event_type": e.event_type, "count": e.count} for e in stats.by_event_type], - age_stats={ - "min_age": stats.age_stats.min_age_seconds, - "max_age": stats.age_stats.max_age_seconds, - "avg_age": stats.age_stats.avg_age_seconds, - } - if stats.age_stats - else {}, - timestamp=stats.timestamp, - ) + return DLQStats.model_validate(stats, from_attributes=True) @router.get("/messages", response_model=DLQMessagesResponse) @@ -70,27 +55,7 @@ async def get_dlq_message(event_id: str, repository: FromDishka[DLQRepository]) message = await repository.get_message_by_id(event_id) if not message: raise HTTPException(status_code=404, detail="Message not found") - - return DLQMessageDetail( - event_id=message.event_id, - event=message.event.model_dump(), - event_type=message.event_type, - original_topic=message.original_topic, - error=message.error, - retry_count=message.retry_count, - failed_at=message.failed_at or datetime(1970, 1, 1, tzinfo=timezone.utc), - status=DLQMessageStatus(message.status), - created_at=message.created_at, - last_updated=message.last_updated, - next_retry_at=message.next_retry_at, - retried_at=message.retried_at, - discarded_at=message.discarded_at, - discard_reason=message.discard_reason, - producer_id=message.producer_id, - dlq_offset=message.dlq_offset, - dlq_partition=message.dlq_partition, - last_error=message.last_error, - ) + return DLQMessageDetail.model_validate(message, from_attributes=True) @router.post("/retry", response_model=DLQBatchRetryResponse) @@ -141,7 +106,7 @@ async def discard_dlq_message( return MessageResponse(message=f"Message {event_id} discarded") -@router.get("/topics", response_model=List[DLQTopicSummaryResponse]) -async def get_dlq_topics(repository: FromDishka[DLQRepository]) -> List[DLQTopicSummaryResponse]: +@router.get("/topics", response_model=list[DLQTopicSummaryResponse]) +async def get_dlq_topics(repository: FromDishka[DLQRepository]) -> list[DLQTopicSummaryResponse]: topics = await repository.get_topics_summary() return [DLQTopicSummaryResponse.model_validate(topic) for topic in topics] diff --git a/backend/app/api/routes/events.py b/backend/app/api/routes/events.py index ac1e33b1..e27aa04a 100644 --- a/backend/app/api/routes/events.py +++ b/backend/app/api/routes/events.py @@ -12,8 +12,7 @@ from app.core.utils import get_client_ip from app.domain.enums.common import SortOrder from app.domain.events.event_models import EventFilter -from app.domain.events.typed import BaseEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata as EventMetadata +from app.domain.events.typed import BaseEvent, EventMetadata from app.schemas_pydantic.events import ( DeleteEventResponse, EventAggregationRequest, diff --git a/backend/app/api/routes/execution.py b/backend/app/api/routes/execution.py index 85762a2f..d5cf51bb 100644 --- a/backend/app/api/routes/execution.py +++ b/backend/app/api/routes/execution.py @@ -12,9 +12,8 @@ from app.domain.enums.events import EventType from app.domain.enums.execution import ExecutionStatus from app.domain.enums.user import UserRole +from app.domain.events.typed import BaseEvent, EventMetadata from app.domain.exceptions import DomainError -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata as EventMetadata from app.schemas_pydantic.execution import ( CancelExecutionRequest, CancelResponse, diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py index 6c1f8eed..3af1e28e 100644 --- a/backend/app/core/providers.py +++ b/backend/app/core/providers.py @@ -43,7 +43,7 @@ from app.dlq.manager import DLQManager, create_dlq_manager from app.domain.enums.kafka import KafkaTopic from app.domain.saga.models import SagaConfig -from app.events.core import ProducerConfig, UnifiedProducer +from app.events.core import UnifiedProducer from app.events.event_store import EventStore, create_event_store from app.events.event_store_consumer import EventStoreConsumer, create_event_store_consumer from app.events.schema.schema_registry import SchemaRegistryManager @@ -160,8 +160,7 @@ class MessagingProvider(Provider): async def get_kafka_producer( self, settings: Settings, schema_registry: SchemaRegistryManager, logger: logging.Logger ) -> AsyncIterator[UnifiedProducer]: - config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS) - async with UnifiedProducer(config, schema_registry, logger, settings=settings) as producer: + async with UnifiedProducer(schema_registry, logger, settings) as producer: yield producer @provide diff --git a/backend/app/db/docs/dlq.py b/backend/app/db/docs/dlq.py index 192fe29c..71e2f7a3 100644 --- a/backend/app/db/docs/dlq.py +++ b/backend/app/db/docs/dlq.py @@ -1,32 +1,25 @@ from datetime import datetime, timezone -from typing import Any from beanie import Document, Indexed from pydantic import ConfigDict, Field from pymongo import ASCENDING, DESCENDING, IndexModel from app.dlq.models import DLQMessageStatus -from app.domain.enums.events import EventType +from app.domain.events.typed import DomainEvent class DLQMessageDocument(Document): - """Unified DLQ message document for the entire system. + """Unified DLQ message document. Access event_id/event_type via event.event_id, event.event_type.""" - Copied from DLQMessage dataclass. - """ - - # Core fields - always required - event: dict[str, Any] # The original event as dict (BaseEvent serialized) - event_id: Indexed(str, unique=True) # type: ignore[valid-type] - event_type: EventType # Indexed via Settings.indexes - original_topic: Indexed(str) # type: ignore[valid-type] - error: str # Error message from the failure - retry_count: Indexed(int) # type: ignore[valid-type] - failed_at: Indexed(datetime) # type: ignore[valid-type] - status: DLQMessageStatus # Indexed via Settings.indexes - producer_id: str # ID of the producer that sent to DLQ + model_config = ConfigDict(from_attributes=True) - # Optional fields + event: DomainEvent # Discriminated union - contains event_id, event_type + original_topic: Indexed(str) = "" # type: ignore[valid-type] + error: str = "Unknown error" + retry_count: Indexed(int) = 0 # type: ignore[valid-type] + failed_at: Indexed(datetime) = Field(default_factory=lambda: datetime.now(timezone.utc)) # type: ignore[valid-type] + status: DLQMessageStatus = DLQMessageStatus.PENDING + producer_id: str = "unknown" created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) last_updated: datetime | None = None next_retry_at: Indexed(datetime) | None = None # type: ignore[valid-type] @@ -36,25 +29,15 @@ class DLQMessageDocument(Document): dlq_offset: int | None = None dlq_partition: int | None = None last_error: str | None = None - - # Kafka message headers (optional) headers: dict[str, str] = Field(default_factory=dict) - model_config = ConfigDict(from_attributes=True) - class Settings: name = "dlq_messages" use_state_management = True indexes = [ - IndexModel([("event_type", ASCENDING)], name="idx_dlq_event_type"), + IndexModel([("event.event_id", ASCENDING)], unique=True, name="idx_dlq_event_id"), + IndexModel([("event.event_type", ASCENDING)], name="idx_dlq_event_type"), IndexModel([("status", ASCENDING)], name="idx_dlq_status"), IndexModel([("failed_at", DESCENDING)], name="idx_dlq_failed_desc"), - # TTL index - auto-delete after 7 days IndexModel([("created_at", ASCENDING)], name="idx_dlq_created_ttl", expireAfterSeconds=7 * 24 * 3600), ] - - @property - def age_seconds(self) -> float: - """Get message age in seconds since failure.""" - failed_at: datetime = self.failed_at - return (datetime.now(timezone.utc) - failed_at).total_seconds() diff --git a/backend/app/db/repositories/admin/admin_events_repository.py b/backend/app/db/repositories/admin/admin_events_repository.py index f5802681..bd410704 100644 --- a/backend/app/db/repositories/admin/admin_events_repository.py +++ b/backend/app/db/repositories/admin/admin_events_repository.py @@ -3,6 +3,7 @@ from beanie.odm.enums import SortDirection from beanie.operators import GTE, LTE, In, Text +from monggregate import Pipeline, S from app.db.docs import ( EventArchiveDocument, @@ -25,7 +26,6 @@ UserEventCount, domain_event_adapter, ) -from app.domain.events.query_builders import EventStatsAggregation from app.domain.replay.models import ReplayFilter, ReplaySessionState @@ -62,7 +62,7 @@ async def browse_events( return EventBrowseResult(events=events, total=total, skip=skip, limit=limit) async def get_event_detail(self, event_id: str) -> EventDetail | None: - doc = await EventDocument.find_one({"event_id": event_id}) + doc = await EventDocument.find_one(EventDocument.event_id == event_id) if not doc: return None @@ -86,7 +86,7 @@ async def get_event_detail(self, event_id: str) -> EventDetail | None: return EventDetail(event=event, related_events=related_events, timeline=timeline) async def delete_event(self, event_id: str) -> bool: - doc = await EventDocument.find_one({"event_id": event_id}) + doc = await EventDocument.find_one(EventDocument.event_id == event_id) if not doc: return False await doc.delete() @@ -95,9 +95,29 @@ async def delete_event(self, event_id: str) -> bool: async def get_event_stats(self, hours: int = 24) -> EventStatistics: start_time = datetime.now(timezone.utc) - timedelta(hours=hours) - overview_pipeline = EventStatsAggregation.build_overview_pipeline(start_time) - overview_result = await EventDocument.aggregate(overview_pipeline).to_list() - + # Overview stats pipeline + # Note: monggregate doesn't have S.add_to_set - use raw dict syntax + overview_pipeline = ( + Pipeline() + .match({EventDocument.timestamp: {"$gte": start_time}}) + .group( + by=None, + query={ + "total_events": S.sum(1), + "event_types": {"$addToSet": S.field(EventDocument.event_type)}, + "unique_users": {"$addToSet": S.field(EventDocument.metadata.user_id)}, + "services": {"$addToSet": S.field(EventDocument.metadata.service_name)}, + }, + ) + .project( + _id=0, + total_events=1, + event_type_count={"$size": "$event_types"}, + unique_user_count={"$size": "$unique_users"}, + service_count={"$size": "$services"}, + ) + ) + overview_result = await EventDocument.aggregate(overview_pipeline.export()).to_list() stats = ( overview_result[0] if overview_result @@ -106,41 +126,61 @@ async def get_event_stats(self, hours: int = 24) -> EventStatistics: error_count = await EventDocument.find( { - "timestamp": {"$gte": start_time}, - "event_type": {"$regex": "failed|error|timeout", "$options": "i"}, + EventDocument.timestamp: {"$gte": start_time}, + EventDocument.event_type: {"$regex": "failed|error|timeout", "$options": "i"}, } ).count() - error_rate = (error_count / stats["total_events"] * 100) if stats["total_events"] > 0 else 0 - type_pipeline = EventStatsAggregation.build_event_types_pipeline(start_time) - top_types = await EventDocument.aggregate(type_pipeline).to_list() + # Event types pipeline + type_pipeline = ( + Pipeline() + .match({EventDocument.timestamp: {"$gte": start_time}}) + .group(by=EventDocument.event_type, query={"count": S.sum(1)}) + .sort(by="count", descending=True) + .limit(10) + ) + top_types = await EventDocument.aggregate(type_pipeline.export()).to_list() events_by_type = {t["_id"]: t["count"] for t in top_types} - hourly_pipeline = EventStatsAggregation.build_hourly_events_pipeline(start_time) - hourly_result = await EventDocument.aggregate(hourly_pipeline).to_list() - events_by_hour: list[HourlyEventCount | dict[str, Any]] = [ - HourlyEventCount(hour=doc["_id"], count=doc["count"]) for doc in hourly_result - ] - - user_pipeline = EventStatsAggregation.build_top_users_pipeline(start_time) - top_users_result = await EventDocument.aggregate(user_pipeline).to_list() - top_users = [ - UserEventCount(user_id=doc["_id"], event_count=doc["count"]) for doc in top_users_result if doc["_id"] - ] - - exec_pipeline: list[dict[str, Any]] = [ - { - "$match": { - "created_at": {"$gte": start_time}, - "status": "completed", - "resource_usage.execution_time_wall_seconds": {"$exists": True}, - } - }, - {"$group": {"_id": None, "avg_duration": {"$avg": "$resource_usage.execution_time_wall_seconds"}}}, - ] - - exec_result = await ExecutionDocument.aggregate(exec_pipeline).to_list() + # Hourly events pipeline - project renames _id->hour + hourly_pipeline = ( + Pipeline() + .match({EventDocument.timestamp: {"$gte": start_time}}) + .group( + by={"$dateToString": {"format": "%Y-%m-%d-%H", "date": S.field(EventDocument.timestamp)}}, + query={"count": S.sum(1)}, + ) + .sort(by="_id") + .project(_id=0, hour="$_id", count=1) + ) + hourly_result = await EventDocument.aggregate(hourly_pipeline.export()).to_list() + events_by_hour: list[HourlyEventCount | dict[str, Any]] = [HourlyEventCount(**doc) for doc in hourly_result] + + # Top users pipeline - project renames _id->user_id, count->event_count + user_pipeline = ( + Pipeline() + .match({EventDocument.timestamp: {"$gte": start_time}}) + .group(by=EventDocument.metadata.user_id, query={"count": S.sum(1)}) + .sort(by="count", descending=True) + .limit(10) + .project(_id=0, user_id="$_id", event_count="$count") + ) + top_users_result = await EventDocument.aggregate(user_pipeline.export()).to_list() + top_users = [UserEventCount(**doc) for doc in top_users_result if doc["user_id"]] + + # Execution duration pipeline + exec_time_field = S.field(ExecutionDocument.resource_usage.execution_time_wall_seconds) + exec_pipeline = ( + Pipeline() + .match({ + ExecutionDocument.created_at: {"$gte": start_time}, + ExecutionDocument.status: "completed", + ExecutionDocument.resource_usage.execution_time_wall_seconds: {"$exists": True}, + }) + .group(by=None, query={"avg_duration": S.avg(exec_time_field)}) + ) + exec_result = await ExecutionDocument.aggregate(exec_pipeline.export()).to_list() avg_processing_time = ( exec_result[0]["avg_duration"] if exec_result and exec_result[0].get("avg_duration") else 0 ) @@ -190,7 +230,7 @@ async def create_replay_session(self, session: ReplaySessionState) -> str: return session.session_id async def get_replay_session(self, session_id: str) -> ReplaySessionState | None: - doc = await ReplaySessionDocument.find_one({"session_id": session_id}) + doc = await ReplaySessionDocument.find_one(ReplaySessionDocument.session_id == session_id) if not doc: return None return ReplaySessionState.model_validate(doc, from_attributes=True) @@ -199,14 +239,14 @@ async def update_replay_session(self, session_id: str, updates: ReplaySessionUpd update_dict = updates.model_dump(exclude_none=True) if not update_dict: return False - doc = await ReplaySessionDocument.find_one({"session_id": session_id}) + doc = await ReplaySessionDocument.find_one(ReplaySessionDocument.session_id == session_id) if not doc: return False await doc.set(update_dict) return True async def get_replay_status_with_progress(self, session_id: str) -> ReplaySessionStatusDetail | None: - doc = await ReplaySessionDocument.find_one({"session_id": session_id}) + doc = await ReplaySessionDocument.find_one(ReplaySessionDocument.session_id == session_id) if not doc: return None @@ -251,7 +291,7 @@ async def get_replay_status_with_progress(self, session_id: str) -> ReplaySessio execution_ids = {event.execution_id for event in original_events if event.execution_id} for exec_id in list(execution_ids)[:10]: - exec_doc = await ExecutionDocument.find_one({"execution_id": exec_id}) + exec_doc = await ExecutionDocument.find_one(ExecutionDocument.execution_id == exec_id) if exec_doc: execution_results.append( { diff --git a/backend/app/db/repositories/admin/admin_settings_repository.py b/backend/app/db/repositories/admin/admin_settings_repository.py index 2899e117..37e3d57f 100644 --- a/backend/app/db/repositories/admin/admin_settings_repository.py +++ b/backend/app/db/repositories/admin/admin_settings_repository.py @@ -16,7 +16,7 @@ def __init__(self, logger: logging.Logger): self.logger = logger async def get_system_settings(self) -> SystemSettings: - doc = await SystemSettingsDocument.find_one({"settings_id": "global"}) + doc = await SystemSettingsDocument.find_one(SystemSettingsDocument.settings_id == "global") if not doc: self.logger.info("System settings not found, creating defaults") doc = SystemSettingsDocument( @@ -40,7 +40,7 @@ async def update_system_settings( updated_by: str, user_id: str, ) -> SystemSettings: - doc = await SystemSettingsDocument.find_one({"settings_id": "global"}) + doc = await SystemSettingsDocument.find_one(SystemSettingsDocument.settings_id == "global") if not doc: doc = SystemSettingsDocument(settings_id="global") @@ -68,7 +68,7 @@ async def update_system_settings( ) async def reset_system_settings(self, username: str, user_id: str) -> SystemSettings: - doc = await SystemSettingsDocument.find_one({"settings_id": "global"}) + doc = await SystemSettingsDocument.find_one(SystemSettingsDocument.settings_id == "global") if doc: await doc.delete() diff --git a/backend/app/db/repositories/admin/admin_user_repository.py b/backend/app/db/repositories/admin/admin_user_repository.py index 1c2d2678..43e8ec5e 100644 --- a/backend/app/db/repositories/admin/admin_user_repository.py +++ b/backend/app/db/repositories/admin/admin_user_repository.py @@ -2,7 +2,7 @@ from datetime import datetime, timezone from beanie.odm.operators.find import BaseFindOperator -from beanie.operators import Eq, Or, RegEx +from beanie.operators import Or, RegEx from app.core.security import SecurityService from app.db.docs import ( @@ -42,7 +42,7 @@ async def list_users( ) if role: - conditions.append(Eq(UserDocument.role, role)) + conditions.append(UserDocument.role == role) query = UserDocument.find(*conditions) total = await query.count() @@ -51,11 +51,11 @@ async def list_users( return UserListResult(users=users, total=total, offset=offset, limit=limit) async def get_user_by_id(self, user_id: str) -> User | None: - doc = await UserDocument.find_one({"user_id": user_id}) + doc = await UserDocument.find_one(UserDocument.user_id == user_id) return User.model_validate(doc, from_attributes=True) if doc else None async def update_user(self, user_id: str, update_data: UserUpdate) -> User | None: - doc = await UserDocument.find_one({"user_id": user_id}) + doc = await UserDocument.find_one(UserDocument.user_id == user_id) if not doc: return None @@ -72,7 +72,7 @@ async def update_user(self, user_id: str, update_data: UserUpdate) -> User | Non async def delete_user(self, user_id: str, cascade: bool = True) -> dict[str, int]: deleted_counts = {} - doc = await UserDocument.find_one({"user_id": user_id}) + doc = await UserDocument.find_one(UserDocument.user_id == user_id) if doc: await doc.delete() deleted_counts["user"] = 1 @@ -83,28 +83,28 @@ async def delete_user(self, user_id: str, cascade: bool = True) -> dict[str, int return deleted_counts # Cascade delete related data - exec_result = await ExecutionDocument.find({"user_id": user_id}).delete() + exec_result = await ExecutionDocument.find(ExecutionDocument.user_id == user_id).delete() deleted_counts["executions"] = exec_result.deleted_count if exec_result else 0 - scripts_result = await SavedScriptDocument.find({"user_id": user_id}).delete() + scripts_result = await SavedScriptDocument.find(SavedScriptDocument.user_id == user_id).delete() deleted_counts["saved_scripts"] = scripts_result.deleted_count if scripts_result else 0 - notif_result = await NotificationDocument.find({"user_id": user_id}).delete() + notif_result = await NotificationDocument.find(NotificationDocument.user_id == user_id).delete() deleted_counts["notifications"] = notif_result.deleted_count if notif_result else 0 - settings_result = await UserSettingsDocument.find({"user_id": user_id}).delete() + settings_result = await UserSettingsDocument.find(UserSettingsDocument.user_id == user_id).delete() deleted_counts["user_settings"] = settings_result.deleted_count if settings_result else 0 - events_result = await EventDocument.find({"metadata.user_id": user_id}).delete() + events_result = await EventDocument.find(EventDocument.metadata.user_id == user_id).delete() deleted_counts["events"] = events_result.deleted_count if events_result else 0 - sagas_result = await SagaDocument.find({"context_data.user_id": user_id}).delete() + sagas_result = await SagaDocument.find(SagaDocument.context_data["user_id"] == user_id).delete() deleted_counts["sagas"] = sagas_result.deleted_count if sagas_result else 0 return deleted_counts async def reset_user_password(self, reset_data: PasswordReset) -> bool: - doc = await UserDocument.find_one({"user_id": reset_data.user_id}) + doc = await UserDocument.find_one(UserDocument.user_id == reset_data.user_id) if not doc: return False diff --git a/backend/app/db/repositories/dlq_repository.py b/backend/app/db/repositories/dlq_repository.py index af30ee45..da077e5a 100644 --- a/backend/app/db/repositories/dlq_repository.py +++ b/backend/app/db/repositories/dlq_repository.py @@ -1,8 +1,9 @@ import logging from datetime import datetime, timezone -from typing import Any, Dict, List, Mapping +from typing import Any from beanie.odm.enums import SortDirection +from monggregate import Pipeline, S from app.db.docs import DLQMessageDocument from app.dlq import ( @@ -16,92 +17,78 @@ TopicStatistic, ) from app.domain.enums.events import EventType -from app.infrastructure.kafka.mappings import get_event_class_for_type class DLQRepository: def __init__(self, logger: logging.Logger): self.logger = logger - def _doc_to_message(self, doc: DLQMessageDocument) -> DLQMessage: - event_class = get_event_class_for_type(doc.event_type) - if not event_class: - raise ValueError(f"Unknown event type: {doc.event_type}") - data = doc.model_dump(exclude={"id", "revision_id"}) - return DLQMessage(**{**data, "event": event_class(**data["event"])}) - async def get_dlq_stats(self) -> DLQStatistics: - # Get counts by status - status_pipeline: list[Mapping[str, object]] = [{"$group": {"_id": "$status", "count": {"$sum": 1}}}] - by_status: Dict[str, int] = {} - async for doc in DLQMessageDocument.aggregate(status_pipeline): - if doc["_id"]: - by_status[doc["_id"]] = doc["count"] - - # Get counts by topic - topic_pipeline: list[Mapping[str, object]] = [ - { - "$group": { - "_id": "$original_topic", - "count": {"$sum": 1}, - "avg_retry_count": {"$avg": "$retry_count"}, - } - }, - {"$sort": {"count": -1}}, - {"$limit": 10}, - ] - by_topic: List[TopicStatistic] = [] - async for doc in DLQMessageDocument.aggregate(topic_pipeline): - by_topic.append( - TopicStatistic(topic=doc["_id"], count=doc["count"], avg_retry_count=round(doc["avg_retry_count"], 2)) + # Counts by status + status_pipeline = Pipeline().group(by=DLQMessageDocument.status, query={"count": S.sum(1)}) + status_results = await DLQMessageDocument.aggregate(status_pipeline.export()).to_list() + by_status = {doc["_id"]: doc["count"] for doc in status_results if doc["_id"]} + + # Counts by topic (top 10) - project renames _id->topic and rounds avg_retry_count + topic_pipeline = ( + Pipeline() + .group( + by=DLQMessageDocument.original_topic, + query={"count": S.sum(1), "avg_retry_count": S.avg(S.field(DLQMessageDocument.retry_count))}, ) - - # Get counts by event type - event_type_pipeline: list[Mapping[str, object]] = [ - {"$group": {"_id": "$event_type", "count": {"$sum": 1}}}, - {"$sort": {"count": -1}}, - {"$limit": 10}, - ] - by_event_type: List[EventTypeStatistic] = [] - async for doc in DLQMessageDocument.aggregate(event_type_pipeline): - if doc["_id"]: - by_event_type.append(EventTypeStatistic(event_type=doc["_id"], count=doc["count"])) - - # Get age statistics - age_pipeline: list[Mapping[str, object]] = [ - { - "$project": { - "age_seconds": {"$divide": [{"$subtract": [datetime.now(timezone.utc), "$failed_at"]}, 1000]} - } - }, - { - "$group": { - "_id": None, - "min_age": {"$min": "$age_seconds"}, - "max_age": {"$max": "$age_seconds"}, - "avg_age": {"$avg": "$age_seconds"}, - } + .sort(by="count", descending=True) + .limit(10) + .project(_id=0, topic="$_id", count=1, avg_retry_count={"$round": ["$avg_retry_count", 2]}) + ) + topic_results = await DLQMessageDocument.aggregate(topic_pipeline.export()).to_list() + by_topic = [TopicStatistic.model_validate(doc) for doc in topic_results] + + # Counts by event type (top 10) - project renames _id->event_type + event_type_pipeline = ( + Pipeline() + .group(by=DLQMessageDocument.event.event_type, query={"count": S.sum(1)}) + .sort(by="count", descending=True) + .limit(10) + .project(_id=0, event_type="$_id", count=1) + ) + event_type_results = await DLQMessageDocument.aggregate(event_type_pipeline.export()).to_list() + by_event_type = [EventTypeStatistic.model_validate(doc) for doc in event_type_results if doc["event_type"]] + + # Age statistics - get timestamps and sum for true average, calculate ages in Python + time_pipeline = Pipeline().group( + by=None, + query={ + "oldest": S.min(S.field(DLQMessageDocument.failed_at)), + "newest": S.max(S.field(DLQMessageDocument.failed_at)), + "count": S.sum(1), + "sum_failed_at": S.sum(S.field(DLQMessageDocument.failed_at)), }, - ] - age_result = [] - async for doc in DLQMessageDocument.aggregate(age_pipeline): - age_result.append(doc) - age_stats_data = age_result[0] if age_result else {} - age_stats = AgeStatistics( - min_age_seconds=age_stats_data.get("min_age", 0.0), - max_age_seconds=age_stats_data.get("max_age", 0.0), - avg_age_seconds=age_stats_data.get("avg_age", 0.0), ) + time_results = await DLQMessageDocument.aggregate(time_pipeline.export()).to_list() + now = datetime.now(timezone.utc) + if time_results and time_results[0].get("oldest"): + r = time_results[0] + oldest, newest, count = r["oldest"], r["newest"], r["count"] + # sum_failed_at is sum of timestamps in ms since epoch, compute true average age + now_ms = now.timestamp() * 1000 + avg_age_seconds = (now_ms * count - r["sum_failed_at"]) / count / 1000 if count > 0 else 0.0 + age_stats = AgeStatistics( + min_age_seconds=(now - newest).total_seconds(), + max_age_seconds=(now - oldest).total_seconds(), + avg_age_seconds=avg_age_seconds, + ) + else: + age_stats = AgeStatistics() return DLQStatistics(by_status=by_status, by_topic=by_topic, by_event_type=by_event_type, age_stats=age_stats) async def get_messages( - self, - status: DLQMessageStatus | None = None, - topic: str | None = None, - event_type: EventType | None = None, - limit: int = 50, - offset: int = 0, + self, + status: DLQMessageStatus | None = None, + topic: str | None = None, + event_type: EventType | None = None, + limit: int = 50, + offset: int = 0, ) -> DLQMessageListResult: conditions: list[Any] = [ DLQMessageDocument.status == status if status else None, @@ -115,54 +102,60 @@ async def get_messages( docs = await query.sort([("failed_at", SortDirection.DESCENDING)]).skip(offset).limit(limit).to_list() return DLQMessageListResult( - messages=[self._doc_to_message(d) for d in docs], + messages=[DLQMessage.model_validate(d, from_attributes=True) for d in docs], total=total_count, offset=offset, limit=limit, ) async def get_message_by_id(self, event_id: str) -> DLQMessage | None: - doc = await DLQMessageDocument.find_one({"event_id": event_id}) - return self._doc_to_message(doc) if doc else None + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) + return DLQMessage.model_validate(doc, from_attributes=True) if doc else None async def get_topics_summary(self) -> list[DLQTopicSummary]: - pipeline: list[Mapping[str, object]] = [ - { - "$group": { - "_id": "$original_topic", - "count": {"$sum": 1}, - "statuses": {"$push": "$status"}, - "oldest_message": {"$min": "$failed_at"}, - "newest_message": {"$max": "$failed_at"}, - "avg_retry_count": {"$avg": "$retry_count"}, - "max_retry_count": {"$max": "$retry_count"}, - } - }, - {"$sort": {"count": -1}}, - ] - - topics = [] - async for result in DLQMessageDocument.aggregate(pipeline): - status_counts: dict[str, int] = {} - for status in result["statuses"]: - status_counts[status] = status_counts.get(status, 0) + 1 - - topics.append( - DLQTopicSummary( - topic=result["_id"], - total_messages=result["count"], - status_breakdown=status_counts, - oldest_message=result["oldest_message"], - newest_message=result["newest_message"], - avg_retry_count=round(result["avg_retry_count"], 2), - max_retry_count=result["max_retry_count"], - ) + # Two-stage aggregation: group by topic+status first, then by topic with $arrayToObject + # Note: compound keys need S.field() wrapper for monggregate to add $ prefix + pipeline = ( + Pipeline() + .group( + by={"topic": S.field(DLQMessageDocument.original_topic), "status": S.field(DLQMessageDocument.status)}, + query={ + "count": S.sum(1), + "oldest": S.min(S.field(DLQMessageDocument.failed_at)), + "newest": S.max(S.field(DLQMessageDocument.failed_at)), + "sum_retry": S.sum(S.field(DLQMessageDocument.retry_count)), + "max_retry": S.max(S.field(DLQMessageDocument.retry_count)), + }, ) - - return topics + .group( + by="$_id.topic", + query={ + "status_pairs": S.push({"k": "$_id.status", "v": "$count"}), + "total_messages": S.sum("$count"), + "oldest_message": S.min("$oldest"), + "newest_message": S.max("$newest"), + "total_retry": S.sum("$sum_retry"), + "doc_count": S.sum("$count"), + "max_retry_count": S.max("$max_retry"), + }, + ) + .sort(by="total_messages", descending=True) + .project( + _id=0, + topic="$_id", + total_messages=1, + status_breakdown={"$arrayToObject": "$status_pairs"}, + oldest_message=1, + newest_message=1, + avg_retry_count={"$round": [{"$divide": ["$total_retry", "$doc_count"]}, 2]}, + max_retry_count=1, + ) + ) + results = await DLQMessageDocument.aggregate(pipeline.export()).to_list() + return [DLQTopicSummary.model_validate(r) for r in results] async def mark_message_retried(self, event_id: str) -> bool: - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) if not doc: return False now = datetime.now(timezone.utc) @@ -173,7 +166,7 @@ async def mark_message_retried(self, event_id: str) -> bool: return True async def mark_message_discarded(self, event_id: str, reason: str) -> bool: - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) if not doc: return False now = datetime.now(timezone.utc) diff --git a/backend/app/db/repositories/event_repository.py b/backend/app/db/repositories/event_repository.py index 998a863d..d0f8daee 100644 --- a/backend/app/db/repositories/event_repository.py +++ b/backend/app/db/repositories/event_repository.py @@ -4,6 +4,7 @@ from beanie.odm.enums import SortDirection from beanie.operators import GTE, LT, LTE, In, Not, Or, RegEx +from monggregate import S from app.core.tracing import EventAttributes from app.core.tracing.utils import add_span_attributes @@ -66,18 +67,18 @@ async def store_events_batch(self, events: list[DomainEvent]) -> list[str]: return [event.event_id for event in events] async def get_event(self, event_id: str) -> DomainEvent | None: - doc = await EventDocument.find_one({"event_id": event_id}) + doc = await EventDocument.find_one(EventDocument.event_id == event_id) if not doc: return None return domain_event_adapter.validate_python(doc, from_attributes=True) async def get_events_by_type( - self, - event_type: str, - start_time: datetime | None = None, - end_time: datetime | None = None, - limit: int = 100, - skip: int = 0, + self, + event_type: str, + start_time: datetime | None = None, + end_time: datetime | None = None, + limit: int = 100, + skip: int = 0, ) -> list[DomainEvent]: conditions = [ EventDocument.event_type == event_type, @@ -93,7 +94,7 @@ async def get_events_by_type( return [domain_event_adapter.validate_python(d, from_attributes=True) for d in docs] async def get_events_by_aggregate( - self, aggregate_id: str, event_types: list[EventType] | None = None, limit: int = 100 + self, aggregate_id: str, event_types: list[EventType] | None = None, limit: int = 100 ) -> list[DomainEvent]: conditions = [ EventDocument.aggregate_id == aggregate_id, @@ -119,13 +120,13 @@ async def get_events_by_correlation(self, correlation_id: str, limit: int = 100, ) async def get_events_by_user( - self, - user_id: str, - event_types: list[str] | None = None, - start_time: datetime | None = None, - end_time: datetime | None = None, - limit: int = 100, - skip: int = 0, + self, + user_id: str, + event_types: list[str] | None = None, + start_time: datetime | None = None, + end_time: datetime | None = None, + limit: int = 100, + skip: int = 0, ) -> list[DomainEvent]: conditions = [ EventDocument.metadata.user_id == user_id, @@ -143,11 +144,11 @@ async def get_events_by_user( return [domain_event_adapter.validate_python(d, from_attributes=True) for d in docs] async def get_execution_events( - self, execution_id: str, limit: int = 100, skip: int = 0, exclude_system_events: bool = False + self, execution_id: str, limit: int = 100, skip: int = 0, exclude_system_events: bool = False ) -> EventListResult: conditions: list[Any] = [ Or( - {"execution_id": execution_id}, + EventDocument.execution_id == execution_id, EventDocument.aggregate_id == execution_id, ), Not(RegEx(EventDocument.metadata.service_name, "^system-")) if exclude_system_events else None, @@ -166,10 +167,10 @@ async def get_execution_events( ) async def get_event_statistics( - self, - start_time: datetime | None = None, - end_time: datetime | None = None, - match: dict[str, object] | None = None, + self, + start_time: datetime | None = None, + end_time: datetime | None = None, + match: dict[str, object] | None = None, ) -> EventStatistics: pipeline: list[Mapping[str, object]] = [] if match: @@ -183,17 +184,22 @@ async def get_event_statistics( { "$facet": { "by_type": [ - {"$group": {"_id": "$event_type", "count": {"$sum": 1}}}, + {"$group": {"_id": S.field(EventDocument.event_type), "count": {"$sum": 1}}}, {"$sort": {"count": -1}}, ], "by_service": [ - {"$group": {"_id": "$metadata.service_name", "count": {"$sum": 1}}}, + {"$group": {"_id": S.field(EventDocument.metadata.service_name), "count": {"$sum": 1}}}, {"$sort": {"count": -1}}, ], "by_hour": [ { "$group": { - "_id": {"$dateToString": {"format": "%Y-%m-%d %H:00", "date": "$timestamp"}}, + "_id": { + "$dateToString": { + "format": "%Y-%m-%d %H:00", + "date": S.field(EventDocument.timestamp), + } + }, "count": {"$sum": 1}, } }, @@ -234,7 +240,7 @@ async def get_event_statistics( return EventStatistics(total_events=0, events_by_type={}, events_by_service={}, events_by_hour=[]) async def cleanup_old_events( - self, older_than_days: int = 30, event_types: list[str] | None = None, dry_run: bool = False + self, older_than_days: int = 30, event_types: list[str] | None = None, dry_run: bool = False ) -> int: cutoff_dt = datetime.now(timezone.utc) - timedelta(days=older_than_days) conditions: list[Any] = [ @@ -254,14 +260,14 @@ async def cleanup_old_events( return deleted_count async def get_user_events_paginated( - self, - user_id: str, - event_types: list[str] | None = None, - start_time: datetime | None = None, - end_time: datetime | None = None, - limit: int = 100, - skip: int = 0, - sort_order: str = "desc", + self, + user_id: str, + event_types: list[str] | None = None, + start_time: datetime | None = None, + end_time: datetime | None = None, + limit: int = 100, + skip: int = 0, + sort_order: str = "desc", ) -> EventListResult: conditions = [ EventDocument.metadata.user_id == user_id, @@ -287,11 +293,11 @@ async def count_events(self, *conditions: Any) -> int: return await EventDocument.find(*conditions).count() async def query_events( - self, - query: dict[str, Any], - sort_field: str = "timestamp", - skip: int = 0, - limit: int = 100, + self, + query: dict[str, Any], + sort_field: str = "timestamp", + skip: int = 0, + limit: int = 100, ) -> EventListResult: """Query events with filter, sort, and pagination. Always sorts descending (newest first).""" cursor = EventDocument.find(query) @@ -315,7 +321,7 @@ async def list_event_types(self, match: dict[str, object] | None = None) -> list pipeline.append({"$match": match}) pipeline.extend( [ - {"$group": {"_id": "$event_type"}}, + {"$group": {"_id": S.field(EventDocument.event_type)}}, {"$sort": {"_id": 1}}, ] ) @@ -323,9 +329,9 @@ async def list_event_types(self, match: dict[str, object] | None = None) -> list return [doc["_id"] for doc in results if doc.get("_id")] async def delete_event_with_archival( - self, event_id: str, deleted_by: str, deletion_reason: str = "Admin deletion via API" + self, event_id: str, deleted_by: str, deletion_reason: str = "Admin deletion via API" ) -> ArchivedEvent | None: - doc = await EventDocument.find_one({"event_id": event_id}) + doc = await EventDocument.find_one(EventDocument.event_id == event_id) if not doc: return None @@ -341,16 +347,16 @@ async def get_aggregate_events_for_replay(self, aggregate_id: str, limit: int = async def get_aggregate_replay_info(self, aggregate_id: str) -> EventReplayInfo | None: pipeline = [ - {"$match": {"aggregate_id": aggregate_id}}, - {"$sort": {"timestamp": 1}}, + {"$match": {EventDocument.aggregate_id: aggregate_id}}, + {"$sort": {EventDocument.timestamp: 1}}, { "$group": { "_id": None, "events": {"$push": "$$ROOT"}, "event_count": {"$sum": 1}, - "event_types": {"$addToSet": "$event_type"}, - "start_time": {"$min": "$timestamp"}, - "end_time": {"$max": "$timestamp"}, + "event_types": {"$addToSet": S.field(EventDocument.event_type)}, + "start_time": {"$min": S.field(EventDocument.timestamp)}, + "end_time": {"$max": S.field(EventDocument.timestamp)}, } }, {"$project": {"_id": 0}}, diff --git a/backend/app/db/repositories/execution_repository.py b/backend/app/db/repositories/execution_repository.py index 66406501..11223f06 100644 --- a/backend/app/db/repositories/execution_repository.py +++ b/backend/app/db/repositories/execution_repository.py @@ -26,7 +26,7 @@ async def create_execution(self, create_data: DomainExecutionCreate) -> DomainEx async def get_execution(self, execution_id: str) -> DomainExecution | None: self.logger.info("Searching for execution in MongoDB", extra={"execution_id": execution_id}) - doc = await ExecutionDocument.find_one({"execution_id": execution_id}) + doc = await ExecutionDocument.find_one(ExecutionDocument.execution_id == execution_id) if not doc: self.logger.warning("Execution not found in MongoDB", extra={"execution_id": execution_id}) return None @@ -35,7 +35,7 @@ async def get_execution(self, execution_id: str) -> DomainExecution | None: return DomainExecution.model_validate(doc, from_attributes=True) async def update_execution(self, execution_id: str, update_data: DomainExecutionUpdate) -> bool: - doc = await ExecutionDocument.find_one({"execution_id": execution_id}) + doc = await ExecutionDocument.find_one(ExecutionDocument.execution_id == execution_id) if not doc: return False @@ -46,7 +46,7 @@ async def update_execution(self, execution_id: str, update_data: DomainExecution return True async def write_terminal_result(self, result: ExecutionResultDomain) -> bool: - doc = await ExecutionDocument.find_one({"execution_id": result.execution_id}) + doc = await ExecutionDocument.find_one(ExecutionDocument.execution_id == result.execution_id) if not doc: self.logger.warning("No execution found", extra={"execution_id": result.execution_id}) return False @@ -81,7 +81,7 @@ async def count_executions(self, query: dict[str, Any]) -> int: return await ExecutionDocument.find(query).count() async def delete_execution(self, execution_id: str) -> bool: - doc = await ExecutionDocument.find_one({"execution_id": execution_id}) + doc = await ExecutionDocument.find_one(ExecutionDocument.execution_id == execution_id) if not doc: return False await doc.delete() diff --git a/backend/app/db/repositories/notification_repository.py b/backend/app/db/repositories/notification_repository.py index 51d2bb68..2132cad3 100644 --- a/backend/app/db/repositories/notification_repository.py +++ b/backend/app/db/repositories/notification_repository.py @@ -29,7 +29,10 @@ async def create_notification(self, create_data: DomainNotificationCreate) -> Do async def update_notification( self, notification_id: str, user_id: str, update_data: DomainNotificationUpdate ) -> bool: - doc = await NotificationDocument.find_one({"notification_id": notification_id, "user_id": user_id}) + doc = await NotificationDocument.find_one( + NotificationDocument.notification_id == notification_id, + NotificationDocument.user_id == user_id, + ) if not doc: return False update_dict = update_data.model_dump(exclude_none=True) @@ -38,13 +41,19 @@ async def update_notification( return True async def get_notification(self, notification_id: str, user_id: str) -> DomainNotification | None: - doc = await NotificationDocument.find_one({"notification_id": notification_id, "user_id": user_id}) + doc = await NotificationDocument.find_one( + NotificationDocument.notification_id == notification_id, + NotificationDocument.user_id == user_id, + ) if not doc: return None return DomainNotification.model_validate(doc, from_attributes=True) async def mark_as_read(self, notification_id: str, user_id: str) -> bool: - doc = await NotificationDocument.find_one({"notification_id": notification_id, "user_id": user_id}) + doc = await NotificationDocument.find_one( + NotificationDocument.notification_id == notification_id, + NotificationDocument.user_id == user_id, + ) if not doc: return False await doc.set({"status": NotificationStatus.READ, "read_at": datetime.now(UTC)}) @@ -52,12 +61,16 @@ async def mark_as_read(self, notification_id: str, user_id: str) -> bool: async def mark_all_as_read(self, user_id: str) -> int: result = await NotificationDocument.find( - {"user_id": user_id, "status": NotificationStatus.DELIVERED} + NotificationDocument.user_id == user_id, + NotificationDocument.status == NotificationStatus.DELIVERED, ).update_many({"$set": {"status": NotificationStatus.READ, "read_at": datetime.now(UTC)}}) return result.modified_count if result and hasattr(result, "modified_count") else 0 async def delete_notification(self, notification_id: str, user_id: str) -> bool: - doc = await NotificationDocument.find_one({"notification_id": notification_id, "user_id": user_id}) + doc = await NotificationDocument.find_one( + NotificationDocument.notification_id == notification_id, + NotificationDocument.user_id == user_id, + ) if not doc: return False await doc.delete() @@ -155,7 +168,10 @@ async def get_subscription( self, user_id: str, channel: NotificationChannel ) -> DomainNotificationSubscription: """Get subscription for user/channel, returning default enabled subscription if none exists.""" - doc = await NotificationSubscriptionDocument.find_one({"user_id": user_id, "channel": channel}) + doc = await NotificationSubscriptionDocument.find_one( + NotificationSubscriptionDocument.user_id == user_id, + NotificationSubscriptionDocument.channel == channel, + ) if not doc: # Default: enabled=True for new users (consistent with get_all_subscriptions) return DomainNotificationSubscription(user_id=user_id, channel=channel, enabled=True) @@ -164,7 +180,10 @@ async def get_subscription( async def upsert_subscription( self, user_id: str, channel: NotificationChannel, update_data: DomainSubscriptionUpdate ) -> DomainNotificationSubscription: - existing = await NotificationSubscriptionDocument.find_one({"user_id": user_id, "channel": channel}) + existing = await NotificationSubscriptionDocument.find_one( + NotificationSubscriptionDocument.user_id == user_id, + NotificationSubscriptionDocument.channel == channel, + ) update_dict = update_data.model_dump(exclude_none=True) update_dict["updated_at"] = datetime.now(UTC) @@ -183,7 +202,10 @@ async def upsert_subscription( async def get_all_subscriptions(self, user_id: str) -> dict[NotificationChannel, DomainNotificationSubscription]: subs: dict[NotificationChannel, DomainNotificationSubscription] = {} for channel in NotificationChannel: - doc = await NotificationSubscriptionDocument.find_one({"user_id": user_id, "channel": channel}) + doc = await NotificationSubscriptionDocument.find_one( + NotificationSubscriptionDocument.user_id == user_id, + NotificationSubscriptionDocument.channel == channel, + ) if doc: subs[channel] = DomainNotificationSubscription.model_validate(doc, from_attributes=True) else: diff --git a/backend/app/db/repositories/replay_repository.py b/backend/app/db/repositories/replay_repository.py index 1ec73b0d..c69593b9 100644 --- a/backend/app/db/repositories/replay_repository.py +++ b/backend/app/db/repositories/replay_repository.py @@ -16,14 +16,14 @@ def __init__(self, logger: logging.Logger) -> None: self.logger = logger async def save_session(self, session: ReplaySessionState) -> None: - existing = await ReplaySessionDocument.find_one({"session_id": session.session_id}) + existing = await ReplaySessionDocument.find_one(ReplaySessionDocument.session_id == session.session_id) doc = ReplaySessionDocument(**session.model_dump()) if existing: doc.id = existing.id await doc.save() async def get_session(self, session_id: str) -> ReplaySessionState | None: - doc = await ReplaySessionDocument.find_one({"session_id": session_id}) + doc = await ReplaySessionDocument.find_one(ReplaySessionDocument.session_id == session_id) if not doc: return None return ReplaySessionState.model_validate(doc, from_attributes=True) @@ -46,7 +46,7 @@ async def list_sessions( return [ReplaySessionState.model_validate(doc, from_attributes=True) for doc in docs] async def update_session_status(self, session_id: str, status: ReplayStatus) -> bool: - doc = await ReplaySessionDocument.find_one({"session_id": session_id}) + doc = await ReplaySessionDocument.find_one(ReplaySessionDocument.session_id == session_id) if not doc: return False doc.status = status @@ -72,7 +72,7 @@ async def update_replay_session(self, session_id: str, updates: ReplaySessionUpd update_dict = updates.model_dump(exclude_none=True) if not update_dict: return False - doc = await ReplaySessionDocument.find_one({"session_id": session_id}) + doc = await ReplaySessionDocument.find_one(ReplaySessionDocument.session_id == session_id) if not doc: return False await doc.set(update_dict) diff --git a/backend/app/db/repositories/resource_allocation_repository.py b/backend/app/db/repositories/resource_allocation_repository.py index 9a103cf6..8b209100 100644 --- a/backend/app/db/repositories/resource_allocation_repository.py +++ b/backend/app/db/repositories/resource_allocation_repository.py @@ -7,7 +7,10 @@ class ResourceAllocationRepository: async def count_active(self, language: str) -> int: - return await ResourceAllocationDocument.find({"status": "active", "language": language}).count() + return await ResourceAllocationDocument.find( + ResourceAllocationDocument.status == "active", + ResourceAllocationDocument.language == language, + ).count() async def create_allocation(self, create_data: DomainResourceAllocationCreate) -> DomainResourceAllocation: doc = ResourceAllocationDocument( @@ -18,7 +21,7 @@ async def create_allocation(self, create_data: DomainResourceAllocationCreate) - return DomainResourceAllocation.model_validate(doc, from_attributes=True) async def release_allocation(self, allocation_id: str) -> bool: - doc = await ResourceAllocationDocument.find_one({"allocation_id": allocation_id}) + doc = await ResourceAllocationDocument.find_one(ResourceAllocationDocument.allocation_id == allocation_id) if not doc: return False await doc.set({"status": "released", "released_at": datetime.now(timezone.utc)}) diff --git a/backend/app/db/repositories/saga_repository.py b/backend/app/db/repositories/saga_repository.py index 95527dc9..1416f85e 100644 --- a/backend/app/db/repositories/saga_repository.py +++ b/backend/app/db/repositories/saga_repository.py @@ -4,6 +4,7 @@ from beanie.odm.enums import SortDirection from beanie.odm.operators.find import BaseFindOperator from beanie.operators import GT, LT, In +from monggregate import Pipeline, S from app.db.docs import ExecutionDocument, SagaDocument from app.domain.enums.saga import SagaState @@ -28,7 +29,7 @@ def _filter_conditions(self, saga_filter: SagaFilter) -> list[BaseFindOperator]: return [c for c in conditions if c is not None] async def upsert_saga(self, saga: Saga) -> bool: - existing = await SagaDocument.find_one({"saga_id": saga.saga_id}) + existing = await SagaDocument.find_one(SagaDocument.saga_id == saga.saga_id) doc = SagaDocument(**saga.model_dump()) if existing: doc.id = existing.id @@ -43,11 +44,11 @@ async def get_saga_by_execution_and_name(self, execution_id: str, saga_name: str return Saga.model_validate(doc, from_attributes=True) if doc else None async def get_saga(self, saga_id: str) -> Saga | None: - doc = await SagaDocument.find_one({"saga_id": saga_id}) + doc = await SagaDocument.find_one(SagaDocument.saga_id == saga_id) return Saga.model_validate(doc, from_attributes=True) if doc else None async def get_sagas_by_execution( - self, execution_id: str, state: SagaState | None = None, limit: int = 100, skip: int = 0 + self, execution_id: str, state: SagaState | None = None, limit: int = 100, skip: int = 0 ) -> SagaListResult: conditions = [ SagaDocument.execution_id == execution_id, @@ -78,7 +79,7 @@ async def list_sagas(self, saga_filter: SagaFilter, limit: int = 100, skip: int ) async def update_saga_state(self, saga_id: str, state: SagaState, error_message: str | None = None) -> bool: - doc = await SagaDocument.find_one({"saga_id": saga_id}) + doc = await SagaDocument.find_one(SagaDocument.saga_id == saga_id) if not doc: return False @@ -90,21 +91,21 @@ async def update_saga_state(self, saga_id: str, state: SagaState, error_message: return True async def get_user_execution_ids(self, user_id: str) -> list[str]: - docs = await ExecutionDocument.find({"user_id": user_id}).to_list() + docs = await ExecutionDocument.find(ExecutionDocument.user_id == user_id).to_list() return [doc.execution_id for doc in docs] async def count_sagas_by_state(self) -> dict[str, int]: - pipeline = [{"$group": {"_id": "$state", "count": {"$sum": 1}}}] + pipeline = Pipeline().group(by=SagaDocument.state, query={"count": S.sum(1)}) result = {} - async for doc in SagaDocument.aggregate(pipeline): + async for doc in SagaDocument.aggregate(pipeline.export()): result[doc["_id"]] = doc["count"] return result async def find_timed_out_sagas( - self, - cutoff_time: datetime, - states: list[SagaState] | None = None, - limit: int = 100, + self, + cutoff_time: datetime, + states: list[SagaState] | None = None, + limit: int = 100, ) -> list[Saga]: states = states or [SagaState.RUNNING, SagaState.COMPENSATING] docs = ( @@ -123,9 +124,9 @@ async def get_saga_statistics(self, saga_filter: SagaFilter | None = None) -> di total = await base_query.count() # Group by state - state_pipeline = [{"$group": {"_id": "$state", "count": {"$sum": 1}}}] + state_pipeline = Pipeline().group(by=SagaDocument.state, query={"count": S.sum(1)}) states = {} - async for doc in base_query.aggregate(state_pipeline): + async for doc in base_query.aggregate(state_pipeline.export()): states[doc["_id"]] = doc["count"] # Average duration for completed sagas @@ -134,12 +135,15 @@ async def get_saga_statistics(self, saga_filter: SagaFilter | None = None) -> di SagaDocument.state == SagaState.COMPLETED, SagaDocument.completed_at != None, # noqa: E711 ] - duration_pipeline = [ - {"$project": {"duration": {"$subtract": ["$completed_at", "$created_at"]}}}, - {"$group": {"_id": None, "avg_duration": {"$avg": "$duration"}}}, - ] + duration_pipeline = ( + Pipeline() + .project( + duration={"$subtract": [S.field(SagaDocument.completed_at), S.field(SagaDocument.created_at)]} + ) + .group(by=None, query={"avg_duration": S.avg("$duration")}) + ) avg_duration = 0.0 - async for doc in SagaDocument.find(*completed_conditions).aggregate(duration_pipeline): + async for doc in SagaDocument.find(*completed_conditions).aggregate(duration_pipeline.export()): avg_duration = doc["avg_duration"] / 1000.0 if doc["avg_duration"] else 0.0 return {"total": total, "by_state": states, "average_duration_seconds": avg_duration} diff --git a/backend/app/db/repositories/saved_script_repository.py b/backend/app/db/repositories/saved_script_repository.py index af46101d..328a1ed3 100644 --- a/backend/app/db/repositories/saved_script_repository.py +++ b/backend/app/db/repositories/saved_script_repository.py @@ -1,5 +1,3 @@ -from beanie.operators import Eq - from app.db.docs import SavedScriptDocument from app.domain.saved_script import DomainSavedScript, DomainSavedScriptCreate, DomainSavedScriptUpdate @@ -12,8 +10,8 @@ async def create_saved_script(self, create_data: DomainSavedScriptCreate, user_i async def get_saved_script(self, script_id: str, user_id: str) -> DomainSavedScript | None: doc = await SavedScriptDocument.find_one( - Eq(SavedScriptDocument.script_id, script_id), - Eq(SavedScriptDocument.user_id, user_id), + SavedScriptDocument.script_id == script_id, + SavedScriptDocument.user_id == user_id, ) return DomainSavedScript.model_validate(doc, from_attributes=True) if doc else None @@ -24,8 +22,8 @@ async def update_saved_script( update_data: DomainSavedScriptUpdate, ) -> DomainSavedScript | None: doc = await SavedScriptDocument.find_one( - Eq(SavedScriptDocument.script_id, script_id), - Eq(SavedScriptDocument.user_id, user_id), + SavedScriptDocument.script_id == script_id, + SavedScriptDocument.user_id == user_id, ) if not doc: return None @@ -36,8 +34,8 @@ async def update_saved_script( async def delete_saved_script(self, script_id: str, user_id: str) -> bool: doc = await SavedScriptDocument.find_one( - Eq(SavedScriptDocument.script_id, script_id), - Eq(SavedScriptDocument.user_id, user_id), + SavedScriptDocument.script_id == script_id, + SavedScriptDocument.user_id == user_id, ) if not doc: return False @@ -45,5 +43,5 @@ async def delete_saved_script(self, script_id: str, user_id: str) -> bool: return True async def list_saved_scripts(self, user_id: str) -> list[DomainSavedScript]: - docs = await SavedScriptDocument.find(Eq(SavedScriptDocument.user_id, user_id)).to_list() + docs = await SavedScriptDocument.find(SavedScriptDocument.user_id == user_id).to_list() return [DomainSavedScript.model_validate(d, from_attributes=True) for d in docs] diff --git a/backend/app/db/repositories/sse_repository.py b/backend/app/db/repositories/sse_repository.py index 339dc72a..14c9d439 100644 --- a/backend/app/db/repositories/sse_repository.py +++ b/backend/app/db/repositories/sse_repository.py @@ -7,7 +7,7 @@ class SSERepository: async def get_execution_status(self, execution_id: str) -> SSEExecutionStatusDomain | None: - doc = await ExecutionDocument.find_one({"execution_id": execution_id}) + doc = await ExecutionDocument.find_one(ExecutionDocument.execution_id == execution_id) if not doc: return None return SSEExecutionStatusDomain( @@ -17,7 +17,7 @@ async def get_execution_status(self, execution_id: str) -> SSEExecutionStatusDom ) async def get_execution(self, execution_id: str) -> DomainExecution | None: - doc = await ExecutionDocument.find_one({"execution_id": execution_id}) + doc = await ExecutionDocument.find_one(ExecutionDocument.execution_id == execution_id) if not doc: return None return DomainExecution.model_validate(doc, from_attributes=True) diff --git a/backend/app/db/repositories/user_repository.py b/backend/app/db/repositories/user_repository.py index 4af4d41c..472e0739 100644 --- a/backend/app/db/repositories/user_repository.py +++ b/backend/app/db/repositories/user_repository.py @@ -2,7 +2,7 @@ from datetime import datetime, timezone from beanie.odm.operators.find import BaseFindOperator -from beanie.operators import Eq, Or, RegEx +from beanie.operators import Or, RegEx from app.db.docs import UserDocument from app.domain.enums.user import UserRole @@ -11,7 +11,7 @@ class UserRepository: async def get_user(self, username: str) -> User | None: - doc = await UserDocument.find_one({"username": username}) + doc = await UserDocument.find_one(UserDocument.username == username) return User.model_validate(doc, from_attributes=True) if doc else None async def create_user(self, create_data: DomainUserCreate) -> User: @@ -20,7 +20,7 @@ async def create_user(self, create_data: DomainUserCreate) -> User: return User.model_validate(doc, from_attributes=True) async def get_user_by_id(self, user_id: str) -> User | None: - doc = await UserDocument.find_one({"user_id": user_id}) + doc = await UserDocument.find_one(UserDocument.user_id == user_id) return User.model_validate(doc, from_attributes=True) if doc else None async def list_users( @@ -38,7 +38,7 @@ async def list_users( ) if role: - conditions.append(Eq(UserDocument.role, role)) + conditions.append(UserDocument.role == role) query = UserDocument.find(*conditions) total = await query.count() @@ -51,7 +51,7 @@ async def list_users( ) async def update_user(self, user_id: str, update_data: DomainUserUpdate) -> User | None: - doc = await UserDocument.find_one({"user_id": user_id}) + doc = await UserDocument.find_one(UserDocument.user_id == user_id) if not doc: return None @@ -62,7 +62,7 @@ async def update_user(self, user_id: str, update_data: DomainUserUpdate) -> User return User.model_validate(doc, from_attributes=True) async def delete_user(self, user_id: str) -> bool: - doc = await UserDocument.find_one({"user_id": user_id}) + doc = await UserDocument.find_one(UserDocument.user_id == user_id) if not doc: return False await doc.delete() diff --git a/backend/app/db/repositories/user_settings_repository.py b/backend/app/db/repositories/user_settings_repository.py index ee05bd4f..222fdff9 100644 --- a/backend/app/db/repositories/user_settings_repository.py +++ b/backend/app/db/repositories/user_settings_repository.py @@ -14,13 +14,13 @@ def __init__(self, logger: logging.Logger) -> None: self.logger = logger async def get_snapshot(self, user_id: str) -> DomainUserSettings | None: - doc = await UserSettingsDocument.find_one({"user_id": user_id}) + doc = await UserSettingsDocument.find_one(UserSettingsDocument.user_id == user_id) if not doc: return None return DomainUserSettings.model_validate(doc, from_attributes=True) async def create_snapshot(self, settings: DomainUserSettings) -> None: - existing = await UserSettingsDocument.find_one({"user_id": settings.user_id}) + existing = await UserSettingsDocument.find_one(UserSettingsDocument.user_id == settings.user_id) doc = UserSettingsDocument(**settings.model_dump()) if existing: doc.id = existing.id @@ -71,7 +71,7 @@ async def count_events_for_user(self, user_id: str) -> int: return await EventDocument.find(EventDocument.aggregate_id == f"user_settings_{user_id}").count() async def delete_user_settings(self, user_id: str) -> None: - doc = await UserSettingsSnapshotDocument.find_one({"user_id": user_id}) + doc = await UserSettingsSnapshotDocument.find_one(UserSettingsSnapshotDocument.user_id == user_id) if doc: await doc.delete() await EventDocument.find(EventDocument.aggregate_id == f"user_settings_{user_id}").delete() diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py index b3eb05c9..27aacfdf 100644 --- a/backend/app/dlq/manager.py +++ b/backend/app/dlq/manager.py @@ -2,10 +2,9 @@ import json import logging from datetime import datetime, timezone -from typing import Any, Awaitable, Callable +from typing import Any, Callable from aiokafka import AIOKafkaConsumer, AIOKafkaProducer -from aiokafka.errors import KafkaError from opentelemetry.trace import SpanKind from app.core.lifecycle import LifecycleEnabled @@ -23,6 +22,12 @@ RetryStrategy, ) from app.domain.enums.kafka import GroupId, KafkaTopic +from app.domain.events.typed import ( + DLQMessageDiscardedEvent, + DLQMessageReceivedEvent, + DLQMessageRetriedEvent, + EventMetadata, +) from app.events.schema.schema_registry import SchemaRegistryManager from app.settings import Settings @@ -61,91 +66,14 @@ def __init__( # Message filters self._filters: list[Callable[[DLQMessage], bool]] = [] - # Retry callbacks - all must be async - self._callbacks: dict[str, list[Callable[..., Awaitable[None]]]] = { - "before_retry": [], - "after_retry": [], - "on_discard": [], - } - - def _doc_to_message(self, doc: DLQMessageDocument) -> DLQMessage: - """Convert DLQMessageDocument to DLQMessage domain model.""" - event = self.schema_registry.deserialize_json(doc.event) - return DLQMessage( - event_id=doc.event_id, - event=event, - event_type=doc.event_type, - original_topic=doc.original_topic, - error=doc.error, - retry_count=doc.retry_count, - failed_at=doc.failed_at, - status=doc.status, - producer_id=doc.producer_id, - created_at=doc.created_at, - last_updated=doc.last_updated, - next_retry_at=doc.next_retry_at, - retried_at=doc.retried_at, - discarded_at=doc.discarded_at, - discard_reason=doc.discard_reason, - dlq_offset=doc.dlq_offset, - dlq_partition=doc.dlq_partition, - last_error=doc.last_error, - headers=doc.headers, - ) - - def _message_to_doc(self, message: DLQMessage) -> DLQMessageDocument: - """Convert DLQMessage domain model to DLQMessageDocument.""" - return DLQMessageDocument( - event=message.event.model_dump(), - event_id=message.event_id, - event_type=message.event_type, - original_topic=message.original_topic, - error=message.error, - retry_count=message.retry_count, - failed_at=message.failed_at, - status=message.status, - producer_id=message.producer_id, - created_at=message.created_at or datetime.now(timezone.utc), - last_updated=message.last_updated, - next_retry_at=message.next_retry_at, - retried_at=message.retried_at, - discarded_at=message.discarded_at, - discard_reason=message.discard_reason, - dlq_offset=message.dlq_offset, - dlq_partition=message.dlq_partition, - last_error=message.last_error, - headers=message.headers, - ) + self._dlq_events_topic = f"{settings.KAFKA_TOPIC_PREFIX}{KafkaTopic.DLQ_EVENTS}" + self._event_metadata = EventMetadata(service_name="dlq-manager", service_version="1.0.0") def _kafka_msg_to_message(self, msg: Any) -> DLQMessage: """Parse Kafka ConsumerRecord into DLQMessage.""" - raw_bytes = msg.value - raw: str = raw_bytes.decode("utf-8") if isinstance(raw_bytes, (bytes, bytearray)) else str(raw_bytes or "") - data: dict[str, Any] = json.loads(raw) if raw else {} - - headers_list = msg.headers or [] - headers: dict[str, str] = {} - for k, v in headers_list: - headers[str(k)] = v.decode("utf-8") if isinstance(v, (bytes, bytearray)) else (v or "") - - event = self.schema_registry.deserialize_json(data.get("event", data)) - - return DLQMessage( - event_id=data.get("event_id", event.event_id), - event=event, - event_type=event.event_type, - original_topic=data.get("original_topic", headers.get("original_topic", "")), - error=data.get("error", headers.get("error", "Unknown error")), - retry_count=data.get("retry_count", int(headers.get("retry_count", 0))), - failed_at=datetime.fromisoformat(data["failed_at"]) - if data.get("failed_at") - else datetime.now(timezone.utc), - status=DLQMessageStatus(data.get("status", DLQMessageStatus.PENDING)), - producer_id=data.get("producer_id", headers.get("producer_id", "unknown")), - dlq_offset=msg.offset, - dlq_partition=msg.partition, - headers=headers, - ) + data = json.loads(msg.value) + headers = {k: v.decode() for k, v in (msg.headers or [])} + return DLQMessage(**data, dlq_offset=msg.offset, dlq_partition=msg.partition, headers=headers) async def _on_start(self) -> None: """Start DLQ manager.""" @@ -177,75 +105,42 @@ async def _on_stop(self) -> None: self.logger.info("DLQ Manager stopped") async def _process_messages(self) -> None: - while self.is_running: + """Process DLQ messages using async iteration.""" + async for msg in self.consumer: try: - msg = await self._poll_message() - if msg is None: - continue - - start_time = asyncio.get_running_loop().time() - dlq_message = self._kafka_msg_to_message(msg) - - await self._record_message_metrics(dlq_message) - await self._process_message_with_tracing(msg, dlq_message) - await self._commit_and_record_duration(start_time) + start = asyncio.get_running_loop().time() + dlq_msg = self._kafka_msg_to_message(msg) + + # Record metrics + self.metrics.record_dlq_message_received(dlq_msg.original_topic, dlq_msg.event.event_type) + self.metrics.record_dlq_message_age((datetime.now(timezone.utc) - dlq_msg.failed_at).total_seconds()) + + # Process with tracing + ctx = extract_trace_context(dlq_msg.headers) + with get_tracer().start_as_current_span( + name="dlq.consume", + context=ctx, + kind=SpanKind.CONSUMER, + attributes={ + EventAttributes.KAFKA_TOPIC: self.dlq_topic, + EventAttributes.EVENT_TYPE: dlq_msg.event.event_type, + EventAttributes.EVENT_ID: dlq_msg.event.event_id, + }, + ): + await self._process_dlq_message(dlq_msg) + + # Commit and record duration + await self.consumer.commit() + self.metrics.record_dlq_processing_duration(asyncio.get_running_loop().time() - start, "process") except Exception as e: - self.logger.error(f"Error in DLQ processing loop: {e}") - await asyncio.sleep(5) - - async def _poll_message(self) -> Any | None: - """Poll for a message from Kafka using async getone().""" - try: - return await asyncio.wait_for(self.consumer.getone(), timeout=1.0) - except asyncio.TimeoutError: - return None - except KafkaError as e: - self.logger.error(f"Consumer error: {e}") - return None - - def _extract_headers(self, msg: Any) -> dict[str, str]: - """Extract headers from Kafka ConsumerRecord.""" - headers_list = msg.headers or [] - headers: dict[str, str] = {} - for k, v in headers_list: - headers[str(k)] = v.decode("utf-8") if isinstance(v, (bytes, bytearray)) else (v or "") - return headers - - async def _record_message_metrics(self, dlq_message: DLQMessage) -> None: - """Record metrics for received DLQ message.""" - self.metrics.record_dlq_message_received(dlq_message.original_topic, dlq_message.event_type) - self.metrics.record_dlq_message_age(dlq_message.age_seconds) - - async def _process_message_with_tracing(self, msg: Any, dlq_message: DLQMessage) -> None: - """Process message with distributed tracing.""" - headers = self._extract_headers(msg) - ctx = extract_trace_context(headers) - tracer = get_tracer() - - with tracer.start_as_current_span( - name="dlq.consume", - context=ctx, - kind=SpanKind.CONSUMER, - attributes={ - EventAttributes.KAFKA_TOPIC: self.dlq_topic, - EventAttributes.EVENT_TYPE: dlq_message.event_type, - EventAttributes.EVENT_ID: dlq_message.event_id or "", - }, - ): - await self._process_dlq_message(dlq_message) - - async def _commit_and_record_duration(self, start_time: float) -> None: - """Commit offset and record processing duration.""" - await self.consumer.commit() - duration = asyncio.get_running_loop().time() - start_time - self.metrics.record_dlq_processing_duration(duration, "process") + self.logger.error(f"Error processing DLQ message: {e}") async def _process_dlq_message(self, message: DLQMessage) -> None: # Apply filters for filter_func in self._filters: if not filter_func(message): - self.logger.info("Message filtered out", extra={"event_id": message.event_id}) + self.logger.info("Message filtered out", extra={"event_id": message.event.event_id}) return # Store in MongoDB via Beanie @@ -264,7 +159,7 @@ async def _process_dlq_message(self, message: DLQMessage) -> None: # Update message status await self._update_message_status( - message.event_id, + message.event.event_id, DLQMessageUpdate(status=DLQMessageStatus.SCHEDULED, next_retry_at=next_retry), ) @@ -277,16 +172,18 @@ async def _store_message(self, message: DLQMessage) -> None: message.status = DLQMessageStatus.PENDING message.last_updated = datetime.now(timezone.utc) - doc = self._message_to_doc(message) + doc = DLQMessageDocument(**message.model_dump()) # Upsert using Beanie - existing = await DLQMessageDocument.find_one({"event_id": message.event_id}) + existing = await DLQMessageDocument.find_one({"event.event_id": message.event.event_id}) if existing: doc.id = existing.id await doc.save() + await self._emit_message_received_event(message) + async def _update_message_status(self, event_id: str, update: DLQMessageUpdate) -> None: - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) if not doc: return @@ -307,9 +204,6 @@ async def _update_message_status(self, event_id: str, update: DLQMessageUpdate) await doc.set(update_dict) async def _retry_message(self, message: DLQMessage) -> None: - # Trigger before_retry callbacks - await self._trigger_callbacks("before_retry", message) - # Send to retry topic first (for monitoring) retry_topic = f"{message.original_topic}{self.retry_topic_suffix}" @@ -327,44 +221,46 @@ async def _retry_message(self, message: DLQMessage) -> None: # Send to retry topic await self.producer.send_and_wait( topic=retry_topic, - value=json.dumps(event.to_dict()).encode(), - key=message.event_id.encode(), + value=json.dumps(event.model_dump(mode="json")).encode(), + key=message.event.event_id.encode(), headers=kafka_headers, ) # Send to original topic await self.producer.send_and_wait( topic=message.original_topic, - value=json.dumps(event.to_dict()).encode(), - key=message.event_id.encode(), + value=json.dumps(event.model_dump(mode="json")).encode(), + key=message.event.event_id.encode(), headers=kafka_headers, ) # Update metrics - self.metrics.record_dlq_message_retried(message.original_topic, message.event_type, "success") + self.metrics.record_dlq_message_retried(message.original_topic, message.event.event_type, "success") + + new_retry_count = message.retry_count + 1 # Update status await self._update_message_status( - message.event_id, + message.event.event_id, DLQMessageUpdate( status=DLQMessageStatus.RETRIED, retried_at=datetime.now(timezone.utc), - retry_count=message.retry_count + 1, + retry_count=new_retry_count, ), ) - # Trigger after_retry callbacks - await self._trigger_callbacks("after_retry", message, success=True) + # Emit DLQ message retried event + await self._emit_message_retried_event(message, retry_topic, new_retry_count) - self.logger.info("Successfully retried message", extra={"event_id": message.event_id}) + self.logger.info("Successfully retried message", extra={"event_id": message.event.event_id}) async def _discard_message(self, message: DLQMessage, reason: str) -> None: # Update metrics - self.metrics.record_dlq_message_discarded(message.original_topic, message.event_type, reason) + self.metrics.record_dlq_message_discarded(message.original_topic, message.event.event_type, reason) # Update status await self._update_message_status( - message.event_id, + message.event.event_id, DLQMessageUpdate( status=DLQMessageStatus.DISCARDED, discarded_at=datetime.now(timezone.utc), @@ -372,10 +268,9 @@ async def _discard_message(self, message: DLQMessage, reason: str) -> None: ), ) - # Trigger callbacks - await self._trigger_callbacks("on_discard", message, reason) + await self._emit_message_discarded_event(message, reason) - self.logger.warning("Discarded message", extra={"event_id": message.event_id, "reason": reason}) + self.logger.warning("Discarded message", extra={"event_id": message.event.event_id, "reason": reason}) async def _monitor_dlq(self) -> None: while self.is_running: @@ -395,7 +290,7 @@ async def _monitor_dlq(self) -> None: ) for doc in docs: - message = self._doc_to_message(doc) + message = DLQMessage.model_validate(doc, from_attributes=True) await self._retry_message(message) # Update queue size metrics @@ -424,19 +319,60 @@ def set_retry_policy(self, topic: str, policy: RetryPolicy) -> None: def add_filter(self, filter_func: Callable[[DLQMessage], bool]) -> None: self._filters.append(filter_func) - def add_callback(self, event_type: str, callback: Callable[..., Awaitable[None]]) -> None: - if event_type in self._callbacks: - self._callbacks[event_type].append(callback) + async def _emit_message_received_event(self, message: DLQMessage) -> None: + """Emit a DLQMessageReceivedEvent to the DLQ events topic.""" + event = DLQMessageReceivedEvent( + dlq_event_id=message.event.event_id, + original_topic=message.original_topic, + original_event_type=str(message.event.event_type), + error=message.error, + retry_count=message.retry_count, + producer_id=message.producer_id, + failed_at=message.failed_at, + metadata=self._event_metadata, + ) + await self._produce_dlq_event(event) - async def _trigger_callbacks(self, event_type: str, *args: Any, **kwargs: Any) -> None: - for callback in self._callbacks.get(event_type, []): - try: - await callback(*args, **kwargs) - except Exception as e: - self.logger.error(f"Error in DLQ callback {callback.__name__}: {e}") + async def _emit_message_retried_event(self, message: DLQMessage, retry_topic: str, new_retry_count: int) -> None: + """Emit a DLQMessageRetriedEvent to the DLQ events topic.""" + event = DLQMessageRetriedEvent( + dlq_event_id=message.event.event_id, + original_topic=message.original_topic, + original_event_type=str(message.event.event_type), + retry_count=new_retry_count, + retry_topic=retry_topic, + metadata=self._event_metadata, + ) + await self._produce_dlq_event(event) + + async def _emit_message_discarded_event(self, message: DLQMessage, reason: str) -> None: + """Emit a DLQMessageDiscardedEvent to the DLQ events topic.""" + event = DLQMessageDiscardedEvent( + dlq_event_id=message.event.event_id, + original_topic=message.original_topic, + original_event_type=str(message.event.event_type), + reason=reason, + retry_count=message.retry_count, + metadata=self._event_metadata, + ) + await self._produce_dlq_event(event) + + async def _produce_dlq_event( + self, event: DLQMessageReceivedEvent | DLQMessageRetriedEvent | DLQMessageDiscardedEvent + ) -> None: + """Produce a DLQ lifecycle event to the DLQ events topic.""" + try: + serialized = await self.schema_registry.serialize_event(event) + await self.producer.send_and_wait( + topic=self._dlq_events_topic, + value=serialized, + key=event.event_id.encode(), + ) + except Exception as e: + self.logger.error(f"Failed to emit DLQ event {event.event_type}: {e}") async def retry_message_manually(self, event_id: str) -> bool: - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) if not doc: self.logger.error("Message not found in DLQ", extra={"event_id": event_id}) return False @@ -446,7 +382,7 @@ async def retry_message_manually(self, event_id: str) -> bool: self.logger.info("Skipping manual retry", extra={"event_id": event_id, "status": doc.status}) return False - message = self._doc_to_message(doc) + message = DLQMessage.model_validate(doc, from_attributes=True) await self._retry_message(message) return True @@ -489,7 +425,7 @@ async def discard_message_manually(self, event_id: str, reason: str) -> bool: Returns: True if discarded, False if not found or in terminal state """ - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) if not doc: self.logger.error("Message not found in DLQ", extra={"event_id": event_id}) return False @@ -499,7 +435,7 @@ async def discard_message_manually(self, event_id: str, reason: str) -> bool: self.logger.info("Skipping manual discard", extra={"event_id": event_id, "status": doc.status}) return False - message = self._doc_to_message(doc) + message = DLQMessage.model_validate(doc, from_attributes=True) await self._discard_message(message, reason) return True @@ -520,6 +456,10 @@ def create_dlq_manager( enable_auto_commit=False, auto_offset_reset="earliest", client_id="dlq-manager-consumer", + session_timeout_ms=settings.KAFKA_SESSION_TIMEOUT_MS, + heartbeat_interval_ms=settings.KAFKA_HEARTBEAT_INTERVAL_MS, + max_poll_interval_ms=settings.KAFKA_MAX_POLL_INTERVAL_MS, + request_timeout_ms=settings.KAFKA_REQUEST_TIMEOUT_MS, ) producer = AIOKafkaProducer( bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, diff --git a/backend/app/dlq/models.py b/backend/app/dlq/models.py index fc8dd8c0..66961243 100644 --- a/backend/app/dlq/models.py +++ b/backend/app/dlq/models.py @@ -2,9 +2,11 @@ from datetime import datetime, timedelta, timezone from typing import Any +from pydantic import BaseModel, ConfigDict, Field + from app.core.utils import StringEnum from app.domain.enums.events import EventType -from app.infrastructure.kafka.events import BaseEvent +from app.domain.events.typed import DomainEvent class DLQMessageStatus(StringEnum): @@ -26,20 +28,19 @@ class RetryStrategy(StringEnum): MANUAL = "manual" -@dataclass -class DLQMessage: - """Unified DLQ message model for the entire system.""" +class DLQMessage(BaseModel): + """Unified DLQ message model. Access event_id/event_type via event.event_id, event.event_type.""" - event_id: str - event: BaseEvent - event_type: EventType - original_topic: str - error: str - retry_count: int - failed_at: datetime - status: DLQMessageStatus - producer_id: str - created_at: datetime | None = None + model_config = ConfigDict(from_attributes=True) + + event: DomainEvent # Discriminated union - auto-validates from dict + original_topic: str = "" + error: str = "Unknown error" + retry_count: int = 0 + failed_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + status: DLQMessageStatus = DLQMessageStatus.PENDING + producer_id: str = "unknown" + created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) last_updated: datetime | None = None next_retry_at: datetime | None = None retried_at: datetime | None = None @@ -48,11 +49,7 @@ class DLQMessage: dlq_offset: int | None = None dlq_partition: int | None = None last_error: str | None = None - headers: dict[str, str] = field(default_factory=dict) - - @property - def age_seconds(self) -> float: - return (datetime.now(timezone.utc) - self.failed_at).total_seconds() + headers: dict[str, str] = Field(default_factory=dict) @dataclass @@ -119,41 +116,45 @@ def get_next_retry_time(self, message: DLQMessage) -> datetime: # Statistics models -@dataclass -class TopicStatistic: +class TopicStatistic(BaseModel): """Statistics for a single topic.""" + model_config = ConfigDict(from_attributes=True) + topic: str count: int avg_retry_count: float -@dataclass -class EventTypeStatistic: +class EventTypeStatistic(BaseModel): """Statistics for a single event type.""" + model_config = ConfigDict(from_attributes=True) + event_type: str count: int -@dataclass -class AgeStatistics: +class AgeStatistics(BaseModel): """Age statistics for DLQ messages.""" - min_age_seconds: float - max_age_seconds: float - avg_age_seconds: float + model_config = ConfigDict(from_attributes=True) + min_age_seconds: float = 0.0 + max_age_seconds: float = 0.0 + avg_age_seconds: float = 0.0 -@dataclass -class DLQStatistics: + +class DLQStatistics(BaseModel): """Comprehensive DLQ statistics.""" - by_status: dict[str, int] + model_config = ConfigDict(from_attributes=True) + + by_status: dict[DLQMessageStatus, int] by_topic: list[TopicStatistic] by_event_type: list[EventTypeStatistic] age_stats: AgeStatistics - timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) @dataclass @@ -185,10 +186,11 @@ class DLQMessageListResult: limit: int -@dataclass -class DLQTopicSummary: +class DLQTopicSummary(BaseModel): """Summary of a topic in DLQ.""" + model_config = ConfigDict(from_attributes=True) + topic: str total_messages: int status_breakdown: dict[str, int] diff --git a/backend/app/domain/enums/events.py b/backend/app/domain/enums/events.py index 021ca6bb..f337b374 100644 --- a/backend/app/domain/enums/events.py +++ b/backend/app/domain/enums/events.py @@ -80,3 +80,8 @@ class EventType(StringEnum): DELETE_POD_COMMAND = "delete_pod_command" ALLOCATE_RESOURCES_COMMAND = "allocate_resources_command" RELEASE_RESOURCES_COMMAND = "release_resources_command" + + # DLQ events + DLQ_MESSAGE_RECEIVED = "dlq_message_received" + DLQ_MESSAGE_RETRIED = "dlq_message_retried" + DLQ_MESSAGE_DISCARDED = "dlq_message_discarded" diff --git a/backend/app/domain/enums/kafka.py b/backend/app/domain/enums/kafka.py index 2824a4fc..81d78e51 100644 --- a/backend/app/domain/enums/kafka.py +++ b/backend/app/domain/enums/kafka.py @@ -49,6 +49,7 @@ class KafkaTopic(StringEnum): # Infrastructure topics DEAD_LETTER_QUEUE = "dead_letter_queue" + DLQ_EVENTS = "dlq_events" EVENT_BUS_STREAM = "event_bus_stream" WEBSOCKET_EVENTS = "websocket_events" @@ -75,14 +76,14 @@ class GroupId(StringEnum): KafkaTopic.EXECUTION_RESULTS, }, GroupId.K8S_WORKER: { - KafkaTopic.EXECUTION_EVENTS, + KafkaTopic.SAGA_COMMANDS, # Receives CreatePodCommand/DeletePodCommand from coordinator }, GroupId.POD_MONITOR: { KafkaTopic.POD_EVENTS, KafkaTopic.POD_STATUS_UPDATES, }, GroupId.RESULT_PROCESSOR: { - KafkaTopic.EXECUTION_RESULTS, + KafkaTopic.EXECUTION_EVENTS, # Listens for COMPLETED/FAILED/TIMEOUT, publishes to EXECUTION_RESULTS }, GroupId.SAGA_ORCHESTRATOR: { # Orchestrator is triggered by domain events, specifically EXECUTION_REQUESTED, @@ -95,7 +96,6 @@ class GroupId(StringEnum): KafkaTopic.EXECUTION_RESULTS, KafkaTopic.POD_EVENTS, KafkaTopic.POD_STATUS_UPDATES, - KafkaTopic.EXECUTION_RESULTS, }, GroupId.NOTIFICATION_SERVICE: { KafkaTopic.NOTIFICATION_EVENTS, diff --git a/backend/app/domain/events/query_builders.py b/backend/app/domain/events/query_builders.py deleted file mode 100644 index 1be73e4e..00000000 --- a/backend/app/domain/events/query_builders.py +++ /dev/null @@ -1,168 +0,0 @@ -from datetime import datetime -from typing import Any - - -class AggregationStages: - @staticmethod - def match(conditions: dict[str, Any]) -> dict[str, Any]: - """Create a $match stage.""" - return {"$match": conditions} - - @staticmethod - def group(group_spec: dict[str, Any]) -> dict[str, Any]: - """Create a $group stage.""" - return {"$group": group_spec} - - @staticmethod - def sort(sort_spec: dict[str, int]) -> dict[str, Any]: - """Create a $sort stage.""" - return {"$sort": sort_spec} - - @staticmethod - def limit(count: int) -> dict[str, Any]: - """Create a $limit stage.""" - return {"$limit": count} - - @staticmethod - def project(projection: dict[str, Any]) -> dict[str, Any]: - """Create a $project stage.""" - return {"$project": projection} - - @staticmethod - def add_to_set(field: str) -> dict[str, str]: - """Create an $addToSet accumulator.""" - return {"$addToSet": field} - - @staticmethod - def sum(value: int | str = 1) -> dict[str, int | str]: - """Create a $sum accumulator.""" - return {"$sum": value} - - @staticmethod - def avg(field: str) -> dict[str, str]: - """Create an $avg accumulator.""" - return {"$avg": field} - - @staticmethod - def size(field: str) -> dict[str, str]: - """Create a $size operator.""" - return {"$size": field} - - @staticmethod - def date_to_string(date_field: str, date_format: str = "%Y-%m-%d-%H") -> dict[str, Any]: - """Create a $dateToString expression.""" - return {"$dateToString": {"format": date_format, "date": date_field}} - - -class EventStatsAggregation: - @staticmethod - def build_overview_pipeline(start_time: datetime) -> list[dict[str, Any]]: - return [ - AggregationStages.match({"timestamp": {"$gte": start_time}}), - AggregationStages.group( - { - "_id": None, - "total_events": AggregationStages.sum(), - "event_types": AggregationStages.add_to_set("$event_type"), - "unique_users": AggregationStages.add_to_set("$metadata.user_id"), - "services": AggregationStages.add_to_set("$metadata.service_name"), - } - ), - AggregationStages.project( - { - "_id": 0, - "total_events": 1, - "event_type_count": AggregationStages.size("$event_types"), - "unique_user_count": AggregationStages.size("$unique_users"), - "service_count": AggregationStages.size("$services"), - } - ), - ] - - @staticmethod - def build_event_types_pipeline(start_time: datetime, limit: int = 10) -> list[dict[str, Any]]: - return [ - AggregationStages.match({"timestamp": {"$gte": start_time}}), - AggregationStages.group({"_id": "$event_type", "count": AggregationStages.sum()}), - AggregationStages.sort({"count": -1}), - AggregationStages.limit(limit), - ] - - @staticmethod - def build_hourly_events_pipeline(start_time: datetime) -> list[dict[str, Any]]: - return [ - AggregationStages.match({"timestamp": {"$gte": start_time}}), - AggregationStages.group( - {"_id": AggregationStages.date_to_string("$timestamp"), "count": AggregationStages.sum()} - ), - AggregationStages.sort({"_id": 1}), - ] - - @staticmethod - def build_top_users_pipeline(start_time: datetime, limit: int = 10) -> list[dict[str, Any]]: - return [ - AggregationStages.match({"timestamp": {"$gte": start_time}}), - AggregationStages.group({"_id": "$metadata.user_id", "count": AggregationStages.sum()}), - AggregationStages.sort({"count": -1}), - AggregationStages.limit(limit), - ] - - @staticmethod - def build_avg_duration_pipeline(start_time: datetime, event_type: str) -> list[dict[str, Any]]: - return [ - AggregationStages.match( - { - "timestamp": {"$gte": start_time}, - "event_type": event_type, - "payload.duration_seconds": {"$exists": True}, - } - ), - AggregationStages.group({"_id": None, "avg_duration": AggregationStages.avg("$payload.duration_seconds")}), - ] - - -class QueryBuilder: - @staticmethod - def regex_search(field: str, pattern: str, case_insensitive: bool = True) -> dict[str, Any]: - """Build regex search condition.""" - options = "i" if case_insensitive else "" - return {field: {"$regex": pattern, "$options": options}} - - @staticmethod - def time_range(field: str, start: datetime | None = None, end: datetime | None = None) -> dict[str, Any]: - """Build time range condition.""" - if not start and not end: - return {} - - condition = {} - if start: - condition["$gte"] = start - if end: - condition["$lte"] = end - - return {field: condition} - - @staticmethod - def in_list(field: str, values: list[Any]) -> dict[str, Any]: - """Build IN condition.""" - return {field: {"$in": values}} - - @staticmethod - def not_equal(field: str, value: Any) -> dict[str, Any]: - """Build not equal condition.""" - return {field: {"$ne": value}} - - @staticmethod - def exists(field: str, exists: bool = True) -> dict[str, Any]: - """Build exists condition.""" - return {field: {"$exists": exists}} - - @staticmethod - def or_conditions(conditions: list[dict[str, Any]]) -> dict[str, Any]: - """Build OR condition.""" - return {"$or": conditions} - - @staticmethod - def and_conditions(conditions: list[dict[str, Any]]) -> dict[str, Any]: - """Build AND condition.""" - return {"$and": conditions} diff --git a/backend/app/domain/events/typed.py b/backend/app/domain/events/typed.py index 92efe7df..5157be88 100644 --- a/backend/app/domain/events/typed.py +++ b/backend/app/domain/events/typed.py @@ -1,8 +1,9 @@ -from datetime import datetime, timedelta, timezone +from datetime import datetime, timezone from typing import Annotated, Literal from uuid import uuid4 -from pydantic import BaseModel, ConfigDict, Discriminator, Field, TypeAdapter +from pydantic import ConfigDict, Discriminator, Field, TypeAdapter +from pydantic_avro.to_avro.base import AvroBase from app.domain.enums.auth import LoginMethod from app.domain.enums.common import Environment @@ -12,10 +13,10 @@ from app.domain.execution import ResourceUsageDomain -class EventMetadata(BaseModel): +class EventMetadata(AvroBase): """Event metadata - embedded in all events.""" - model_config = ConfigDict(from_attributes=True) + model_config = ConfigDict(from_attributes=True, use_enum_values=True) service_name: str service_version: str @@ -26,18 +27,17 @@ class EventMetadata(BaseModel): environment: Environment = Environment.PRODUCTION -class BaseEvent(BaseModel): +class BaseEvent(AvroBase): """Base fields for all domain events.""" model_config = ConfigDict(from_attributes=True) event_id: str = Field(default_factory=lambda: str(uuid4())) + event_type: EventType event_version: str = "1.0" timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) aggregate_id: str | None = None metadata: EventMetadata - stored_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) - ttl_expires_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc) + timedelta(days=30)) # --- Execution Events --- @@ -514,10 +514,48 @@ class ServiceRecoveredEvent(BaseEvent): downtime_seconds: int +# --- DLQ Events --- + + +class DLQMessageReceivedEvent(BaseEvent): + """Emitted when a message is received and persisted in the DLQ.""" + + event_type: Literal[EventType.DLQ_MESSAGE_RECEIVED] = EventType.DLQ_MESSAGE_RECEIVED + dlq_event_id: str # The event_id of the failed message + original_topic: str + original_event_type: str + error: str + retry_count: int + producer_id: str + failed_at: datetime + + +class DLQMessageRetriedEvent(BaseEvent): + """Emitted when a DLQ message is retried.""" + + event_type: Literal[EventType.DLQ_MESSAGE_RETRIED] = EventType.DLQ_MESSAGE_RETRIED + dlq_event_id: str # The event_id of the retried message + original_topic: str + original_event_type: str + retry_count: int # New retry count after this retry + retry_topic: str # Topic the message was retried to + + +class DLQMessageDiscardedEvent(BaseEvent): + """Emitted when a DLQ message is discarded (max retries exceeded or manual discard).""" + + event_type: Literal[EventType.DLQ_MESSAGE_DISCARDED] = EventType.DLQ_MESSAGE_DISCARDED + dlq_event_id: str # The event_id of the discarded message + original_topic: str + original_event_type: str + reason: str + retry_count: int # Final retry count when discarded + + # --- Archived Event (for deleted events) --- -class ArchivedEvent(BaseModel): +class ArchivedEvent(AvroBase): """Archived event with deletion metadata. Wraps the original event data.""" model_config = ConfigDict(from_attributes=True) @@ -602,7 +640,11 @@ class ArchivedEvent(BaseModel): # System Events | SystemErrorEvent | ServiceUnhealthyEvent - | ServiceRecoveredEvent, + | ServiceRecoveredEvent + # DLQ Events + | DLQMessageReceivedEvent + | DLQMessageRetriedEvent + | DLQMessageDiscardedEvent, Discriminator("event_type"), ] diff --git a/backend/app/events/consumer_group_monitor.py b/backend/app/events/consumer_group_monitor.py index 00a0c72e..d20b9c45 100644 --- a/backend/app/events/consumer_group_monitor.py +++ b/backend/app/events/consumer_group_monitor.py @@ -147,6 +147,7 @@ def __init__( min_members_threshold: int = 1, ): self.logger = logger + self._settings = settings self._bootstrap_servers = settings.KAFKA_BOOTSTRAP_SERVERS self._client_id = client_id @@ -383,6 +384,9 @@ async def _get_consumer_group_lag(self, group_id: str) -> dict[str, Any]: group_id=f"{group_id}-lag-monitor-{datetime.now().timestamp()}", enable_auto_commit=False, auto_offset_reset="earliest", + session_timeout_ms=self._settings.KAFKA_SESSION_TIMEOUT_MS, + heartbeat_interval_ms=self._settings.KAFKA_HEARTBEAT_INTERVAL_MS, + request_timeout_ms=self._settings.KAFKA_REQUEST_TIMEOUT_MS, ) try: diff --git a/backend/app/events/core/__init__.py b/backend/app/events/core/__init__.py index f0957882..3b12df76 100644 --- a/backend/app/events/core/__init__.py +++ b/backend/app/events/core/__init__.py @@ -9,7 +9,6 @@ ConsumerConfig, ConsumerMetrics, ConsumerState, - ProducerConfig, ProducerMetrics, ProducerState, ) @@ -18,7 +17,6 @@ # Types "ProducerState", "ConsumerState", - "ProducerConfig", "ConsumerConfig", "ProducerMetrics", "ConsumerMetrics", diff --git a/backend/app/events/core/consumer.py b/backend/app/events/core/consumer.py index 1f21412b..01556751 100644 --- a/backend/app/events/core/consumer.py +++ b/backend/app/events/core/consumer.py @@ -12,8 +12,8 @@ from app.core.tracing import EventAttributes from app.core.tracing.utils import extract_trace_context, get_tracer from app.domain.enums.kafka import KafkaTopic +from app.domain.events.typed import DomainEvent from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events.base import BaseEvent from app.settings import Settings from .dispatcher import EventDispatcher @@ -38,7 +38,7 @@ def __init__( self._running = False self._metrics = ConsumerMetrics() self._event_metrics = get_event_metrics() # Singleton for Kafka metrics - self._error_callback: "Callable[[Exception, BaseEvent], Awaitable[None]] | None" = None + self._error_callback: "Callable[[Exception, DomainEvent], Awaitable[None]] | None" = None self._consume_task: asyncio.Task[None] | None = None self._topic_prefix = settings.KAFKA_TOPIC_PREFIX @@ -57,6 +57,7 @@ async def start(self, topics: list[KafkaTopic]) -> None: session_timeout_ms=self._config.session_timeout_ms, heartbeat_interval_ms=self._config.heartbeat_interval_ms, max_poll_interval_ms=self._config.max_poll_interval_ms, + request_timeout_ms=self._config.request_timeout_ms, fetch_min_bytes=self._config.fetch_min_bytes, fetch_max_wait_ms=self._config.fetch_max_wait_ms, ) @@ -190,7 +191,7 @@ async def _process_message(self, message: Any) -> None: if self._error_callback: await self._error_callback(e, event) - def register_error_callback(self, callback: Callable[[Exception, BaseEvent], Awaitable[None]]) -> None: + def register_error_callback(self, callback: Callable[[Exception, DomainEvent], Awaitable[None]]) -> None: self._error_callback = callback @property diff --git a/backend/app/events/core/dispatcher.py b/backend/app/events/core/dispatcher.py index cd7e7d4f..bc69a4a3 100644 --- a/backend/app/events/core/dispatcher.py +++ b/backend/app/events/core/dispatcher.py @@ -5,11 +5,11 @@ from typing import TypeAlias, TypeVar from app.domain.enums.events import EventType -from app.infrastructure.kafka.events.base import BaseEvent +from app.domain.events.typed import DomainEvent from app.infrastructure.kafka.mappings import get_event_class_for_type -T = TypeVar("T", bound=BaseEvent) -EventHandler: TypeAlias = Callable[[BaseEvent], Awaitable[None]] +T = TypeVar("T", bound=DomainEvent) +EventHandler: TypeAlias = Callable[[DomainEvent], Awaitable[None]] class EventDispatcher: @@ -23,34 +23,23 @@ class EventDispatcher: def __init__(self, logger: logging.Logger) -> None: self.logger = logger # Map event types to their handlers - self._handlers: dict[EventType, list[Callable[[BaseEvent], Awaitable[None]]]] = defaultdict(list) + self._handlers: dict[EventType, list[Callable[[DomainEvent], Awaitable[None]]]] = defaultdict(list) # Map topics to event types that can appear on them - self._topic_event_types: dict[str, set[type[BaseEvent]]] = defaultdict(set) + self._topic_event_types: dict[str, set[type[DomainEvent]]] = defaultdict(set) # Metrics per event type self._event_metrics: dict[EventType, dict[str, int]] = defaultdict( lambda: {"processed": 0, "failed": 0, "skipped": 0} ) - # Build topic->event type mapping from schema - self._build_topic_mapping() - - def _build_topic_mapping(self) -> None: - """Build mapping of topics to event types based on event classes.""" - for event_class in BaseEvent.__subclasses__(): - if hasattr(event_class, "topic"): - topic = str(event_class.topic) - self._topic_event_types[topic].add(event_class) - self.logger.debug(f"Mapped {event_class.__name__} to topic {topic}") - def register( self, event_type: EventType ) -> Callable[[Callable[[T], Awaitable[None]]], Callable[[T], Awaitable[None]]]: """ Decorator for registering type-safe event handlers. - Generic over T (any BaseEvent subtype) - accepts handlers with specific + Generic over T (any DomainEvent subtype) - accepts handlers with specific event types while preserving their type signature for callers. Usage: @@ -98,7 +87,7 @@ def remove_handler(self, event_type: EventType, handler: EventHandler) -> bool: return True return False - async def dispatch(self, event: BaseEvent) -> None: + async def dispatch(self, event: DomainEvent) -> None: """ Dispatch an event to all registered handlers for its type. @@ -133,7 +122,7 @@ async def dispatch(self, event: BaseEvent) -> None: else: self._event_metrics[event_type]["processed"] += 1 - async def _execute_handler(self, handler: EventHandler, event: BaseEvent) -> None: + async def _execute_handler(self, handler: EventHandler, event: DomainEvent) -> None: """ Execute a single handler with error handling. @@ -175,14 +164,14 @@ def clear_handlers(self) -> None: self._handlers.clear() self.logger.info("All event handlers cleared") - def get_handlers(self, event_type: EventType) -> list[Callable[[BaseEvent], Awaitable[None]]]: + def get_handlers(self, event_type: EventType) -> list[Callable[[DomainEvent], Awaitable[None]]]: """Get all handlers for a specific event type.""" return self._handlers.get(event_type, []).copy() - def get_all_handlers(self) -> dict[EventType, list[Callable[[BaseEvent], Awaitable[None]]]]: + def get_all_handlers(self) -> dict[EventType, list[Callable[[DomainEvent], Awaitable[None]]]]: """Get all registered handlers (returns a copy).""" return {k: v.copy() for k, v in self._handlers.items()} - def replace_handlers(self, event_type: EventType, handlers: list[Callable[[BaseEvent], Awaitable[None]]]) -> None: + def replace_handlers(self, event_type: EventType, handlers: list[Callable[[DomainEvent], Awaitable[None]]]) -> None: """Replace all handlers for a specific event type.""" self._handlers[event_type] = handlers diff --git a/backend/app/events/core/dlq_handler.py b/backend/app/events/core/dlq_handler.py index 0e035b2e..7de433a7 100644 --- a/backend/app/events/core/dlq_handler.py +++ b/backend/app/events/core/dlq_handler.py @@ -1,97 +1,37 @@ import logging from typing import Awaitable, Callable -from app.infrastructure.kafka.events.base import BaseEvent +from app.domain.events.typed import DomainEvent from .producer import UnifiedProducer def create_dlq_error_handler( producer: UnifiedProducer, original_topic: str, logger: logging.Logger, max_retries: int = 3 -) -> Callable[[Exception, BaseEvent], Awaitable[None]]: - """ - Create an error handler that sends failed events to DLQ. - - Args: - producer: The Kafka producer to use for sending to DLQ - original_topic: The topic where the event originally failed - logger: Logger instance for logging - max_retries: Maximum number of retries before sending to DLQ - - Returns: - An async error handler function suitable for UnifiedConsumer.register_error_callback - """ - # Track retry counts per event ID +) -> Callable[[Exception, DomainEvent], Awaitable[None]]: + """Create an error handler that sends failed events to DLQ after max retries.""" retry_counts: dict[str, int] = {} - async def handle_error_with_dlq(error: Exception, event: BaseEvent) -> None: - """ - Handle processing errors by sending to DLQ after max retries. - - Args: - error: The exception that occurred - event: The event that failed processing - """ + async def handle_error_with_dlq(error: Exception, event: DomainEvent) -> None: event_id = event.event_id or "unknown" - - # Track retry count retry_count = retry_counts.get(event_id, 0) retry_counts[event_id] = retry_count + 1 - - logger.error( - f"Error processing event {event_id} ({event.event_type}): {error}. Retry {retry_count + 1}/{max_retries}", - exc_info=True, - ) - - # Send to DLQ if we've exceeded max retries + logger.error(f"Error processing {event_id}: {error}. Retry {retry_count + 1}/{max_retries}", exc_info=True) if retry_count >= max_retries: - logger.warning(f"Event {event_id} exceeded max retries ({max_retries}). Sending to DLQ.") - - await producer.send_to_dlq( - original_event=event, original_topic=original_topic, error=error, retry_count=retry_count - ) - - # Clear retry count for this event + logger.warning(f"Event {event_id} exceeded max retries. Sending to DLQ.") + await producer.send_to_dlq(event, original_topic, error, retry_count) retry_counts.pop(event_id, None) - else: - # Could implement retry logic here if needed - # For now, the event will be retried when Kafka redelivers it - pass return handle_error_with_dlq def create_immediate_dlq_handler( producer: UnifiedProducer, original_topic: str, logger: logging.Logger -) -> Callable[[Exception, BaseEvent], Awaitable[None]]: - """ - Create an error handler that immediately sends failed events to DLQ. - - This is useful for critical errors where retry won't help. - - Args: - producer: The Kafka producer to use for sending to DLQ - original_topic: The topic where the event originally failed - logger: Logger instance for logging - - Returns: - An async error handler function suitable for UnifiedConsumer.register_error_callback - """ - - async def handle_error_immediate_dlq(error: Exception, event: BaseEvent) -> None: - """ - Handle processing errors by immediately sending to DLQ. - - Args: - error: The exception that occurred - event: The event that failed processing - """ - logger.error( - f"Critical error processing event {event.event_id} ({event.event_type}): {error}. " - f"Sending immediately to DLQ.", - exc_info=True, - ) +) -> Callable[[Exception, DomainEvent], Awaitable[None]]: + """Create an error handler that immediately sends failed events to DLQ.""" - await producer.send_to_dlq(original_event=event, original_topic=original_topic, error=error, retry_count=0) + async def handle_error_immediate_dlq(error: Exception, event: DomainEvent) -> None: + logger.error(f"Critical error processing {event.event_id}: {error}. Sending to DLQ.", exc_info=True) + await producer.send_to_dlq(event, original_topic, error, 0) return handle_error_immediate_dlq diff --git a/backend/app/events/core/producer.py b/backend/app/events/core/producer.py index c6a30b75..c5848aec 100644 --- a/backend/app/events/core/producer.py +++ b/backend/app/events/core/producer.py @@ -12,11 +12,12 @@ from app.core.metrics.context import get_event_metrics from app.dlq.models import DLQMessage, DLQMessageStatus from app.domain.enums.kafka import KafkaTopic +from app.domain.events.typed import DomainEvent from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events import BaseEvent +from app.infrastructure.kafka.mappings import EVENT_TYPE_TO_TOPIC from app.settings import Settings -from .types import ProducerConfig, ProducerMetrics, ProducerState +from .types import ProducerMetrics, ProducerState class UnifiedProducer(LifecycleEnabled): @@ -24,13 +25,12 @@ class UnifiedProducer(LifecycleEnabled): def __init__( self, - config: ProducerConfig, schema_registry_manager: SchemaRegistryManager, logger: logging.Logger, settings: Settings, ): super().__init__() - self._config = config + self._settings = settings self._schema_registry = schema_registry_manager self.logger = logger self._producer: AIOKafkaProducer | None = None @@ -61,28 +61,26 @@ async def _on_start(self) -> None: self.logger.info("Starting producer...") self._producer = AIOKafkaProducer( - bootstrap_servers=self._config.bootstrap_servers, - client_id=self._config.client_id, - acks=self._config.acks, - compression_type=self._config.compression_type, - max_batch_size=self._config.batch_size, - linger_ms=self._config.linger_ms, - enable_idempotence=self._config.enable_idempotence, + bootstrap_servers=self._settings.KAFKA_BOOTSTRAP_SERVERS, + client_id=f"{self._settings.SERVICE_NAME}-producer", + acks="all", + compression_type="gzip", + max_batch_size=16384, + linger_ms=10, + enable_idempotence=True, ) await self._producer.start() self._state = ProducerState.RUNNING - self.logger.info(f"Producer started: {self._config.bootstrap_servers}") + self.logger.info(f"Producer started: {self._settings.KAFKA_BOOTSTRAP_SERVERS}") def get_status(self) -> dict[str, Any]: return { "state": self._state, "running": self.is_running, "config": { - "bootstrap_servers": self._config.bootstrap_servers, - "client_id": self._config.client_id, - "batch_size": self._config.batch_size, - "compression_type": self._config.compression_type, + "bootstrap_servers": self._settings.KAFKA_BOOTSTRAP_SERVERS, + "client_id": f"{self._settings.SERVICE_NAME}-producer", }, "metrics": { "messages_sent": self._metrics.messages_sent, @@ -108,25 +106,16 @@ async def _on_stop(self) -> None: self.logger.info("Producer stopped") async def produce( - self, event_to_produce: BaseEvent, key: str | None = None, headers: dict[str, str] | None = None + self, event_to_produce: DomainEvent, key: str | None = None, headers: dict[str, str] | None = None ) -> None: - """ - Produce a message to Kafka. - - Args: - event_to_produce: Message value (BaseEvent) - key: Message key - headers: Message headers - """ + """Produce a message to Kafka.""" if not self._producer: self.logger.error("Producer not running") return try: - # Serialize value using async schema registry serialized_value = await self._schema_registry.serialize_event(event_to_produce) - - topic = f"{self._topic_prefix}{str(event_to_produce.topic)}" + topic = f"{self._topic_prefix}{EVENT_TYPE_TO_TOPIC[event_to_produce.event_type]}" # Convert headers to list of tuples format header_list = [(k, v.encode()) for k, v in headers.items()] if headers else None @@ -151,24 +140,14 @@ async def produce( self._metrics.messages_failed += 1 self._metrics.last_error = str(e) self._metrics.last_error_time = datetime.now(timezone.utc) - self._event_metrics.record_kafka_production_error( - topic=f"{self._topic_prefix}{str(event_to_produce.topic)}", error_type=type(e).__name__ - ) + self._event_metrics.record_kafka_production_error(topic=topic, error_type=type(e).__name__) self.logger.error(f"Failed to produce message: {e}") raise async def send_to_dlq( - self, original_event: BaseEvent, original_topic: str, error: Exception, retry_count: int = 0 + self, original_event: DomainEvent, original_topic: str, error: Exception, retry_count: int = 0 ) -> None: - """ - Send a failed event to the Dead Letter Queue. - - Args: - original_event: The event that failed processing - original_topic: The topic where the event originally failed - error: The exception that caused the failure - retry_count: Number of retry attempts already made - """ + """Send a failed event to the Dead Letter Queue.""" if not self._producer: self.logger.error("Producer not running, cannot send to DLQ") return @@ -181,9 +160,7 @@ async def send_to_dlq( # Create DLQ message directly dlq_message = DLQMessage( - event_id=original_event.event_id, event=original_event, - event_type=original_event.event_type, original_topic=original_topic, error=str(error), retry_count=retry_count, @@ -194,9 +171,7 @@ async def send_to_dlq( # Create DLQ event wrapper dlq_event_data = { - "event_id": dlq_message.event_id, - "event_type": "dlq.message", - "event": dlq_message.event.to_dict(), + "event": dlq_message.event.model_dump(mode="json"), "original_topic": dlq_message.original_topic, "error": dlq_message.error, "retry_count": dlq_message.retry_count, diff --git a/backend/app/events/core/types.py b/backend/app/events/core/types.py index 33b8e3b8..1912f1be 100644 --- a/backend/app/events/core/types.py +++ b/backend/app/events/core/types.py @@ -1,6 +1,5 @@ from dataclasses import dataclass from datetime import datetime, timezone -from typing import Any from pydantic import BaseModel, ConfigDict @@ -27,41 +26,6 @@ class ConsumerState(StringEnum): ERROR = "error" -@dataclass(slots=True) -class ProducerConfig: - """Kafka producer configuration.""" - - bootstrap_servers: str - client_id: str = "integr8scode-producer" - - # Batching configuration - batch_size: int = 16384 - linger_ms: int = 10 - compression_type: str = "gzip" - - # Reliability configuration - request_timeout_ms: int = 30000 - retries: int = 3 - enable_idempotence: bool = True - acks: str = "all" - max_in_flight_requests_per_connection: int = 5 - - def to_producer_config(self) -> dict[str, Any]: - """Convert to Confluent Kafka producer configuration.""" - return { - "bootstrap.servers": self.bootstrap_servers, - "client.id": self.client_id, - "batch.size": self.batch_size, - "linger.ms": self.linger_ms, - "compression.type": self.compression_type, - "request.timeout.ms": self.request_timeout_ms, - "retries": self.retries, - "enable.idempotence": self.enable_idempotence, - "acks": self.acks, - "max.in.flight.requests.per.connection": self.max_in_flight_requests_per_connection, - } - - @dataclass(slots=True) class ConsumerConfig: """Kafka consumer configuration.""" @@ -75,34 +39,16 @@ class ConsumerConfig: enable_auto_commit: bool = False # Session configuration - session_timeout_ms: int = 30000 - heartbeat_interval_ms: int = 3000 + session_timeout_ms: int = 45000 + heartbeat_interval_ms: int = 10000 max_poll_interval_ms: int = 300000 + request_timeout_ms: int = 40000 # Fetch configuration max_poll_records: int = 500 fetch_min_bytes: int = 1 fetch_max_wait_ms: int = 500 - # Monitoring - statistics_interval_ms: int = 30000 - - def to_consumer_config(self) -> dict[str, object]: - """Convert to Confluent Kafka consumer configuration.""" - return { - "bootstrap.servers": self.bootstrap_servers, - "group.id": self.group_id, - "client.id": self.client_id, - "auto.offset.reset": self.auto_offset_reset, - "enable.auto.commit": self.enable_auto_commit, - "session.timeout.ms": self.session_timeout_ms, - "heartbeat.interval.ms": self.heartbeat_interval_ms, - "max.poll.interval.ms": self.max_poll_interval_ms, - "fetch.min.bytes": self.fetch_min_bytes, - "fetch.wait.max.ms": self.fetch_max_wait_ms, - "statistics.interval.ms": self.statistics_interval_ms, - } - @dataclass(slots=True) class ProducerMetrics: diff --git a/backend/app/events/event_store.py b/backend/app/events/event_store.py index fe6ce8b3..0c475cc3 100644 --- a/backend/app/events/event_store.py +++ b/backend/app/events/event_store.py @@ -12,8 +12,8 @@ from app.core.tracing.utils import add_span_attributes from app.db.docs import EventDocument from app.domain.enums.events import EventType +from app.domain.events.typed import DomainEvent from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events.base import BaseEvent class EventStore: @@ -43,7 +43,7 @@ async def initialize(self) -> None: self._initialized = True self.logger.info("Event store initialized with Beanie") - async def store_event(self, event: BaseEvent) -> bool: + async def store_event(self, event: DomainEvent) -> bool: start = asyncio.get_running_loop().time() try: now = datetime.now(timezone.utc) @@ -71,7 +71,7 @@ async def store_event(self, event: BaseEvent) -> bool: self.metrics.record_event_store_failed(event.event_type, type(e).__name__) return False - async def store_batch(self, events: list[BaseEvent]) -> dict[str, int]: + async def store_batch(self, events: list[DomainEvent]) -> dict[str, int]: start = asyncio.get_running_loop().time() results = {"total": len(events), "stored": 0, "duplicates": 0, "failed": 0} if not events: @@ -108,7 +108,7 @@ async def store_batch(self, events: list[BaseEvent]) -> dict[str, int]: results["failed"] = results["total"] - results["stored"] return results - async def get_event(self, event_id: str) -> BaseEvent | None: + async def get_event(self, event_id: str) -> DomainEvent | None: start = asyncio.get_running_loop().time() doc = await EventDocument.find_one({"event_id": event_id}) if not doc: @@ -127,7 +127,7 @@ async def get_events_by_type( end_time: datetime | None = None, limit: int = 100, offset: int = 0, - ) -> list[BaseEvent]: + ) -> list[DomainEvent]: start = asyncio.get_running_loop().time() query: dict[str, Any] = {"event_type": event_type} if tr := self._time_range(start_time, end_time): @@ -150,7 +150,7 @@ async def get_execution_events( self, execution_id: str, event_types: list[EventType] | None = None, - ) -> list[BaseEvent]: + ) -> list[DomainEvent]: start = asyncio.get_running_loop().time() query: dict[str, Any] = {"$or": [{"execution_id": execution_id}, {"aggregate_id": execution_id}]} if event_types: @@ -170,7 +170,7 @@ async def get_user_events( start_time: datetime | None = None, end_time: datetime | None = None, limit: int = 100, - ) -> list[BaseEvent]: + ) -> list[DomainEvent]: start = asyncio.get_running_loop().time() query: dict[str, Any] = {"metadata.user_id": str(user_id)} if event_types: @@ -191,7 +191,7 @@ async def get_security_events( end_time: datetime | None = None, user_id: str | None = None, limit: int = 100, - ) -> list[BaseEvent]: + ) -> list[DomainEvent]: start = asyncio.get_running_loop().time() query: dict[str, Any] = {"event_type": {"$in": self._SECURITY_TYPES}} if user_id: @@ -206,7 +206,7 @@ async def get_security_events( self.metrics.record_event_query_duration(duration, "get_security_events", "event_store") return events - async def get_correlation_chain(self, correlation_id: str) -> list[BaseEvent]: + async def get_correlation_chain(self, correlation_id: str) -> list[DomainEvent]: start = asyncio.get_running_loop().time() docs = await ( EventDocument.find({"metadata.correlation_id": str(correlation_id)}) @@ -224,7 +224,7 @@ async def replay_events( start_time: datetime, end_time: datetime | None = None, event_types: list[EventType] | None = None, - callback: Callable[[BaseEvent], Awaitable[None]] | None = None, + callback: Callable[[DomainEvent], Awaitable[None]] | None = None, ) -> int: start = asyncio.get_running_loop().time() count = 0 diff --git a/backend/app/events/event_store_consumer.py b/backend/app/events/event_store_consumer.py index af6239f7..4f2ba47d 100644 --- a/backend/app/events/event_store_consumer.py +++ b/backend/app/events/event_store_consumer.py @@ -7,10 +7,10 @@ from app.core.tracing.utils import trace_span from app.domain.enums.events import EventType from app.domain.enums.kafka import GroupId, KafkaTopic +from app.domain.events.typed import DomainEvent from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer, create_dlq_error_handler from app.events.event_store import EventStore from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events.base import BaseEvent from app.settings import Settings @@ -41,7 +41,7 @@ def __init__( self.schema_registry_manager = schema_registry_manager self.dispatcher = EventDispatcher(logger) self.producer = producer # For DLQ handling - self._batch_buffer: list[BaseEvent] = [] + self._batch_buffer: list[DomainEvent] = [] self._batch_lock = asyncio.Lock() self._last_batch_time: float = 0.0 self._batch_task: asyncio.Task[None] | None = None @@ -54,6 +54,10 @@ async def _on_start(self) -> None: group_id=f"{self.group_id}.{self.settings.KAFKA_GROUP_SUFFIX}", enable_auto_commit=False, max_poll_records=self.batch_size, + session_timeout_ms=self.settings.KAFKA_SESSION_TIMEOUT_MS, + heartbeat_interval_ms=self.settings.KAFKA_HEARTBEAT_INTERVAL_MS, + max_poll_interval_ms=self.settings.KAFKA_MAX_POLL_INTERVAL_MS, + request_timeout_ms=self.settings.KAFKA_REQUEST_TIMEOUT_MS, ) self.consumer = UnifiedConsumer( @@ -104,7 +108,7 @@ async def _on_stop(self) -> None: self.logger.info("Event store consumer stopped") - async def _handle_event(self, event: BaseEvent) -> None: + async def _handle_event(self, event: DomainEvent) -> None: """Handle incoming event from dispatcher.""" self.logger.info(f"Event store received event: {event.event_type} - {event.event_id}") @@ -114,7 +118,7 @@ async def _handle_event(self, event: BaseEvent) -> None: if len(self._batch_buffer) >= self.batch_size: await self._flush_batch() - async def _handle_error_with_event(self, error: Exception, event: BaseEvent) -> None: + async def _handle_error_with_event(self, error: Exception, event: DomainEvent) -> None: """Handle processing errors with event context.""" self.logger.error(f"Error processing event {event.event_id} ({event.event_type}): {error}", exc_info=True) diff --git a/backend/app/events/schema/schema_registry.py b/backend/app/events/schema/schema_registry.py index b36392b2..6e4337a4 100644 --- a/backend/app/events/schema/schema_registry.py +++ b/backend/app/events/schema/schema_registry.py @@ -1,73 +1,52 @@ import logging import struct from functools import lru_cache -from typing import Any, Dict, Type, TypeVar +from typing import Any, get_args, get_origin from schema_registry.client import AsyncSchemaRegistryClient, schema from schema_registry.serializers import AsyncAvroMessageSerializer # type: ignore[attr-defined] from app.domain.enums.events import EventType -from app.infrastructure.kafka.events.base import BaseEvent +from app.domain.events.typed import DomainEvent from app.settings import Settings -T = TypeVar("T", bound=BaseEvent) - -# Confluent wire-format magic byte (single byte, value 0) MAGIC_BYTE = b"\x00" @lru_cache(maxsize=1) -def _get_event_class_mapping() -> Dict[str, Type[BaseEvent]]: - """ - Map Avro record name (class name) -> Python class. - Uses only direct subclasses; extend to recursive if you introduce deeper hierarchies. - """ - mapping: Dict[str, Type[BaseEvent]] = {} - for subclass in BaseEvent.__subclasses__(): - mapping[subclass.__name__] = subclass - return mapping +def _get_all_event_classes() -> list[type[DomainEvent]]: + """Get all concrete event classes from DomainEvent union.""" + union_type = get_args(DomainEvent)[0] # Annotated[Union[...], Discriminator] -> Union + return list(get_args(union_type)) if get_origin(union_type) else [union_type] @lru_cache(maxsize=1) -def _get_all_event_classes() -> list[Type[BaseEvent]]: - """All direct subclasses of BaseEvent (extend to recursive IF you add nested inheritance).""" - return list(BaseEvent.__subclasses__()) +def _get_event_class_mapping() -> dict[str, type[DomainEvent]]: + """Map class name -> class.""" + return {cls.__name__: cls for cls in _get_all_event_classes()} @lru_cache(maxsize=1) -def _get_event_type_to_class_mapping() -> Dict[EventType, Type[BaseEvent]]: - """ - EventType enum -> event class, inferred from the default of the `event_type` field on each subclass. - """ - mapping: Dict[EventType, Type[BaseEvent]] = {} - for subclass in _get_all_event_classes(): - f = subclass.model_fields.get("event_type") - if f is not None and f.default is not None: - mapping[f.default] = subclass # default is EventType thanks to Literal[...] - return mapping +def _get_event_type_to_class_mapping() -> dict[EventType, type[DomainEvent]]: + """EventType -> class mapping.""" + return {cls.model_fields["event_type"].default: cls for cls in _get_all_event_classes()} class SchemaRegistryManager: - """Schema registry manager for Avro serialization with Confluent wire format. - - Uses aiokafka-compatible python-schema-registry-client for fully async operations. - """ + """Schema registry manager for Avro serialization with Confluent wire format.""" def __init__(self, settings: Settings, logger: logging.Logger): self.logger = logger self.namespace = "com.integr8scode.events" self.subject_prefix = settings.SCHEMA_SUBJECT_PREFIX - parts = settings.SCHEMA_REGISTRY_AUTH.split(":", 1) auth: tuple[str, str] | None = (parts[0], parts[1]) if len(parts) == 2 else None self._client = AsyncSchemaRegistryClient(url=settings.SCHEMA_REGISTRY_URL, auth=auth) # type: ignore[arg-type] self._serializer = AsyncAvroMessageSerializer(self._client) + self._schema_id_cache: dict[type[DomainEvent], int] = {} + self._id_to_class_cache: dict[int, type[DomainEvent]] = {} - # Caches: class <-> schema_id (library caches schema_string -> id, we need class -> id) - self._schema_id_cache: Dict[Type[BaseEvent], int] = {} - self._id_to_class_cache: Dict[int, Type[BaseEvent]] = {} - - async def register_schema(self, subject: str, event_class: Type[BaseEvent]) -> int: + async def register_schema(self, subject: str, event_class: type[DomainEvent]) -> int: """Register schema and return schema ID.""" avro_schema = schema.AvroSchema(event_class.avro_schema(namespace=self.namespace)) schema_id: int = await self._client.register(subject, avro_schema) @@ -76,124 +55,67 @@ async def register_schema(self, subject: str, event_class: Type[BaseEvent]) -> i self.logger.info(f"Registered schema for {event_class.__name__}: ID {schema_id}") return schema_id - async def _get_schema_id(self, event_class: Type[BaseEvent]) -> int: - """Get or register schema ID for event class.""" - if event_class in self._schema_id_cache: - return self._schema_id_cache[event_class] - subject = f"{self.subject_prefix}{event_class.__name__}-value" - return await self.register_schema(subject, event_class) - - async def _get_event_class_by_id(self, schema_id: int) -> Type[BaseEvent] | None: - """Get event class by schema ID, via cache or registry lookup.""" + async def _get_event_class_by_id(self, schema_id: int) -> type[DomainEvent] | None: + """Get event class by schema ID.""" if schema_id in self._id_to_class_cache: return self._id_to_class_cache[schema_id] - schema_obj = await self._client.get_by_id(schema_id) - if schema_obj is None: - return None - - # Parse schema to get class name - raw_schema is already a dict - schema_dict = schema_obj.raw_schema - class_name = schema_dict.get("name") - if class_name: - cls = _get_event_class_mapping().get(class_name) - if cls: + if schema_obj and (class_name := schema_obj.raw_schema.get("name")): + if cls := _get_event_class_mapping().get(class_name): self._id_to_class_cache[schema_id] = cls self._schema_id_cache[cls] = schema_id return cls - return None - async def serialize_event(self, event: BaseEvent) -> bytes: - """ - Serialize event to Confluent wire format. - Format: [0x00][4-byte schema id][Avro binary] - """ + async def serialize_event(self, event: DomainEvent) -> bytes: + """Serialize event to Confluent wire format: [0x00][4-byte schema id][Avro binary].""" subject = f"{self.subject_prefix}{event.__class__.__name__}-value" avro_schema = schema.AvroSchema(event.__class__.avro_schema(namespace=self.namespace)) - - # Prepare payload dict (exclude event_type: schema id implies the concrete record) payload: dict[str, Any] = event.model_dump(mode="python", by_alias=False, exclude_unset=False) payload.pop("event_type", None) - - # Convert datetime to microseconds for Avro timestamp-micros logical type if "timestamp" in payload and payload["timestamp"] is not None: payload["timestamp"] = int(payload["timestamp"].timestamp() * 1_000_000) - return await self._serializer.encode_record_with_schema(subject, avro_schema, payload) - async def deserialize_event(self, data: bytes, topic: str) -> BaseEvent: - """ - Deserialize from Confluent wire format to a concrete BaseEvent subclass. - """ + async def deserialize_event(self, data: bytes, topic: str) -> DomainEvent: + """Deserialize from Confluent wire format to DomainEvent.""" if not data or len(data) < 5: raise ValueError("Invalid message: too short for wire format") - if data[0:1] != MAGIC_BYTE: raise ValueError(f"Unknown magic byte: {data[0]:#x}") - - # Extract schema ID from wire format schema_id = struct.unpack(">I", data[1:5])[0] event_class = await self._get_event_class_by_id(schema_id) if not event_class: raise ValueError(f"Unknown schema ID: {schema_id}") - - # Decode the message obj = await self._serializer.decode_message(data) if not isinstance(obj, dict): raise ValueError(f"Deserialization returned {type(obj)}, expected dict") - - # Restore constant event_type if schema/payload doesn't include it - f = event_class.model_fields.get("event_type") - if f is not None and f.default is not None and "event_type" not in obj: + if (f := event_class.model_fields.get("event_type")) and f.default and "event_type" not in obj: obj["event_type"] = f.default - return event_class.model_validate(obj) - def deserialize_json(self, data: dict[str, Any]) -> BaseEvent: - """ - Deserialize JSON data (from MongoDB or DLQ) to event object using event_type field. - """ - event_type_str = data.get("event_type") - if not event_type_str: + def deserialize_json(self, data: dict[str, Any]) -> DomainEvent: + """Deserialize JSON data to DomainEvent using event_type field.""" + if not (event_type_str := data.get("event_type")): raise ValueError("Missing event_type in event data") - - event_type = EventType(event_type_str) - mapping = _get_event_type_to_class_mapping() - event_class = mapping.get(event_type) - - if not event_class: - raise ValueError(f"No event class found for event type: {event_type}") - + if not (event_class := _get_event_type_to_class_mapping().get(EventType(event_type_str))): + raise ValueError(f"No event class found for event type: {event_type_str}") return event_class.model_validate(data) async def set_compatibility(self, subject: str, mode: str) -> None: - """ - Set compatibility for a subject. - Valid: BACKWARD, FORWARD, FULL, NONE, BACKWARD_TRANSITIVE, FORWARD_TRANSITIVE, FULL_TRANSITIVE - """ - valid_modes = { - "BACKWARD", - "FORWARD", - "FULL", - "NONE", - "BACKWARD_TRANSITIVE", - "FORWARD_TRANSITIVE", - "FULL_TRANSITIVE", - } - if mode not in valid_modes: + """Set compatibility for a subject.""" + valid = {"BACKWARD", "FORWARD", "FULL", "NONE", "BACKWARD_TRANSITIVE", "FORWARD_TRANSITIVE", "FULL_TRANSITIVE"} + if mode not in valid: raise ValueError(f"Invalid compatibility mode: {mode}") - await self._client.update_compatibility(level=mode, subject=subject) self.logger.info(f"Set {subject} compatibility to {mode}") async def initialize_schemas(self) -> None: - """Initialize all event schemas in the registry (set compat + register).""" + """Initialize all event schemas in the registry.""" for event_class in _get_all_event_classes(): subject = f"{self.subject_prefix}{event_class.__name__}-value" await self.set_compatibility(subject, "FORWARD") await self.register_schema(subject, event_class) - self.logger.info(f"Initialized {len(_get_all_event_classes())} event schemas") diff --git a/backend/app/infrastructure/kafka/__init__.py b/backend/app/infrastructure/kafka/__init__.py index df6d0ee2..97295a56 100644 --- a/backend/app/infrastructure/kafka/__init__.py +++ b/backend/app/infrastructure/kafka/__init__.py @@ -1,13 +1,10 @@ -"""Kafka infrastructure for event-driven architecture.""" - -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata +from app.domain.events.typed import DomainEvent, EventMetadata from app.infrastructure.kafka.mappings import get_event_class_for_type, get_topic_for_event from app.infrastructure.kafka.topics import get_all_topics, get_topic_configs __all__ = [ - "BaseEvent", - "AvroEventMetadata", + "DomainEvent", + "EventMetadata", "get_all_topics", "get_topic_configs", "get_event_class_for_type", diff --git a/backend/app/infrastructure/kafka/events/__init__.py b/backend/app/infrastructure/kafka/events/__init__.py deleted file mode 100644 index 8f0aad85..00000000 --- a/backend/app/infrastructure/kafka/events/__init__.py +++ /dev/null @@ -1,137 +0,0 @@ -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.execution import ( - ExecutionAcceptedEvent, - ExecutionCancelledEvent, - ExecutionCompletedEvent, - ExecutionFailedEvent, - ExecutionQueuedEvent, - ExecutionRequestedEvent, - ExecutionRunningEvent, - ExecutionStartedEvent, - ExecutionTimeoutEvent, -) -from app.infrastructure.kafka.events.metadata import AvroEventMetadata -from app.infrastructure.kafka.events.notification import ( - NotificationClickedEvent, - NotificationCreatedEvent, - NotificationDeliveredEvent, - NotificationFailedEvent, - NotificationPreferencesUpdatedEvent, - NotificationReadEvent, - NotificationSentEvent, -) -from app.infrastructure.kafka.events.pod import ( - PodCreatedEvent, - PodDeletedEvent, - PodFailedEvent, - PodRunningEvent, - PodScheduledEvent, - PodSucceededEvent, - PodTerminatedEvent, -) -from app.infrastructure.kafka.events.saga import ( - AllocateResourcesCommandEvent, - CreatePodCommandEvent, - DeletePodCommandEvent, - ReleaseResourcesCommandEvent, - SagaCancelledEvent, - SagaCompensatedEvent, - SagaCompensatingEvent, - SagaCompletedEvent, - SagaFailedEvent, - SagaStartedEvent, -) -from app.infrastructure.kafka.events.system import ( - AuthFailedEvent, - QuotaExceededEvent, - RateLimitExceededEvent, - ResourceLimitExceededEvent, - ResultFailedEvent, - ResultStoredEvent, - ScriptDeletedEvent, - ScriptSavedEvent, - ScriptSharedEvent, - SecurityViolationEvent, - ServiceRecoveredEvent, - ServiceUnhealthyEvent, - SystemErrorEvent, -) -from app.infrastructure.kafka.events.user import ( - UserDeletedEvent, - UserLoggedInEvent, - UserLoggedOutEvent, - UserLoginEvent, - UserRegisteredEvent, - UserSettingsUpdatedEvent, - UserUpdatedEvent, -) - -__all__ = [ - # Base - "BaseEvent", - "AvroEventMetadata", - # Execution - "ExecutionRequestedEvent", - "ExecutionAcceptedEvent", - "ExecutionQueuedEvent", - "ExecutionRunningEvent", - "ExecutionStartedEvent", - "ExecutionCompletedEvent", - "ExecutionFailedEvent", - "ExecutionTimeoutEvent", - "ExecutionCancelledEvent", - # Pod - "PodCreatedEvent", - "PodScheduledEvent", - "PodRunningEvent", - "PodTerminatedEvent", - "PodSucceededEvent", - "PodFailedEvent", - "PodDeletedEvent", - # User - "UserRegisteredEvent", - "UserLoginEvent", - "UserLoggedInEvent", - "UserLoggedOutEvent", - "UserUpdatedEvent", - "UserDeletedEvent", - "UserSettingsUpdatedEvent", - # Notification - "NotificationCreatedEvent", - "NotificationSentEvent", - "NotificationDeliveredEvent", - "NotificationFailedEvent", - "NotificationReadEvent", - "NotificationClickedEvent", - "NotificationPreferencesUpdatedEvent", - # Script - "ScriptSavedEvent", - "ScriptDeletedEvent", - "ScriptSharedEvent", - # Security - "SecurityViolationEvent", - "RateLimitExceededEvent", - "AuthFailedEvent", - # Resource - "ResourceLimitExceededEvent", - "QuotaExceededEvent", - # System - "SystemErrorEvent", - "ServiceUnhealthyEvent", - "ServiceRecoveredEvent", - # Result - "ResultStoredEvent", - "ResultFailedEvent", - # Saga - "SagaStartedEvent", - "SagaCompletedEvent", - "SagaFailedEvent", - "SagaCancelledEvent", - "SagaCompensatingEvent", - "SagaCompensatedEvent", - # Saga Commands - "CreatePodCommandEvent", - "DeletePodCommandEvent", - "AllocateResourcesCommandEvent", - "ReleaseResourcesCommandEvent", -] diff --git a/backend/app/infrastructure/kafka/events/base.py b/backend/app/infrastructure/kafka/events/base.py deleted file mode 100644 index 23592c41..00000000 --- a/backend/app/infrastructure/kafka/events/base.py +++ /dev/null @@ -1,37 +0,0 @@ -from datetime import datetime, timezone -from typing import Any, ClassVar -from uuid import uuid4 - -from pydantic import ConfigDict, Field, field_serializer -from pydantic_avro.to_avro.base import AvroBase - -from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic -from app.infrastructure.kafka.events.metadata import AvroEventMetadata - - -class BaseEvent(AvroBase): - """Base class for all events.""" - - event_id: str = Field(default_factory=lambda: str(uuid4())) - event_type: EventType - event_version: str = "1.0" - timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) - aggregate_id: str | None = None - metadata: AvroEventMetadata - - # Each subclass must define its topic - topic: ClassVar[KafkaTopic] - - model_config = ConfigDict() - - @field_serializer("timestamp", when_used="json") - def serialize_timestamp(self, dt: datetime) -> str: - return dt.isoformat() - - def to_dict(self) -> dict[str, Any]: - # Use mode='json' to properly serialize datetime objects to ISO strings - return self.model_dump(by_alias=True, mode="json") - - def to_json(self) -> str: - return self.model_dump_json(by_alias=True) diff --git a/backend/app/infrastructure/kafka/events/execution.py b/backend/app/infrastructure/kafka/events/execution.py deleted file mode 100644 index 3030b4eb..00000000 --- a/backend/app/infrastructure/kafka/events/execution.py +++ /dev/null @@ -1,136 +0,0 @@ -from datetime import datetime -from typing import ClassVar, Literal - -from pydantic import ConfigDict - -from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic -from app.domain.enums.storage import ExecutionErrorType -from app.domain.execution import ResourceUsageDomain -from app.infrastructure.kafka.events.base import BaseEvent - - -class ExecutionRequestedEvent(BaseEvent): - event_type: Literal[EventType.EXECUTION_REQUESTED] = EventType.EXECUTION_REQUESTED - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_EVENTS - execution_id: str - script: str - language: str - language_version: str - runtime_image: str - runtime_command: list[str] - runtime_filename: str - timeout_seconds: int - cpu_limit: str - memory_limit: str - cpu_request: str - memory_request: str - priority: int = 5 - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "event_type": EventType.EXECUTION_REQUESTED, - "execution_id": "550e8400-e29b-41d4-a716-446655440000", - "script": "print('Hello, World!')", - "language": "python", - "language_version": "3.11", - "runtime_image": "python:3.11-slim", - "runtime_command": ["python"], - "runtime_filename": "main.py", - "timeout_seconds": 30, - "cpu_limit": "100m", - "memory_limit": "128Mi", - "cpu_request": "50m", - "memory_request": "64Mi", - "priority": 5, - } - } - ) - - -class ExecutionAcceptedEvent(BaseEvent): - event_type: Literal[EventType.EXECUTION_ACCEPTED] = EventType.EXECUTION_ACCEPTED - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_EVENTS - execution_id: str - queue_position: int - estimated_wait_seconds: float | None = None - priority: int = 5 - - -class ExecutionQueuedEvent(BaseEvent): - event_type: Literal[EventType.EXECUTION_QUEUED] = EventType.EXECUTION_QUEUED - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_EVENTS - execution_id: str - position_in_queue: int | None = None - estimated_start_time: datetime | None = None - - -class ExecutionRunningEvent(BaseEvent): - event_type: Literal[EventType.EXECUTION_RUNNING] = EventType.EXECUTION_RUNNING - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_EVENTS - execution_id: str - pod_name: str - progress_percentage: int | None = None - - -class ExecutionStartedEvent(BaseEvent): - event_type: Literal[EventType.EXECUTION_STARTED] = EventType.EXECUTION_STARTED - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_EVENTS - execution_id: str - pod_name: str - node_name: str | None = None - container_id: str | None = None - - -class ExecutionCompletedEvent(BaseEvent): - event_type: Literal[EventType.EXECUTION_COMPLETED] = EventType.EXECUTION_COMPLETED - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_COMPLETED - execution_id: str - exit_code: int - resource_usage: ResourceUsageDomain - stdout: str = "" - stderr: str = "" - - -class ExecutionFailedEvent(BaseEvent): - event_type: Literal[EventType.EXECUTION_FAILED] = EventType.EXECUTION_FAILED - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_FAILED - execution_id: str - stdout: str = "" - stderr: str = "" - exit_code: int - error_type: ExecutionErrorType - error_message: str - resource_usage: ResourceUsageDomain | None = None - - -class ExecutionTimeoutEvent(BaseEvent): - event_type: Literal[EventType.EXECUTION_TIMEOUT] = EventType.EXECUTION_TIMEOUT - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_TIMEOUT - execution_id: str - timeout_seconds: int - resource_usage: ResourceUsageDomain - stdout: str = "" - stderr: str = "" - - -class ExecutionCancelledEvent(BaseEvent): - event_type: Literal[EventType.EXECUTION_CANCELLED] = EventType.EXECUTION_CANCELLED - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_EVENTS - execution_id: str - reason: str - cancelled_by: str | None = None - force_terminated: bool = False - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "event_type": "execution.cancelled", - "execution_id": "550e8400-e29b-41d4-a716-446655440000", - "reason": "user_requested", - "cancelled_by": "user123", - "force_terminated": False, - } - } - ) diff --git a/backend/app/infrastructure/kafka/events/metadata.py b/backend/app/infrastructure/kafka/events/metadata.py deleted file mode 100644 index a6522ae1..00000000 --- a/backend/app/infrastructure/kafka/events/metadata.py +++ /dev/null @@ -1,31 +0,0 @@ -from uuid import uuid4 - -from pydantic import ConfigDict, Field -from pydantic_avro.to_avro.base import AvroBase - -from app.domain.enums.common import Environment - - -class AvroEventMetadata(AvroBase): - """Unified event metadata for auditing and tracing.""" - - service_name: str - service_version: str - correlation_id: str = Field(default_factory=lambda: str(uuid4())) - user_id: str | None = None - ip_address: str | None = None - user_agent: str | None = None - environment: Environment = Environment.PRODUCTION - - model_config = ConfigDict(extra="allow", str_strip_whitespace=True, use_enum_values=True) - - def with_correlation(self, correlation_id: str) -> "AvroEventMetadata": - return self.model_copy(update={"correlation_id": correlation_id}) - - def with_user(self, user_id: str) -> "AvroEventMetadata": - return self.model_copy(update={"user_id": user_id}) - - def ensure_correlation_id(self) -> "AvroEventMetadata": - if self.correlation_id: - return self - return self.model_copy(update={"correlation_id": str(uuid4())}) diff --git a/backend/app/infrastructure/kafka/events/notification.py b/backend/app/infrastructure/kafka/events/notification.py deleted file mode 100644 index 197e4fa2..00000000 --- a/backend/app/infrastructure/kafka/events/notification.py +++ /dev/null @@ -1,73 +0,0 @@ -from datetime import datetime -from typing import ClassVar, Literal - -from pydantic import Field - -from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic -from app.domain.enums.notification import NotificationChannel, NotificationSeverity -from app.infrastructure.kafka.events.base import BaseEvent - - -class NotificationCreatedEvent(BaseEvent): - event_type: Literal[EventType.NOTIFICATION_CREATED] = EventType.NOTIFICATION_CREATED - topic: ClassVar[KafkaTopic] = KafkaTopic.NOTIFICATION_EVENTS - notification_id: str - user_id: str - subject: str - body: str - severity: NotificationSeverity - tags: list[str] - channels: list[NotificationChannel] - - -class NotificationSentEvent(BaseEvent): - event_type: Literal[EventType.NOTIFICATION_SENT] = EventType.NOTIFICATION_SENT - topic: ClassVar[KafkaTopic] = KafkaTopic.NOTIFICATION_EVENTS - notification_id: str - user_id: str - channel: NotificationChannel - sent_at: datetime - - -class NotificationDeliveredEvent(BaseEvent): - event_type: Literal[EventType.NOTIFICATION_DELIVERED] = EventType.NOTIFICATION_DELIVERED - topic: ClassVar[KafkaTopic] = KafkaTopic.NOTIFICATION_EVENTS - notification_id: str - user_id: str - channel: NotificationChannel - delivered_at: datetime - - -class NotificationFailedEvent(BaseEvent): - event_type: Literal[EventType.NOTIFICATION_FAILED] = EventType.NOTIFICATION_FAILED - topic: ClassVar[KafkaTopic] = KafkaTopic.NOTIFICATION_EVENTS - notification_id: str - user_id: str - channel: NotificationChannel - error: str - retry_count: int - - -class NotificationReadEvent(BaseEvent): - event_type: Literal[EventType.NOTIFICATION_READ] = EventType.NOTIFICATION_READ - topic: ClassVar[KafkaTopic] = KafkaTopic.NOTIFICATION_EVENTS - notification_id: str - user_id: str - read_at: datetime - - -class NotificationClickedEvent(BaseEvent): - event_type: Literal[EventType.NOTIFICATION_CLICKED] = EventType.NOTIFICATION_CLICKED - topic: ClassVar[KafkaTopic] = KafkaTopic.NOTIFICATION_EVENTS - notification_id: str - user_id: str - clicked_at: datetime - action: str | None = None - - -class NotificationPreferencesUpdatedEvent(BaseEvent): - event_type: Literal[EventType.NOTIFICATION_PREFERENCES_UPDATED] = EventType.NOTIFICATION_PREFERENCES_UPDATED - topic: ClassVar[KafkaTopic] = KafkaTopic.NOTIFICATION_EVENTS - user_id: str - changed_fields: list[str] = Field(default_factory=list) diff --git a/backend/app/infrastructure/kafka/events/pod.py b/backend/app/infrastructure/kafka/events/pod.py deleted file mode 100644 index b8a6138a..00000000 --- a/backend/app/infrastructure/kafka/events/pod.py +++ /dev/null @@ -1,69 +0,0 @@ -from typing import ClassVar, Literal - -from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic -from app.infrastructure.kafka.events.base import BaseEvent - - -class PodCreatedEvent(BaseEvent): - event_type: Literal[EventType.POD_CREATED] = EventType.POD_CREATED - topic: ClassVar[KafkaTopic] = KafkaTopic.POD_EVENTS - execution_id: str - pod_name: str - namespace: str - - -class PodScheduledEvent(BaseEvent): - event_type: Literal[EventType.POD_SCHEDULED] = EventType.POD_SCHEDULED - topic: ClassVar[KafkaTopic] = KafkaTopic.POD_EVENTS - execution_id: str - pod_name: str - node_name: str - - -class PodRunningEvent(BaseEvent): - event_type: Literal[EventType.POD_RUNNING] = EventType.POD_RUNNING - topic: ClassVar[KafkaTopic] = KafkaTopic.POD_STATUS_UPDATES - execution_id: str - pod_name: str - container_statuses: str # JSON serialized list of container statuses - - -class PodTerminatedEvent(BaseEvent): - event_type: Literal[EventType.POD_TERMINATED] = EventType.POD_TERMINATED - topic: ClassVar[KafkaTopic] = KafkaTopic.POD_STATUS_UPDATES - execution_id: str - pod_name: str - exit_code: int - reason: str | None = None - message: str | None = None - - -class PodSucceededEvent(BaseEvent): - event_type: Literal[EventType.POD_SUCCEEDED] = EventType.POD_SUCCEEDED - topic: ClassVar[KafkaTopic] = KafkaTopic.POD_STATUS_UPDATES - execution_id: str - pod_name: str - exit_code: int - stdout: str | None = None - stderr: str | None = None - - -class PodFailedEvent(BaseEvent): - event_type: Literal[EventType.POD_FAILED] = EventType.POD_FAILED - topic: ClassVar[KafkaTopic] = KafkaTopic.POD_STATUS_UPDATES - execution_id: str - pod_name: str - exit_code: int - reason: str | None = None - message: str | None = None - stdout: str | None = None - stderr: str | None = None - - -class PodDeletedEvent(BaseEvent): - event_type: Literal[EventType.POD_DELETED] = EventType.POD_DELETED - topic: ClassVar[KafkaTopic] = KafkaTopic.POD_EVENTS - execution_id: str - pod_name: str - reason: str | None = None diff --git a/backend/app/infrastructure/kafka/events/saga.py b/backend/app/infrastructure/kafka/events/saga.py deleted file mode 100644 index fb3b2133..00000000 --- a/backend/app/infrastructure/kafka/events/saga.py +++ /dev/null @@ -1,112 +0,0 @@ -from datetime import datetime -from typing import ClassVar, Literal - -from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic -from app.infrastructure.kafka.events.base import BaseEvent - - -class SagaStartedEvent(BaseEvent): - event_type: Literal[EventType.SAGA_STARTED] = EventType.SAGA_STARTED - topic: ClassVar[KafkaTopic] = KafkaTopic.SAGA_EVENTS - saga_id: str - saga_name: str - execution_id: str - initial_event_id: str - - -class SagaCompletedEvent(BaseEvent): - event_type: Literal[EventType.SAGA_COMPLETED] = EventType.SAGA_COMPLETED - topic: ClassVar[KafkaTopic] = KafkaTopic.SAGA_EVENTS - saga_id: str - saga_name: str - execution_id: str - completed_steps: list[str] - - -class SagaFailedEvent(BaseEvent): - event_type: Literal[EventType.SAGA_FAILED] = EventType.SAGA_FAILED - topic: ClassVar[KafkaTopic] = KafkaTopic.SAGA_EVENTS - saga_id: str - saga_name: str - execution_id: str - failed_step: str - error: str - - -class SagaCancelledEvent(BaseEvent): - event_type: Literal[EventType.SAGA_CANCELLED] = EventType.SAGA_CANCELLED - topic: ClassVar[KafkaTopic] = KafkaTopic.SAGA_EVENTS - saga_id: str - saga_name: str - execution_id: str - reason: str - completed_steps: list[str] - compensated_steps: list[str] - cancelled_at: datetime | None = None - cancelled_by: str | None = None - - -class SagaCompensatingEvent(BaseEvent): - event_type: Literal[EventType.SAGA_COMPENSATING] = EventType.SAGA_COMPENSATING - topic: ClassVar[KafkaTopic] = KafkaTopic.SAGA_EVENTS - saga_id: str - saga_name: str - execution_id: str - compensating_step: str - - -class SagaCompensatedEvent(BaseEvent): - event_type: Literal[EventType.SAGA_COMPENSATED] = EventType.SAGA_COMPENSATED - topic: ClassVar[KafkaTopic] = KafkaTopic.SAGA_EVENTS - saga_id: str - saga_name: str - execution_id: str - compensated_steps: list[str] - - -# Saga Command Events -class CreatePodCommandEvent(BaseEvent): - event_type: Literal[EventType.CREATE_POD_COMMAND] = EventType.CREATE_POD_COMMAND - topic: ClassVar[KafkaTopic] = KafkaTopic.SAGA_COMMANDS - saga_id: str - execution_id: str - script: str - language: str - language_version: str - runtime_image: str - runtime_command: list[str] - runtime_filename: str - timeout_seconds: int - cpu_limit: str - memory_limit: str - cpu_request: str - memory_request: str - priority: int - pod_spec: dict[str, str | int | list[str]] | None = None - - -class DeletePodCommandEvent(BaseEvent): - event_type: Literal[EventType.DELETE_POD_COMMAND] = EventType.DELETE_POD_COMMAND - topic: ClassVar[KafkaTopic] = KafkaTopic.SAGA_COMMANDS - saga_id: str - execution_id: str - reason: str - pod_name: str | None = None - namespace: str | None = None - - -class AllocateResourcesCommandEvent(BaseEvent): - event_type: Literal[EventType.ALLOCATE_RESOURCES_COMMAND] = EventType.ALLOCATE_RESOURCES_COMMAND - topic: ClassVar[KafkaTopic] = KafkaTopic.SAGA_COMMANDS - execution_id: str - cpu_request: str - memory_request: str - - -class ReleaseResourcesCommandEvent(BaseEvent): - event_type: Literal[EventType.RELEASE_RESOURCES_COMMAND] = EventType.RELEASE_RESOURCES_COMMAND - topic: ClassVar[KafkaTopic] = KafkaTopic.SAGA_COMMANDS - execution_id: str - cpu_request: str - memory_request: str diff --git a/backend/app/infrastructure/kafka/events/system.py b/backend/app/infrastructure/kafka/events/system.py deleted file mode 100644 index 36b1d341..00000000 --- a/backend/app/infrastructure/kafka/events/system.py +++ /dev/null @@ -1,123 +0,0 @@ -from typing import ClassVar, Literal - -from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic -from app.domain.enums.storage import StorageType -from app.infrastructure.kafka.events.base import BaseEvent - - -# Script Events -class ScriptSavedEvent(BaseEvent): - event_type: Literal[EventType.SCRIPT_SAVED] = EventType.SCRIPT_SAVED - topic: ClassVar[KafkaTopic] = KafkaTopic.SCRIPT_EVENTS - script_id: str - user_id: str - title: str - language: str - - -class ScriptDeletedEvent(BaseEvent): - event_type: Literal[EventType.SCRIPT_DELETED] = EventType.SCRIPT_DELETED - topic: ClassVar[KafkaTopic] = KafkaTopic.SCRIPT_EVENTS - script_id: str - user_id: str - deleted_by: str | None = None - - -class ScriptSharedEvent(BaseEvent): - event_type: Literal[EventType.SCRIPT_SHARED] = EventType.SCRIPT_SHARED - topic: ClassVar[KafkaTopic] = KafkaTopic.SCRIPT_EVENTS - script_id: str - shared_by: str - shared_with: list[str] - permissions: str - - -# Security Events -class SecurityViolationEvent(BaseEvent): - event_type: Literal[EventType.SECURITY_VIOLATION] = EventType.SECURITY_VIOLATION - topic: ClassVar[KafkaTopic] = KafkaTopic.SECURITY_EVENTS - user_id: str | None = None - violation_type: str - details: str - ip_address: str | None = None - - -class RateLimitExceededEvent(BaseEvent): - event_type: Literal[EventType.RATE_LIMIT_EXCEEDED] = EventType.RATE_LIMIT_EXCEEDED - topic: ClassVar[KafkaTopic] = KafkaTopic.SECURITY_EVENTS - user_id: str | None = None - endpoint: str - limit: int - window_seconds: int - - -class AuthFailedEvent(BaseEvent): - event_type: Literal[EventType.AUTH_FAILED] = EventType.AUTH_FAILED - topic: ClassVar[KafkaTopic] = KafkaTopic.SECURITY_EVENTS - username: str | None = None - reason: str - ip_address: str | None = None - - -# Resource Events -class ResourceLimitExceededEvent(BaseEvent): - event_type: Literal[EventType.RESOURCE_LIMIT_EXCEEDED] = EventType.RESOURCE_LIMIT_EXCEEDED - topic: ClassVar[KafkaTopic] = KafkaTopic.RESOURCE_EVENTS - resource_type: str - limit: int - requested: int - user_id: str | None = None - - -class QuotaExceededEvent(BaseEvent): - event_type: Literal[EventType.QUOTA_EXCEEDED] = EventType.QUOTA_EXCEEDED - topic: ClassVar[KafkaTopic] = KafkaTopic.RESOURCE_EVENTS - quota_type: str - limit: int - current_usage: int - user_id: str - - -# System Events -class SystemErrorEvent(BaseEvent): - event_type: Literal[EventType.SYSTEM_ERROR] = EventType.SYSTEM_ERROR - topic: ClassVar[KafkaTopic] = KafkaTopic.SYSTEM_EVENTS - error_type: str - message: str - service_name: str - stack_trace: str | None = None - - -class ServiceUnhealthyEvent(BaseEvent): - event_type: Literal[EventType.SERVICE_UNHEALTHY] = EventType.SERVICE_UNHEALTHY - topic: ClassVar[KafkaTopic] = KafkaTopic.SYSTEM_EVENTS - service_name: str - health_check: str - reason: str - - -class ServiceRecoveredEvent(BaseEvent): - event_type: Literal[EventType.SERVICE_RECOVERED] = EventType.SERVICE_RECOVERED - topic: ClassVar[KafkaTopic] = KafkaTopic.SYSTEM_EVENTS - service_name: str - health_check: str - downtime_seconds: int - - -# Result Events -class ResultStoredEvent(BaseEvent): - event_type: Literal[EventType.RESULT_STORED] = EventType.RESULT_STORED - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_RESULTS - execution_id: str - storage_type: StorageType - storage_path: str - size_bytes: int - - -class ResultFailedEvent(BaseEvent): - event_type: Literal[EventType.RESULT_FAILED] = EventType.RESULT_FAILED - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_RESULTS - execution_id: str - error: str - storage_type: StorageType | None = None diff --git a/backend/app/infrastructure/kafka/events/user.py b/backend/app/infrastructure/kafka/events/user.py deleted file mode 100644 index 0a6e6495..00000000 --- a/backend/app/infrastructure/kafka/events/user.py +++ /dev/null @@ -1,95 +0,0 @@ -from typing import ClassVar, Literal - -from pydantic_avro.to_avro.base import AvroBase - -from app.domain.enums.auth import LoginMethod -from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic -from app.infrastructure.kafka.events.base import BaseEvent - - -class NotificationSettingsPayload(AvroBase): - """Avro-compatible payload for notification settings changes.""" - - execution_completed: bool | None = None - execution_failed: bool | None = None - system_updates: bool | None = None - security_alerts: bool | None = None - channels: list[str] | None = None - - -class EditorSettingsPayload(AvroBase): - """Avro-compatible payload for editor settings changes.""" - - theme: str | None = None - font_size: int | None = None - tab_size: int | None = None - use_tabs: bool | None = None - word_wrap: bool | None = None - show_line_numbers: bool | None = None - - -class UserRegisteredEvent(BaseEvent): - event_type: Literal[EventType.USER_REGISTERED] = EventType.USER_REGISTERED - topic: ClassVar[KafkaTopic] = KafkaTopic.USER_EVENTS - user_id: str - username: str - email: str - - -class UserLoginEvent(BaseEvent): - event_type: Literal[EventType.USER_LOGIN] = EventType.USER_LOGIN - topic: ClassVar[KafkaTopic] = KafkaTopic.USER_EVENTS - user_id: str - login_method: LoginMethod - ip_address: str | None = None - user_agent: str | None = None - - -class UserLoggedInEvent(BaseEvent): - event_type: Literal[EventType.USER_LOGGED_IN] = EventType.USER_LOGGED_IN - topic: ClassVar[KafkaTopic] = KafkaTopic.USER_EVENTS - user_id: str - login_method: LoginMethod - ip_address: str | None = None - user_agent: str | None = None - - -class UserLoggedOutEvent(BaseEvent): - event_type: Literal[EventType.USER_LOGGED_OUT] = EventType.USER_LOGGED_OUT - topic: ClassVar[KafkaTopic] = KafkaTopic.USER_EVENTS - user_id: str - logout_reason: str | None = None - - -class UserUpdatedEvent(BaseEvent): - event_type: Literal[EventType.USER_UPDATED] = EventType.USER_UPDATED - topic: ClassVar[KafkaTopic] = KafkaTopic.USER_EVENTS - user_id: str - updated_fields: list[str] - updated_by: str | None = None - - -class UserDeletedEvent(BaseEvent): - event_type: Literal[EventType.USER_DELETED] = EventType.USER_DELETED - topic: ClassVar[KafkaTopic] = KafkaTopic.USER_EVENTS - user_id: str - deleted_by: str | None = None - reason: str | None = None - - -class UserSettingsUpdatedEvent(BaseEvent): - """Unified event for all user settings changes with typed payloads.""" - - event_type: Literal[EventType.USER_SETTINGS_UPDATED] = EventType.USER_SETTINGS_UPDATED - topic: ClassVar[KafkaTopic] = KafkaTopic.USER_SETTINGS_EVENTS - user_id: str - changed_fields: list[str] - # Typed fields for each settings category (Avro-compatible) - theme: str | None = None - timezone: str | None = None - date_format: str | None = None - time_format: str | None = None - notifications: NotificationSettingsPayload | None = None - editor: EditorSettingsPayload | None = None - reason: str | None = None diff --git a/backend/app/infrastructure/kafka/mappings.py b/backend/app/infrastructure/kafka/mappings.py index 5a056eef..764f06c0 100644 --- a/backend/app/infrastructure/kafka/mappings.py +++ b/backend/app/infrastructure/kafka/mappings.py @@ -1,164 +1,104 @@ from functools import lru_cache -from typing import Type +from typing import Dict, get_args, get_origin from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.execution import ( - ExecutionAcceptedEvent, - ExecutionCancelledEvent, - ExecutionCompletedEvent, - ExecutionFailedEvent, - ExecutionQueuedEvent, - ExecutionRequestedEvent, - ExecutionRunningEvent, - ExecutionStartedEvent, - ExecutionTimeoutEvent, -) -from app.infrastructure.kafka.events.notification import ( - NotificationClickedEvent, - NotificationCreatedEvent, - NotificationDeliveredEvent, - NotificationFailedEvent, - NotificationPreferencesUpdatedEvent, - NotificationReadEvent, - NotificationSentEvent, -) -from app.infrastructure.kafka.events.pod import ( - PodCreatedEvent, - PodDeletedEvent, - PodFailedEvent, - PodRunningEvent, - PodScheduledEvent, - PodSucceededEvent, - PodTerminatedEvent, -) -from app.infrastructure.kafka.events.saga import ( - AllocateResourcesCommandEvent, - CreatePodCommandEvent, - DeletePodCommandEvent, - ReleaseResourcesCommandEvent, - SagaCancelledEvent, - SagaCompensatedEvent, - SagaCompensatingEvent, - SagaCompletedEvent, - SagaFailedEvent, - SagaStartedEvent, -) -from app.infrastructure.kafka.events.system import ( - AuthFailedEvent, - QuotaExceededEvent, - RateLimitExceededEvent, - ResourceLimitExceededEvent, - ResultFailedEvent, - ResultStoredEvent, - ScriptDeletedEvent, - ScriptSavedEvent, - ScriptSharedEvent, - SecurityViolationEvent, - ServiceRecoveredEvent, - ServiceUnhealthyEvent, - SystemErrorEvent, -) -from app.infrastructure.kafka.events.user import ( - UserDeletedEvent, - UserLoggedInEvent, - UserLoggedOutEvent, - UserLoginEvent, - UserRegisteredEvent, - UserSettingsUpdatedEvent, - UserUpdatedEvent, -) + +# EventType -> KafkaTopic routing +EVENT_TYPE_TO_TOPIC: Dict[EventType, KafkaTopic] = { + # Execution events + EventType.EXECUTION_REQUESTED: KafkaTopic.EXECUTION_EVENTS, + EventType.EXECUTION_ACCEPTED: KafkaTopic.EXECUTION_EVENTS, + EventType.EXECUTION_QUEUED: KafkaTopic.EXECUTION_EVENTS, + EventType.EXECUTION_STARTED: KafkaTopic.EXECUTION_EVENTS, + EventType.EXECUTION_RUNNING: KafkaTopic.EXECUTION_EVENTS, + EventType.EXECUTION_COMPLETED: KafkaTopic.EXECUTION_EVENTS, + EventType.EXECUTION_FAILED: KafkaTopic.EXECUTION_EVENTS, + EventType.EXECUTION_TIMEOUT: KafkaTopic.EXECUTION_EVENTS, + EventType.EXECUTION_CANCELLED: KafkaTopic.EXECUTION_EVENTS, + # Pod events + EventType.POD_CREATED: KafkaTopic.POD_EVENTS, + EventType.POD_SCHEDULED: KafkaTopic.POD_EVENTS, + EventType.POD_RUNNING: KafkaTopic.POD_EVENTS, + EventType.POD_SUCCEEDED: KafkaTopic.POD_EVENTS, + EventType.POD_FAILED: KafkaTopic.POD_EVENTS, + EventType.POD_TERMINATED: KafkaTopic.POD_EVENTS, + EventType.POD_DELETED: KafkaTopic.POD_EVENTS, + # Result events + EventType.RESULT_STORED: KafkaTopic.EXECUTION_RESULTS, + EventType.RESULT_FAILED: KafkaTopic.EXECUTION_RESULTS, + # User events + EventType.USER_REGISTERED: KafkaTopic.USER_EVENTS, + EventType.USER_LOGIN: KafkaTopic.USER_EVENTS, + EventType.USER_LOGGED_IN: KafkaTopic.USER_EVENTS, + EventType.USER_LOGGED_OUT: KafkaTopic.USER_EVENTS, + EventType.USER_UPDATED: KafkaTopic.USER_EVENTS, + EventType.USER_DELETED: KafkaTopic.USER_EVENTS, + EventType.USER_SETTINGS_UPDATED: KafkaTopic.USER_SETTINGS_EVENTS, + # Notification events + EventType.NOTIFICATION_CREATED: KafkaTopic.NOTIFICATION_EVENTS, + EventType.NOTIFICATION_SENT: KafkaTopic.NOTIFICATION_EVENTS, + EventType.NOTIFICATION_DELIVERED: KafkaTopic.NOTIFICATION_EVENTS, + EventType.NOTIFICATION_FAILED: KafkaTopic.NOTIFICATION_EVENTS, + EventType.NOTIFICATION_READ: KafkaTopic.NOTIFICATION_EVENTS, + EventType.NOTIFICATION_CLICKED: KafkaTopic.NOTIFICATION_EVENTS, + EventType.NOTIFICATION_PREFERENCES_UPDATED: KafkaTopic.NOTIFICATION_EVENTS, + # Script events + EventType.SCRIPT_SAVED: KafkaTopic.SCRIPT_EVENTS, + EventType.SCRIPT_DELETED: KafkaTopic.SCRIPT_EVENTS, + EventType.SCRIPT_SHARED: KafkaTopic.SCRIPT_EVENTS, + # Security events + EventType.SECURITY_VIOLATION: KafkaTopic.SECURITY_EVENTS, + EventType.RATE_LIMIT_EXCEEDED: KafkaTopic.SECURITY_EVENTS, + EventType.AUTH_FAILED: KafkaTopic.SECURITY_EVENTS, + # Resource events + EventType.RESOURCE_LIMIT_EXCEEDED: KafkaTopic.RESOURCE_EVENTS, + EventType.QUOTA_EXCEEDED: KafkaTopic.RESOURCE_EVENTS, + # System events + EventType.SYSTEM_ERROR: KafkaTopic.SYSTEM_EVENTS, + EventType.SERVICE_UNHEALTHY: KafkaTopic.SYSTEM_EVENTS, + EventType.SERVICE_RECOVERED: KafkaTopic.SYSTEM_EVENTS, + # Saga events + EventType.SAGA_STARTED: KafkaTopic.SAGA_EVENTS, + EventType.SAGA_COMPLETED: KafkaTopic.SAGA_EVENTS, + EventType.SAGA_FAILED: KafkaTopic.SAGA_EVENTS, + EventType.SAGA_CANCELLED: KafkaTopic.SAGA_EVENTS, + EventType.SAGA_COMPENSATING: KafkaTopic.SAGA_EVENTS, + EventType.SAGA_COMPENSATED: KafkaTopic.SAGA_EVENTS, + # Saga command events + EventType.CREATE_POD_COMMAND: KafkaTopic.SAGA_COMMANDS, + EventType.DELETE_POD_COMMAND: KafkaTopic.SAGA_COMMANDS, + EventType.ALLOCATE_RESOURCES_COMMAND: KafkaTopic.SAGA_COMMANDS, + EventType.RELEASE_RESOURCES_COMMAND: KafkaTopic.SAGA_COMMANDS, + # DLQ events + EventType.DLQ_MESSAGE_RECEIVED: KafkaTopic.DLQ_EVENTS, + EventType.DLQ_MESSAGE_RETRIED: KafkaTopic.DLQ_EVENTS, + EventType.DLQ_MESSAGE_DISCARDED: KafkaTopic.DLQ_EVENTS, +} + + +@lru_cache(maxsize=1) +def _get_event_type_to_class() -> Dict[EventType, type]: + """Build mapping from EventType to event class using DomainEvent union.""" + from app.domain.events.typed import DomainEvent + + union_type = get_args(DomainEvent)[0] + classes = list(get_args(union_type)) if get_origin(union_type) is not None else [union_type] + return {cls.model_fields["event_type"].default: cls for cls in classes} @lru_cache(maxsize=128) -def get_event_class_for_type(event_type: EventType) -> Type[BaseEvent] | None: +def get_event_class_for_type(event_type: EventType) -> type | None: """Get the event class for a given event type.""" - event_map: dict[EventType, Type[BaseEvent]] = { - # Execution events - EventType.EXECUTION_REQUESTED: ExecutionRequestedEvent, - EventType.EXECUTION_ACCEPTED: ExecutionAcceptedEvent, - EventType.EXECUTION_QUEUED: ExecutionQueuedEvent, - EventType.EXECUTION_STARTED: ExecutionStartedEvent, - EventType.EXECUTION_RUNNING: ExecutionRunningEvent, - EventType.EXECUTION_COMPLETED: ExecutionCompletedEvent, - EventType.EXECUTION_FAILED: ExecutionFailedEvent, - EventType.EXECUTION_TIMEOUT: ExecutionTimeoutEvent, - EventType.EXECUTION_CANCELLED: ExecutionCancelledEvent, - # Pod events - EventType.POD_CREATED: PodCreatedEvent, - EventType.POD_SCHEDULED: PodScheduledEvent, - EventType.POD_RUNNING: PodRunningEvent, - EventType.POD_SUCCEEDED: PodSucceededEvent, - EventType.POD_FAILED: PodFailedEvent, - EventType.POD_TERMINATED: PodTerminatedEvent, - EventType.POD_DELETED: PodDeletedEvent, - # User events - EventType.USER_REGISTERED: UserRegisteredEvent, - EventType.USER_LOGIN: UserLoginEvent, - EventType.USER_LOGGED_IN: UserLoggedInEvent, - EventType.USER_LOGGED_OUT: UserLoggedOutEvent, - EventType.USER_UPDATED: UserUpdatedEvent, - EventType.USER_DELETED: UserDeletedEvent, - EventType.USER_SETTINGS_UPDATED: UserSettingsUpdatedEvent, - # Notification events - EventType.NOTIFICATION_CREATED: NotificationCreatedEvent, - EventType.NOTIFICATION_SENT: NotificationSentEvent, - EventType.NOTIFICATION_DELIVERED: NotificationDeliveredEvent, - EventType.NOTIFICATION_FAILED: NotificationFailedEvent, - EventType.NOTIFICATION_READ: NotificationReadEvent, - EventType.NOTIFICATION_CLICKED: NotificationClickedEvent, - EventType.NOTIFICATION_PREFERENCES_UPDATED: NotificationPreferencesUpdatedEvent, - # Script events - EventType.SCRIPT_SAVED: ScriptSavedEvent, - EventType.SCRIPT_DELETED: ScriptDeletedEvent, - EventType.SCRIPT_SHARED: ScriptSharedEvent, - # Security events - EventType.SECURITY_VIOLATION: SecurityViolationEvent, - EventType.RATE_LIMIT_EXCEEDED: RateLimitExceededEvent, - EventType.AUTH_FAILED: AuthFailedEvent, - # Resource events - EventType.RESOURCE_LIMIT_EXCEEDED: ResourceLimitExceededEvent, - EventType.QUOTA_EXCEEDED: QuotaExceededEvent, - # System events - EventType.SYSTEM_ERROR: SystemErrorEvent, - EventType.SERVICE_UNHEALTHY: ServiceUnhealthyEvent, - EventType.SERVICE_RECOVERED: ServiceRecoveredEvent, - # Result events - EventType.RESULT_STORED: ResultStoredEvent, - EventType.RESULT_FAILED: ResultFailedEvent, - # Saga events - EventType.SAGA_STARTED: SagaStartedEvent, - EventType.SAGA_COMPLETED: SagaCompletedEvent, - EventType.SAGA_FAILED: SagaFailedEvent, - EventType.SAGA_CANCELLED: SagaCancelledEvent, - EventType.SAGA_COMPENSATING: SagaCompensatingEvent, - EventType.SAGA_COMPENSATED: SagaCompensatedEvent, - # Saga command events - EventType.CREATE_POD_COMMAND: CreatePodCommandEvent, - EventType.DELETE_POD_COMMAND: DeletePodCommandEvent, - EventType.ALLOCATE_RESOURCES_COMMAND: AllocateResourcesCommandEvent, - EventType.RELEASE_RESOURCES_COMMAND: ReleaseResourcesCommandEvent, - } - - return event_map.get(event_type) + return _get_event_type_to_class().get(event_type) @lru_cache(maxsize=128) def get_topic_for_event(event_type: EventType) -> KafkaTopic: """Get the Kafka topic for a given event type.""" - event_class = get_event_class_for_type(event_type) - if event_class: - return event_class.topic - - # Default fallback - return KafkaTopic.SYSTEM_EVENTS + return EVENT_TYPE_TO_TOPIC.get(event_type, KafkaTopic.SYSTEM_EVENTS) def get_event_types_for_topic(topic: KafkaTopic) -> list[EventType]: """Get all event types that publish to a given topic.""" - event_types = [] - for event_type in EventType: - if get_topic_for_event(event_type) == topic: - event_types.append(event_type) - return event_types + return [et for et, t in EVENT_TYPE_TO_TOPIC.items() if t == topic] diff --git a/backend/app/schemas_pydantic/dlq.py b/backend/app/schemas_pydantic/dlq.py index a9815820..78d85efe 100644 --- a/backend/app/schemas_pydantic/dlq.py +++ b/backend/app/schemas_pydantic/dlq.py @@ -3,8 +3,8 @@ from pydantic import BaseModel, ConfigDict -from app.dlq import DLQMessageStatus, RetryStrategy -from app.domain.enums.events import EventType +from app.dlq import AgeStatistics, DLQMessageStatus, EventTypeStatistic, RetryStrategy, TopicStatistic +from app.domain.events.typed import DomainEvent class DLQStats(BaseModel): @@ -12,26 +12,24 @@ class DLQStats(BaseModel): model_config = ConfigDict(from_attributes=True) - by_status: dict[str, int] - by_topic: list[dict[str, Any]] - by_event_type: list[dict[str, Any]] - age_stats: dict[str, Any] + by_status: dict[DLQMessageStatus, int] + by_topic: list[TopicStatistic] + by_event_type: list[EventTypeStatistic] + age_stats: AgeStatistics timestamp: datetime class DLQMessageResponse(BaseModel): - """Response model for a DLQ message.""" + """Response model for a DLQ message. Mirrors DLQMessage for direct model_validate.""" model_config = ConfigDict(from_attributes=True) - event_id: str - event_type: EventType + event: DomainEvent original_topic: str error: str retry_count: int failed_at: datetime status: DLQMessageStatus - age_seconds: float producer_id: str dlq_offset: int | None = None dlq_partition: int | None = None @@ -93,25 +91,23 @@ class DLQTopicSummaryResponse(BaseModel): class DLQMessageDetail(BaseModel): - """Detailed DLQ message response.""" + """Detailed DLQ message response. Mirrors DLQMessage for direct model_validate.""" model_config = ConfigDict(from_attributes=True) - event_id: str - event: dict[str, Any] # BaseEvent as dict - event_type: EventType + event: DomainEvent original_topic: str error: str retry_count: int failed_at: datetime status: DLQMessageStatus + producer_id: str created_at: datetime | None = None last_updated: datetime | None = None next_retry_at: datetime | None = None retried_at: datetime | None = None discarded_at: datetime | None = None discard_reason: str | None = None - producer_id: str dlq_offset: int | None = None dlq_partition: int | None = None last_error: str | None = None diff --git a/backend/app/services/coordinator/coordinator.py b/backend/app/services/coordinator/coordinator.py index ef915720..ea9b37a6 100644 --- a/backend/app/services/coordinator/coordinator.py +++ b/backend/app/services/coordinator/coordinator.py @@ -9,22 +9,22 @@ from app.core.metrics.context import get_coordinator_metrics from app.db.repositories.execution_repository import ExecutionRepository from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic +from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId from app.domain.enums.storage import ExecutionErrorType -from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer -from app.events.event_store import EventStore -from app.events.schema.schema_registry import ( - SchemaRegistryManager, -) -from app.infrastructure.kafka.events.execution import ( +from app.domain.events.typed import ( + CreatePodCommandEvent, + EventMetadata, ExecutionAcceptedEvent, ExecutionCancelledEvent, ExecutionCompletedEvent, ExecutionFailedEvent, ExecutionRequestedEvent, ) -from app.infrastructure.kafka.events.metadata import AvroEventMetadata as EventMetadata -from app.infrastructure.kafka.events.saga import CreatePodCommandEvent +from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer +from app.events.event_store import EventStore +from app.events.schema.schema_registry import ( + SchemaRegistryManager, +) from app.services.coordinator.queue_manager import QueueManager, QueuePriority from app.services.coordinator.resource_manager import ResourceAllocation, ResourceManager from app.services.idempotency import IdempotencyManager @@ -112,9 +112,10 @@ async def _on_start(self) -> None: bootstrap_servers=self.kafka_servers, group_id=f"{self.consumer_group}.{self._settings.KAFKA_GROUP_SUFFIX}", enable_auto_commit=False, - session_timeout_ms=30000, # 30 seconds - heartbeat_interval_ms=10000, # 10 seconds (must be < session_timeout / 3) - max_poll_interval_ms=300000, # 5 minutes - max time between polls + session_timeout_ms=self._settings.KAFKA_SESSION_TIMEOUT_MS, + heartbeat_interval_ms=self._settings.KAFKA_HEARTBEAT_INTERVAL_MS, + max_poll_interval_ms=self._settings.KAFKA_MAX_POLL_INTERVAL_MS, + request_timeout_ms=self._settings.KAFKA_REQUEST_TIMEOUT_MS, max_poll_records=100, # Process max 100 messages at a time for flow control fetch_max_wait_ms=500, # Wait max 500ms for data (reduces latency) fetch_min_bytes=1, # Return immediately if any data available @@ -157,7 +158,7 @@ async def handle_cancelled(event: ExecutionCancelledEvent) -> None: self.logger.info("COORDINATOR: Event handlers registered with idempotency protection") - await self.idempotent_consumer.start([KafkaTopic.EXECUTION_EVENTS]) + await self.idempotent_consumer.start(list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.EXECUTION_COORDINATOR])) # Start scheduling task self._scheduling_task = asyncio.create_task(self._scheduling_loop()) @@ -242,6 +243,10 @@ async def _handle_execution_requested(self, event: ExecutionRequestedEvent) -> N self.logger.info(f"Execution {event.execution_id} added to queue at position {position}") + # Schedule immediately if at front of queue (position 0) + if position == 0: + await self._schedule_execution(event) + except Exception as e: self.logger.error(f"Failed to handle execution request {event.execution_id}: {e}", exc_info=True) self.metrics.record_coordinator_execution_scheduled("error") @@ -384,7 +389,6 @@ async def _schedule_execution(self, event: ExecutionRequestedEvent) -> None: async def _build_command_metadata(self, request: ExecutionRequestedEvent) -> EventMetadata: """Build metadata for CreatePodCommandEvent with guaranteed user_id.""" # Prefer execution record user_id to avoid missing attribution - # Prefer execution record user_id to avoid missing attribution exec_rec = await self.execution_repository.get_execution(request.execution_id) user_id: str = exec_rec.user_id if exec_rec and exec_rec.user_id else "system" diff --git a/backend/app/services/coordinator/queue_manager.py b/backend/app/services/coordinator/queue_manager.py index e43ec861..64ba66c8 100644 --- a/backend/app/services/coordinator/queue_manager.py +++ b/backend/app/services/coordinator/queue_manager.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List, Tuple from app.core.metrics.context import get_coordinator_metrics -from app.infrastructure.kafka.events import ExecutionRequestedEvent +from app.domain.events.typed import ExecutionRequestedEvent class QueuePriority(IntEnum): diff --git a/backend/app/services/event_bus.py b/backend/app/services/event_bus.py index 12136f76..455085d4 100644 --- a/backend/app/services/event_bus.py +++ b/backend/app/services/event_bus.py @@ -93,6 +93,10 @@ async def _initialize_kafka(self) -> None: auto_offset_reset="latest", enable_auto_commit=True, client_id=f"event-bus-consumer-{uuid4()}", + session_timeout_ms=self.settings.KAFKA_SESSION_TIMEOUT_MS, + heartbeat_interval_ms=self.settings.KAFKA_HEARTBEAT_INTERVAL_MS, + max_poll_interval_ms=self.settings.KAFKA_MAX_POLL_INTERVAL_MS, + request_timeout_ms=self.settings.KAFKA_REQUEST_TIMEOUT_MS, ) await self.consumer.start() diff --git a/backend/app/services/event_replay/replay_service.py b/backend/app/services/event_replay/replay_service.py index 856cdea6..ccd8eb26 100644 --- a/backend/app/services/event_replay/replay_service.py +++ b/backend/app/services/event_replay/replay_service.py @@ -13,10 +13,10 @@ from app.db.repositories.replay_repository import ReplayRepository from app.domain.admin.replay_updates import ReplaySessionUpdate from app.domain.enums.replay import ReplayStatus, ReplayTarget +from app.domain.events.typed import DomainEvent from app.domain.replay import ReplayConfig, ReplaySessionState from app.events.core import UnifiedProducer from app.events.event_store import EventStore -from app.infrastructure.kafka.events.base import BaseEvent from app.settings import Settings @@ -151,14 +151,14 @@ async def _handle_session_error(self, session: ReplaySessionState, error: Except self._metrics.record_replay_error(type(error).__name__) await self._update_session_in_db(session) - async def _apply_replay_delay(self, session: ReplaySessionState, event: BaseEvent) -> None: + async def _apply_replay_delay(self, session: ReplaySessionState, event: DomainEvent) -> None: if session.last_event_at and session.config.speed_multiplier < 100: time_diff = (event.timestamp - session.last_event_at).total_seconds() delay = time_diff / session.config.speed_multiplier if delay > 0: await asyncio.sleep(delay) - def _update_replay_metrics(self, session: ReplaySessionState, event: BaseEvent, success: bool) -> None: + def _update_replay_metrics(self, session: ReplaySessionState, event: DomainEvent, success: bool) -> None: if success: session.replayed_events += 1 status = "success" @@ -168,14 +168,14 @@ def _update_replay_metrics(self, session: ReplaySessionState, event: BaseEvent, self._metrics.record_event_replayed(session.config.replay_type, event.event_type, status) - async def _handle_replay_error(self, session: ReplaySessionState, event: BaseEvent, error: Exception) -> None: + async def _handle_replay_error(self, session: ReplaySessionState, event: DomainEvent, error: Exception) -> None: self.logger.error("Failed to replay event", extra={"event_id": event.event_id, "error": str(error)}) session.failed_events += 1 session.errors.append( {"timestamp": datetime.now(timezone.utc).isoformat(), "event_id": str(event.event_id), "error": str(error)} ) - async def _replay_to_kafka(self, session: ReplaySessionState, event: BaseEvent) -> bool: + async def _replay_to_kafka(self, session: ReplaySessionState, event: DomainEvent) -> bool: config = session.config if not config.preserve_timestamps: event.timestamp = datetime.now(timezone.utc) @@ -184,21 +184,21 @@ async def _replay_to_kafka(self, session: ReplaySessionState, event: BaseEvent) await self._producer.produce(event_to_produce=event) return True - async def _replay_to_callback(self, event: BaseEvent, session: ReplaySessionState) -> bool: + async def _replay_to_callback(self, event: DomainEvent, session: ReplaySessionState) -> bool: callback = self._callbacks.get(ReplayTarget.CALLBACK) if callback: await callback(event, session) return True return False - async def _replay_to_file(self, event: BaseEvent, file_path: str | None) -> bool: + async def _replay_to_file(self, event: DomainEvent, file_path: str | None) -> bool: if not file_path: self.logger.error("No target file path specified") return False await self._write_event_to_file(event, file_path) return True - async def _fetch_event_batches(self, session: ReplaySessionState) -> AsyncIterator[List[BaseEvent]]: + async def _fetch_event_batches(self, session: ReplaySessionState) -> AsyncIterator[List[DomainEvent]]: self.logger.info("Fetching events for session", extra={"session_id": session.session_id}) events_processed = 0 max_events = session.config.max_events @@ -206,7 +206,7 @@ async def _fetch_event_batches(self, session: ReplaySessionState) -> AsyncIterat async for batch_docs in self._repository.fetch_events( replay_filter=session.config.filter, batch_size=session.config.batch_size ): - batch: List[BaseEvent] = [] + batch: List[DomainEvent] = [] for doc in batch_docs: if max_events and events_processed >= max_events: break @@ -222,7 +222,7 @@ async def _fetch_event_batches(self, session: ReplaySessionState) -> AsyncIterat if max_events and events_processed >= max_events: break - async def _process_batch(self, session: ReplaySessionState, batch: List[BaseEvent]) -> None: + async def _process_batch(self, session: ReplaySessionState, batch: List[DomainEvent]) -> None: with trace_span( name="event_replay.process_batch", kind=SpanKind.INTERNAL, @@ -250,7 +250,7 @@ async def _process_batch(self, session: ReplaySessionState, batch: List[BaseEven session.last_event_at = event.timestamp await self._update_session_in_db(session) - async def _replay_event(self, session: ReplaySessionState, event: BaseEvent) -> bool: + async def _replay_event(self, session: ReplaySessionState, event: DomainEvent) -> bool: config = session.config attempts = config.retry_attempts if config.retry_failed else 1 @@ -278,7 +278,7 @@ async def _replay_event(self, session: ReplaySessionState, event: BaseEvent) -> return False - async def _write_event_to_file(self, event: BaseEvent, file_path: str) -> None: + async def _write_event_to_file(self, event: DomainEvent, file_path: str) -> None: if file_path not in self._file_locks: self._file_locks[file_path] = asyncio.Lock() @@ -286,7 +286,7 @@ async def _write_event_to_file(self, event: BaseEvent, file_path: str) -> None: loop = asyncio.get_running_loop() await loop.run_in_executor(None, self._write_to_file_sync, event, file_path) - def _write_to_file_sync(self, event: BaseEvent, file_path: str) -> None: + def _write_to_file_sync(self, event: DomainEvent, file_path: str) -> None: with open(file_path, "a") as f: f.write(json.dumps(event.model_dump(), default=str) + "\n") @@ -333,7 +333,9 @@ def list_sessions(self, status: ReplayStatus | None = None, limit: int = 100) -> sessions.sort(key=lambda s: s.created_at, reverse=True) return sessions[:limit] - def register_callback(self, target: ReplayTarget, callback: Callable[[BaseEvent, ReplaySessionState], Any]) -> None: + def register_callback( + self, target: ReplayTarget, callback: Callable[[DomainEvent, ReplaySessionState], Any] + ) -> None: self._callbacks[target] = callback async def cleanup_old_sessions(self, older_than_hours: int = 24) -> int: diff --git a/backend/app/services/execution_service.py b/backend/app/services/execution_service.py index 370adc90..e4455c36 100644 --- a/backend/app/services/execution_service.py +++ b/backend/app/services/execution_service.py @@ -9,6 +9,12 @@ from app.db.repositories.execution_repository import ExecutionRepository from app.domain.enums.events import EventType from app.domain.enums.execution import ExecutionStatus +from app.domain.events.typed import ( + DomainEvent, + EventMetadata, + ExecutionCancelledEvent, + ExecutionRequestedEvent, +) from app.domain.exceptions import InfrastructureError from app.domain.execution import ( DomainExecution, @@ -19,12 +25,6 @@ ) from app.events.core import UnifiedProducer from app.events.event_store import EventStore -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.execution import ( - ExecutionCancelledEvent, - ExecutionRequestedEvent, -) -from app.infrastructure.kafka.events.metadata import AvroEventMetadata as EventMetadata from app.runtime_registry import RUNTIME_REGISTRY from app.settings import Settings @@ -288,7 +288,7 @@ async def get_execution_events( execution_id: str, event_types: EventFilter = None, limit: int = 100, - ) -> list[BaseEvent]: + ) -> list[DomainEvent]: """ Get all events for an execution from the event store. diff --git a/backend/app/services/idempotency/idempotency_manager.py b/backend/app/services/idempotency/idempotency_manager.py index a49a4c62..90757740 100644 --- a/backend/app/services/idempotency/idempotency_manager.py +++ b/backend/app/services/idempotency/idempotency_manager.py @@ -9,8 +9,8 @@ from pymongo.errors import DuplicateKeyError from app.core.metrics.context import get_database_metrics +from app.domain.events.typed import BaseEvent from app.domain.idempotency import IdempotencyRecord, IdempotencyStats, IdempotencyStatus -from app.infrastructure.kafka.events import BaseEvent class IdempotencyResult(BaseModel): @@ -41,7 +41,7 @@ def event_based(event: BaseEvent) -> str: @staticmethod def content_hash(event: BaseEvent, fields: set[str] | None = None) -> str: - event_dict = event.model_dump() + event_dict = event.model_dump(mode="json") event_dict.pop("event_id", None) event_dict.pop("timestamp", None) event_dict.pop("metadata", None) diff --git a/backend/app/services/idempotency/middleware.py b/backend/app/services/idempotency/middleware.py index dd410781..689897d5 100644 --- a/backend/app/services/idempotency/middleware.py +++ b/backend/app/services/idempotency/middleware.py @@ -6,8 +6,8 @@ from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic +from app.domain.events.typed import DomainEvent from app.events.core import EventDispatcher, UnifiedConsumer -from app.infrastructure.kafka.events.base import BaseEvent from app.services.idempotency.idempotency_manager import IdempotencyManager @@ -16,15 +16,15 @@ class IdempotentEventHandler: def __init__( self, - handler: Callable[[BaseEvent], Awaitable[None]], + handler: Callable[[DomainEvent], Awaitable[None]], idempotency_manager: IdempotencyManager, logger: logging.Logger, key_strategy: str = "event_based", - custom_key_func: Callable[[BaseEvent], str] | None = None, + custom_key_func: Callable[[DomainEvent], str] | None = None, fields: Set[str] | None = None, ttl_seconds: int | None = None, cache_result: bool = True, - on_duplicate: Callable[[BaseEvent, Any], Any] | None = None, + on_duplicate: Callable[[DomainEvent, Any], Any] | None = None, ): self.handler = handler self.idempotency_manager = idempotency_manager @@ -36,7 +36,7 @@ def __init__( self.cache_result = cache_result self.on_duplicate = on_duplicate - async def __call__(self, event: BaseEvent) -> None: + async def __call__(self, event: DomainEvent) -> None: """Process event with idempotency check""" self.logger.info( f"IdempotentEventHandler called for event {event.event_type}, " @@ -94,15 +94,15 @@ def idempotent_handler( idempotency_manager: IdempotencyManager, logger: logging.Logger, key_strategy: str = "event_based", - custom_key_func: Callable[[BaseEvent], str] | None = None, + custom_key_func: Callable[[DomainEvent], str] | None = None, fields: Set[str] | None = None, ttl_seconds: int | None = None, cache_result: bool = True, - on_duplicate: Callable[[BaseEvent, Any], Any] | None = None, -) -> Callable[[Callable[[BaseEvent], Awaitable[None]]], Callable[[BaseEvent], Awaitable[None]]]: + on_duplicate: Callable[[DomainEvent, Any], Any] | None = None, +) -> Callable[[Callable[[DomainEvent], Awaitable[None]]], Callable[[DomainEvent], Awaitable[None]]]: """Decorator for making event handlers idempotent""" - def decorator(func: Callable[[BaseEvent], Awaitable[None]]) -> Callable[[BaseEvent], Awaitable[None]]: + def decorator(func: Callable[[DomainEvent], Awaitable[None]]) -> Callable[[DomainEvent], Awaitable[None]]: handler = IdempotentEventHandler( handler=func, idempotency_manager=idempotency_manager, @@ -139,7 +139,7 @@ def __init__( self.default_key_strategy = default_key_strategy self.default_ttl_seconds = default_ttl_seconds self.enable_for_all_handlers = enable_for_all_handlers - self._original_handlers: Dict[EventType, list[Callable[[BaseEvent], Awaitable[None]]]] = {} + self._original_handlers: Dict[EventType, list[Callable[[DomainEvent], Awaitable[None]]]] = {} def make_handlers_idempotent(self) -> None: """Wrap all registered handlers with idempotency""" @@ -157,7 +157,7 @@ def make_handlers_idempotent(self) -> None: # Wrap each handler for event_type, handlers in self._original_handlers.items(): - wrapped_handlers: list[Callable[[BaseEvent], Awaitable[None]]] = [] + wrapped_handlers: list[Callable[[DomainEvent], Awaitable[None]]] = [] for handler in handlers: # Wrap with idempotency - IdempotentEventHandler is callable with the right signature wrapped = IdempotentEventHandler( @@ -180,13 +180,13 @@ def make_handlers_idempotent(self) -> None: def subscribe_idempotent_handler( self, event_type: str, - handler: Callable[[BaseEvent], Awaitable[None]], + handler: Callable[[DomainEvent], Awaitable[None]], key_strategy: str | None = None, - custom_key_func: Callable[[BaseEvent], str] | None = None, + custom_key_func: Callable[[DomainEvent], str] | None = None, fields: Set[str] | None = None, ttl_seconds: int | None = None, cache_result: bool = True, - on_duplicate: Callable[[BaseEvent, Any], Any] | None = None, + on_duplicate: Callable[[DomainEvent, Any], Any] | None = None, ) -> None: """Subscribe an idempotent handler for specific event type""" # Create the idempotent handler wrapper @@ -252,7 +252,7 @@ async def async_handler(message: Any) -> Any: # Register with the dispatcher if available if self.dispatcher: # Create wrapper for EventDispatcher - async def dispatch_handler(event: BaseEvent) -> None: + async def dispatch_handler(event: DomainEvent) -> None: await idempotent_wrapper(event) self.dispatcher.register(EventType(event_type))(dispatch_handler) diff --git a/backend/app/services/k8s_worker/config.py b/backend/app/services/k8s_worker/config.py index ebc2c953..3b1831c4 100644 --- a/backend/app/services/k8s_worker/config.py +++ b/backend/app/services/k8s_worker/config.py @@ -1,14 +1,14 @@ import os from dataclasses import dataclass, field -from app.domain.enums.kafka import KafkaTopic +from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId, KafkaTopic @dataclass class K8sWorkerConfig: # Kafka settings consumer_group: str = "kubernetes-worker" - topics: list[KafkaTopic] = field(default_factory=lambda: [KafkaTopic.EXECUTION_TASKS]) + topics: list[KafkaTopic] = field(default_factory=lambda: list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.K8S_WORKER])) # Kubernetes settings namespace: str = os.environ.get("K8S_NAMESPACE", "integr8scode") diff --git a/backend/app/services/k8s_worker/pod_builder.py b/backend/app/services/k8s_worker/pod_builder.py index c4db7a48..e401be7c 100644 --- a/backend/app/services/k8s_worker/pod_builder.py +++ b/backend/app/services/k8s_worker/pod_builder.py @@ -1,6 +1,6 @@ from kubernetes import client as k8s_client -from app.infrastructure.kafka.events.saga import CreatePodCommandEvent +from app.domain.events.typed import CreatePodCommandEvent from app.services.k8s_worker.config import K8sWorkerConfig diff --git a/backend/app/services/k8s_worker/worker.py b/backend/app/services/k8s_worker/worker.py index 177c9e46..5a1c0ccc 100644 --- a/backend/app/services/k8s_worker/worker.py +++ b/backend/app/services/k8s_worker/worker.py @@ -12,20 +12,21 @@ from app.core.lifecycle import LifecycleEnabled from app.core.metrics import ExecutionMetrics, KubernetesMetrics from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic +from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId from app.domain.enums.storage import ExecutionErrorType +from app.domain.events.typed import ( + CreatePodCommandEvent, + DeletePodCommandEvent, + DomainEvent, + ExecutionFailedEvent, + ExecutionStartedEvent, + PodCreatedEvent, +) from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer from app.events.event_store import EventStore from app.events.schema.schema_registry import ( SchemaRegistryManager, ) -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.execution import ( - ExecutionFailedEvent, - ExecutionStartedEvent, -) -from app.infrastructure.kafka.events.pod import PodCreatedEvent -from app.infrastructure.kafka.events.saga import CreatePodCommandEvent, DeletePodCommandEvent from app.runtime_registry import RUNTIME_REGISTRY from app.services.idempotency import IdempotencyManager from app.services.idempotency.middleware import IdempotentConsumerWrapper @@ -47,14 +48,14 @@ class KubernetesWorker(LifecycleEnabled): """ def __init__( - self, - config: K8sWorkerConfig, - producer: UnifiedProducer, - schema_registry_manager: SchemaRegistryManager, - settings: Settings, - event_store: EventStore, - idempotency_manager: IdempotencyManager, - logger: logging.Logger, + self, + config: K8sWorkerConfig, + producer: UnifiedProducer, + schema_registry_manager: SchemaRegistryManager, + settings: Settings, + event_store: EventStore, + idempotency_manager: IdempotencyManager, + logger: logging.Logger, ): super().__init__() self.logger = logger @@ -107,6 +108,10 @@ async def _on_start(self) -> None: bootstrap_servers=self.kafka_servers, group_id=f"{self.config.consumer_group}.{self._settings.KAFKA_GROUP_SUFFIX}", enable_auto_commit=False, + session_timeout_ms=self._settings.KAFKA_SESSION_TIMEOUT_MS, + heartbeat_interval_ms=self._settings.KAFKA_HEARTBEAT_INTERVAL_MS, + max_poll_interval_ms=self._settings.KAFKA_MAX_POLL_INTERVAL_MS, + request_timeout_ms=self._settings.KAFKA_REQUEST_TIMEOUT_MS, ) # Create dispatcher and register handlers for saga commands @@ -134,8 +139,8 @@ async def _on_start(self) -> None: enable_for_all_handlers=True, # Enable idempotency for all handlers ) - # Start the consumer with idempotency - listen to saga commands topic - await self.idempotent_consumer.start([KafkaTopic.SAGA_COMMANDS]) + # Start the consumer with idempotency - topics from centralized config + await self.idempotent_consumer.start(list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.K8S_WORKER])) # Create daemonset for image pre-pulling asyncio.create_task(self.ensure_image_pre_puller_daemonset()) @@ -211,13 +216,13 @@ def _initialize_kubernetes_client(self) -> None: self.logger.error(f"Failed to initialize Kubernetes client: {e}") raise - async def _handle_create_pod_command_wrapper(self, event: BaseEvent) -> None: + async def _handle_create_pod_command_wrapper(self, event: DomainEvent) -> None: """Wrapper for handling CreatePodCommandEvent with type safety.""" assert isinstance(event, CreatePodCommandEvent) self.logger.info(f"Processing create_pod_command for execution {event.execution_id} from saga {event.saga_id}") await self._handle_create_pod_command(event) - async def _handle_delete_pod_command_wrapper(self, event: BaseEvent) -> None: + async def _handle_delete_pod_command_wrapper(self, event: DomainEvent) -> None: """Wrapper for handling DeletePodCommandEvent.""" assert isinstance(event, DeletePodCommandEvent) self.logger.info(f"Processing delete_pod_command for execution {event.execution_id} from saga {event.saga_id}") diff --git a/backend/app/services/kafka_event_service.py b/backend/app/services/kafka_event_service.py index d2ce2456..1b25a34f 100644 --- a/backend/app/services/kafka_event_service.py +++ b/backend/app/services/kafka_event_service.py @@ -11,12 +11,9 @@ from app.core.tracing.utils import inject_trace_context from app.db.repositories.event_repository import EventRepository from app.domain.enums.events import EventType -from app.domain.events import EventMetadata as DomainEventMetadata from app.domain.events import domain_event_adapter +from app.domain.events.typed import DomainEvent, EventMetadata from app.events.core import UnifiedProducer -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata -from app.infrastructure.kafka.mappings import get_event_class_for_type from app.settings import Settings tracer = trace.get_tracer(__name__) @@ -42,7 +39,7 @@ async def publish_event( payload: Dict[str, Any], aggregate_id: str | None, correlation_id: str | None = None, - metadata: AvroEventMetadata | None = None, + metadata: EventMetadata | None = None, ) -> str: """ Publish an event to Kafka and store an audit copy via the repository @@ -67,22 +64,17 @@ async def publish_event( if not correlation_id: correlation_id = CorrelationContext.get_correlation_id() - # Create or enrich event metadata (Avro for Kafka) - avro_metadata = metadata or AvroEventMetadata( + event_metadata = metadata or EventMetadata( service_name=self.settings.SERVICE_NAME, service_version=self.settings.SERVICE_VERSION, + correlation_id=correlation_id or str(uuid4()), ) - avro_metadata = avro_metadata.with_correlation(correlation_id or str(uuid4())) + if correlation_id and event_metadata.correlation_id != correlation_id: + event_metadata = event_metadata.model_copy(update={"correlation_id": correlation_id}) - # Create event event_id = str(uuid4()) timestamp = datetime.now(timezone.utc) - # Convert to domain metadata for storage - domain_metadata = DomainEventMetadata( - **avro_metadata.model_dump(include=set(DomainEventMetadata.model_fields.keys())) - ) - # Create typed domain event via discriminated union adapter event_data = { "event_id": event_id, @@ -90,36 +82,17 @@ async def publish_event( "event_version": "1.0", "timestamp": timestamp, "aggregate_id": aggregate_id, - "metadata": domain_metadata, + "metadata": event_metadata, **payload, } domain_event = domain_event_adapter.validate_python(event_data) - _ = await self.event_repository.store_event(domain_event) - - # Get event class and create proper event instance - event_class = get_event_class_for_type(event_type) - if not event_class: - raise ValueError(f"No event class found for event type: {event_type}") - - # Create proper Kafka event instance with all required fields - kafka_event_data = { - "event_id": domain_event.event_id, - "event_type": event_type, - "event_version": "1.0", - "timestamp": timestamp, - "aggregate_id": aggregate_id, - "metadata": avro_metadata, - **payload, # Include event-specific payload fields - } - - # Create the typed Kafka event instance - kafka_event = event_class(**kafka_event_data) + await self.event_repository.store_event(domain_event) - # Prepare headers (all values must be strings for UnifiedProducer) + # Prepare headers headers: Dict[str, str] = { "event_type": event_type, - "correlation_id": domain_metadata.correlation_id or "", - "service": avro_metadata.service_name, + "correlation_id": event_metadata.correlation_id or "", + "service": event_metadata.service_name, } # Add trace context @@ -132,31 +105,18 @@ async def publish_event( headers = inject_trace_context(headers) # Publish to Kafka - await self.kafka_producer.produce(event_to_produce=kafka_event, key=aggregate_id, headers=headers) - + await self.kafka_producer.produce(domain_event, aggregate_id, headers) self.metrics.record_event_published(event_type) - - # Record processing duration - duration = time.time() - start_time - self.metrics.record_event_processing_duration(duration, event_type) - - self.logger.info( - "Event published", - extra={ - "event_type": event_type, - "event_id": kafka_event.event_id, - "topic": getattr(kafka_event, "topic", "unknown"), - }, - ) - - return kafka_event.event_id + self.metrics.record_event_processing_duration(time.time() - start_time, event_type) + self.logger.info("Event published", extra={"event_type": event_type, "event_id": domain_event.event_id}) + return domain_event.event_id async def publish_execution_event( self, event_type: EventType, execution_id: str, status: str, - metadata: AvroEventMetadata | None = None, + metadata: EventMetadata | None = None, error_message: str | None = None, ) -> str: """Publish execution-related event using provided metadata (no framework coupling).""" @@ -199,7 +159,7 @@ async def publish_pod_event( execution_id: str, namespace: str = "integr8scode", status: str | None = None, - metadata: AvroEventMetadata | None = None, + metadata: EventMetadata | None = None, ) -> str: """Publish pod-related event""" payload = {"pod_name": pod_name, "execution_id": execution_id, "namespace": namespace} @@ -214,83 +174,31 @@ async def publish_pod_event( metadata=metadata, ) - async def publish_base_event(self, event: BaseEvent, key: str | None = None) -> str: - """ - Publish a pre-built BaseEvent to Kafka and store an audit copy. - - Used by PodMonitor and other components that create fully-formed events. - This ensures events are stored in the events collection AND published to Kafka. - - Args: - event: Pre-built BaseEvent with all fields populated - key: Optional Kafka message key (defaults to aggregate_id) - - Returns: - Event ID of published event - """ - with tracer.start_as_current_span("publish_base_event") as span: + async def publish_domain_event(self, event: DomainEvent, key: str | None = None) -> str: + """Publish a pre-built DomainEvent to Kafka and store an audit copy.""" + with tracer.start_as_current_span("publish_domain_event") as span: span.set_attribute("event.type", event.event_type) if event.aggregate_id: span.set_attribute("aggregate.id", event.aggregate_id) start_time = time.time() + await self.event_repository.store_event(event) - # Convert to domain metadata for storage - domain_metadata = DomainEventMetadata(**event.metadata.model_dump()) - - # Build payload from event attributes (exclude base fields) - base_fields = {"event_id", "event_type", "event_version", "timestamp", "aggregate_id", "metadata", "topic"} - payload = {k: v for k, v in event.model_dump().items() if k not in base_fields} - - # Create typed domain event via discriminated union adapter - event_data = { - "event_id": event.event_id, - "event_type": event.event_type, - "event_version": event.event_version, - "timestamp": event.timestamp, - "aggregate_id": event.aggregate_id, - "metadata": domain_metadata, - **payload, - } - domain_event = domain_event_adapter.validate_python(event_data) - await self.event_repository.store_event(domain_event) - - # Prepare headers headers: Dict[str, str] = { "event_type": event.event_type, "correlation_id": event.metadata.correlation_id or "", "service": event.metadata.service_name, } - - # Add trace context span_context = span.get_span_context() if span_context.is_valid: headers["trace_id"] = f"{span_context.trace_id:032x}" headers["span_id"] = f"{span_context.span_id:016x}" - headers = inject_trace_context(headers) - # Publish to Kafka - await self.kafka_producer.produce( - event_to_produce=event, - key=key or event.aggregate_id, - headers=headers, - ) - + await self.kafka_producer.produce(event, key or event.aggregate_id, headers) self.metrics.record_event_published(event.event_type) - - duration = time.time() - start_time - self.metrics.record_event_processing_duration(duration, event.event_type) - - self.logger.info( - "Base event published", - extra={ - "event_type": event.event_type, - "event_id": event.event_id, - "aggregate_id": event.aggregate_id, - }, - ) - + self.metrics.record_event_processing_duration(time.time() - start_time, event.event_type) + self.logger.info("Domain event published", extra={"event_id": event.event_id}) return event.event_id async def close(self) -> None: diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py index 7a2c0a61..eb6f79ad 100644 --- a/backend/app/services/notification_service.py +++ b/backend/app/services/notification_service.py @@ -19,6 +19,12 @@ NotificationStatus, ) from app.domain.enums.user import UserRole +from app.domain.events.typed import ( + DomainEvent, + ExecutionCompletedEvent, + ExecutionFailedEvent, + ExecutionTimeoutEvent, +) from app.domain.notification import ( DomainNotification, DomainNotificationCreate, @@ -32,12 +38,6 @@ ) from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.execution import ( - ExecutionCompletedEvent, - ExecutionFailedEvent, - ExecutionTimeoutEvent, -) from app.infrastructure.kafka.mappings import get_topic_for_event from app.schemas_pydantic.sse import RedisNotificationMessage from app.services.event_bus import EventBusManager @@ -221,7 +221,11 @@ async def _subscribe_to_events(self) -> None: group_id=f"{GroupId.NOTIFICATION_SERVICE}.{self.settings.KAFKA_GROUP_SUFFIX}", max_poll_records=10, enable_auto_commit=True, - auto_offset_reset="latest", # Only process new events + auto_offset_reset="latest", + session_timeout_ms=self.settings.KAFKA_SESSION_TIMEOUT_MS, + heartbeat_interval_ms=self.settings.KAFKA_HEARTBEAT_INTERVAL_MS, + max_poll_interval_ms=self.settings.KAFKA_MAX_POLL_INTERVAL_MS, + request_timeout_ms=self.settings.KAFKA_REQUEST_TIMEOUT_MS, ) execution_results_topic = get_topic_for_event(EventType.EXECUTION_COMPLETED) @@ -327,7 +331,7 @@ async def create_notification( }, ) - asyncio.create_task(self._deliver_notification(notification)) + await self._deliver_notification(notification) return notification @@ -628,9 +632,8 @@ async def _handle_execution_completed_typed(self, event: ExecutionCompletedEvent return title = f"Execution Completed: {event.execution_id}" - body = ( - f"Your execution completed successfully. Duration: {event.resource_usage.execution_time_wall_seconds:.2f}s." - ) + duration = event.resource_usage.execution_time_wall_seconds if event.resource_usage else 0.0 + body = f"Your execution completed successfully. Duration: {duration:.2f}s." await self.create_notification( user_id=user_id, subject=title, @@ -642,7 +645,7 @@ async def _handle_execution_completed_typed(self, event: ExecutionCompletedEvent ), ) - async def _handle_execution_event(self, event: BaseEvent) -> None: + async def _handle_execution_event(self, event: DomainEvent) -> None: """Unified handler for execution result events.""" try: if isinstance(event, ExecutionCompletedEvent): diff --git a/backend/app/services/pod_monitor/event_mapper.py b/backend/app/services/pod_monitor/event_mapper.py index c608035a..c34b530f 100644 --- a/backend/app/services/pod_monitor/event_mapper.py +++ b/backend/app/services/pod_monitor/event_mapper.py @@ -8,23 +8,21 @@ from app.domain.enums.kafka import GroupId from app.domain.enums.storage import ExecutionErrorType -from app.domain.execution import ResourceUsageDomain -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.execution import ( +from app.domain.events.typed import ( + DomainEvent, + EventMetadata, ExecutionCompletedEvent, ExecutionFailedEvent, ExecutionTimeoutEvent, -) -from app.infrastructure.kafka.events.metadata import AvroEventMetadata as EventMetadata -from app.infrastructure.kafka.events.pod import ( PodRunningEvent, PodScheduledEvent, PodTerminatedEvent, ) +from app.domain.execution import ResourceUsageDomain # Python 3.12 type aliases type PodPhase = str -type EventList = list[BaseEvent] +type EventList = list[DomainEvent] @dataclass(frozen=True) @@ -51,7 +49,7 @@ class PodLogs: class EventMapper(Protocol): """Protocol for event mapping functions""" - def __call__(self, ctx: PodContext) -> BaseEvent | None: ... + def __call__(self, ctx: PodContext) -> DomainEvent | None: ... class PodEventMapper: @@ -111,7 +109,7 @@ def map_pod_event(self, pod: k8s_client.V1Pod, event_type: str) -> EventList: ) # Collect events from mappers - events: list[BaseEvent] = [] + events: list[DomainEvent] = [] # Check for timeout first - if pod timed out, only return timeout event if timeout_event := self._check_timeout(ctx): @@ -271,7 +269,7 @@ def _map_completed(self, ctx: PodContext) -> ExecutionCompletedEvent | None: self.logger.info(f"POD-EVENT: mapped completed exec={ctx.execution_id} exit_code={logs.exit_code}") return evt - def _map_failed_or_completed(self, ctx: PodContext) -> BaseEvent | None: + def _map_failed_or_completed(self, ctx: PodContext) -> DomainEvent | None: """Map failed pod to either timeout, completed, or failed""" if ctx.pod.status and ctx.pod.status.reason == "DeadlineExceeded": return self._check_timeout(ctx) diff --git a/backend/app/services/pod_monitor/monitor.py b/backend/app/services/pod_monitor/monitor.py index 69028f25..f6325ab3 100644 --- a/backend/app/services/pod_monitor/monitor.py +++ b/backend/app/services/pod_monitor/monitor.py @@ -14,7 +14,7 @@ from app.core.lifecycle import LifecycleEnabled from app.core.metrics.context import get_kubernetes_metrics from app.core.utils import StringEnum -from app.infrastructure.kafka.events import BaseEvent +from app.domain.events.typed import DomainEvent from app.services.kafka_event_service import KafkaEventService from app.services.pod_monitor.config import PodMonitorConfig from app.services.pod_monitor.event_mapper import PodEventMapper @@ -323,7 +323,7 @@ async def _process_pod_event(self, event: PodEvent) -> None: self.logger.error(f"Error processing pod event: {e}", exc_info=True) self._metrics.record_pod_monitor_watch_error(ErrorType.PROCESSING_ERROR) - async def _publish_event(self, event: BaseEvent, pod: k8s_client.V1Pod) -> None: + async def _publish_event(self, event: DomainEvent, pod: k8s_client.V1Pod) -> None: """Publish event to Kafka and store in events collection.""" try: # Add correlation ID from pod labels @@ -333,7 +333,7 @@ async def _publish_event(self, event: BaseEvent, pod: k8s_client.V1Pod) -> None: execution_id = getattr(event, "execution_id", None) or event.aggregate_id key = str(execution_id or (pod.metadata.name if pod.metadata else "unknown")) - await self._kafka_event_service.publish_base_event(event=event, key=key) + await self._kafka_event_service.publish_domain_event(event=event, key=key) phase = pod.status.phase if pod.status else "Unknown" self._metrics.record_pod_monitor_event_published(event.event_type, phase) diff --git a/backend/app/services/result_processor/processor.py b/backend/app/services/result_processor/processor.py index 6a709787..530dbd15 100644 --- a/backend/app/services/result_processor/processor.py +++ b/backend/app/services/result_processor/processor.py @@ -10,22 +10,20 @@ from app.db.repositories.execution_repository import ExecutionRepository from app.domain.enums.events import EventType from app.domain.enums.execution import ExecutionStatus -from app.domain.enums.kafka import GroupId, KafkaTopic +from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId, KafkaTopic from app.domain.enums.storage import ExecutionErrorType, StorageType -from app.domain.execution import ExecutionNotFoundError, ExecutionResultDomain -from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer -from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka import BaseEvent -from app.infrastructure.kafka.events.execution import ( +from app.domain.events.typed import ( + DomainEvent, + EventMetadata, ExecutionCompletedEvent, ExecutionFailedEvent, ExecutionTimeoutEvent, -) -from app.infrastructure.kafka.events.metadata import AvroEventMetadata as EventMetadata -from app.infrastructure.kafka.events.system import ( ResultFailedEvent, ResultStoredEvent, ) +from app.domain.execution import ExecutionNotFoundError, ExecutionResultDomain +from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer +from app.events.schema.schema_registry import SchemaRegistryManager from app.services.idempotency import IdempotencyManager from app.services.idempotency.middleware import IdempotentConsumerWrapper from app.settings import Settings @@ -46,11 +44,7 @@ class ResultProcessorConfig(BaseModel): consumer_group: GroupId = Field(default=GroupId.RESULT_PROCESSOR) topics: list[KafkaTopic] = Field( - default_factory=lambda: [ - KafkaTopic.EXECUTION_COMPLETED, - KafkaTopic.EXECUTION_FAILED, - KafkaTopic.EXECUTION_TIMEOUT, - ] + default_factory=lambda: list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.RESULT_PROCESSOR]) ) result_topic: KafkaTopic = Field(default=KafkaTopic.EXECUTION_RESULTS) batch_size: int = Field(default=10) @@ -61,13 +55,13 @@ class ResultProcessor(LifecycleEnabled): """Service for processing execution completion events and storing results.""" def __init__( - self, - execution_repo: ExecutionRepository, - producer: UnifiedProducer, - schema_registry: SchemaRegistryManager, - settings: Settings, - idempotency_manager: IdempotencyManager, - logger: logging.Logger, + self, + execution_repo: ExecutionRepository, + producer: UnifiedProducer, + schema_registry: SchemaRegistryManager, + settings: Settings, + idempotency_manager: IdempotencyManager, + logger: logging.Logger, ) -> None: """Initialize the result processor.""" super().__init__() @@ -127,6 +121,10 @@ async def _create_consumer(self) -> IdempotentConsumerWrapper: max_poll_records=1, enable_auto_commit=True, auto_offset_reset="earliest", + session_timeout_ms=self._settings.KAFKA_SESSION_TIMEOUT_MS, + heartbeat_interval_ms=self._settings.KAFKA_HEARTBEAT_INTERVAL_MS, + max_poll_interval_ms=self._settings.KAFKA_MAX_POLL_INTERVAL_MS, + request_timeout_ms=self._settings.KAFKA_REQUEST_TIMEOUT_MS, ) # Create consumer with schema registry and dispatcher @@ -152,17 +150,17 @@ async def _create_consumer(self) -> IdempotentConsumerWrapper: await wrapper.start(self.config.topics) return wrapper - # Wrappers accepting BaseEvent to satisfy dispatcher typing + # Wrappers accepting DomainEvent to satisfy dispatcher typing - async def _handle_completed_wrapper(self, event: BaseEvent) -> None: + async def _handle_completed_wrapper(self, event: DomainEvent) -> None: assert isinstance(event, ExecutionCompletedEvent) await self._handle_completed(event) - async def _handle_failed_wrapper(self, event: BaseEvent) -> None: + async def _handle_failed_wrapper(self, event: DomainEvent) -> None: assert isinstance(event, ExecutionFailedEvent) await self._handle_failed(event) - async def _handle_timeout_wrapper(self, event: BaseEvent) -> None: + async def _handle_timeout_wrapper(self, event: DomainEvent) -> None: assert isinstance(event, ExecutionTimeoutEvent) await self._handle_timeout(event) @@ -177,20 +175,21 @@ async def _handle_completed(self, event: ExecutionCompletedEvent) -> None: # Record metrics for successful completion self._metrics.record_script_execution(ExecutionStatus.COMPLETED, lang_and_version) - runtime_seconds = event.resource_usage.execution_time_wall_seconds - self._metrics.record_execution_duration(runtime_seconds, lang_and_version) - - # Record memory utilization - memory_mib = event.resource_usage.peak_memory_kb / 1024 - self._metrics.record_memory_usage(memory_mib, lang_and_version) - - # Calculate and record memory utilization percentage - settings_limit = self._settings.K8S_POD_MEMORY_LIMIT - memory_limit_mib = int(settings_limit.rstrip("Mi")) # TODO: Less brittle acquisition of limit - memory_percent = (memory_mib / memory_limit_mib) * 100 - self._metrics.memory_utilization_percent.record( - memory_percent, attributes={"lang_and_version": lang_and_version} - ) + if event.resource_usage: + runtime_seconds = event.resource_usage.execution_time_wall_seconds + self._metrics.record_execution_duration(runtime_seconds, lang_and_version) + + # Record memory utilization + memory_mib = event.resource_usage.peak_memory_kb / 1024 + self._metrics.record_memory_usage(memory_mib, lang_and_version) + + # Calculate and record memory utilization percentage + settings_limit = self._settings.K8S_POD_MEMORY_LIMIT + memory_limit_mib = int(settings_limit.rstrip("Mi")) # TODO: Less brittle acquisition of limit + memory_percent = (memory_mib / memory_limit_mib) * 100 + self._metrics.memory_utilization_percent.record( + memory_percent, attributes={"lang_and_version": lang_and_version} + ) result = ExecutionResultDomain( execution_id=event.execution_id, @@ -217,7 +216,7 @@ async def _handle_failed(self, event: ExecutionFailedEvent) -> None: if exec_obj is None: raise ExecutionNotFoundError(event.execution_id) - self._metrics.record_error(event.error_type) + self._metrics.record_error(str(event.error_type) if event.error_type else "unknown") lang_and_version = f"{exec_obj.lang}-{exec_obj.lang_version}" self._metrics.record_script_execution(ExecutionStatus.FAILED, lang_and_version) diff --git a/backend/app/services/saga/execution_saga.py b/backend/app/services/saga/execution_saga.py index 4ea62100..5cc430e2 100644 --- a/backend/app/services/saga/execution_saga.py +++ b/backend/app/services/saga/execution_saga.py @@ -3,11 +3,9 @@ from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository from app.domain.enums.events import EventType +from app.domain.events.typed import CreatePodCommandEvent, DeletePodCommandEvent, EventMetadata, ExecutionRequestedEvent from app.domain.saga import DomainResourceAllocationCreate from app.events.core import UnifiedProducer -from app.infrastructure.kafka.events.execution import ExecutionRequestedEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata as EventMetadata -from app.infrastructure.kafka.events.saga import CreatePodCommandEvent, DeletePodCommandEvent from .base_saga import BaseSaga from .saga_step import CompensationStep, SagaContext, SagaStep diff --git a/backend/app/services/saga/saga_orchestrator.py b/backend/app/services/saga/saga_orchestrator.py index ad84a235..4fef4167 100644 --- a/backend/app/services/saga/saga_orchestrator.py +++ b/backend/app/services/saga/saga_orchestrator.py @@ -11,13 +11,11 @@ from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository from app.db.repositories.saga_repository import SagaRepository from app.domain.enums.saga import SagaState +from app.domain.events.typed import DomainEvent, EventMetadata, SagaCancelledEvent from app.domain.saga.models import Saga, SagaConfig from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer from app.events.event_store import EventStore from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata as EventMetadata -from app.infrastructure.kafka.events.saga import SagaCancelledEvent from app.infrastructure.kafka.mappings import get_topic_for_event from app.services.idempotency import IdempotentConsumerWrapper from app.services.idempotency.idempotency_manager import IdempotencyManager @@ -121,6 +119,10 @@ async def _start_consumer(self) -> None: bootstrap_servers=self._settings.KAFKA_BOOTSTRAP_SERVERS, group_id=f"saga-{self.config.name}.{self._settings.KAFKA_GROUP_SUFFIX}", enable_auto_commit=False, + session_timeout_ms=self._settings.KAFKA_SESSION_TIMEOUT_MS, + heartbeat_interval_ms=self._settings.KAFKA_HEARTBEAT_INTERVAL_MS, + max_poll_interval_ms=self._settings.KAFKA_MAX_POLL_INTERVAL_MS, + request_timeout_ms=self._settings.KAFKA_REQUEST_TIMEOUT_MS, ) dispatcher = EventDispatcher(logger=self.logger) @@ -150,7 +152,7 @@ async def _start_consumer(self) -> None: self.logger.info(f"Saga consumer started for topics: {topics}") - async def _handle_event(self, event: BaseEvent) -> None: + async def _handle_event(self, event: DomainEvent) -> None: """Handle incoming event""" self.logger.info(f"Saga orchestrator handling event: type={event.event_type}, id={event.event_id}") try: @@ -171,7 +173,7 @@ async def _handle_event(self, event: BaseEvent) -> None: self.logger.error(f"Error handling event {event.event_id}: {e}", exc_info=True) raise - def _should_trigger_saga(self, saga_class: type[BaseSaga], event: BaseEvent) -> bool: + def _should_trigger_saga(self, saga_class: type[BaseSaga], event: DomainEvent) -> bool: trigger_event_types = saga_class.get_trigger_events() should_trigger = event.event_type in trigger_event_types self.logger.debug( @@ -180,7 +182,7 @@ def _should_trigger_saga(self, saga_class: type[BaseSaga], event: BaseEvent) -> ) return should_trigger - async def _start_saga(self, saga_name: str, trigger_event: BaseEvent) -> str | None: + async def _start_saga(self, saga_name: str, trigger_event: DomainEvent) -> str | None: """Start a new saga instance""" self.logger.info(f"Starting saga {saga_name} for event {trigger_event.event_type}") saga_class = self._sagas.get(saga_name) @@ -234,7 +236,7 @@ async def _execute_saga( saga: BaseSaga, instance: Saga, context: SagaContext, - trigger_event: BaseEvent, + trigger_event: DomainEvent, ) -> None: """Execute saga steps""" tracer = get_tracer() diff --git a/backend/app/services/saga/saga_step.py b/backend/app/services/saga/saga_step.py index e03d0e22..c2c7937e 100644 --- a/backend/app/services/saga/saga_step.py +++ b/backend/app/services/saga/saga_step.py @@ -4,11 +4,11 @@ from fastapi.encoders import jsonable_encoder -from app.infrastructure.kafka.events import BaseEvent +from app.domain.events.typed import DomainEvent logger = logging.getLogger(__name__) -T = TypeVar("T", bound=BaseEvent) +T = TypeVar("T", bound=DomainEvent) class SagaContext: @@ -18,7 +18,7 @@ def __init__(self, saga_id: str, execution_id: str): self.saga_id = saga_id self.execution_id = execution_id self.data: dict[str, Any] = {} - self.events: list[BaseEvent] = [] + self.events: list[DomainEvent] = [] self.compensations: list[CompensationStep] = [] self.current_step: Optional[str] = None self.error: Optional[Exception] = None @@ -31,7 +31,7 @@ def get(self, key: str, default: Any = None) -> Any: """Get context data""" return self.data.get(key, default) - def add_event(self, event: BaseEvent) -> None: + def add_event(self, event: DomainEvent) -> None: """Add event to context""" self.events.append(event) diff --git a/backend/app/services/sse/kafka_redis_bridge.py b/backend/app/services/sse/kafka_redis_bridge.py index 43473556..950837ca 100644 --- a/backend/app/services/sse/kafka_redis_bridge.py +++ b/backend/app/services/sse/kafka_redis_bridge.py @@ -5,10 +5,10 @@ from app.core.lifecycle import LifecycleEnabled from app.core.metrics.events import EventMetrics from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic +from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId +from app.domain.events.typed import DomainEvent from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events.base import BaseEvent from app.services.sse.redis_bus import SSERedisBus from app.settings import Settings @@ -23,12 +23,12 @@ class SSEKafkaRedisBridge(LifecycleEnabled): """ def __init__( - self, - schema_registry: SchemaRegistryManager, - settings: Settings, - event_metrics: EventMetrics, - sse_bus: SSERedisBus, - logger: logging.Logger, + self, + schema_registry: SchemaRegistryManager, + settings: Settings, + event_metrics: EventMetrics, + sse_bus: SSERedisBus, + logger: logging.Logger, ) -> None: super().__init__() self.schema_registry = schema_registry @@ -71,9 +71,10 @@ async def _create_consumer(self, consumer_index: int) -> UnifiedConsumer: client_id=client_id, enable_auto_commit=True, auto_offset_reset="latest", - max_poll_interval_ms=300000, - session_timeout_ms=30000, - heartbeat_interval_ms=3000, + max_poll_interval_ms=self.settings.KAFKA_MAX_POLL_INTERVAL_MS, + session_timeout_ms=self.settings.KAFKA_SESSION_TIMEOUT_MS, + heartbeat_interval_ms=self.settings.KAFKA_HEARTBEAT_INTERVAL_MS, + request_timeout_ms=self.settings.KAFKA_REQUEST_TIMEOUT_MS, ) dispatcher = EventDispatcher(logger=self.logger) @@ -87,15 +88,8 @@ async def _create_consumer(self, consumer_index: int) -> UnifiedConsumer: logger=self.logger, ) - topics = [ - KafkaTopic.EXECUTION_EVENTS, - KafkaTopic.EXECUTION_COMPLETED, - KafkaTopic.EXECUTION_FAILED, - KafkaTopic.EXECUTION_TIMEOUT, - KafkaTopic.EXECUTION_RESULTS, - KafkaTopic.POD_EVENTS, - KafkaTopic.POD_STATUS_UPDATES, - ] + # Use WEBSOCKET_GATEWAY subscriptions - SSE bridge serves same purpose (real-time client delivery) + topics = list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.WEBSOCKET_GATEWAY]) await consumer.start(topics) self.logger.info(f"Bridge consumer {consumer_index} started") @@ -122,7 +116,7 @@ def _register_routing_handlers(self, dispatcher: EventDispatcher) -> None: EventType.POD_DELETED, ] - async def route_event(event: BaseEvent) -> None: + async def route_event(event: DomainEvent) -> None: data = event.model_dump() execution_id = data.get("execution_id") if not execution_id: @@ -150,11 +144,11 @@ def get_stats(self) -> dict[str, int | bool]: def create_sse_kafka_redis_bridge( - schema_registry: SchemaRegistryManager, - settings: Settings, - event_metrics: EventMetrics, - sse_bus: SSERedisBus, - logger: logging.Logger, + schema_registry: SchemaRegistryManager, + settings: Settings, + event_metrics: EventMetrics, + sse_bus: SSERedisBus, + logger: logging.Logger, ) -> SSEKafkaRedisBridge: return SSEKafkaRedisBridge( schema_registry=schema_registry, diff --git a/backend/app/services/sse/redis_bus.py b/backend/app/services/sse/redis_bus.py index ce323708..3be68c2c 100644 --- a/backend/app/services/sse/redis_bus.py +++ b/backend/app/services/sse/redis_bus.py @@ -6,7 +6,7 @@ import redis.asyncio as redis from pydantic import BaseModel -from app.infrastructure.kafka.events.base import BaseEvent +from app.domain.events.typed import DomainEvent from app.schemas_pydantic.sse import RedisNotificationMessage, RedisSSEMessage T = TypeVar("T", bound=BaseModel) @@ -62,7 +62,7 @@ def _exec_channel(self, execution_id: str) -> str: def _notif_channel(self, user_id: str) -> str: return f"{self._notif_prefix}{user_id}" - async def publish_event(self, execution_id: str, event: BaseEvent) -> None: + async def publish_event(self, execution_id: str, event: DomainEvent) -> None: message = RedisSSEMessage( event_type=event.event_type, execution_id=execution_id, diff --git a/backend/app/services/sse/sse_shutdown_manager.py b/backend/app/services/sse/sse_shutdown_manager.py index 1e29b60a..86314b27 100644 --- a/backend/app/services/sse/sse_shutdown_manager.py +++ b/backend/app/services/sse/sse_shutdown_manager.py @@ -64,6 +64,10 @@ def __init__( self._shutdown_event = asyncio.Event() self._drain_complete_event = asyncio.Event() + # Phase transition events for external coordination + self.initiated_event = asyncio.Event() # Set when shutdown initiated + self.notifying_event = asyncio.Event() # Set when entering notifying phase + self.logger.info( "SSEShutdownManager initialized", extra={"drain_timeout": drain_timeout, "notification_timeout": notification_timeout}, diff --git a/backend/app/settings.py b/backend/app/settings.py index fc5efc37..44f8e2a3 100644 --- a/backend/app/settings.py +++ b/backend/app/settings.py @@ -1,5 +1,3 @@ -import os - from pydantic import Field from pydantic_settings import BaseSettings, SettingsConfigDict @@ -62,8 +60,11 @@ class Settings(BaseSettings): KAFKA_CONSUMER_GROUP_ID: str = "integr8scode-backend" KAFKA_AUTO_OFFSET_RESET: str = "earliest" KAFKA_ENABLE_AUTO_COMMIT: bool = True - KAFKA_SESSION_TIMEOUT_MS: int = 30000 + KAFKA_SESSION_TIMEOUT_MS: int = 45000 + KAFKA_HEARTBEAT_INTERVAL_MS: int = 10000 + KAFKA_MAX_POLL_INTERVAL_MS: int = 300000 KAFKA_MAX_POLL_RECORDS: int = 500 + KAFKA_REQUEST_TIMEOUT_MS: int = 40000 # SSE Configuration SSE_CONSUMER_POOL_SIZE: int = 10 # Number of consumers in the partitioned pool @@ -162,7 +163,7 @@ class Settings(BaseSettings): LOG_LEVEL: str = Field(default="DEBUG", description="Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)") model_config = SettingsConfigDict( - env_file=os.environ.get("DOTENV_PATH", ".env"), + env_file=".env", env_file_encoding="utf-8", case_sensitive=True, extra="forbid", # Raise error on extra fields diff --git a/backend/pyproject.toml b/backend/pyproject.toml index cd8dfeba..febd8c01 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -123,6 +123,7 @@ dependencies = [ "wrapt==1.16.0", "yarl==1.20.1", "zipp==3.20.2", + "monggregate==0.22.1", ] [build-system] @@ -215,9 +216,9 @@ log_cli_level = "ERROR" log_level = "ERROR" addopts = "--tb=short -n auto --dist=loadfile" -# pytest-env: Set DOTENV_PATH so Settings loads .env.test instead of .env +# pytest-env: Sets env vars before test execution [tool.pytest_env] -DOTENV_PATH = ".env.test" +OTEL_SDK_DISABLED = "true" # Prevents teardown delays from OTLP exporter retries # Coverage configuration [tool.coverage.run] diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index 8c5aa57a..c9eef28d 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -25,21 +25,15 @@ # ===== Pytest hooks ===== @pytest.hookimpl(trylast=True) -def pytest_configure(config: pytest.Config) -> None: - """Create Kafka topics once before any tests run. - - Uses trylast=True to ensure pytest-env has set DOTENV_PATH first. - Runs in master process before xdist workers spawn. - - Silently skips if Kafka is unavailable (e.g., unit tests). - """ - # Only run in master process (not in xdist workers) - if not hasattr(config, "workerinput"): - try: - asyncio.run(create_topics(Settings())) - except Exception: - # Kafka not available (unit tests) - silently skip - pass +def pytest_configure() -> None: + """Create Kafka topics once in master process before xdist workers spawn.""" + # PYTEST_XDIST_WORKER is only set in workers, not master + if os.environ.get("PYTEST_XDIST_WORKER"): + return + try: + asyncio.run(create_topics(Settings(_env_file=".env.test"))) + except Exception: + pass # Kafka unavailable (unit tests) # ===== Settings fixture ===== @@ -47,8 +41,8 @@ def pytest_configure(config: pytest.Config) -> None: def test_settings() -> Settings: """Provide test settings with per-worker isolation where needed. - pytest-env sets DOTENV_PATH=.env.test (configured in pyproject.toml). - Settings class uses this to load the correct env file via pydantic-settings. + Uses pydantic-settings _env_file parameter to load .env.test at instantiation, + overriding the class-level default of .env. What gets isolated per worker (to prevent interference): - DATABASE_NAME: Each worker gets its own MongoDB database @@ -59,17 +53,14 @@ def test_settings() -> Settings: - KAFKA_TOPIC_PREFIX: Topics created once by CI/scripts - SCHEMA_SUBJECT_PREFIX: Schemas shared across workers """ - base = Settings() # Uses DOTENV_PATH from pytest-env to load .env.test + base = Settings(_env_file=".env.test") session_id = uuid.uuid4().hex[:8] return base.model_copy( update={ - # Per-worker isolation + # Per-worker isolation for xdist - must be dynamic, can't be in .env.test "DATABASE_NAME": f"integr8scode_test_{session_id}_{_WORKER_ID}", "REDIS_DB": _WORKER_NUM, "KAFKA_GROUP_SUFFIX": f"{session_id}.{_WORKER_ID}", - # Disable telemetry in tests - "OTEL_EXPORTER_OTLP_ENDPOINT": None, - "ENABLE_TRACING": False, } ) @@ -84,11 +75,19 @@ async def app(test_settings: Settings) -> AsyncGenerator[FastAPI, None]: Uses lifespan_context to trigger startup/shutdown events, which initializes Beanie, metrics, and other services through the normal DI flow. + + Cleanup: Best-effort drop of test database. May not always succeed due to + known MongoDB driver behavior when client stays connected, but ulimits on + MongoDB container (65536) prevent file descriptor exhaustion regardless. """ application = create_app(settings=test_settings) async with application.router.lifespan_context(application): yield application + # Best-effort cleanup (may fail silently due to MongoDB driver behavior) + container: AsyncContainer = application.state.dishka_container + db: Database = await container.get(Database) + await db.client.drop_database(test_settings.DATABASE_NAME) @pytest_asyncio.fixture(scope="session") diff --git a/backend/tests/e2e/test_k8s_worker_create_pod.py b/backend/tests/e2e/test_k8s_worker_create_pod.py index 63c6c0ee..30c23eb2 100644 --- a/backend/tests/e2e/test_k8s_worker_create_pod.py +++ b/backend/tests/e2e/test_k8s_worker_create_pod.py @@ -3,11 +3,10 @@ import uuid import pytest +from app.domain.events.typed import CreatePodCommandEvent, EventMetadata from app.events.core import UnifiedProducer from app.events.event_store import EventStore from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events.metadata import AvroEventMetadata -from app.infrastructure.kafka.events.saga import CreatePodCommandEvent from app.services.idempotency import IdempotencyManager from app.services.k8s_worker.config import K8sWorkerConfig from app.services.k8s_worker.worker import KubernetesWorker @@ -67,7 +66,7 @@ async def test_worker_creates_configmap_and_pod( cpu_request="50m", memory_request="64Mi", priority=5, - metadata=AvroEventMetadata(service_name="tests", service_version="1", user_id="u1"), + metadata=EventMetadata(service_name="tests", service_version="1", user_id="u1"), ) # Build and create ConfigMap + Pod diff --git a/backend/tests/e2e/test_resource_cleaner_orphan.py b/backend/tests/e2e/test_resource_cleaner_orphan.py index bb75c911..cf879ed1 100644 --- a/backend/tests/e2e/test_resource_cleaner_orphan.py +++ b/backend/tests/e2e/test_resource_cleaner_orphan.py @@ -6,8 +6,6 @@ from kubernetes import client as k8s_client from kubernetes import config as k8s_config -from tests.helpers.eventually import eventually - pytestmark = [pytest.mark.e2e, pytest.mark.k8s] _test_logger = logging.getLogger("test.k8s.resource_cleaner_orphan") @@ -37,16 +35,12 @@ async def test_cleanup_orphaned_configmaps_dry_run() -> None: try: cleaner = ResourceCleaner(logger=_test_logger) - # Force as orphaned by using a large cutoff - await cleaner.cleanup_orphaned_resources(namespace=ns, max_age_hours=0, dry_run=True) - - # We expect our configmap to be a candidate; poll the response - async def _has_cm() -> None: - # If cleaner is non-deterministic across runs, re-invoke to reflect current state - res = await cleaner.cleanup_orphaned_resources(namespace=ns, max_age_hours=0, dry_run=True) - assert any(name == cm for cm in res.get("configmaps", [])) - - await eventually(_has_cm, timeout=2.0, interval=0.1) + # Force as orphaned by using a large cutoff - ConfigMap created synchronously, available now + res = await cleaner.cleanup_orphaned_resources(namespace=ns, max_age_hours=0, dry_run=True) + # ConfigMap should be detected immediately + assert any(name == cm for cm in res.get("configmaps", [])), ( + f"Expected ConfigMap '{name}' to be detected as orphan candidate" + ) finally: # Cleanup resource try: diff --git a/backend/tests/helpers/events.py b/backend/tests/helpers/events.py index b4eef7ca..055eec31 100644 --- a/backend/tests/helpers/events.py +++ b/backend/tests/helpers/events.py @@ -1,8 +1,7 @@ import uuid from typing import Iterable -from app.infrastructure.kafka.events.execution import ExecutionRequestedEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata +from app.domain.events.typed import EventMetadata, ExecutionRequestedEvent def make_execution_requested_event( @@ -31,7 +30,7 @@ def make_execution_requested_event( if execution_id is None: execution_id = f"exec-{uuid.uuid4().hex[:8]}" - metadata = AvroEventMetadata(service_name=service_name, service_version=service_version, user_id=user_id) + metadata = EventMetadata(service_name=service_name, service_version=service_version, user_id=user_id) return ExecutionRequestedEvent( execution_id=execution_id, aggregate_id=execution_id, # Match production: aggregate_id == execution_id for execution events diff --git a/backend/tests/helpers/eventually.py b/backend/tests/helpers/eventually.py deleted file mode 100644 index ee5c525b..00000000 --- a/backend/tests/helpers/eventually.py +++ /dev/null @@ -1,36 +0,0 @@ -import asyncio -from collections.abc import Awaitable, Callable -from typing import TypeVar - -T = TypeVar("T") - - -async def eventually( - fn: Callable[[], Awaitable[T]], - *, - timeout: float = 10.0, - interval: float = 0.1, - exceptions: tuple[type[BaseException], ...] = (AssertionError,), -) -> T: - """Poll async `fn` until it succeeds or timeout elapses. - - Args: - fn: Async callable to poll. Retried if it raises one of `exceptions`. - timeout: Maximum time to wait in seconds. - interval: Time between retries in seconds. - exceptions: Exception types that trigger a retry. - - Returns: - The return value of `fn` on success. - - Raises: - The last exception raised by `fn` after timeout. - """ - deadline = asyncio.get_running_loop().time() + timeout - while True: - try: - return await fn() - except exceptions: - if asyncio.get_running_loop().time() >= deadline: - raise - await asyncio.sleep(interval) diff --git a/backend/tests/helpers/kafka.py b/backend/tests/helpers/kafka.py index 944eca58..182b04c4 100644 --- a/backend/tests/helpers/kafka.py +++ b/backend/tests/helpers/kafka.py @@ -1,8 +1,8 @@ from collections.abc import Awaitable, Callable import pytest +from app.domain.events.typed import DomainEvent from app.events.core import UnifiedProducer -from app.infrastructure.kafka.events.base import BaseEvent from dishka import AsyncContainer @@ -14,8 +14,8 @@ async def producer(scope: AsyncContainer) -> UnifiedProducer: @pytest.fixture(scope="function") -def send_event(producer: UnifiedProducer) -> Callable[[BaseEvent], Awaitable[None]]: - async def _send(ev: BaseEvent) -> None: +def send_event(producer: UnifiedProducer) -> Callable[[DomainEvent], Awaitable[None]]: + async def _send(ev: DomainEvent) -> None: await producer.produce(ev) return _send diff --git a/backend/tests/integration/db/repositories/test_dlq_repository.py b/backend/tests/integration/db/repositories/test_dlq_repository.py index 06bbb5f8..b016f7f3 100644 --- a/backend/tests/integration/db/repositories/test_dlq_repository.py +++ b/backend/tests/integration/db/repositories/test_dlq_repository.py @@ -23,9 +23,8 @@ async def insert_test_dlq_docs() -> None: docs = [ DLQMessageDocument( - event_id="id1", - event_type=str(EventType.USER_LOGGED_IN), event={ + "event_id": "id1", "event_type": str(EventType.USER_LOGGED_IN), "metadata": {"service_name": "svc", "service_version": "1"}, "user_id": "u1", @@ -39,9 +38,8 @@ async def insert_test_dlq_docs() -> None: producer_id="p1", ), DLQMessageDocument( - event_id="id2", - event_type=str(EventType.USER_LOGGED_IN), event={ + "event_id": "id2", "event_type": str(EventType.USER_LOGGED_IN), "metadata": {"service_name": "svc", "service_version": "1"}, "user_id": "u1", @@ -55,9 +53,8 @@ async def insert_test_dlq_docs() -> None: producer_id="p1", ), DLQMessageDocument( - event_id="id3", - event_type=str(EventType.EXECUTION_STARTED), event={ + "event_id": "id3", "event_type": str(EventType.EXECUTION_STARTED), "metadata": {"service_name": "svc", "service_version": "1"}, "execution_id": "x1", @@ -86,7 +83,7 @@ async def test_stats_list_get_and_updates(repo: DLQRepository) -> None: res = await repo.get_messages(limit=2) assert res.total >= 3 and len(res.messages) <= 2 msg = await repo.get_message_by_id("id1") - assert msg and msg.event_id == "id1" + assert msg and msg.event.event_id == "id1" assert await repo.mark_message_retried("id1") in (True, False) assert await repo.mark_message_discarded("id1", "r") in (True, False) diff --git a/backend/tests/integration/dlq/test_dlq_discard.py b/backend/tests/integration/dlq/test_dlq_discard.py index 0cbcd5f0..8932fa51 100644 --- a/backend/tests/integration/dlq/test_dlq_discard.py +++ b/backend/tests/integration/dlq/test_dlq_discard.py @@ -6,7 +6,6 @@ from app.db.docs import DLQMessageDocument from app.db.repositories.dlq_repository import DLQRepository from app.dlq.models import DLQMessageStatus -from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic from dishka import AsyncContainer @@ -26,12 +25,13 @@ async def _create_dlq_document( event_id = str(uuid.uuid4()) event = make_execution_requested_event(execution_id=f"exec-{uuid.uuid4().hex[:8]}") + # Override event_id for test predictability + event_dict = event.model_dump() + event_dict["event_id"] = event_id now = datetime.now(timezone.utc) doc = DLQMessageDocument( - event=event.model_dump(), - event_id=event_id, - event_type=EventType.EXECUTION_REQUESTED, + event=event_dict, original_topic=str(KafkaTopic.EXECUTION_EVENTS), error="Test error", retry_count=0, @@ -60,7 +60,7 @@ async def test_dlq_repository_marks_message_discarded(scope: AsyncContainer) -> assert result is True # Verify the status changed - updated_doc = await DLQMessageDocument.find_one({"event_id": event_id}) + updated_doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) assert updated_doc is not None assert updated_doc.status == DLQMessageStatus.DISCARDED assert updated_doc.discard_reason == reason @@ -96,7 +96,7 @@ async def test_dlq_discard_sets_timestamp(scope: AsyncContainer) -> None: after_discard = datetime.now(timezone.utc) # Verify timestamp is set correctly - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) assert doc is not None assert doc.discarded_at is not None assert before_discard <= doc.discarded_at <= after_discard @@ -116,7 +116,7 @@ async def test_dlq_discard_with_custom_reason(scope: AsyncContainer) -> None: await repository.mark_message_discarded(event_id, custom_reason) # Verify the reason is stored - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) assert doc is not None assert doc.discard_reason == custom_reason @@ -136,7 +136,7 @@ async def test_dlq_discard_from_scheduled_status(scope: AsyncContainer) -> None: assert result is True # Verify status transition - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) assert doc is not None assert doc.status == DLQMessageStatus.DISCARDED diff --git a/backend/tests/integration/dlq/test_dlq_manager.py b/backend/tests/integration/dlq/test_dlq_manager.py index e760e7c4..b1f84426 100644 --- a/backend/tests/integration/dlq/test_dlq_manager.py +++ b/backend/tests/integration/dlq/test_dlq_manager.py @@ -1,19 +1,19 @@ +import asyncio import json import logging import uuid from datetime import datetime, timezone import pytest -from aiokafka import AIOKafkaProducer -from app.core.database_context import Database -from app.db.docs import DLQMessageDocument +from aiokafka import AIOKafkaConsumer, AIOKafkaProducer from app.dlq.manager import create_dlq_manager +from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic +from app.domain.events.typed import DLQMessageReceivedEvent from app.events.schema.schema_registry import SchemaRegistryManager from app.settings import Settings from tests.helpers import make_execution_requested_event -from tests.helpers.eventually import eventually # xdist_group: DLQ tests share a Kafka consumer group. When running in parallel, # different workers' managers consume each other's messages and apply wrong policies. @@ -24,17 +24,44 @@ @pytest.mark.asyncio -async def test_dlq_manager_persists_in_mongo(db: Database, test_settings: Settings) -> None: +async def test_dlq_manager_persists_and_emits_event(test_settings: Settings) -> None: + """Test that DLQ manager persists messages and emits DLQMessageReceivedEvent.""" schema_registry = SchemaRegistryManager(test_settings, _test_logger) manager = create_dlq_manager(settings=test_settings, schema_registry=schema_registry, logger=_test_logger) - # Use prefix from test_settings to match what the manager uses prefix = test_settings.KAFKA_TOPIC_PREFIX - - # Use unique execution_id to avoid conflicts with parallel test workers ev = make_execution_requested_event(execution_id=f"exec-dlq-persist-{uuid.uuid4().hex[:8]}") + + # Future resolves when DLQMessageReceivedEvent is consumed + received_future: asyncio.Future[DLQMessageReceivedEvent] = asyncio.get_running_loop().create_future() + + # Create consumer for DLQ events topic + dlq_events_topic = f"{prefix}{KafkaTopic.DLQ_EVENTS}" + consumer = AIOKafkaConsumer( + dlq_events_topic, + bootstrap_servers=test_settings.KAFKA_BOOTSTRAP_SERVERS, + group_id=f"test-dlq-events.{uuid.uuid4().hex[:6]}", + auto_offset_reset="earliest", + enable_auto_commit=True, + ) + + async def consume_dlq_events() -> None: + """Consume DLQ events and set future when our event is received.""" + async for msg in consumer: + try: + event = await schema_registry.deserialize_event(msg.value, dlq_events_topic) + if ( + isinstance(event, DLQMessageReceivedEvent) + and event.dlq_event_id == ev.event_id + and not received_future.done() + ): + received_future.set_result(event) + return + except Exception as e: + _test_logger.debug(f"Error deserializing DLQ event: {e}") + payload = { - "event": ev.to_dict(), + "event": ev.model_dump(mode="json"), "original_topic": f"{prefix}{str(KafkaTopic.EXECUTION_EVENTS)}", "error": "handler failed", "retry_count": 0, @@ -42,7 +69,7 @@ async def test_dlq_manager_persists_in_mongo(db: Database, test_settings: Settin "producer_id": "tests", } - # Produce to DLQ topic using aiokafka + # Produce to DLQ topic BEFORE starting consumers (auto_offset_reset="earliest") producer = AIOKafkaProducer(bootstrap_servers=test_settings.KAFKA_BOOTSTRAP_SERVERS) await producer.start() try: @@ -54,12 +81,23 @@ async def test_dlq_manager_persists_in_mongo(db: Database, test_settings: Settin finally: await producer.stop() - # Run the manager briefly to consume and persist - async with manager: - - async def _exists() -> None: - doc = await DLQMessageDocument.find_one({"event_id": ev.event_id}) - assert doc is not None + # Start consumer for DLQ events + await consumer.start() + consume_task = asyncio.create_task(consume_dlq_events()) - # Poll until the document appears - await eventually(_exists, timeout=10.0, interval=0.2) + try: + # Start manager - it will consume from DLQ, persist, and emit DLQMessageReceivedEvent + async with manager: + # Await the DLQMessageReceivedEvent - true async, no polling + received = await asyncio.wait_for(received_future, timeout=15.0) + assert received.dlq_event_id == ev.event_id + assert received.event_type == EventType.DLQ_MESSAGE_RECEIVED + assert received.original_event_type == str(EventType.EXECUTION_REQUESTED) + assert received.error == "handler failed" + finally: + consume_task.cancel() + try: + await consume_task + except asyncio.CancelledError: + pass + await consumer.stop() diff --git a/backend/tests/integration/dlq/test_dlq_retry.py b/backend/tests/integration/dlq/test_dlq_retry.py index 77ad8dcf..f82765cd 100644 --- a/backend/tests/integration/dlq/test_dlq_retry.py +++ b/backend/tests/integration/dlq/test_dlq_retry.py @@ -6,7 +6,6 @@ from app.db.docs import DLQMessageDocument from app.db.repositories.dlq_repository import DLQRepository from app.dlq.models import DLQMessageStatus -from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic from dishka import AsyncContainer @@ -26,12 +25,13 @@ async def _create_dlq_document( event_id = str(uuid.uuid4()) event = make_execution_requested_event(execution_id=f"exec-{uuid.uuid4().hex[:8]}") + # Override event_id for test predictability + event_dict = event.model_dump() + event_dict["event_id"] = event_id now = datetime.now(timezone.utc) doc = DLQMessageDocument( - event=event.model_dump(), - event_id=event_id, - event_type=EventType.EXECUTION_REQUESTED, + event=event_dict, original_topic=str(KafkaTopic.EXECUTION_EVENTS), error="Test error", retry_count=0, @@ -59,7 +59,7 @@ async def test_dlq_repository_marks_message_retried(scope: AsyncContainer) -> No assert result is True # Verify the status changed - updated_doc = await DLQMessageDocument.find_one({"event_id": event_id}) + updated_doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) assert updated_doc is not None assert updated_doc.status == DLQMessageStatus.RETRIED assert updated_doc.retried_at is not None @@ -91,7 +91,7 @@ async def test_dlq_retry_sets_timestamp(scope: AsyncContainer) -> None: after_retry = datetime.now(timezone.utc) # Verify timestamp is set correctly - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) assert doc is not None assert doc.retried_at is not None assert before_retry <= doc.retried_at <= after_retry @@ -112,7 +112,7 @@ async def test_dlq_retry_from_pending_status(scope: AsyncContainer) -> None: assert result is True # Verify status transition - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) assert doc is not None assert doc.status == DLQMessageStatus.RETRIED @@ -155,7 +155,7 @@ async def test_dlq_retry_already_retried_message(scope: AsyncContainer) -> None: assert result is True # Status remains RETRIED - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) assert doc is not None assert doc.status == DLQMessageStatus.RETRIED @@ -178,7 +178,7 @@ async def test_dlq_retry_discarded_message(scope: AsyncContainer) -> None: assert result is True # Status is now RETRIED (repository doesn't guard transitions) - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) assert doc is not None assert doc.status == DLQMessageStatus.RETRIED @@ -198,7 +198,7 @@ async def test_dlq_discard_already_discarded_message(scope: AsyncContainer) -> N assert result is True # Reason is updated - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) assert doc is not None assert doc.status == DLQMessageStatus.DISCARDED assert doc.discard_reason == new_reason @@ -219,7 +219,7 @@ async def test_dlq_discard_retried_message(scope: AsyncContainer) -> None: assert result is True # Status is now DISCARDED - doc = await DLQMessageDocument.find_one({"event_id": event_id}) + doc = await DLQMessageDocument.find_one({"event.event_id": event_id}) assert doc is not None assert doc.status == DLQMessageStatus.DISCARDED assert doc.discard_reason == reason diff --git a/backend/tests/integration/events/test_consume_roundtrip.py b/backend/tests/integration/events/test_consume_roundtrip.py index 9d007594..9812b14f 100644 --- a/backend/tests/integration/events/test_consume_roundtrip.py +++ b/backend/tests/integration/events/test_consume_roundtrip.py @@ -5,11 +5,11 @@ import pytest from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic +from app.domain.events.typed import DomainEvent from app.events.core import UnifiedConsumer, UnifiedProducer from app.events.core.dispatcher import EventDispatcher from app.events.core.types import ConsumerConfig from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas -from app.infrastructure.kafka.events.base import BaseEvent from app.settings import Settings from dishka import AsyncContainer @@ -37,7 +37,7 @@ async def test_produce_consume_roundtrip(scope: AsyncContainer) -> None: received = asyncio.Event() @dispatcher.register(EventType.EXECUTION_REQUESTED) - async def _handle(_event: BaseEvent) -> None: + async def _handle(_event: DomainEvent) -> None: received.set() group_id = f"test-consumer.{uuid.uuid4().hex[:6]}" diff --git a/backend/tests/integration/events/test_dlq_handler.py b/backend/tests/integration/events/test_dlq_handler.py index de3584a1..16536256 100644 --- a/backend/tests/integration/events/test_dlq_handler.py +++ b/backend/tests/integration/events/test_dlq_handler.py @@ -1,10 +1,8 @@ import logging import pytest +from app.domain.events.typed import DomainEvent, EventMetadata, SagaStartedEvent from app.events.core import UnifiedProducer, create_dlq_error_handler, create_immediate_dlq_handler -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata -from app.infrastructure.kafka.events.saga import SagaStartedEvent from dishka import AsyncContainer pytestmark = [pytest.mark.integration, pytest.mark.kafka] @@ -18,7 +16,7 @@ async def test_dlq_handler_with_retries(scope: AsyncContainer, monkeypatch: pyte calls: list[tuple[str | None, str, str, int]] = [] async def _record_send_to_dlq( - original_event: BaseEvent, original_topic: str, error: Exception, retry_count: int + original_event: DomainEvent, original_topic: str, error: Exception, retry_count: int ) -> None: calls.append((original_event.event_id, original_topic, str(error), retry_count)) @@ -29,7 +27,7 @@ async def _record_send_to_dlq( saga_name="n", execution_id="x", initial_event_id="i", - metadata=AvroEventMetadata(service_name="a", service_version="1"), + metadata=EventMetadata(service_name="a", service_version="1"), ) # Call 1 and 2 should not send to DLQ await h(RuntimeError("boom"), e) @@ -47,7 +45,7 @@ async def test_immediate_dlq_handler(scope: AsyncContainer, monkeypatch: pytest. calls: list[tuple[str | None, str, str, int]] = [] async def _record_send_to_dlq( - original_event: BaseEvent, original_topic: str, error: Exception, retry_count: int + original_event: DomainEvent, original_topic: str, error: Exception, retry_count: int ) -> None: calls.append((original_event.event_id, original_topic, str(error), retry_count)) @@ -58,7 +56,7 @@ async def _record_send_to_dlq( saga_name="n", execution_id="x", initial_event_id="i", - metadata=AvroEventMetadata(service_name="a", service_version="1"), + metadata=EventMetadata(service_name="a", service_version="1"), ) await h(RuntimeError("x"), e) assert calls and calls[0][3] == 0 diff --git a/backend/tests/integration/events/test_event_dispatcher.py b/backend/tests/integration/events/test_event_dispatcher.py index 244930f6..d5f118a3 100644 --- a/backend/tests/integration/events/test_event_dispatcher.py +++ b/backend/tests/integration/events/test_event_dispatcher.py @@ -5,11 +5,11 @@ import pytest from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic +from app.domain.events.typed import DomainEvent from app.events.core import UnifiedConsumer, UnifiedProducer from app.events.core.dispatcher import EventDispatcher from app.events.core.types import ConsumerConfig from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas -from app.infrastructure.kafka.events.base import BaseEvent from app.settings import Settings from dishka import AsyncContainer @@ -35,11 +35,11 @@ async def test_dispatcher_with_multiple_handlers(scope: AsyncContainer) -> None: h2_called = asyncio.Event() @dispatcher.register(EventType.EXECUTION_REQUESTED) - async def h1(_e: BaseEvent) -> None: + async def h1(_e: DomainEvent) -> None: h1_called.set() @dispatcher.register(EventType.EXECUTION_REQUESTED) - async def h2(_e: BaseEvent) -> None: + async def h2(_e: DomainEvent) -> None: h2_called.set() # Real consumer against execution-events diff --git a/backend/tests/integration/events/test_event_store.py b/backend/tests/integration/events/test_event_store.py index 6fc68869..470297c5 100644 --- a/backend/tests/integration/events/test_event_store.py +++ b/backend/tests/integration/events/test_event_store.py @@ -5,8 +5,8 @@ import pytest from app.db.docs import EventDocument from app.domain.enums.events import EventType +from app.domain.events.typed import DomainEvent from app.events.event_store import EventStore -from app.infrastructure.kafka.events.base import BaseEvent from dishka import AsyncContainer from tests.helpers import make_execution_requested_event @@ -47,7 +47,7 @@ async def test_event_store_stores_batch(scope: AsyncContainer) -> None: store: EventStore = await scope.get(EventStore) # Create multiple unique events - events: list[BaseEvent] = [ + events: list[DomainEvent] = [ make_execution_requested_event(execution_id=f"exec-batch-{uuid.uuid4().hex[:8]}") for _ in range(5) ] @@ -99,7 +99,7 @@ async def test_event_store_batch_handles_duplicates(scope: AsyncContainer) -> No # Create a batch with one new event and one duplicate new_event = make_execution_requested_event(execution_id=f"exec-batch-new-{uuid.uuid4().hex[:8]}") - batch: list[BaseEvent] = [new_event, event] # event is already stored + batch: list[DomainEvent] = [new_event, event] # event is already stored results = await store.store_batch(batch) @@ -133,7 +133,7 @@ async def test_event_store_retrieves_by_type(scope: AsyncContainer) -> None: # Store a few events unique_prefix = uuid.uuid4().hex[:8] - events: list[BaseEvent] = [ + events: list[DomainEvent] = [ make_execution_requested_event(execution_id=f"exec-type-{unique_prefix}-{i}") for i in range(3) ] diff --git a/backend/tests/integration/events/test_producer_roundtrip.py b/backend/tests/integration/events/test_producer_roundtrip.py index 81ef5865..18493a51 100644 --- a/backend/tests/integration/events/test_producer_roundtrip.py +++ b/backend/tests/integration/events/test_producer_roundtrip.py @@ -2,8 +2,9 @@ from uuid import uuid4 import pytest -from app.events.core import ProducerConfig, UnifiedProducer +from app.events.core import UnifiedProducer from app.events.schema.schema_registry import SchemaRegistryManager +from app.infrastructure.kafka.mappings import get_topic_for_event from app.settings import Settings from dishka import AsyncContainer @@ -20,7 +21,6 @@ async def test_unified_producer_start_produce_send_to_dlq_stop( ) -> None: schema: SchemaRegistryManager = await scope.get(SchemaRegistryManager) prod = UnifiedProducer( - ProducerConfig(bootstrap_servers=test_settings.KAFKA_BOOTSTRAP_SERVERS), schema, logger=_test_logger, settings=test_settings, @@ -31,7 +31,8 @@ async def test_unified_producer_start_produce_send_to_dlq_stop( await prod.produce(ev) # Exercise send_to_dlq path - await prod.send_to_dlq(ev, original_topic=str(ev.topic), error=RuntimeError("forced"), retry_count=1) + topic = str(get_topic_for_event(ev.event_type)) + await prod.send_to_dlq(ev, original_topic=topic, error=RuntimeError("forced"), retry_count=1) st = prod.get_status() assert st["running"] is True and st["state"] == "running" diff --git a/backend/tests/integration/events/test_schema_registry_real.py b/backend/tests/integration/events/test_schema_registry_real.py index 90647f0d..3e9da631 100644 --- a/backend/tests/integration/events/test_schema_registry_real.py +++ b/backend/tests/integration/events/test_schema_registry_real.py @@ -1,9 +1,9 @@ import logging import pytest +from app.domain.events.typed import EventMetadata, PodCreatedEvent from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events.metadata import AvroEventMetadata -from app.infrastructure.kafka.events.pod import PodCreatedEvent +from app.infrastructure.kafka.mappings import get_topic_for_event from app.settings import Settings pytestmark = [pytest.mark.integration, pytest.mark.kafka] @@ -19,9 +19,10 @@ async def test_serialize_and_deserialize_event_real_registry(test_settings: Sett execution_id="e1", pod_name="p", namespace="n", - metadata=AvroEventMetadata(service_name="s", service_version="1"), + metadata=EventMetadata(service_name="s", service_version="1"), ) data = await m.serialize_event(ev) - obj = await m.deserialize_event(data, topic=str(ev.topic)) + topic = str(get_topic_for_event(ev.event_type)) + obj = await m.deserialize_event(data, topic=topic) assert isinstance(obj, PodCreatedEvent) assert obj.namespace == "n" diff --git a/backend/tests/integration/events/test_schema_registry_roundtrip.py b/backend/tests/integration/events/test_schema_registry_roundtrip.py index 00cc2784..f23b2fe6 100644 --- a/backend/tests/integration/events/test_schema_registry_roundtrip.py +++ b/backend/tests/integration/events/test_schema_registry_roundtrip.py @@ -2,6 +2,7 @@ import pytest from app.events.schema.schema_registry import MAGIC_BYTE, SchemaRegistryManager +from app.infrastructure.kafka.mappings import get_topic_for_event from app.settings import Settings from dishka import AsyncContainer @@ -19,7 +20,8 @@ async def test_schema_registry_serialize_deserialize_roundtrip(scope: AsyncConta ev = make_execution_requested_event(execution_id="e-rt") data = await reg.serialize_event(ev) assert data.startswith(MAGIC_BYTE) - back = await reg.deserialize_event(data, topic=str(ev.topic)) + topic = str(get_topic_for_event(ev.event_type)) + back = await reg.deserialize_event(data, topic=topic) assert back.event_id == ev.event_id and getattr(back, "execution_id", None) == ev.execution_id # initialize_schemas should be a no-op if already initialized; call to exercise path diff --git a/backend/tests/integration/idempotency/test_consumer_idempotent.py b/backend/tests/integration/idempotency/test_consumer_idempotent.py index b98db675..5e95eadb 100644 --- a/backend/tests/integration/idempotency/test_consumer_idempotent.py +++ b/backend/tests/integration/idempotency/test_consumer_idempotent.py @@ -5,17 +5,16 @@ import pytest from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic +from app.domain.events.typed import DomainEvent from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer, UnifiedProducer from app.events.core.dispatcher import EventDispatcher as Disp from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events.base import BaseEvent from app.services.idempotency.idempotency_manager import IdempotencyManager from app.services.idempotency.middleware import IdempotentConsumerWrapper from app.settings import Settings from dishka import AsyncContainer from tests.helpers import make_execution_requested_event -from tests.helpers.eventually import eventually # xdist_group: Kafka consumer creation can crash librdkafka when multiple workers # instantiate Consumer() objects simultaneously. Serial execution prevents this. @@ -36,13 +35,24 @@ async def test_consumer_idempotent_wrapper_blocks_duplicates(scope: AsyncContain registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager) settings: Settings = await scope.get(Settings) - # Build a dispatcher with a counter - disp: Disp = EventDispatcher(logger=_test_logger) + # Future resolves when handler processes an event - no polling needed + handled_future: asyncio.Future[None] = asyncio.get_running_loop().create_future() seen = {"n": 0} + # Build a dispatcher that signals completion via future + disp: Disp = EventDispatcher(logger=_test_logger) + @disp.register(EventType.EXECUTION_REQUESTED) - async def handle(_ev: BaseEvent) -> None: + async def handle(_ev: DomainEvent) -> None: seen["n"] += 1 + if not handled_future.done(): + handled_future.set_result(None) + + # Produce messages BEFORE starting consumer (auto_offset_reset="earliest" will read them) + execution_id = f"e-{uuid.uuid4().hex[:8]}" + ev = make_execution_requested_event(execution_id=execution_id) + await producer.produce(ev, key=execution_id) + await producer.produce(ev, key=execution_id) # Real consumer with idempotent wrapper cfg = ConsumerConfig( @@ -68,18 +78,9 @@ async def handle(_ev: BaseEvent) -> None: ) await wrapper.start([KafkaTopic.EXECUTION_EVENTS]) - # Allow time for consumer to join group and get partition assignments - await asyncio.sleep(2) try: - # Produce the same event twice (same event_id) - execution_id = f"e-{uuid.uuid4().hex[:8]}" - ev = make_execution_requested_event(execution_id=execution_id) - await producer.produce(ev, key=execution_id) - await producer.produce(ev, key=execution_id) - - async def _one() -> None: - assert seen["n"] >= 1 - - await eventually(_one, timeout=10.0, interval=0.2) + # Await the future directly - true async, no polling + await asyncio.wait_for(handled_future, timeout=10.0) + assert seen["n"] >= 1 finally: await wrapper.stop() diff --git a/backend/tests/integration/idempotency/test_decorator_idempotent.py b/backend/tests/integration/idempotency/test_decorator_idempotent.py index ffaa51ab..65e5b8b8 100644 --- a/backend/tests/integration/idempotency/test_decorator_idempotent.py +++ b/backend/tests/integration/idempotency/test_decorator_idempotent.py @@ -1,7 +1,7 @@ import logging import pytest -from app.infrastructure.kafka.events.base import BaseEvent +from app.domain.events.typed import DomainEvent from app.services.idempotency.idempotency_manager import IdempotencyManager from app.services.idempotency.middleware import idempotent_handler from dishka import AsyncContainer @@ -21,7 +21,7 @@ async def test_decorator_blocks_duplicate_event(scope: AsyncContainer) -> None: calls = {"n": 0} @idempotent_handler(idempotency_manager=idm, key_strategy="event_based", logger=_test_logger) - async def h(ev: BaseEvent) -> None: + async def h(ev: DomainEvent) -> None: calls["n"] += 1 ev = make_execution_requested_event(execution_id="exec-deco-1") @@ -37,11 +37,11 @@ async def test_decorator_custom_key_blocks(scope: AsyncContainer) -> None: calls = {"n": 0} - def fixed_key(_ev: BaseEvent) -> str: + def fixed_key(_ev: DomainEvent) -> str: return "fixed-key" @idempotent_handler(idempotency_manager=idm, key_strategy="custom", custom_key_func=fixed_key, logger=_test_logger) - async def h(ev: BaseEvent) -> None: + async def h(ev: DomainEvent) -> None: calls["n"] += 1 e1 = make_execution_requested_event(execution_id="exec-deco-2a") diff --git a/backend/tests/integration/idempotency/test_idempotency.py b/backend/tests/integration/idempotency/test_idempotency.py index 5011c110..cc5017e4 100644 --- a/backend/tests/integration/idempotency/test_idempotency.py +++ b/backend/tests/integration/idempotency/test_idempotency.py @@ -8,8 +8,8 @@ import pytest import redis.asyncio as redis +from app.domain.events.typed import DomainEvent from app.domain.idempotency import IdempotencyRecord, IdempotencyStatus -from app.infrastructure.kafka.events.base import BaseEvent from app.services.idempotency.idempotency_manager import IdempotencyConfig, IdempotencyManager from app.services.idempotency.middleware import IdempotentEventHandler, idempotent_handler from app.services.idempotency.redis_repository import RedisIdempotencyRepository @@ -268,9 +268,9 @@ async def manager(self, redis_client: redis.Redis) -> AsyncGenerator[Idempotency @pytest.mark.asyncio async def test_handler_processes_new_event(self, manager: IdempotencyManager) -> None: """Test that handler processes new events""" - processed_events: list[BaseEvent] = [] + processed_events: list[DomainEvent] = [] - async def actual_handler(event: BaseEvent) -> None: + async def actual_handler(event: DomainEvent) -> None: processed_events.append(event) # Create idempotent handler @@ -292,9 +292,9 @@ async def actual_handler(event: BaseEvent) -> None: @pytest.mark.asyncio async def test_handler_blocks_duplicate(self, manager: IdempotencyManager) -> None: """Test that handler blocks duplicate events""" - processed_events: list[BaseEvent] = [] + processed_events: list[DomainEvent] = [] - async def actual_handler(event: BaseEvent) -> None: + async def actual_handler(event: DomainEvent) -> None: processed_events.append(event) # Create idempotent handler @@ -317,7 +317,7 @@ async def actual_handler(event: BaseEvent) -> None: async def test_handler_with_failure(self, manager: IdempotencyManager) -> None: """Test handler marks failure on exception""" - async def failing_handler(event: BaseEvent) -> None: # noqa: ARG001 + async def failing_handler(event: DomainEvent) -> None: # noqa: ARG001 raise ValueError("Processing failed") handler = IdempotentEventHandler( @@ -343,12 +343,12 @@ async def failing_handler(event: BaseEvent) -> None: # noqa: ARG001 @pytest.mark.asyncio async def test_handler_duplicate_callback(self, manager: IdempotencyManager) -> None: """Test duplicate callback is invoked""" - duplicate_events: list[tuple[BaseEvent, Any]] = [] + duplicate_events: list[tuple[DomainEvent, Any]] = [] - async def actual_handler(event: BaseEvent) -> None: # noqa: ARG001 + async def actual_handler(event: DomainEvent) -> None: # noqa: ARG001 pass # Do nothing - async def on_duplicate(event: BaseEvent, result: Any) -> None: + async def on_duplicate(event: DomainEvent, result: Any) -> None: duplicate_events.append((event, result)) handler = IdempotentEventHandler( @@ -372,7 +372,7 @@ async def on_duplicate(event: BaseEvent, result: Any) -> None: @pytest.mark.asyncio async def test_decorator_integration(self, manager: IdempotencyManager) -> None: """Test the @idempotent_handler decorator""" - processed_events: list[BaseEvent] = [] + processed_events: list[DomainEvent] = [] @idempotent_handler( idempotency_manager=manager, @@ -380,7 +380,7 @@ async def test_decorator_integration(self, manager: IdempotencyManager) -> None: ttl_seconds=300, logger=_test_logger, ) - async def my_handler(event: BaseEvent) -> None: + async def my_handler(event: DomainEvent) -> None: processed_events.append(event) # Process same event twice @@ -406,11 +406,11 @@ async def test_custom_key_function(self, manager: IdempotencyManager) -> None: """Test handler with custom key function""" processed_scripts: list[str] = [] - async def process_script(event: BaseEvent) -> None: + async def process_script(event: DomainEvent) -> None: script: str = getattr(event, "script", "") processed_scripts.append(script) - def extract_script_key(event: BaseEvent) -> str: + def extract_script_key(event: DomainEvent) -> str: # Custom key based on script content only script: str = getattr(event, "script", "") return f"script:{hash(script)}" diff --git a/backend/tests/integration/idempotency/test_idempotent_handler.py b/backend/tests/integration/idempotency/test_idempotent_handler.py index 551c9d2c..c7ef5730 100644 --- a/backend/tests/integration/idempotency/test_idempotent_handler.py +++ b/backend/tests/integration/idempotency/test_idempotent_handler.py @@ -1,7 +1,7 @@ import logging import pytest -from app.infrastructure.kafka.events.base import BaseEvent +from app.domain.events.typed import DomainEvent from app.services.idempotency.idempotency_manager import IdempotencyManager from app.services.idempotency.middleware import IdempotentEventHandler from dishka import AsyncContainer @@ -19,7 +19,7 @@ async def test_idempotent_handler_blocks_duplicates(scope: AsyncContainer) -> No processed: list[str | None] = [] - async def _handler(ev: BaseEvent) -> None: + async def _handler(ev: DomainEvent) -> None: processed.append(ev.event_id) handler = IdempotentEventHandler( @@ -43,7 +43,7 @@ async def test_idempotent_handler_content_hash_blocks_same_content(scope: AsyncC processed: list[str] = [] - async def _handler(ev: BaseEvent) -> None: + async def _handler(ev: DomainEvent) -> None: processed.append(getattr(ev, "execution_id", "")) handler = IdempotentEventHandler( diff --git a/backend/tests/integration/notifications/test_notification_sse.py b/backend/tests/integration/notifications/test_notification_sse.py index b8145075..10327629 100644 --- a/backend/tests/integration/notifications/test_notification_sse.py +++ b/backend/tests/integration/notifications/test_notification_sse.py @@ -1,3 +1,4 @@ +import asyncio from uuid import uuid4 import pytest @@ -7,8 +8,6 @@ from app.services.sse.redis_bus import SSERedisBus from dishka import AsyncContainer -from tests.helpers.eventually import eventually - pytestmark = [pytest.mark.integration, pytest.mark.redis] @@ -25,6 +24,7 @@ async def test_in_app_notification_published_to_sse(scope: AsyncContainer) -> No await svc.update_subscription(user_id, NotificationChannel.IN_APP, True) # Create notification via service (IN_APP channel triggers SSE publish) + # Delivery is now awaited synchronously - message in Redis immediately await svc.create_notification( user_id=user_id, subject="Hello", @@ -34,14 +34,10 @@ async def test_in_app_notification_published_to_sse(scope: AsyncContainer) -> No channel=NotificationChannel.IN_APP, ) - # Receive published SSE payload - async def _recv() -> RedisNotificationMessage: - m = await sub.get(RedisNotificationMessage) - assert m is not None - return m - - msg = await eventually(_recv, timeout=5.0, interval=0.1) + # Await the subscription directly - true async, no polling + msg = await asyncio.wait_for(sub.get(RedisNotificationMessage), timeout=5.0) # Basic shape assertions + assert msg is not None assert msg.subject == "Hello" assert msg.body == "World" assert msg.notification_id diff --git a/backend/tests/integration/result_processor/test_result_processor.py b/backend/tests/integration/result_processor/test_result_processor.py index de4eaaae..08a44a37 100644 --- a/backend/tests/integration/result_processor/test_result_processor.py +++ b/backend/tests/integration/result_processor/test_result_processor.py @@ -8,22 +8,18 @@ from app.domain.enums.events import EventType from app.domain.enums.execution import ExecutionStatus from app.domain.enums.kafka import KafkaTopic +from app.domain.events.typed import EventMetadata, ExecutionCompletedEvent, ResultStoredEvent from app.domain.execution import DomainExecutionCreate from app.domain.execution.models import ResourceUsageDomain from app.events.core import UnifiedConsumer, UnifiedProducer from app.events.core.dispatcher import EventDispatcher from app.events.core.types import ConsumerConfig from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas -from app.infrastructure.kafka.events.execution import ExecutionCompletedEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata -from app.infrastructure.kafka.events.system import ResultStoredEvent from app.services.idempotency import IdempotencyManager from app.services.result_processor.processor import ResultProcessor from app.settings import Settings from dishka import AsyncContainer -from tests.helpers.eventually import eventually - # xdist_group: Kafka consumer creation can crash librdkafka when multiple workers # instantiate Consumer() objects simultaneously. Serial execution prevents this. pytestmark = [ @@ -74,8 +70,9 @@ async def test_result_processor_persists_and_emits(scope: AsyncContainer) -> Non stored_received = asyncio.Event() @dispatcher.register(EventType.RESULT_STORED) - async def _stored(_event: ResultStoredEvent) -> None: - stored_received.set() + async def _stored(event: ResultStoredEvent) -> None: + if event.execution_id == execution_id: + stored_received.set() group_id = f"rp-test.{uuid.uuid4().hex[:6]}" cconf = ConsumerConfig( @@ -91,36 +88,37 @@ async def _stored(_event: ResultStoredEvent) -> None: settings=settings, logger=_test_logger, ) + + # Produce the event BEFORE starting consumers (auto_offset_reset="earliest" will read it) + usage = ResourceUsageDomain( + execution_time_wall_seconds=0.5, + cpu_time_jiffies=100, + clk_tck_hertz=100, + peak_memory_kb=1024, + ) + evt = ExecutionCompletedEvent( + execution_id=execution_id, + exit_code=0, + stdout="hello", + stderr="", + resource_usage=usage, + metadata=EventMetadata(service_name="tests", service_version="1.0.0"), + ) + await producer.produce(evt, key=execution_id) + + # Start consumers after producing await stored_consumer.start([KafkaTopic.EXECUTION_RESULTS]) try: async with processor: - # Emit a completed event - usage = ResourceUsageDomain( - execution_time_wall_seconds=0.5, - cpu_time_jiffies=100, - clk_tck_hertz=100, - peak_memory_kb=1024, + # Await the ResultStoredEvent - signals that processing is complete + await asyncio.wait_for(stored_received.wait(), timeout=12.0) + + # Now verify DB persistence - should be done since event was emitted + doc = await db.get_collection("executions").find_one({"execution_id": execution_id}) + assert doc is not None, f"Execution {execution_id} not found in DB after ResultStoredEvent" + assert doc.get("status") == ExecutionStatus.COMPLETED, ( + f"Expected COMPLETED status, got {doc.get('status')}" ) - evt = ExecutionCompletedEvent( - execution_id=execution_id, - exit_code=0, - stdout="hello", - stderr="", - resource_usage=usage, - metadata=AvroEventMetadata(service_name="tests", service_version="1.0.0"), - ) - await producer.produce(evt, key=execution_id) - - # Wait for DB persistence (event-driven polling) - async def _persisted() -> None: - doc = await db.get_collection("executions").find_one({"execution_id": execution_id}) - assert doc is not None - assert doc.get("status") == ExecutionStatus.COMPLETED - - await eventually(_persisted, timeout=12.0, interval=0.2) - - # Wait for result stored event - await asyncio.wait_for(stored_received.wait(), timeout=10.0) finally: await stored_consumer.stop() diff --git a/backend/tests/integration/services/coordinator/test_execution_coordinator.py b/backend/tests/integration/services/coordinator/test_execution_coordinator.py index c00ff263..c3d3ed61 100644 --- a/backend/tests/integration/services/coordinator/test_execution_coordinator.py +++ b/backend/tests/integration/services/coordinator/test_execution_coordinator.py @@ -2,7 +2,6 @@ from app.services.coordinator.coordinator import ExecutionCoordinator from dishka import AsyncContainer from tests.helpers import make_execution_requested_event -from tests.helpers.eventually import eventually pytestmark = pytest.mark.integration @@ -12,10 +11,8 @@ async def test_handle_requested_and_schedule(scope: AsyncContainer) -> None: coord: ExecutionCoordinator = await scope.get(ExecutionCoordinator) ev = make_execution_requested_event(execution_id="e-real-1") + # Handler now schedules immediately - no polling needed await coord._handle_execution_requested(ev) # noqa: SLF001 - # Coordinator's background loop schedules executions automatically - async def is_active() -> None: - assert "e-real-1" in coord._active_executions # noqa: SLF001 - - await eventually(is_active, timeout=2.0, interval=0.05) + # Execution should be active immediately after handler returns + assert "e-real-1" in coord._active_executions # noqa: SLF001 diff --git a/backend/tests/integration/services/events/test_event_bus.py b/backend/tests/integration/services/events/test_event_bus.py index 189448c5..6f17670b 100644 --- a/backend/tests/integration/services/events/test_event_bus.py +++ b/backend/tests/integration/services/events/test_event_bus.py @@ -1,3 +1,4 @@ +import asyncio from datetime import datetime, timezone from uuid import uuid4 @@ -7,7 +8,6 @@ from app.services.event_bus import EventBusEvent, EventBusManager from app.settings import Settings from dishka import AsyncContainer -from tests.helpers.eventually import eventually pytestmark = pytest.mark.integration @@ -18,15 +18,16 @@ async def test_event_bus_publish_subscribe(scope: AsyncContainer, test_settings: manager: EventBusManager = await scope.get(EventBusManager) bus = await manager.get_event_bus() - received: list[EventBusEvent] = [] + # Future resolves when handler receives the event - no polling needed + received_future: asyncio.Future[EventBusEvent] = asyncio.get_running_loop().create_future() async def handler(event: EventBusEvent) -> None: - received.append(event) + if not received_future.done(): + received_future.set_result(event) await bus.subscribe("test.*", handler) - # EventBus filters self-published messages (designed for cross-instance communication). - # Simulate a message from another instance by producing directly to Kafka. + # Simulate message from another instance by producing directly to Kafka event = EventBusEvent( id=str(uuid4()), event_type="test.created", @@ -42,12 +43,11 @@ async def handler(event: EventBusEvent) -> None: topic=topic, value=event.model_dump_json().encode("utf-8"), key=b"test.created", - headers=[("source_instance", b"other-instance")], # Different instance + headers=[("source_instance", b"other-instance")], ) finally: await producer.stop() - async def _received() -> None: - assert any(e.event_type == "test.created" for e in received) - - await eventually(_received, timeout=5.0, interval=0.1) + # Await the future directly - true async, no polling + received = await asyncio.wait_for(received_future, timeout=10.0) + assert received.event_type == "test.created" diff --git a/backend/tests/integration/services/sse/test_partitioned_event_router.py b/backend/tests/integration/services/sse/test_partitioned_event_router.py index cc8ab4b8..15b0ec63 100644 --- a/backend/tests/integration/services/sse/test_partitioned_event_router.py +++ b/backend/tests/integration/services/sse/test_partitioned_event_router.py @@ -1,3 +1,4 @@ +import asyncio import logging from uuid import uuid4 @@ -12,7 +13,6 @@ from app.settings import Settings from tests.helpers import make_execution_requested_event -from tests.helpers.eventually import eventually pytestmark = [pytest.mark.integration, pytest.mark.redis] @@ -46,12 +46,9 @@ async def test_router_bridges_to_redis(redis_client: redis.Redis, test_settings: handler = disp.get_handlers(ev.event_type)[0] await handler(ev) - async def _recv() -> RedisSSEMessage: - m = await subscription.get(RedisSSEMessage) - assert m is not None - return m - - msg = await eventually(_recv, timeout=2.0, interval=0.05) + # Await the subscription directly - true async, no polling + msg = await asyncio.wait_for(subscription.get(RedisSSEMessage), timeout=2.0) + assert msg is not None assert str(msg.event_type) == str(ev.event_type) diff --git a/backend/tests/integration/services/sse/test_redis_bus.py b/backend/tests/integration/services/sse/test_redis_bus.py index c2148c7c..b22c5c1e 100644 --- a/backend/tests/integration/services/sse/test_redis_bus.py +++ b/backend/tests/integration/services/sse/test_redis_bus.py @@ -1,15 +1,13 @@ import asyncio import logging from datetime import datetime, timezone -from typing import Any, ClassVar, cast +from typing import Any, cast import pytest import redis.asyncio as redis_async from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic from app.domain.enums.notification import NotificationSeverity, NotificationStatus -from app.infrastructure.kafka.events import BaseEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata +from app.domain.events.typed import EventMetadata, ExecutionCompletedEvent from app.schemas_pydantic.sse import RedisNotificationMessage, RedisSSEMessage from app.services.sse.redis_bus import SSERedisBus @@ -18,16 +16,6 @@ _test_logger = logging.getLogger("test.services.sse.redis_bus") -class _DummyEvent(BaseEvent): - """Dummy event for testing.""" - execution_id: str = "" - status: str | None = None - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_EVENTS - - def model_dump(self, **kwargs: object) -> dict[str, Any]: - return {"execution_id": self.execution_id, "status": self.status} - - class _FakePubSub: def __init__(self) -> None: self.subscribed: set[str] = set() @@ -72,8 +60,8 @@ def pubsub(self) -> _FakePubSub: return self._pubsub -def _make_metadata() -> AvroEventMetadata: - return AvroEventMetadata(service_name="test", service_version="1.0") +def _make_metadata() -> EventMetadata: + return EventMetadata(service_name="test", service_version="1.0") @pytest.mark.asyncio @@ -87,11 +75,12 @@ async def test_publish_and_subscribe_round_trip() -> None: assert "sse:exec:exec-1" in r._pubsub.subscribed # Publish event - evt = _DummyEvent( - event_type=EventType.EXECUTION_COMPLETED, - metadata=_make_metadata(), + evt = ExecutionCompletedEvent( execution_id="exec-1", - status="completed" + exit_code=0, + stdout="", + stderr="", + metadata=_make_metadata(), ) await bus.publish_event("exec-1", evt) assert r.published, "nothing published" diff --git a/backend/tests/integration/test_dlq_routes.py b/backend/tests/integration/test_dlq_routes.py index 8d29b929..ef59fd82 100644 --- a/backend/tests/integration/test_dlq_routes.py +++ b/backend/tests/integration/test_dlq_routes.py @@ -2,7 +2,7 @@ from typing import TypedDict import pytest -from app.dlq import DLQMessageStatus +from app.dlq import AgeStatistics, DLQMessageStatus, EventTypeStatistic, TopicStatistic from app.schemas_pydantic.dlq import ( DLQBatchRetryResponse, DLQMessageDetail, @@ -47,39 +47,33 @@ async def test_get_dlq_statistics(self, test_user: AsyncClient) -> None: stats_data = response.json() stats = DLQStats(**stats_data) - # Verify structure + # Verify structure - using typed models assert isinstance(stats.by_status, dict) assert isinstance(stats.by_topic, list) assert isinstance(stats.by_event_type, list) - assert isinstance(stats.age_stats, dict) + assert isinstance(stats.age_stats, AgeStatistics) assert stats.timestamp is not None - # Check status breakdown - for status in ["pending", "retrying", "failed", "discarded"]: + # Check status breakdown - iterate over actual enum values + for status in DLQMessageStatus: if status in stats.by_status: assert isinstance(stats.by_status[status], int) assert stats.by_status[status] >= 0 - # Check topic stats + # Check topic stats - now typed as TopicStatistic for topic_stat in stats.by_topic: - assert "topic" in topic_stat - assert "count" in topic_stat - assert isinstance(topic_stat["count"], int) - assert topic_stat["count"] >= 0 + assert isinstance(topic_stat, TopicStatistic) + assert topic_stat.count >= 0 - # Check event type stats + # Check event type stats - now typed as EventTypeStatistic for event_type_stat in stats.by_event_type: - assert "event_type" in event_type_stat - assert "count" in event_type_stat - assert isinstance(event_type_stat["count"], int) - assert event_type_stat["count"] >= 0 - - # Check age stats - if stats.age_stats: - for key in ["min", "max", "avg", "median"]: - if key in stats.age_stats: - assert isinstance(stats.age_stats[key], (int, float)) - assert stats.age_stats[key] >= 0 + assert isinstance(event_type_stat, EventTypeStatistic) + assert event_type_stat.count >= 0 + + # Check age stats - now typed as AgeStatistics + assert stats.age_stats.min_age_seconds >= 0 + assert stats.age_stats.max_age_seconds >= 0 + assert stats.age_stats.avg_age_seconds >= 0 @pytest.mark.asyncio async def test_list_dlq_messages(self, test_user: AsyncClient) -> None: @@ -102,17 +96,13 @@ async def test_list_dlq_messages(self, test_user: AsyncClient) -> None: # If there are messages, validate their structure for message in messages_response.messages: assert isinstance(message, DLQMessageResponse) - assert message.event_id is not None - assert message.event_type is not None + assert message.event.event_id is not None + assert message.event.event_type is not None assert message.original_topic is not None assert message.retry_count >= 0 assert message.failed_at is not None assert message.status in DLQMessageStatus.__members__.values() - # Check age_seconds is reasonable - if message.age_seconds is not None: - assert message.age_seconds >= 0 - @pytest.mark.asyncio async def test_filter_dlq_messages_by_status(self, test_user: AsyncClient) -> None: """Test filtering DLQ messages by status.""" @@ -162,11 +152,10 @@ async def test_get_single_dlq_message_detail(self, test_user: AsyncClient) -> No detail_data = detail_response.json() message_detail = DLQMessageDetail(**detail_data) - # Verify all fields are present - assert message_detail.event_id == event_id + # Verify all fields are present - event is DomainEvent with event_id/event_type assert message_detail.event is not None - assert isinstance(message_detail.event, dict) - assert message_detail.event_type is not None + assert message_detail.event.event_id == event_id + assert message_detail.event.event_type is not None assert message_detail.original_topic is not None assert message_detail.error is not None assert message_detail.retry_count >= 0 @@ -197,7 +186,7 @@ async def test_get_nonexistent_dlq_message(self, test_user: AsyncClient) -> None @pytest.mark.asyncio async def test_set_retry_policy( - self, test_user: AsyncClient, test_settings: Settings + self, test_user: AsyncClient, test_settings: Settings ) -> None: """Test setting a retry policy for a topic.""" # Set retry policy @@ -351,8 +340,8 @@ async def test_dlq_message_pagination(self, test_user: AsyncClient) -> None: # Messages should be different if page1.messages and page2.messages: - page1_ids = {msg.event_id for msg in page1.messages} - page2_ids = {msg.event_id for msg in page2.messages} + page1_ids = {msg.event.event_id for msg in page1.messages} + page2_ids = {msg.event.event_id for msg in page2.messages} # Should have no overlap assert len(page1_ids.intersection(page2_ids)) == 0 diff --git a/backend/tests/integration/test_notifications_routes.py b/backend/tests/integration/test_notifications_routes.py index 5eea763c..bac015cc 100644 --- a/backend/tests/integration/test_notifications_routes.py +++ b/backend/tests/integration/test_notifications_routes.py @@ -15,8 +15,6 @@ from dishka import AsyncContainer from httpx import AsyncClient -from tests.helpers.eventually import eventually - @pytest.mark.integration class TestNotificationRoutes: @@ -150,6 +148,7 @@ async def test_mark_all_notifications_as_read( user_id = me_response.json()["user_id"] notification_service = await scope.get(NotificationService) + # Delivery is now awaited synchronously - notification available immediately await notification_service.create_notification( user_id=user_id, subject="Test notification", @@ -159,18 +158,11 @@ async def test_mark_all_notifications_as_read( channel=NotificationChannel.IN_APP, ) - # Wait for async delivery to complete (create_notification uses asyncio.create_task) - async def _has_unread() -> None: - resp = await test_user.get("/api/v1/notifications/unread-count") - assert resp.status_code == 200 - assert resp.json()["unread_count"] >= 1 - - await eventually(_has_unread, timeout=5.0, interval=0.1) - - # Get initial unread count (guaranteed >= 1 now) + # Get initial unread count (notification created synchronously, available now) initial_response = await test_user.get("/api/v1/notifications/unread-count") assert initial_response.status_code == 200 initial_count = initial_response.json()["unread_count"] + assert initial_count >= 1, "Expected at least one unread notification after create" # Mark all as read mark_all_response = await test_user.post("/api/v1/notifications/mark-all-read") diff --git a/backend/tests/integration/test_replay_routes.py b/backend/tests/integration/test_replay_routes.py index 219e6697..7ef221a5 100644 --- a/backend/tests/integration/test_replay_routes.py +++ b/backend/tests/integration/test_replay_routes.py @@ -9,8 +9,6 @@ from app.schemas_pydantic.replay_models import ReplaySession from httpx import AsyncClient -from tests.helpers.eventually import eventually - @pytest.mark.integration class TestReplayRoutes: @@ -389,16 +387,14 @@ async def test_replay_session_progress_tracking(self, test_admin: AsyncClient) - # Start the session await test_admin.post(f"/api/v1/replay/sessions/{session_id}/start") - # Poll progress without fixed sleeps - async def _check_progress_once() -> None: - detail_response = await test_admin.get(f"/api/v1/replay/sessions/{session_id}") - assert detail_response.status_code == 200 - session_data = detail_response.json() - session = ReplaySession(**session_data) - if session.replayed_events is not None and session.total_events is not None: - assert 0 <= session.replayed_events <= session.total_events - if session.total_events > 0: - progress = (session.replayed_events / session.total_events) * 100 - assert 0.0 <= progress <= 100.0 - - await eventually(_check_progress_once, timeout=5.0, interval=0.5) + # Check progress immediately - session state available right after start + detail_response = await test_admin.get(f"/api/v1/replay/sessions/{session_id}") + assert detail_response.status_code == 200 + session_data = detail_response.json() + session = ReplaySession(**session_data) + # Validate progress fields are present and valid + if session.replayed_events is not None and session.total_events is not None: + assert 0 <= session.replayed_events <= session.total_events + if session.total_events > 0: + progress = (session.replayed_events / session.total_events) * 100 + assert 0.0 <= progress <= 100.0 diff --git a/backend/tests/integration/test_sse_routes.py b/backend/tests/integration/test_sse_routes.py index 9339219e..9a902a54 100644 --- a/backend/tests/integration/test_sse_routes.py +++ b/backend/tests/integration/test_sse_routes.py @@ -10,8 +10,7 @@ from app.domain.enums.events import EventType from app.domain.enums.notification import NotificationSeverity, NotificationStatus from app.domain.enums.sse import SSEControlEvent, SSENotificationEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata -from app.infrastructure.kafka.events.pod import PodCreatedEvent +from app.domain.events.typed import EventMetadata, PodCreatedEvent from app.schemas_pydantic.sse import ( RedisNotificationMessage, RedisSSEMessage, @@ -69,7 +68,7 @@ async def test_publish_wraps_event_in_redis_message(self, scope: AsyncContainer) execution_id=exec_id, pod_name="test-pod", namespace="test-ns", - metadata=AvroEventMetadata(service_name="test", service_version="1.0"), + metadata=EventMetadata(service_name="test", service_version="1.0"), ) await bus.publish_event(exec_id, event) @@ -96,7 +95,7 @@ async def test_channel_isolation(self, scope: AsyncContainer) -> None: execution_id=exec_a, pod_name="pod-a", namespace="default", - metadata=AvroEventMetadata(service_name="test", service_version="1"), + metadata=EventMetadata(service_name="test", service_version="1"), ) await bus.publish_event(exec_a, event) diff --git a/backend/tests/integration/test_user_settings_routes.py b/backend/tests/integration/test_user_settings_routes.py index 024e01ce..9338346f 100644 --- a/backend/tests/integration/test_user_settings_routes.py +++ b/backend/tests/integration/test_user_settings_routes.py @@ -5,8 +5,6 @@ from app.schemas_pydantic.user_settings import SettingsHistoryResponse, UserSettings from httpx import AsyncClient -from tests.helpers.eventually import eventually - class _NotificationSettings(TypedDict): execution_completed: bool @@ -301,16 +299,7 @@ async def test_restore_settings_to_previous_point(self, test_user: AsyncClient) new_theme = "dark" if original_theme != "dark" else "light" await test_user.put("/api/v1/user/settings/theme", json={"theme": new_theme}) - # Ensure restore point is distinct by checking time monotonicity - prev = datetime.now(timezone.utc) - - async def _tick() -> None: - now = datetime.now(timezone.utc) - assert (now - prev).total_seconds() >= 0 - - await eventually(_tick, timeout=0.5, interval=0.05) - - # Get restore point (before the change) + # Get restore point - timestamps are monotonic by definition restore_point = datetime.now(timezone.utc).isoformat() # Make another change diff --git a/backend/tests/unit/domain/events/test_event_schema_coverage.py b/backend/tests/unit/domain/events/test_event_schema_coverage.py index 46a23a98..dd150e3e 100644 --- a/backend/tests/unit/domain/events/test_event_schema_coverage.py +++ b/backend/tests/unit/domain/events/test_event_schema_coverage.py @@ -3,7 +3,7 @@ This test ensures that: 1. Every EventType has a corresponding domain event class (in DomainEvent union) -2. Every EventType has a corresponding Kafka event class (BaseEvent subclass) +2. Every EventType has a corresponding Kafka event class (DomainEvent subclass) 3. No orphan event classes exist (classes without matching EventType) Run this test to catch missing event implementations early. @@ -12,10 +12,8 @@ from typing import get_args from app.domain.enums.events import EventType -from app.domain.events.typed import BaseEvent as DomainBaseEvent -from app.domain.events.typed import DomainEvent, domain_event_adapter +from app.domain.events.typed import BaseEvent, DomainEvent, domain_event_adapter from app.events.schema.schema_registry import _get_event_type_to_class_mapping -from app.infrastructure.kafka.events.base import BaseEvent as KafkaBaseEvent def get_domain_event_classes() -> dict[EventType, type]: @@ -35,10 +33,10 @@ def get_domain_event_classes() -> dict[EventType, type]: else: event_classes = [] - # Fallback: iterate through all DomainBaseEvent subclasses + # Fallback: iterate through all BaseEvent subclasses if not event_classes: event_classes = [] - for cls in DomainBaseEvent.__subclasses__(): + for cls in BaseEvent.__subclasses__(): if hasattr(cls, "model_fields") and "event_type" in cls.model_fields: event_classes.append(cls) @@ -52,7 +50,7 @@ def get_domain_event_classes() -> dict[EventType, type]: def get_kafka_event_classes() -> dict[EventType, type]: - """Extract EventType -> class mapping from Kafka BaseEvent subclasses.""" + """Extract EventType -> class mapping from Kafka DomainEvent subclasses.""" return _get_event_type_to_class_mapping() @@ -108,7 +106,7 @@ def test_no_orphan_domain_event_classes(self) -> None: """All domain event classes must have a corresponding EventType.""" orphans: list[str] = [] - for cls in DomainBaseEvent.__subclasses__(): + for cls in BaseEvent.__subclasses__(): # Skip test fixtures/mocks (private classes starting with _) if cls.__name__.startswith("_"): continue @@ -128,7 +126,7 @@ def test_no_orphan_kafka_event_classes(self) -> None: """All Kafka event classes must have a corresponding EventType.""" orphans: list[str] = [] - for cls in KafkaBaseEvent.__subclasses__(): + for cls in BaseEvent.__subclasses__(): # Skip test fixtures/mocks (private classes starting with _) if cls.__name__.startswith("_"): continue diff --git a/backend/tests/unit/events/core/test_consumer_config.py b/backend/tests/unit/events/core/test_consumer_config.py deleted file mode 100644 index 99e1a6bf..00000000 --- a/backend/tests/unit/events/core/test_consumer_config.py +++ /dev/null @@ -1,19 +0,0 @@ - -from app.events.core.types import ConsumerConfig, ProducerConfig - - -def test_producer_config_mapping() -> None: - cfg = ProducerConfig(bootstrap_servers="localhost:9092", client_id="cid") - m = cfg.to_producer_config() - assert m["bootstrap.servers"] == "localhost:9092" - assert m["client.id"] == "cid" - assert m["compression.type"] == "gzip" - - -def test_consumer_config_mapping() -> None: - cfg = ConsumerConfig(bootstrap_servers="localhost:9092", group_id="gid", client_id="cid") - m = cfg.to_consumer_config() - assert m["bootstrap.servers"] == "localhost:9092" - assert m["group.id"] == "gid" - assert m["client.id"] == "cid" - assert m["auto.offset.reset"] == "earliest" diff --git a/backend/tests/unit/events/test_event_dispatcher.py b/backend/tests/unit/events/test_event_dispatcher.py index 344a5a9f..6bda67e8 100644 --- a/backend/tests/unit/events/test_event_dispatcher.py +++ b/backend/tests/unit/events/test_event_dispatcher.py @@ -1,19 +1,19 @@ import logging from app.domain.enums.events import EventType +from app.domain.events.typed import DomainEvent from app.events.core import EventDispatcher -from app.infrastructure.kafka.events.base import BaseEvent from tests.helpers import make_execution_requested_event _test_logger = logging.getLogger("test.events.event_dispatcher") -def make_event() -> BaseEvent: +def make_event() -> DomainEvent: return make_execution_requested_event(execution_id="e1") -async def _async_noop(_: BaseEvent) -> None: +async def _async_noop(_: DomainEvent) -> None: return None @@ -34,7 +34,7 @@ def test_decorator_registration() -> None: disp = EventDispatcher(logger=_test_logger) @disp.register(EventType.EXECUTION_REQUESTED) - async def handler(ev: BaseEvent) -> None: # noqa: ARG001 + async def handler(ev: DomainEvent) -> None: # noqa: ARG001 return None assert len(disp.get_handlers(EventType.EXECUTION_REQUESTED)) == 1 @@ -45,7 +45,7 @@ async def test_dispatch_metrics_processed_and_skipped() -> None: called = {"n": 0} @disp.register(EventType.EXECUTION_REQUESTED) - async def handler(_: BaseEvent) -> None: + async def handler(_: DomainEvent) -> None: called["n"] += 1 await disp.dispatch(make_event()) diff --git a/backend/tests/unit/events/test_mappings_and_types.py b/backend/tests/unit/events/test_mappings_and_types.py index 62477f63..cdbefb24 100644 --- a/backend/tests/unit/events/test_mappings_and_types.py +++ b/backend/tests/unit/events/test_mappings_and_types.py @@ -1,6 +1,5 @@ from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic -from app.events.core import ConsumerConfig, ProducerConfig from app.infrastructure.kafka.mappings import ( get_event_class_for_type, get_event_types_for_topic, @@ -8,51 +7,6 @@ ) -def test_producer_config_mapping() -> None: - cfg = ProducerConfig( - bootstrap_servers="kafka:29092", - client_id="cid", - batch_size=123, - linger_ms=7, - compression_type="gzip", - request_timeout_ms=1111, - retries=2, - enable_idempotence=True, - acks="all", - max_in_flight_requests_per_connection=3, - ) - conf = cfg.to_producer_config() - assert conf["bootstrap.servers"] == "kafka:29092" - assert conf["client.id"] == "cid" - assert conf["batch.size"] == 123 - assert conf["linger.ms"] == 7 - assert conf["compression.type"] == "gzip" - assert conf["enable.idempotence"] is True - - -def test_consumer_config_mapping() -> None: - cfg = ConsumerConfig( - bootstrap_servers="kafka:29092", - group_id="g", - client_id="c", - auto_offset_reset="latest", - enable_auto_commit=False, - session_timeout_ms=12345, - heartbeat_interval_ms=999, - max_poll_interval_ms=555000, - fetch_min_bytes=10, - fetch_max_wait_ms=777, - statistics_interval_ms=60000, - ) - conf = cfg.to_consumer_config() - assert conf["bootstrap.servers"] == "kafka:29092" - assert conf["group.id"] == "g" - assert conf["client.id"] == "c" - assert conf["auto.offset.reset"] == "latest" - assert conf["enable.auto.commit"] is False - assert conf["fetch.wait.max.ms"] == 777 - - def test_event_mappings_topics() -> None: # A few spot checks assert get_topic_for_event(EventType.EXECUTION_REQUESTED) == KafkaTopic.EXECUTION_EVENTS diff --git a/backend/tests/unit/events/test_metadata_model.py b/backend/tests/unit/events/test_metadata_model.py index 71440ce7..f237a263 100644 --- a/backend/tests/unit/events/test_metadata_model.py +++ b/backend/tests/unit/events/test_metadata_model.py @@ -1,22 +1,20 @@ -from app.infrastructure.kafka.events.metadata import AvroEventMetadata +from app.domain.events.typed import EventMetadata -def test_with_correlation() -> None: - m = AvroEventMetadata(service_name="svc", service_version="1") - m2 = m.with_correlation("cid") - assert m2.correlation_id == "cid" - assert m2.service_name == m.service_name # preserves other fields +def test_metadata_creation() -> None: + m = EventMetadata(service_name="svc", service_version="1") + assert m.service_name == "svc" + assert m.service_version == "1" + assert m.correlation_id # auto-generated -def test_with_user() -> None: - m = AvroEventMetadata(service_name="svc", service_version="1") - m2 = m.with_user("u1") - assert m2.user_id == "u1" +def test_metadata_with_user() -> None: + m = EventMetadata(service_name="svc", service_version="1", user_id="u1") + assert m.user_id == "u1" -def test_ensure_correlation_id() -> None: - m = AvroEventMetadata(service_name="svc", service_version="1") - # ensure_correlation_id returns self if correlation_id already present - same = m.ensure_correlation_id() - assert same.correlation_id == m.correlation_id - assert m.ensure_correlation_id().correlation_id +def test_metadata_copy_with_correlation() -> None: + m = EventMetadata(service_name="svc", service_version="1") + m2 = m.model_copy(update={"correlation_id": "cid"}) + assert m2.correlation_id == "cid" + assert m2.service_name == m.service_name diff --git a/backend/tests/unit/events/test_schema_registry_manager.py b/backend/tests/unit/events/test_schema_registry_manager.py index 9a5c0a2e..6819237a 100644 --- a/backend/tests/unit/events/test_schema_registry_manager.py +++ b/backend/tests/unit/events/test_schema_registry_manager.py @@ -1,8 +1,8 @@ import logging import pytest +from app.domain.events.typed import ExecutionRequestedEvent from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events.execution import ExecutionRequestedEvent from app.settings import Settings _test_logger = logging.getLogger("test.events.schema_registry_manager") diff --git a/backend/tests/unit/services/coordinator/test_queue_manager.py b/backend/tests/unit/services/coordinator/test_queue_manager.py index b5b88220..b3b87dee 100644 --- a/backend/tests/unit/services/coordinator/test_queue_manager.py +++ b/backend/tests/unit/services/coordinator/test_queue_manager.py @@ -1,7 +1,7 @@ import logging import pytest -from app.infrastructure.kafka.events.execution import ExecutionRequestedEvent +from app.domain.events.typed import ExecutionRequestedEvent from app.services.coordinator.queue_manager import QueueManager, QueuePriority from tests.helpers import make_execution_requested_event diff --git a/backend/tests/unit/services/idempotency/test_idempotency_manager.py b/backend/tests/unit/services/idempotency/test_idempotency_manager.py index 62227363..102aa56c 100644 --- a/backend/tests/unit/services/idempotency/test_idempotency_manager.py +++ b/backend/tests/unit/services/idempotency/test_idempotency_manager.py @@ -2,7 +2,7 @@ from unittest.mock import MagicMock import pytest -from app.infrastructure.kafka.events.base import BaseEvent +from app.domain.events.typed import BaseEvent from app.services.idempotency.idempotency_manager import ( IdempotencyConfig, IdempotencyKeyStrategy, diff --git a/backend/tests/unit/services/idempotency/test_middleware.py b/backend/tests/unit/services/idempotency/test_middleware.py index 4b1125e0..e3f69ece 100644 --- a/backend/tests/unit/services/idempotency/test_middleware.py +++ b/backend/tests/unit/services/idempotency/test_middleware.py @@ -2,8 +2,8 @@ from unittest.mock import AsyncMock, MagicMock import pytest +from app.domain.events.typed import DomainEvent from app.domain.idempotency import IdempotencyStatus -from app.infrastructure.kafka.events.base import BaseEvent from app.services.idempotency.idempotency_manager import IdempotencyManager, IdempotencyResult from app.services.idempotency.middleware import ( IdempotentEventHandler, @@ -28,7 +28,7 @@ def mock_handler(self) -> AsyncMock: @pytest.fixture def event(self) -> MagicMock: - event = MagicMock(spec=BaseEvent) + event = MagicMock(spec=DomainEvent) event.event_type = "test.event" event.event_id = "event-123" return event diff --git a/backend/tests/unit/services/pod_monitor/test_event_mapper.py b/backend/tests/unit/services/pod_monitor/test_event_mapper.py index 8d848937..2314de3b 100644 --- a/backend/tests/unit/services/pod_monitor/test_event_mapper.py +++ b/backend/tests/unit/services/pod_monitor/test_event_mapper.py @@ -4,13 +4,13 @@ import pytest from app.domain.enums.events import EventType from app.domain.enums.storage import ExecutionErrorType -from app.infrastructure.kafka.events.execution import ( +from app.domain.events.typed import ( + EventMetadata, ExecutionCompletedEvent, ExecutionFailedEvent, ExecutionTimeoutEvent, + PodRunningEvent, ) -from app.infrastructure.kafka.events.metadata import AvroEventMetadata -from app.infrastructure.kafka.events.pod import PodRunningEvent from app.services.pod_monitor.event_mapper import PodContext, PodEventMapper from tests.helpers.k8s_fakes import ( @@ -31,7 +31,7 @@ def _ctx(pod: Pod, event_type: str = "ADDED") -> PodContext: return PodContext( pod=pod, execution_id="e1", - metadata=AvroEventMetadata(service_name="t", service_version="1"), + metadata=EventMetadata(service_name="t", service_version="1"), phase=pod.status.phase or "", event_type=event_type, ) diff --git a/backend/tests/unit/services/pod_monitor/test_monitor.py b/backend/tests/unit/services/pod_monitor/test_monitor.py index 6f33d44b..ec60121a 100644 --- a/backend/tests/unit/services/pod_monitor/test_monitor.py +++ b/backend/tests/unit/services/pod_monitor/test_monitor.py @@ -8,12 +8,9 @@ from app.core import k8s_clients as k8s_clients_module from app.core.k8s_clients import K8sClients from app.db.repositories.event_repository import EventRepository -from app.domain.events import DomainEvent +from app.domain.events.typed import DomainEvent, EventMetadata, ExecutionCompletedEvent, ExecutionStartedEvent from app.domain.execution.models import ResourceUsageDomain from app.events.core import UnifiedProducer -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.execution import ExecutionCompletedEvent, ExecutionStartedEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata from app.services.kafka_event_service import KafkaEventService from app.services.pod_monitor.config import PodMonitorConfig from app.services.pod_monitor.event_mapper import PodEventMapper @@ -63,11 +60,11 @@ class FakeUnifiedProducer(UnifiedProducer): def __init__(self) -> None: # Don't call super().__init__ - we don't need real Kafka - self.produced_events: list[tuple[BaseEvent, str | None]] = [] + self.produced_events: list[tuple[DomainEvent, str | None]] = [] self.logger = _test_logger async def produce( - self, event_to_produce: BaseEvent, key: str | None = None, headers: dict[str, str] | None = None + self, event_to_produce: DomainEvent, key: str | None = None, headers: dict[str, str] | None = None ) -> None: self.produced_events.append((event_to_produce, key)) @@ -399,7 +396,7 @@ async def test_publish_event_full_flow() -> None: aggregate_id="exec1", exit_code=0, resource_usage=ResourceUsageDomain(), - metadata=AvroEventMetadata(service_name="test", service_version="1.0"), + metadata=EventMetadata(service_name="test", service_version="1.0"), ) pod = make_pod(name="test-pod", phase="Succeeded", labels={"execution-id": "exec1"}) @@ -415,7 +412,7 @@ async def test_publish_event_exception_handling() -> None: class FailingProducer(FakeUnifiedProducer): async def produce( - self, event_to_produce: BaseEvent, key: str | None = None, headers: dict[str, str] | None = None + self, event_to_produce: DomainEvent, key: str | None = None, headers: dict[str, str] | None = None ) -> None: raise RuntimeError("Publish failed") @@ -434,7 +431,7 @@ async def produce( event = ExecutionStartedEvent( execution_id="exec1", pod_name="test-pod", - metadata=AvroEventMetadata(service_name="test", service_version="1.0"), + metadata=EventMetadata(service_name="test", service_version="1.0"), ) # Use pod with no metadata to exercise edge case diff --git a/backend/tests/unit/services/result_processor/test_processor.py b/backend/tests/unit/services/result_processor/test_processor.py index 79410f7e..f78cc3bc 100644 --- a/backend/tests/unit/services/result_processor/test_processor.py +++ b/backend/tests/unit/services/result_processor/test_processor.py @@ -3,7 +3,7 @@ import pytest from app.domain.enums.events import EventType -from app.domain.enums.kafka import GroupId, KafkaTopic +from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId, KafkaTopic from app.services.result_processor.processor import ResultProcessor, ResultProcessorConfig pytestmark = pytest.mark.unit @@ -15,9 +15,9 @@ class TestResultProcessorConfig: def test_default_values(self) -> None: config = ResultProcessorConfig() assert config.consumer_group == GroupId.RESULT_PROCESSOR - assert KafkaTopic.EXECUTION_COMPLETED in config.topics - assert KafkaTopic.EXECUTION_FAILED in config.topics - assert KafkaTopic.EXECUTION_TIMEOUT in config.topics + # Topics should match centralized CONSUMER_GROUP_SUBSCRIPTIONS mapping + assert set(config.topics) == CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.RESULT_PROCESSOR] + assert KafkaTopic.EXECUTION_EVENTS in config.topics assert config.result_topic == KafkaTopic.EXECUTION_RESULTS assert config.batch_size == 10 assert config.processing_timeout == 300 diff --git a/backend/tests/unit/services/saga/test_execution_saga_steps.py b/backend/tests/unit/services/saga/test_execution_saga_steps.py index 47327538..8c235076 100644 --- a/backend/tests/unit/services/saga/test_execution_saga_steps.py +++ b/backend/tests/unit/services/saga/test_execution_saga_steps.py @@ -1,9 +1,8 @@ import pytest from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository +from app.domain.events.typed import DomainEvent, ExecutionRequestedEvent from app.domain.saga import DomainResourceAllocation, DomainResourceAllocationCreate from app.events.core import UnifiedProducer -from app.infrastructure.kafka.events import BaseEvent -from app.infrastructure.kafka.events.execution import ExecutionRequestedEvent from app.services.saga.execution_saga import ( AllocateResourcesStep, CreatePodStep, @@ -114,9 +113,9 @@ class _FakeProducer(UnifiedProducer): """Fake UnifiedProducer for testing.""" def __init__(self) -> None: - self.events: list[BaseEvent] = [] + self.events: list[DomainEvent] = [] - async def produce(self, event_to_produce: BaseEvent, key: str | None = None, + async def produce(self, event_to_produce: DomainEvent, key: str | None = None, headers: dict[str, str] | None = None) -> None: self.events.append(event_to_produce) diff --git a/backend/tests/unit/services/saga/test_saga_comprehensive.py b/backend/tests/unit/services/saga/test_saga_comprehensive.py index 4c7c48f1..14bd756a 100644 --- a/backend/tests/unit/services/saga/test_saga_comprehensive.py +++ b/backend/tests/unit/services/saga/test_saga_comprehensive.py @@ -8,9 +8,8 @@ import pytest from app.domain.enums.events import EventType from app.domain.enums.saga import SagaState +from app.domain.events.typed import DomainEvent, ExecutionRequestedEvent from app.domain.saga.models import Saga -from app.infrastructure.kafka.events.base import BaseEvent -from app.infrastructure.kafka.events.execution import ExecutionRequestedEvent from app.services.saga.execution_saga import ExecutionSaga from app.services.saga.saga_step import CompensationStep, SagaContext, SagaStep @@ -24,12 +23,12 @@ async def compensate(self, context: SagaContext) -> bool: # noqa: ARG002 return True -class _Step(SagaStep[BaseEvent]): +class _Step(SagaStep[DomainEvent]): def __init__(self, name: str, ok: bool = True) -> None: super().__init__(name) self._ok = ok - async def execute(self, context: SagaContext, event: BaseEvent) -> bool: # noqa: ARG002 + async def execute(self, context: SagaContext, event: DomainEvent) -> bool: # noqa: ARG002 return self._ok def get_compensation(self) -> CompensationStep: diff --git a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py index 77ed1084..b8e24fb1 100644 --- a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py +++ b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py @@ -1,48 +1,29 @@ import logging -from datetime import datetime, timezone -from typing import ClassVar from unittest.mock import MagicMock import pytest from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository from app.db.repositories.saga_repository import SagaRepository from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic from app.domain.enums.saga import SagaState +from app.domain.events.typed import DomainEvent, ExecutionRequestedEvent from app.domain.saga.models import Saga, SagaConfig from app.events.core import UnifiedProducer from app.events.event_store import EventStore from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events import BaseEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata from app.services.idempotency.idempotency_manager import IdempotencyManager from app.services.saga.base_saga import BaseSaga from app.services.saga.saga_orchestrator import SagaOrchestrator from app.services.saga.saga_step import CompensationStep, SagaContext, SagaStep from app.settings import Settings -from pydantic import Field + +from tests.helpers import make_execution_requested_event pytestmark = pytest.mark.unit _test_logger = logging.getLogger("test.services.saga.orchestrator") -class _FakeEvent(BaseEvent): - """Fake event for testing that extends BaseEvent. - - Note: event_type has no default to avoid polluting the global event type mapping - (which is built from BaseEvent subclasses with default event_type values). - """ - - event_type: EventType # No default - set explicitly in _make_event() - execution_id: str = "" - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_EVENTS - metadata: AvroEventMetadata = Field(default_factory=lambda: AvroEventMetadata( - service_name="test", service_version="1.0", correlation_id="test-corr-id" - )) - timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) - - class _FakeRepo(SagaRepository): """Fake SagaRepository for testing.""" @@ -65,7 +46,7 @@ def __init__(self) -> None: pass # Skip parent __init__ async def produce( - self, event_to_produce: BaseEvent, key: str | None = None, headers: dict[str, str] | None = None + self, event_to_produce: DomainEvent, key: str | None = None, headers: dict[str, str] | None = None ) -> None: return None @@ -94,11 +75,11 @@ def __init__(self) -> None: pass # No special attributes needed -class _StepOK(SagaStep[_FakeEvent]): +class _StepOK(SagaStep[ExecutionRequestedEvent]): def __init__(self) -> None: super().__init__("ok") - async def execute(self, context: SagaContext, event: _FakeEvent) -> bool: + async def execute(self, context: SagaContext, event: ExecutionRequestedEvent) -> bool: return True def get_compensation(self) -> CompensationStep | None: @@ -114,7 +95,7 @@ def get_name(cls) -> str: def get_trigger_events(cls) -> list[EventType]: return [EventType.EXECUTION_REQUESTED] - def get_steps(self) -> list[SagaStep[_FakeEvent]]: + def get_steps(self) -> list[SagaStep[ExecutionRequestedEvent]]: return [_StepOK()] @@ -132,17 +113,13 @@ def _orch() -> SagaOrchestrator: ) -def _make_event(et: EventType, execution_id: str) -> _FakeEvent: - return _FakeEvent(event_type=et, execution_id=execution_id) - - @pytest.mark.asyncio async def test_min_success_flow() -> None: orch = _orch() orch.register_saga(_Saga) # Set orchestrator running state via lifecycle property orch._lifecycle_started = True - await orch._handle_event(_make_event(EventType.EXECUTION_REQUESTED, "e")) + await orch._handle_event(make_execution_requested_event(execution_id="e")) # basic sanity; deep behavior covered by integration assert orch.is_running is True @@ -162,9 +139,9 @@ async def test_should_trigger_and_existing_short_circuit() -> None: logger=_test_logger, ) orch.register_saga(_Saga) - assert orch._should_trigger_saga(_Saga, _make_event(EventType.EXECUTION_REQUESTED, "e")) is True + assert orch._should_trigger_saga(_Saga, make_execution_requested_event(execution_id="e")) is True # Existing short-circuit returns existing ID s = Saga(saga_id="sX", saga_name="s", execution_id="e", state=SagaState.RUNNING) fake_repo.existing[("e", "s")] = s - sid = await orch._start_saga("s", _make_event(EventType.EXECUTION_REQUESTED, "e")) + sid = await orch._start_saga("s", make_execution_requested_event(execution_id="e")) assert sid == "sX" diff --git a/backend/tests/unit/services/saga/test_saga_step_and_base.py b/backend/tests/unit/services/saga/test_saga_step_and_base.py index 693832d5..d56acab6 100644 --- a/backend/tests/unit/services/saga/test_saga_step_and_base.py +++ b/backend/tests/unit/services/saga/test_saga_step_and_base.py @@ -3,8 +3,7 @@ import pytest from app.domain.enums.events import EventType -from app.infrastructure.kafka.events import BaseEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata +from app.domain.events.typed import EventMetadata, SystemErrorEvent from app.services.saga.base_saga import BaseSaga from app.services.saga.saga_step import CompensationStep, SagaContext, SagaStep @@ -43,12 +42,13 @@ async def compensate(self, context: SagaContext) -> bool: # noqa: ARG002 @pytest.mark.asyncio async def test_context_adders() -> None: - class E(BaseEvent): - event_type: EventType = EventType.SYSTEM_ERROR - topic = None # type: ignore[assignment] - ctx = SagaContext("s1", "e1") - evt = E(metadata=AvroEventMetadata(service_name="t", service_version="1")) + evt = SystemErrorEvent( + error_type="test_error", + message="test", + service_name="test_service", + metadata=EventMetadata(service_name="t", service_version="1"), + ) ctx.add_event(evt) assert len(ctx.events) == 1 comp = _DummyComp() @@ -74,15 +74,15 @@ def get_name(cls) -> str: def get_trigger_events(cls) -> list[EventType]: return [] - def get_steps(self) -> list[SagaStep[BaseEvent]]: + def get_steps(self) -> list[SagaStep[SystemErrorEvent]]: return [] Dummy().bind_dependencies() def test_saga_step_str_and_can_execute() -> None: - class S(SagaStep[BaseEvent]): - async def execute(self, context: SagaContext, event: BaseEvent) -> bool: + class S(SagaStep[SystemErrorEvent]): + async def execute(self, context: SagaContext, event: SystemErrorEvent) -> bool: return True def get_compensation(self) -> CompensationStep | None: @@ -91,4 +91,4 @@ def get_compensation(self) -> CompensationStep | None: s = S("nm") assert str(s) == "SagaStep(nm)" # can_execute default True - assert asyncio.run(s.can_execute(SagaContext("s", "e"), MagicMock(spec=BaseEvent))) is True + assert asyncio.run(s.can_execute(SagaContext("s", "e"), MagicMock(spec=SystemErrorEvent))) is True diff --git a/backend/tests/unit/services/sse/test_kafka_redis_bridge.py b/backend/tests/unit/services/sse/test_kafka_redis_bridge.py index 48f1b936..6e78449b 100644 --- a/backend/tests/unit/services/sse/test_kafka_redis_bridge.py +++ b/backend/tests/unit/services/sse/test_kafka_redis_bridge.py @@ -1,15 +1,12 @@ import logging -from typing import ClassVar from unittest.mock import MagicMock import pytest from app.core.metrics.events import EventMetrics from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic +from app.domain.events.typed import DomainEvent, EventMetadata, ExecutionStartedEvent from app.events.core import EventDispatcher from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.events import BaseEvent -from app.infrastructure.kafka.events.metadata import AvroEventMetadata from app.services.sse.kafka_redis_bridge import SSEKafkaRedisBridge from app.services.sse.redis_bus import SSERedisBus from app.settings import Settings @@ -23,23 +20,14 @@ class _FakeBus(SSERedisBus): """Fake SSERedisBus for testing.""" def __init__(self) -> None: - self.published: list[tuple[str, BaseEvent]] = [] + self.published: list[tuple[str, DomainEvent]] = [] - async def publish_event(self, execution_id: str, event: BaseEvent) -> None: + async def publish_event(self, execution_id: str, event: DomainEvent) -> None: self.published.append((execution_id, event)) -def _make_metadata() -> AvroEventMetadata: - return AvroEventMetadata(service_name="test", service_version="1.0") - - -class _DummyEvent(BaseEvent): - """Dummy event for testing.""" - execution_id: str | None = None - topic: ClassVar[KafkaTopic] = KafkaTopic.EXECUTION_EVENTS - - def model_dump(self, **kwargs: object) -> dict[str, str | None]: - return {"execution_id": self.execution_id} +def _make_metadata() -> EventMetadata: + return EventMetadata(service_name="test", service_version="1.0") @pytest.mark.asyncio @@ -63,13 +51,13 @@ async def test_register_and_route_events_without_kafka() -> None: handlers = disp.get_handlers(EventType.EXECUTION_STARTED) assert len(handlers) > 0 - # Event without execution_id is ignored + # Event with empty execution_id is ignored h = handlers[0] - await h(_DummyEvent(event_type=EventType.EXECUTION_STARTED, metadata=_make_metadata(), execution_id=None)) + await h(ExecutionStartedEvent(execution_id="", pod_name="p", metadata=_make_metadata())) assert fake_bus.published == [] # Proper event is published - await h(_DummyEvent(event_type=EventType.EXECUTION_STARTED, metadata=_make_metadata(), execution_id="exec-123")) + await h(ExecutionStartedEvent(execution_id="exec-123", pod_name="p", metadata=_make_metadata())) assert fake_bus.published and fake_bus.published[-1][0] == "exec-123" s = bridge.get_stats() diff --git a/backend/tests/unit/services/sse/test_sse_shutdown_manager.py b/backend/tests/unit/services/sse/test_sse_shutdown_manager.py index 46d28026..43d3e61c 100644 --- a/backend/tests/unit/services/sse/test_sse_shutdown_manager.py +++ b/backend/tests/unit/services/sse/test_sse_shutdown_manager.py @@ -5,8 +5,6 @@ from app.core.lifecycle import LifecycleEnabled from app.services.sse.sse_shutdown_manager import SSEShutdownManager -from tests.helpers.eventually import eventually - pytestmark = pytest.mark.unit _test_logger = logging.getLogger("test.services.sse.sse_shutdown_manager") @@ -34,14 +32,11 @@ async def test_register_unregister_and_shutdown_flow() -> None: e2 = await mgr.register_connection("exec-1", "c2") assert e1 is not None and e2 is not None - # Start shutdown concurrently - task = asyncio.create_task(mgr.initiate_shutdown()) - - # Wait until manager enters NOTIFYING phase (event-driven) - async def _is_notifying() -> bool: - return mgr.get_shutdown_status().phase == "notifying" + # Start shutdown - it will block waiting for connections to drain + shutdown_task = asyncio.create_task(mgr.initiate_shutdown()) - await eventually(_is_notifying, timeout=1.0, interval=0.02) + # Give shutdown task a chance to start and enter drain phase + await asyncio.sleep(0) # Yield control once # Simulate clients acknowledging and disconnecting e1.set() @@ -49,27 +44,31 @@ async def _is_notifying() -> bool: e2.set() await mgr.unregister_connection("exec-1", "c2") - await task + # Now shutdown can complete + await shutdown_task assert mgr.get_shutdown_status().complete is True @pytest.mark.asyncio async def test_reject_new_connection_during_shutdown() -> None: - mgr = SSEShutdownManager(drain_timeout=0.1, notification_timeout=0.01, force_close_timeout=0.01, + mgr = SSEShutdownManager(drain_timeout=0.5, notification_timeout=0.01, force_close_timeout=0.01, logger=_test_logger) - # Pre-register one active connection to reflect realistic state + # Pre-register one active connection - shutdown will block waiting for it e = await mgr.register_connection("e", "c0") assert e is not None - # Start shutdown and wait until initiated - t = asyncio.create_task(mgr.initiate_shutdown()) + # Start shutdown task - it sets _shutdown_initiated immediately then blocks on drain + shutdown_task = asyncio.create_task(mgr.initiate_shutdown()) - async def _initiated() -> None: - assert mgr.is_shutting_down() is True + # Yield control so shutdown task can start and set _shutdown_initiated + await asyncio.sleep(0) - await eventually(_initiated, timeout=1.0, interval=0.02) - - # New registrations rejected once shutdown initiated + # Shutdown is now initiated (blocking on drain), new registrations should be rejected + assert mgr.is_shutting_down() is True denied = await mgr.register_connection("e", "c1") assert denied is None - await t + + # Clean up - disconnect the blocking connection so shutdown can complete + e.set() + await mgr.unregister_connection("e", "c0") + await shutdown_task diff --git a/backend/tests/unit/services/test_pod_builder.py b/backend/tests/unit/services/test_pod_builder.py index 45c267e7..6742073b 100644 --- a/backend/tests/unit/services/test_pod_builder.py +++ b/backend/tests/unit/services/test_pod_builder.py @@ -1,8 +1,7 @@ from uuid import uuid4 import pytest -from app.infrastructure.kafka.events.metadata import AvroEventMetadata -from app.infrastructure.kafka.events.saga import CreatePodCommandEvent +from app.domain.events.typed import CreatePodCommandEvent, EventMetadata from app.services.k8s_worker.config import K8sWorkerConfig from app.services.k8s_worker.pod_builder import PodBuilder from kubernetes import client as k8s_client @@ -40,7 +39,7 @@ def create_pod_command(self) -> CreatePodCommandEvent: cpu_limit="1000m", memory_limit="1Gi", priority=5, - metadata=AvroEventMetadata( + metadata=EventMetadata( user_id=str(uuid4()), correlation_id=str(uuid4()), service_name="test-service", @@ -154,7 +153,7 @@ def test_container_resources_defaults( cpu_limit="", memory_limit="", priority=5, - metadata=AvroEventMetadata( + metadata=EventMetadata( service_name="svc", service_version="1", user_id=str(uuid4()), @@ -287,7 +286,7 @@ def test_pod_timeout_default( cpu_limit="500m", memory_limit="512Mi", priority=5, - metadata=AvroEventMetadata(user_id=str(uuid4()), service_name="t", service_version="1") + metadata=EventMetadata(user_id=str(uuid4()), service_name="t", service_version="1") ) pod = pod_builder.build_pod_manifest(command) @@ -344,7 +343,7 @@ def test_pod_labels_truncation( cpu_request="50m", memory_request="64Mi", priority=5, - metadata=AvroEventMetadata( + metadata=EventMetadata( service_name="svc", service_version="1", user_id=long_id, @@ -401,7 +400,7 @@ def test_different_languages( cpu_limit="200m", memory_limit="256Mi", priority=5, - metadata=AvroEventMetadata(user_id=str(uuid4()), service_name="t", service_version="1") + metadata=EventMetadata(user_id=str(uuid4()), service_name="t", service_version="1") ) pod = pod_builder.build_pod_manifest(cmd) diff --git a/backend/uv.lock b/backend/uv.lock index 1206ff4a..29f26cae 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -1068,6 +1068,7 @@ dependencies = [ { name = "markdown-it-py" }, { name = "markupsafe" }, { name = "mdurl" }, + { name = "monggregate" }, { name = "msgpack" }, { name = "multidict" }, { name = "oauthlib" }, @@ -1210,6 +1211,7 @@ requires-dist = [ { name = "markdown-it-py", specifier = "==4.0.0" }, { name = "markupsafe", specifier = "==3.0.2" }, { name = "mdurl", specifier = "==0.1.2" }, + { name = "monggregate", specifier = "==0.22.1" }, { name = "msgpack", specifier = "==1.1.0" }, { name = "multidict", specifier = "==6.7.0" }, { name = "oauthlib", specifier = "==3.2.2" }, @@ -1580,6 +1582,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] +[[package]] +name = "monggregate" +version = "0.22.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "pyhumps" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e2/8c/9627d0d3a569a2e1c6657dcd27ff7593c33f07efc5c6d206c7bf27b58c78/monggregate-0.22.1.tar.gz", hash = "sha256:6c36b5cea8ec51bd4a36e25f149d6f4b424060b9a8e023ef909884ae500b8c3a", size = 97860, upload-time = "2025-08-24T15:00:57.315Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/77/2f37358731fdf228379fb9bdc0c736371691c0493075c393ee431e42b908/monggregate-0.22.1-py3-none-any.whl", hash = "sha256:4eef7839109ce4b1bb1172b6643fa22e2dc284a45e645ea55fd4efd848aedfb2", size = 169108, upload-time = "2025-08-24T15:00:55.959Z" }, +] + [[package]] name = "msgpack" version = "1.1.0" @@ -2418,6 +2434,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, ] +[[package]] +name = "pyhumps" +version = "3.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c4/83/fa6f8fb7accb21f39e8f2b6a18f76f6d90626bdb0a5e5448e5cc9b8ab014/pyhumps-3.8.0.tar.gz", hash = "sha256:498026258f7ee1a8e447c2e28526c0bea9407f9a59c03260aee4bd6c04d681a3", size = 9018, upload-time = "2022-10-21T10:38:59.496Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/11/a1938340ecb32d71e47ad4914843775011e6e9da59ba1229f181fef3119e/pyhumps-3.8.0-py3-none-any.whl", hash = "sha256:060e1954d9069f428232a1adda165db0b9d8dfdce1d265d36df7fbff540acfd6", size = 6095, upload-time = "2022-10-21T10:38:58.231Z" }, +] + [[package]] name = "pyjwt" version = "2.9.0" diff --git a/backend/workers/dlq_processor.py b/backend/workers/dlq_processor.py index b03d3dc1..97598539 100644 --- a/backend/workers/dlq_processor.py +++ b/backend/workers/dlq_processor.py @@ -2,7 +2,7 @@ import logging import signal from contextlib import AsyncExitStack -from typing import Optional +from datetime import datetime, timezone from app.core.container import create_dlq_processor_container from app.core.database_context import Database @@ -67,42 +67,19 @@ def filter_test_events(message: DLQMessage) -> bool: def filter_old_messages(message: DLQMessage) -> bool: max_age_days = 7 - return message.age_seconds < (max_age_days * 24 * 3600) + age_seconds = (datetime.now(timezone.utc) - message.failed_at).total_seconds() + return age_seconds < (max_age_days * 24 * 3600) manager.add_filter(filter_old_messages) -def _configure_callbacks(manager: DLQManager, testing: bool, logger: logging.Logger) -> None: - async def log_before_retry(message: DLQMessage) -> None: - logger.info( - f"Retrying message {message.event_id} (type: {message.event_type}, " - f"topic: {message.original_topic}, retry: {message.retry_count + 1})" - ) - - manager.add_callback("before_retry", log_before_retry) - - async def log_after_retry(message: DLQMessage, success: bool, error: Optional[Exception] = None) -> None: - if success: - logger.info(f"Successfully retried message {message.event_id} to topic {message.original_topic}") - else: - logger.error(f"Failed to retry message {message.event_id}: {error}") - - manager.add_callback("after_retry", log_after_retry) - - async def alert_on_discard(message: DLQMessage, reason: str) -> None: - logger.warning( - f"Message {message.event_id} discarded! Type: {message.event_type}, Topic: {message.original_topic}, " - f"Reason: {reason}, Original error: {message.error}" - ) - if not testing: - pass - - manager.add_callback("on_discard", alert_on_discard) - - async def main(settings: Settings) -> None: - """Run the DLQ processor.""" + """Run the DLQ processor. + DLQ lifecycle events (received, retried, discarded) are emitted to the + dlq_events Kafka topic for external observability. Logging is handled + internally by the DLQ manager. + """ container = create_dlq_processor_container(settings) logger = await container.get(logging.Logger) logger.info("Starting DLQ Processor with DI container...") @@ -114,7 +91,6 @@ async def main(settings: Settings) -> None: _configure_retry_policies(manager, logger) _configure_filters(manager, testing=settings.TESTING, logger=logger) - _configure_callbacks(manager, testing=settings.TESTING, logger=logger) stop_event = asyncio.Event() loop = asyncio.get_running_loop() diff --git a/docker-compose.yaml b/docker-compose.yaml index f68ec656..9611c4e8 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -47,6 +47,10 @@ services: networks: - app-network container_name: mongo + ulimits: + nofile: + soft: 65536 + hard: 65536 healthcheck: test: echo 'db.runCommand("ping").ok' | mongosh localhost/integr8scode -u ${MONGO_ROOT_USER:-root} -p ${MONGO_ROOT_PASSWORD:-rootpassword} --authenticationDatabase admin --quiet interval: 10s diff --git a/docs/architecture/event-system-design.md b/docs/architecture/event-system-design.md index 604a0f7e..5e1c2787 100644 --- a/docs/architecture/event-system-design.md +++ b/docs/architecture/event-system-design.md @@ -1,10 +1,10 @@ # Event system design -This document explains how events flow through the system, why there are multiple event representations, and how they work together. If you've looked at the codebase and wondered why we have both domain events and Kafka events that look almost identical, this is where that question gets answered. +This document explains how events flow through the system and how domain events are both stored and serialized for Kafka transport. -## The three layers +## The unified event model -Events in Integr8sCode exist in three forms: +Events in Integr8sCode use a unified design where domain events are directly Avro-serializable: ```mermaid graph LR @@ -13,50 +13,40 @@ graph LR end subgraph "Domain Layer" - DE[Domain Events
typed.py] + DE[Domain Events
typed.py
extends AvroBase] end - subgraph "Infrastructure Layer" - KE[Kafka Events
kafka/events/] + subgraph "Infrastructure" + M[Mappings
kafka/mappings.py] end ET --> DE - ET --> KE - DE -.->|"same event_type"| KE + DE --> M + M --> Kafka[(Kafka Topics)] + DE --> MongoDB[(MongoDB)] ``` -The `EventType` enum defines all possible event types as strings. Domain events are Pydantic models used for storage in MongoDB and deserialization from the event store. Kafka events are Avro-compatible models used for serialization to Kafka topics. Both reference the same `EventType` values, ensuring consistency. +The `EventType` enum defines all possible event types as strings. Domain events are Pydantic models that extend `AvroBase` (from `pydantic-avro`), making them both usable for MongoDB storage and Avro-serializable for Kafka. The mappings module routes events to the correct Kafka topics. -This might look like unnecessary duplication, but it's actually a deliberate architectural choice rooted in Domain-Driven Design. +This design eliminates duplication between "domain events" and "Kafka events" by making the domain event the single source of truth. -## Why two event classes? +## Why a unified model? -In DDD terminology, what we call "domain events" and "Kafka events" map to two different concepts: domain events and integration events. +Earlier designs maintained separate domain and Kafka event classes, arguing that domain events shouldn't know about infrastructure concerns. In practice, this created: -Domain events are internal to the bounded context. They carry whatever information the domain needs, including storage-related fields like `stored_at` and `ttl_expires_at`. These events get stored in MongoDB and replayed during event sourcing operations. +- Duplicate class definitions for every event type +- Transformation logic between layers +- Risk of drift when fields changed +- Extra maintenance burden -Integration events cross bounded context boundaries. They flow through Kafka to other services or workers. They need to be serializable to Avro, which means they can't contain arbitrary Python objects. They carry routing information like the `topic` ClassVar. +The unified approach addresses these issues: -```mermaid -graph TB - subgraph "Bounded Context: Backend" - API[API Handler] --> DS[Domain Service] - DS --> DomainEvent[Domain Event] - DomainEvent --> MongoDB[(MongoDB)] - DomainEvent --> Transform[Transform] - Transform --> KafkaEvent[Kafka Event] - end - - KafkaEvent --> Kafka[(Kafka)] - - subgraph "Other Contexts" - Kafka --> Worker1[Saga Orchestrator] - Kafka --> Worker2[Pod Monitor] - Kafka --> Worker3[Result Processor] - end -``` +- **Single definition**: Each event is defined once in `domain/events/typed.py` +- **Avro-compatible**: `BaseEvent` extends `AvroBase`, enabling automatic schema generation +- **Storage-ready**: Events include storage fields (`stored_at`, `ttl_expires_at`) that MongoDB uses +- **Topic routing**: The `EVENT_TYPE_TO_TOPIC` mapping in `infrastructure/kafka/mappings.py` handles routing -The transformation between domain and Kafka events happens in `KafkaEventService`. When you call `publish_event()`, the service stores the domain event in MongoDB and publishes the corresponding Kafka event to the appropriate topic. +Infrastructure concerns (Kafka topics) are kept separate through the mappings module rather than embedded in event classes. ## How discriminated unions work @@ -102,59 +92,107 @@ sequenceDiagram This approach is more performant than trying each union member until one validates. The discriminator tells Pydantic exactly which class to use. -## Keeping things in sync - -With three representations of each event, there's a risk of drift. You might add a new `EventType` value but forget to create the corresponding domain or Kafka event class. Or you might create a Kafka event but forget to add it to the `DomainEvent` union. +## BaseEvent and AvroBase -The `test_event_schema_coverage.py` test suite catches these problems: +The `BaseEvent` class provides common fields for all events and inherits from `AvroBase` for Avro schema generation: ```python ---8<-- "backend/tests/unit/domain/events/test_event_schema_coverage.py:59:72" +class BaseEvent(AvroBase): + """Base fields for all domain events.""" + model_config = ConfigDict(from_attributes=True) + + event_id: str = Field(default_factory=lambda: str(uuid4())) + event_type: EventType + event_version: str = "1.0" + timestamp: datetime = Field(default_factory=...) + aggregate_id: str | None = None + metadata: EventMetadata + stored_at: datetime = Field(default_factory=...) + ttl_expires_at: datetime = Field(default_factory=...) ``` -The test runs in CI and fails if any `EventType` value lacks a corresponding event class. It also checks the reverse: that no orphan event classes exist without matching enum values. +The `AvroBase` inheritance enables: +- Automatic Avro schema generation via `BaseEvent.avro_schema()` +- Serialization through the Schema Registry +- Forward compatibility checking -When adding a new event type, the workflow is: +## Topic routing -1. Add the value to `EventType` enum -2. Create the domain event class in `typed.py` -3. Add it to the `DomainEvent` union -4. Create the Kafka event class in `kafka/events/` -5. Export it from `kafka/events/__init__.py` - -If you miss a step, the test tells you exactly what's missing. +Events are routed to Kafka topics through the `EVENT_TYPE_TO_TOPIC` mapping: -## The Avro connection +```python +# infrastructure/kafka/mappings.py +EVENT_TYPE_TO_TOPIC: Dict[EventType, KafkaTopic] = { + EventType.EXECUTION_REQUESTED: KafkaTopic.EXECUTION_EVENTS, + EventType.EXECUTION_COMPLETED: KafkaTopic.EXECUTION_EVENTS, + EventType.POD_CREATED: KafkaTopic.EXECUTION_EVENTS, + EventType.SAGA_STARTED: KafkaTopic.SAGA_EVENTS, + # ... all event types +} +``` -Kafka events inherit from `AvroBase` (via `pydantic-avro`), which enables automatic Avro schema generation. The schema registry stores these schemas and validates that producers and consumers agree on the format. +Helper functions provide type-safe access: ```python ---8<-- "backend/app/infrastructure/kafka/events/base.py:13:27" +def get_topic_for_event(event_type: EventType) -> KafkaTopic: + return EVENT_TYPE_TO_TOPIC.get(event_type, KafkaTopic.SYSTEM_EVENTS) + +def get_event_class_for_type(event_type: EventType) -> type | None: + return _get_event_type_to_class().get(event_type) ``` -Each Kafka event class also declares its target topic as a class variable. The producer uses this to route events to the correct topic without external mapping tables. +## Keeping things in sync + +With the unified model, there's less risk of drift since each event is defined once. The `test_event_schema_coverage.py` test suite validates: -## Why not just one event class? +1. Every `EventType` has a corresponding domain event class +2. Every event class has a valid `event_type` default +3. The `DomainEvent` union includes all event types +4. No orphan classes exist without matching enum values -You could theoretically use the same class for both domain and Kafka purposes. The domain-specific fields (`stored_at`, `ttl_expires_at`) could be excluded from Avro serialization with `exclude=True`. The `topic` ClassVar wouldn't serialize anyway. +When adding a new event type: -This is a valid simplification if your domain and integration events have identical payloads. But there are reasons to keep them separate: +1. Add the value to `EventType` enum +2. Create the event class in `typed.py` with the correct `event_type` default +3. Add it to the `DomainEvent` union +4. Add the topic mapping in `infrastructure/kafka/mappings.py` -The domain layer shouldn't know about Kafka topics. Adding `topic: ClassVar[KafkaTopic]` to a domain event couples it to infrastructure concerns. DDD purists would argue this violates the dependency rule. +If you miss a step, the test tells you exactly what's missing. -Avro has constraints that don't apply to MongoDB. Avro schemas don't support arbitrary nested dicts, certain datetime formats, or MongoDB-specific types like ObjectId. Keeping Kafka events separate means you can optimize them for wire format without affecting domain logic. +## Event flow + +```mermaid +graph TB + subgraph "Bounded Context: Backend" + API[API Handler] --> DS[Domain Service] + DS --> DomainEvent[Domain Event] + DomainEvent --> MongoDB[(MongoDB)] + DomainEvent --> Producer[UnifiedProducer] + end + + Producer --> Kafka[(Kafka)] + + subgraph "Consumers" + Kafka --> Worker1[Saga Orchestrator] + Kafka --> Worker2[Pod Monitor] + Kafka --> Worker3[Result Processor] + end +``` -The two layers can evolve independently. If you need to change how events are stored in MongoDB, you don't have to worry about breaking Kafka consumers. If you need to add a field to Kafka events for a new consumer, you can do so without touching the domain layer. +When publishing events, the `UnifiedProducer`: +1. Looks up the topic via `EVENT_TYPE_TO_TOPIC` +2. Serializes the event using the Schema Registry +3. Publishes to Kafka -That said, if your events are simple and you want less code to maintain, unifying them is a reasonable choice. The current architecture prioritizes separation of concerns over minimizing duplication. +The producer handles both storage in MongoDB and publishing to Kafka in a single flow. ## Key files | File | Purpose | |------|---------| | [`domain/enums/events.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/domain/enums/events.py) | `EventType` enum with all event type values | -| [`domain/events/typed.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/domain/events/typed.py) | Domain event classes and `DomainEvent` union | -| [`infrastructure/kafka/events/`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/infrastructure/kafka/events/) | Kafka event classes organized by domain | +| [`domain/events/typed.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/domain/events/typed.py) | All domain event classes and `DomainEvent` union | +| [`infrastructure/kafka/mappings.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/infrastructure/kafka/mappings.py) | Event-to-topic routing and helper functions | | [`services/kafka_event_service.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/kafka_event_service.py) | Publishes events to both MongoDB and Kafka | | [`tests/unit/domain/events/test_event_schema_coverage.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/tests/unit/domain/events/test_event_schema_coverage.py) | Validates correspondence between enum and event classes | diff --git a/docs/architecture/kafka-topic-architecture.md b/docs/architecture/kafka-topic-architecture.md index c2e00f90..ac948c3c 100644 --- a/docs/architecture/kafka-topic-architecture.md +++ b/docs/architecture/kafka-topic-architecture.md @@ -147,9 +147,9 @@ Admins can: Key files: -- `infrastructure/kafka/events/*` — Pydantic event models -- `infrastructure/kafka/mappings.py` — event to topic mapping +- `domain/events/typed.py` — all Pydantic event models (extends `AvroBase` for Avro serialization) +- `infrastructure/kafka/mappings.py` — event-to-topic routing and helper functions - `events/schema/schema_registry.py` — schema manager - `events/core/{producer,consumer,dispatcher}.py` — unified Kafka plumbing -All events are Pydantic models with *strict typing*. The mappings module routes each event type to its destination topic. Schema Registry integration ensures producers and consumers agree on structure, catching incompatible changes *before* runtime failures. The unified producer and consumer classes handle serialization, error handling, and observability. +All events are Pydantic models with *strict typing* that extend `AvroBase` for Avro schema generation. The mappings module routes each event type to its destination topic via `EVENT_TYPE_TO_TOPIC`. Schema Registry integration ensures producers and consumers agree on structure, catching incompatible changes *before* runtime failures. The unified producer and consumer classes handle serialization, error handling, and observability. diff --git a/docs/architecture/model-conversion.md b/docs/architecture/model-conversion.md index 11fd380e..a9a1e03d 100644 --- a/docs/architecture/model-conversion.md +++ b/docs/architecture/model-conversion.md @@ -25,7 +25,7 @@ graph TB end subgraph "Infrastructure Layer" - INF["Pydantic/ODM
app/db/docs/
app/infrastructure/kafka/events/"] + INF["Pydantic/ODM
app/db/docs/
app/domain/events/typed.py"] end API <--> SVC diff --git a/docs/architecture/user-settings-events.md b/docs/architecture/user-settings-events.md index 857fda7d..8a6b9378 100644 --- a/docs/architecture/user-settings-events.md +++ b/docs/architecture/user-settings-events.md @@ -9,7 +9,7 @@ All user settings changes emit a single `USER_SETTINGS_UPDATED` event type. Ther notifications, or editor settings. This eliminates branching in both publishing and consuming code. ```python ---8<-- "backend/app/infrastructure/kafka/events/user.py:72:86" +--8<-- "backend/app/domain/events/typed.py:UserSettingsUpdatedEvent" ``` The `changed_fields` list identifies which settings changed. Typed fields (`theme`, `notifications`, `editor`, etc.) @@ -91,7 +91,7 @@ The `get_settings_history` method returns a list of changes extracted from event | [`services/user_settings_service.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/user_settings_service.py) | Settings service with caching and event sourcing | | [`services/event_bus.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/event_bus.py) | Cross-instance event distribution | | [`domain/user/settings_models.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/domain/user/settings_models.py) | `DomainUserSettings`, `DomainUserSettingsUpdate` dataclasses | -| [`infrastructure/kafka/events/user.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/infrastructure/kafka/events/user.py) | `UserSettingsUpdatedEvent` definition | +| [`domain/events/typed.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/domain/events/typed.py) | `UserSettingsUpdatedEvent` definition | | [`db/repositories/user_settings_repository.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/db/repositories/user_settings_repository.py) | Snapshot and event queries | ## Related docs diff --git a/docs/components/schema-manager.md b/docs/components/schema-manager.md index abf3a928..d6c05c7a 100644 --- a/docs/components/schema-manager.md +++ b/docs/components/schema-manager.md @@ -18,7 +18,7 @@ Repositories don't create their own indexes — they only read and write. This s The `SchemaRegistryManager` class in `app/events/schema/schema_registry.py` handles Avro serialization for Kafka events. All registry operations are async and must be awaited. The manager connects to a Confluent Schema Registry and registers schemas for all event types at startup via `await initialize_schemas()`. -Each event class (subclass of `BaseEvent`) generates its own Avro schema from Pydantic model definitions. The manager registers these schemas with subjects named after the class (like `ExecutionRequestedEvent-value`) and sets FORWARD compatibility, meaning new schemas can add fields but not remove required ones. This allows producers to be upgraded before consumers without breaking deserialization. +All event classes in `domain/events/typed.py` extend `AvroBase` (from `pydantic-avro`), enabling automatic Avro schema generation. The manager registers these schemas with subjects named after the class (like `ExecutionRequestedEvent-value`) and sets FORWARD compatibility, meaning new schemas can add fields but not remove required ones. This allows producers to be upgraded before consumers without breaking deserialization. Serialization and deserialization are async — `await serialize_event(event)` and `await deserialize_event(data, topic)` must be awaited. The wire format follows Confluent conventions: a magic byte, four-byte schema id, then the Avro binary payload. The underlying `python-schema-registry-client` library handles schema registration caching internally. The manager maintains a bidirectional cache between schema ids and Python event classes for deserialization. When deserializing, it reads the schema id from the message header, looks up the corresponding event class, deserializes the Avro payload to a dict, and hydrates the Pydantic model. @@ -40,4 +40,6 @@ For Kafka schemas, the registry keeps all versions. If you break compatibility a |--------------------------------------------------------------------------------------------------------------------------------|----------------------------| | [`schema_manager.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/db/schema/schema_manager.py) | MongoDB migrations | | [`schema_registry.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/events/schema/schema_registry.py) | Kafka Avro serialization | +| [`typed.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/domain/events/typed.py) | Domain events (extend AvroBase) | +| [`mappings.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/infrastructure/kafka/mappings.py) | Event-to-topic routing | | [`dishka_lifespan.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/dishka_lifespan.py) | Startup initialization | diff --git a/docs/reference/environment-variables.md b/docs/reference/environment-variables.md index c4c054a5..75b32d9d 100644 --- a/docs/reference/environment-variables.md +++ b/docs/reference/environment-variables.md @@ -43,20 +43,23 @@ Complete reference of all environment variables used by the Integr8sCode backend ## Kafka Configuration -| Variable | Default | Description | -|----------------------------|-------------------------------|-------------------------------------| -| `KAFKA_BOOTSTRAP_SERVERS` | `kafka:29092` | Kafka broker addresses | -| `SCHEMA_REGISTRY_URL` | `http://schema-registry:8081` | Schema Registry URL | -| `SCHEMA_REGISTRY_AUTH` | `""` | Registry auth (`username:password`) | -| `ENABLE_EVENT_STREAMING` | `false` | Enable Kafka event streaming | -| `EVENT_RETENTION_DAYS` | `30` | Event retention period | -| `KAFKA_TOPIC_PREFIX` | `pref` | Topic name prefix | -| `KAFKA_GROUP_SUFFIX` | `suff` | Consumer group suffix | -| `KAFKA_CONSUMER_GROUP_ID` | `integr8scode-backend` | Default consumer group | -| `KAFKA_AUTO_OFFSET_RESET` | `earliest` | Offset reset policy | -| `KAFKA_ENABLE_AUTO_COMMIT` | `true` | Auto-commit offsets | -| `KAFKA_SESSION_TIMEOUT_MS` | `30000` | Session timeout | -| `KAFKA_MAX_POLL_RECORDS` | `500` | Max poll batch size | +| Variable | Default | Description | +|-------------------------------|-------------------------------|----------------------------------------| +| `KAFKA_BOOTSTRAP_SERVERS` | `kafka:29092` | Kafka broker addresses | +| `SCHEMA_REGISTRY_URL` | `http://schema-registry:8081` | Schema Registry URL | +| `SCHEMA_REGISTRY_AUTH` | `""` | Registry auth (`username:password`) | +| `ENABLE_EVENT_STREAMING` | `false` | Enable Kafka event streaming | +| `EVENT_RETENTION_DAYS` | `30` | Event retention period | +| `KAFKA_TOPIC_PREFIX` | `pref` | Topic name prefix | +| `KAFKA_GROUP_SUFFIX` | `suff` | Consumer group suffix | +| `KAFKA_CONSUMER_GROUP_ID` | `integr8scode-backend` | Default consumer group | +| `KAFKA_AUTO_OFFSET_RESET` | `earliest` | Offset reset policy | +| `KAFKA_ENABLE_AUTO_COMMIT` | `true` | Auto-commit offsets | +| `KAFKA_SESSION_TIMEOUT_MS` | `45000` | Session timeout (ms) | +| `KAFKA_HEARTBEAT_INTERVAL_MS` | `10000` | Heartbeat interval (ms) | +| `KAFKA_MAX_POLL_INTERVAL_MS` | `300000` | Max poll interval (ms) | +| `KAFKA_REQUEST_TIMEOUT_MS` | `40000` | Request timeout for broker calls (ms) | +| `KAFKA_MAX_POLL_RECORDS` | `500` | Max poll batch size | ## Redis Configuration diff --git a/docs/testing/kafka-test-stability.md b/docs/testing/kafka-test-stability.md index 24c89b2f..99d17fdb 100644 --- a/docs/testing/kafka-test-stability.md +++ b/docs/testing/kafka-test-stability.md @@ -21,7 +21,8 @@ failures that pass on retry. ## The fix Serialize `Producer` initialization using a global threading lock. In -[`app/events/core/producer.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/events/core/producer.py): +[ +`app/events/core/producer.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/events/core/producer.py): ```python --8<-- "backend/app/events/core/producer.py:22:24" @@ -34,11 +35,11 @@ overhead in production (producers are created once at startup) while eliminating These GitHub issues document the underlying problem: -| Issue | Description | -|-------|-------------| -| [confluent-kafka-python#1797](https://github.com/confluentinc/confluent-kafka-python/issues/1797) | Segfaults in multithreaded/asyncio pytest environments | +| Issue | Description | +|---------------------------------------------------------------------------------------------------|---------------------------------------------------------| +| [confluent-kafka-python#1797](https://github.com/confluentinc/confluent-kafka-python/issues/1797) | Segfaults in multithreaded/asyncio pytest environments | | [confluent-kafka-python#1761](https://github.com/confluentinc/confluent-kafka-python/issues/1761) | Segfault on garbage collection in multithreaded context | -| [librdkafka#3608](https://github.com/confluentinc/librdkafka/issues/3608) | Crash in `rd_kafka_broker_destroy_final` | +| [librdkafka#3608](https://github.com/confluentinc/librdkafka/issues/3608) | Crash in `rd_kafka_broker_destroy_final` | ## Alternative approaches @@ -66,3 +67,68 @@ If you still encounter issues: yield p await p.stop() # Always clean up ``` + +## Consumer teardown delays + +### The problem + +Test teardown taking 40+ seconds with errors like: + +```text +ERROR aiokafka.consumer.group_coordinator: Error sending LeaveGroupRequest to node 1 [[Error 7] RequestTimedOutError] +``` + +This happens when `consumer.stop()` sends a `LeaveGroupRequest` to the Kafka coordinator, but the request times out. + +### Root cause + +The `request_timeout_ms` parameter in aiokafka defaults to **40000ms** (40 seconds). When the Kafka coordinator is slow +or +unresponsive during test teardown, the consumer waits the full timeout before giving up. + +See [aiokafka#773](https://github.com/aio-libs/aiokafka/issues/773) for details on consumer stop delays. + +### The fix + +Configure shorter timeouts in `.env.test`: + +```bash +# Reduce consumer pool and timeouts for faster test startup/teardown +# https://github.com/aio-libs/aiokafka/issues/773 +SSE_CONSUMER_POOL_SIZE=1 +KAFKA_SESSION_TIMEOUT_MS=6000 +KAFKA_HEARTBEAT_INTERVAL_MS=2000 +KAFKA_REQUEST_TIMEOUT_MS=5000 +``` + +All consumers must pass these settings to `AIOKafkaConsumer`: + +```python +consumer = AIOKafkaConsumer( + *topics, + bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, + session_timeout_ms=settings.KAFKA_SESSION_TIMEOUT_MS, + heartbeat_interval_ms=settings.KAFKA_HEARTBEAT_INTERVAL_MS, + request_timeout_ms=settings.KAFKA_REQUEST_TIMEOUT_MS, +) +``` + +### Results + +| Metric | Before | After | +|-----------------|--------|--------| +| Teardown time | 40s | <1s | +| Total test time | 70s | 20-35s | + +### Key timeouts explained + +| Setting | Default | Test Value | Purpose | +|-------------------------------|---------|------------|----------------------------------------------------| +| `KAFKA_SESSION_TIMEOUT_MS` | 45000 | 6000 | Time before broker considers consumer dead | +| `KAFKA_HEARTBEAT_INTERVAL_MS` | 10000 | 2000 | Frequency of heartbeats to coordinator | +| `KAFKA_REQUEST_TIMEOUT_MS` | 40000 | 5000 | Timeout for broker requests (including LeaveGroup) | +| `SSE_CONSUMER_POOL_SIZE` | 10 | 1 | Number of SSE consumers (fewer = faster startup) | + +!!! note "Timeout constraints" + `request_timeout_ms` must be less than `session_timeout_ms`. The test values (5000 < 6000) and + production defaults (40000 < 45000) satisfy this constraint.