diff --git a/.github/workflows/backend-ci.yml b/.github/workflows/backend-ci.yml index 565b9eba..68d6a392 100644 --- a/.github/workflows/backend-ci.yml +++ b/.github/workflows/backend-ci.yml @@ -97,7 +97,7 @@ jobs: SCHEMA_SUBJECT_PREFIX: "ci.${{ github.run_id }}." run: | cd backend - uv run pytest tests/integration -v --cov=app --cov-branch --cov-report=xml --cov-report=term + uv run pytest tests/integration -v -rs --cov=app --cov-branch --cov-report=xml --cov-report=term - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index 4d814177..d4752b08 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -31,4 +31,4 @@ jobs: SECRET_KEY: ${{ secrets.TEST_SECRET_KEY }} run: | cd backend - uv run mypy --config-file pyproject.toml . + uv run mypy --config-file pyproject.toml --strict . diff --git a/backend/app/api/dependencies.py b/backend/app/api/dependencies.py index 47c3c7ee..d8189e1f 100644 --- a/backend/app/api/dependencies.py +++ b/backend/app/api/dependencies.py @@ -7,18 +7,12 @@ @inject -async def current_user( - request: Request, - auth_service: FromDishka[AuthService] -) -> UserResponse: +async def current_user(request: Request, auth_service: FromDishka[AuthService]) -> UserResponse: """Get authenticated user.""" return await auth_service.get_current_user(request) @inject -async def admin_user( - request: Request, - auth_service: FromDishka[AuthService] -) -> UserResponse: +async def admin_user(request: Request, auth_service: FromDishka[AuthService]) -> UserResponse: """Get authenticated admin user.""" return await auth_service.get_admin(request) diff --git a/backend/app/api/routes/admin/events.py b/backend/app/api/routes/admin/events.py index 591a4050..5af09a0d 100644 --- a/backend/app/api/routes/admin/events.py +++ b/backend/app/api/routes/admin/events.py @@ -33,18 +33,12 @@ from app.services.admin import AdminEventsService router = APIRouter( - prefix="/admin/events", - tags=["admin-events"], - route_class=DishkaRoute, - dependencies=[Depends(admin_user)] + prefix="/admin/events", tags=["admin-events"], route_class=DishkaRoute, dependencies=[Depends(admin_user)] ) @router.post("/browse") -async def browse_events( - request: EventBrowseRequest, - service: FromDishka[AdminEventsService] -) -> EventBrowseResponse: +async def browse_events(request: EventBrowseRequest, service: FromDishka[AdminEventsService]) -> EventBrowseResponse: try: event_filter = EventFilterMapper.from_admin_pydantic(request.filters) @@ -53,7 +47,7 @@ async def browse_events( skip=request.skip, limit=request.limit, sort_by=request.sort_by, - sort_order=request.sort_order + sort_order=request.sort_order, ) event_mapper = EventMapper() @@ -61,7 +55,7 @@ async def browse_events( events=[jsonable_encoder(event_mapper.to_dict(event)) for event in result.events], total=result.total, skip=result.skip, - limit=result.limit + limit=result.limit, ) except Exception as e: @@ -70,8 +64,8 @@ async def browse_events( @router.get("/stats") async def get_event_stats( - service: FromDishka[AdminEventsService], - hours: int = Query(default=24, le=168), + service: FromDishka[AdminEventsService], + hours: int = Query(default=24, le=168), ) -> EventStatsResponse: try: stats = await service.get_event_stats(hours=hours) @@ -82,11 +76,71 @@ async def get_event_stats( raise HTTPException(status_code=500, detail=str(e)) +@router.get("/export/csv") +async def export_events_csv( + service: FromDishka[AdminEventsService], + event_types: list[EventType] | None = Query(None, description="Event types (repeat param for multiple)"), + start_time: datetime | None = Query(None, description="Start time"), + end_time: datetime | None = Query(None, description="End time"), + limit: int = Query(default=10000, le=50000), +) -> StreamingResponse: + try: + export_filter = EventFilterMapper.from_admin_pydantic( + AdminEventFilter( + event_types=event_types, + start_time=start_time, + end_time=end_time, + ) + ) + result = await service.export_events_csv_content(filter=export_filter, limit=limit) + return StreamingResponse( + iter([result.content]), + media_type=result.media_type, + headers={"Content-Disposition": f"attachment; filename={result.file_name}"}, + ) + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/export/json") +async def export_events_json( + service: FromDishka[AdminEventsService], + event_types: list[EventType] | None = Query(None, description="Event types (repeat param for multiple)"), + aggregate_id: str | None = Query(None, description="Aggregate ID filter"), + correlation_id: str | None = Query(None, description="Correlation ID filter"), + user_id: str | None = Query(None, description="User ID filter"), + service_name: str | None = Query(None, description="Service name filter"), + start_time: datetime | None = Query(None, description="Start time"), + end_time: datetime | None = Query(None, description="End time"), + limit: int = Query(default=10000, le=50000), +) -> StreamingResponse: + """Export events as JSON with comprehensive filtering.""" + try: + export_filter = EventFilterMapper.from_admin_pydantic( + AdminEventFilter( + event_types=event_types, + aggregate_id=aggregate_id, + correlation_id=correlation_id, + user_id=user_id, + service_name=service_name, + start_time=start_time, + end_time=end_time, + ) + ) + result = await service.export_events_json_content(filter=export_filter, limit=limit) + return StreamingResponse( + iter([result.content]), + media_type=result.media_type, + headers={"Content-Disposition": f"attachment; filename={result.file_name}"}, + ) + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @router.get("/{event_id}") -async def get_event_detail( - event_id: str, - service: FromDishka[AdminEventsService] -) -> EventDetailResponse: +async def get_event_detail(event_id: str, service: FromDishka[AdminEventsService]) -> EventDetailResponse: try: result = await service.get_event_detail(event_id) @@ -98,7 +152,7 @@ async def get_event_detail( return EventDetailResponse( event=serialized_result["event"], related_events=serialized_result["related_events"], - timeline=serialized_result["timeline"] + timeline=serialized_result["timeline"], ) except HTTPException: @@ -109,9 +163,7 @@ async def get_event_detail( @router.post("/replay") async def replay_events( - request: EventReplayRequest, - background_tasks: BackgroundTasks, - service: FromDishka[AdminEventsService] + request: EventReplayRequest, background_tasks: BackgroundTasks, service: FromDishka[AdminEventsService] ) -> EventReplayResponse: try: replay_correlation_id = f"replay_{CorrelationContext.get_correlation_id()}" @@ -150,10 +202,7 @@ async def replay_events( @router.get("/replay/{session_id}/status") -async def get_replay_status( - session_id: str, - service: FromDishka[AdminEventsService] -) -> EventReplayStatusResponse: +async def get_replay_status(session_id: str, service: FromDishka[AdminEventsService]) -> EventReplayStatusResponse: try: status = await service.get_replay_status(session_id) @@ -171,84 +220,16 @@ async def get_replay_status( @router.delete("/{event_id}") async def delete_event( - event_id: str, - admin: Annotated[UserResponse, Depends(admin_user)], - service: FromDishka[AdminEventsService] + event_id: str, admin: Annotated[UserResponse, Depends(admin_user)], service: FromDishka[AdminEventsService] ) -> EventDeleteResponse: try: deleted = await service.delete_event(event_id=event_id, deleted_by=admin.email) if not deleted: raise HTTPException(status_code=500, detail="Failed to delete event") - return EventDeleteResponse( - message="Event deleted and archived", - event_id=event_id - ) + return EventDeleteResponse(message="Event deleted and archived", event_id=event_id) except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=str(e)) - - -@router.get("/export/csv") -async def export_events_csv( - service: FromDishka[AdminEventsService], - event_types: list[EventType] | None = Query(None, description="Event types (repeat param for multiple)"), - start_time: datetime | None = Query(None, description="Start time"), - end_time: datetime | None = Query(None, description="End time"), - limit: int = Query(default=10000, le=50000), -) -> StreamingResponse: - try: - export_filter = EventFilterMapper.from_admin_pydantic( - AdminEventFilter( - event_types=event_types, - start_time=start_time, - end_time=end_time, - ) - ) - result = await service.export_events_csv_content(filter=export_filter, limit=limit) - return StreamingResponse( - iter([result.content]), - media_type=result.media_type, - headers={"Content-Disposition": f"attachment; filename={result.filename}"}, - ) - - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@router.get("/export/json") -async def export_events_json( - service: FromDishka[AdminEventsService], - event_types: list[EventType] | None = Query(None, description="Event types (repeat param for multiple)"), - aggregate_id: str | None = Query(None, description="Aggregate ID filter"), - correlation_id: str | None = Query(None, description="Correlation ID filter"), - user_id: str | None = Query(None, description="User ID filter"), - service_name: str | None = Query(None, description="Service name filter"), - start_time: datetime | None = Query(None, description="Start time"), - end_time: datetime | None = Query(None, description="End time"), - limit: int = Query(default=10000, le=50000), -) -> StreamingResponse: - """Export events as JSON with comprehensive filtering.""" - try: - export_filter = EventFilterMapper.from_admin_pydantic( - AdminEventFilter( - event_types=event_types, - aggregate_id=aggregate_id, - correlation_id=correlation_id, - user_id=user_id, - service_name=service_name, - start_time=start_time, - end_time=end_time, - ) - ) - result = await service.export_events_json_content(filter=export_filter, limit=limit) - return StreamingResponse( - iter([result.content]), - media_type=result.media_type, - headers={"Content-Disposition": f"attachment; filename={result.filename}"}, - ) - - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) diff --git a/backend/app/api/routes/admin/settings.py b/backend/app/api/routes/admin/settings.py index e254b6f5..87ea4cf5 100644 --- a/backend/app/api/routes/admin/settings.py +++ b/backend/app/api/routes/admin/settings.py @@ -12,17 +12,14 @@ from app.services.admin import AdminSettingsService router = APIRouter( - prefix="/admin/settings", - tags=["admin", "settings"], - route_class=DishkaRoute, - dependencies=[Depends(admin_user)] + prefix="/admin/settings", tags=["admin", "settings"], route_class=DishkaRoute, dependencies=[Depends(admin_user)] ) @router.get("/", response_model=SystemSettings) async def get_system_settings( - admin: Annotated[UserResponse, Depends(admin_user)], - service: FromDishka[AdminSettingsService], + admin: Annotated[UserResponse, Depends(admin_user)], + service: FromDishka[AdminSettingsService], ) -> SystemSettings: try: domain_settings = await service.get_system_settings(admin.username) @@ -35,18 +32,15 @@ async def get_system_settings( @router.put("/", response_model=SystemSettings) async def update_system_settings( - admin: Annotated[UserResponse, Depends(admin_user)], - settings: SystemSettings, - service: FromDishka[AdminSettingsService], + admin: Annotated[UserResponse, Depends(admin_user)], + settings: SystemSettings, + service: FromDishka[AdminSettingsService], ) -> SystemSettings: try: settings_mapper = SettingsMapper() domain_settings = settings_mapper.system_settings_from_pydantic(settings.model_dump()) except (ValueError, ValidationError, KeyError) as e: - raise HTTPException( - status_code=422, - detail=f"Invalid settings: {str(e)}" - ) + raise HTTPException(status_code=422, detail=f"Invalid settings: {str(e)}") except Exception: raise HTTPException(status_code=400, detail="Invalid settings format") @@ -68,8 +62,8 @@ async def update_system_settings( @router.post("/reset", response_model=SystemSettings) async def reset_system_settings( - admin: Annotated[UserResponse, Depends(admin_user)], - service: FromDishka[AdminSettingsService], + admin: Annotated[UserResponse, Depends(admin_user)], + service: FromDishka[AdminSettingsService], ) -> SystemSettings: try: reset_domain_settings = await service.reset_system_settings(admin.username, admin.user_id) diff --git a/backend/app/api/routes/admin/users.py b/backend/app/api/routes/admin/users.py index b10216d8..c630a274 100644 --- a/backend/app/api/routes/admin/users.py +++ b/backend/app/api/routes/admin/users.py @@ -6,41 +6,39 @@ from app.api.dependencies import admin_user from app.db.repositories.admin.admin_user_repository import AdminUserRepository +from app.domain.enums.user import UserRole from app.domain.rate_limit import UserRateLimit -from app.domain.user import ( - UserUpdate as DomainUserUpdate, -) +from app.domain.user import UserUpdate as DomainUserUpdate from app.infrastructure.mappers import AdminOverviewApiMapper, UserMapper from app.schemas_pydantic.admin_user_overview import AdminUserOverview from app.schemas_pydantic.user import ( + DeleteUserResponse, MessageResponse, PasswordResetRequest, + RateLimitUpdateResponse, UserCreate, UserListResponse, + UserRateLimitsResponse, UserResponse, - UserRole, UserUpdate, ) from app.services.admin import AdminUserService from app.services.rate_limit_service import RateLimitService router = APIRouter( - prefix="/admin/users", - tags=["admin", "users"], - route_class=DishkaRoute, - dependencies=[Depends(admin_user)] + prefix="/admin/users", tags=["admin", "users"], route_class=DishkaRoute, dependencies=[Depends(admin_user)] ) @router.get("/", response_model=UserListResponse) async def list_users( - admin: Annotated[UserResponse, Depends(admin_user)], - admin_user_service: FromDishka[AdminUserService], - rate_limit_service: FromDishka[RateLimitService], - limit: int = Query(default=100, le=1000), - offset: int = Query(default=0, ge=0), - search: str | None = None, - role: UserRole | None = None, + admin: Annotated[UserResponse, Depends(admin_user)], + admin_user_service: FromDishka[AdminUserService], + rate_limit_service: FromDishka[RateLimitService], + limit: int = Query(default=100, le=1000), + offset: int = Query(default=0, ge=0), + search: str | None = None, + role: UserRole | None = None, ) -> UserListResponse: result = await admin_user_service.list_users( admin_username=admin.username, @@ -72,9 +70,9 @@ async def list_users( @router.post("/", response_model=UserResponse) async def create_user( - admin: Annotated[UserResponse, Depends(admin_user)], - user_data: UserCreate, - admin_user_service: FromDishka[AdminUserService], + admin: Annotated[UserResponse, Depends(admin_user)], + user_data: UserCreate, + admin_user_service: FromDishka[AdminUserService], ) -> UserResponse: """Create a new user (admin only).""" # Delegate to service; map known validation error to 400 @@ -88,9 +86,9 @@ async def create_user( @router.get("/{user_id}", response_model=UserResponse) async def get_user( - admin: Annotated[UserResponse, Depends(admin_user)], - user_id: str, - admin_user_service: FromDishka[AdminUserService], + admin: Annotated[UserResponse, Depends(admin_user)], + user_id: str, + admin_user_service: FromDishka[AdminUserService], ) -> UserResponse: user = await admin_user_service.get_user(admin_username=admin.username, user_id=user_id) if not user: @@ -102,9 +100,9 @@ async def get_user( @router.get("/{user_id}/overview", response_model=AdminUserOverview) async def get_user_overview( - admin: Annotated[UserResponse, Depends(admin_user)], - user_id: str, - admin_user_service: FromDishka[AdminUserService], + admin: Annotated[UserResponse, Depends(admin_user)], + user_id: str, + admin_user_service: FromDishka[AdminUserService], ) -> AdminUserOverview: # Service raises ValueError if not found -> map to 404 try: @@ -117,11 +115,11 @@ async def get_user_overview( @router.put("/{user_id}", response_model=UserResponse) async def update_user( - admin: Annotated[UserResponse, Depends(admin_user)], - user_id: str, - user_update: UserUpdate, - user_repo: FromDishka[AdminUserRepository], - admin_user_service: FromDishka[AdminUserService], + admin: Annotated[UserResponse, Depends(admin_user)], + user_id: str, + user_update: UserUpdate, + user_repo: FromDishka[AdminUserRepository], + admin_user_service: FromDishka[AdminUserService], ) -> UserResponse: # Get existing user (explicit 404), then update existing_user = await user_repo.get_user_by_id(user_id) @@ -147,13 +145,13 @@ async def update_user( return UserResponse(**user_mapper.to_response_dict(updated_user)) -@router.delete("/{user_id}") +@router.delete("/{user_id}", response_model=DeleteUserResponse) async def delete_user( - admin: Annotated[UserResponse, Depends(admin_user)], - user_id: str, - admin_user_service: FromDishka[AdminUserService], - cascade: bool = Query(default=True, description="Cascade delete user's data"), -) -> dict: + admin: Annotated[UserResponse, Depends(admin_user)], + user_id: str, + admin_user_service: FromDishka[AdminUserService], + cascade: bool = Query(default=True, description="Cascade delete user's data"), +) -> DeleteUserResponse: # Prevent self-deletion; delegate to service if admin.user_id == user_id: raise HTTPException(status_code=400, detail="Cannot delete your own account") @@ -164,15 +162,15 @@ async def delete_user( if deleted_counts.get("user", 0) == 0: raise HTTPException(status_code=500, detail="Failed to delete user") - return {"message": f"User {user_id} deleted successfully", "deleted_counts": deleted_counts} + return DeleteUserResponse(message=f"User {user_id} deleted successfully", deleted_counts=deleted_counts) @router.post("/{user_id}/reset-password", response_model=MessageResponse) async def reset_user_password( - admin: Annotated[UserResponse, Depends(admin_user)], - admin_user_service: FromDishka[AdminUserService], - user_id: str, - password_request: PasswordResetRequest, + admin: Annotated[UserResponse, Depends(admin_user)], + admin_user_service: FromDishka[AdminUserService], + user_id: str, + password_request: PasswordResetRequest, ) -> MessageResponse: success = await admin_user_service.reset_user_password( admin_username=admin.username, user_id=user_id, new_password=password_request.new_password @@ -182,32 +180,34 @@ async def reset_user_password( return MessageResponse(message=f"Password reset successfully for user {user_id}") -@router.get("/{user_id}/rate-limits") +@router.get("/{user_id}/rate-limits", response_model=UserRateLimitsResponse) async def get_user_rate_limits( - admin: Annotated[UserResponse, Depends(admin_user)], - admin_user_service: FromDishka[AdminUserService], - user_id: str, -) -> dict: - return await admin_user_service.get_user_rate_limits(admin_username=admin.username, user_id=user_id) + admin: Annotated[UserResponse, Depends(admin_user)], + admin_user_service: FromDishka[AdminUserService], + user_id: str, +) -> UserRateLimitsResponse: + result = await admin_user_service.get_user_rate_limits(admin_username=admin.username, user_id=user_id) + return UserRateLimitsResponse.model_validate(result) -@router.put("/{user_id}/rate-limits") +@router.put("/{user_id}/rate-limits", response_model=RateLimitUpdateResponse) async def update_user_rate_limits( - admin: Annotated[UserResponse, Depends(admin_user)], - admin_user_service: FromDishka[AdminUserService], - user_id: str, - rate_limit_config: UserRateLimit, -) -> dict: - return await admin_user_service.update_user_rate_limits( + admin: Annotated[UserResponse, Depends(admin_user)], + admin_user_service: FromDishka[AdminUserService], + user_id: str, + rate_limit_config: UserRateLimit, +) -> RateLimitUpdateResponse: + result = await admin_user_service.update_user_rate_limits( admin_username=admin.username, user_id=user_id, config=rate_limit_config ) + return RateLimitUpdateResponse.model_validate(result) @router.post("/{user_id}/rate-limits/reset") async def reset_user_rate_limits( - admin: Annotated[UserResponse, Depends(admin_user)], - admin_user_service: FromDishka[AdminUserService], - user_id: str, + admin: Annotated[UserResponse, Depends(admin_user)], + admin_user_service: FromDishka[AdminUserService], + user_id: str, ) -> MessageResponse: await admin_user_service.reset_user_rate_limits(admin_username=admin.username, user_id=user_id) return MessageResponse(message=f"Rate limits reset successfully for user {user_id}") diff --git a/backend/app/api/routes/auth.py b/backend/app/api/routes/auth.py index e898caad..da0eb586 100644 --- a/backend/app/api/routes/auth.py +++ b/backend/app/api/routes/auth.py @@ -21,17 +21,15 @@ from app.services.auth_service import AuthService from app.settings import get_settings -router = APIRouter(prefix="/auth", - tags=["authentication"], - route_class=DishkaRoute) +router = APIRouter(prefix="/auth", tags=["authentication"], route_class=DishkaRoute) @router.post("/login", response_model=LoginResponse) async def login( - request: Request, - response: Response, - user_repo: FromDishka[UserRepository], - form_data: OAuth2PasswordRequestForm = Depends(), + request: Request, + response: Response, + user_repo: FromDishka[UserRepository], + form_data: OAuth2PasswordRequestForm = Depends(), ) -> LoginResponse: logger.info( "Login attempt", @@ -88,9 +86,7 @@ async def login( ) access_token_expires = timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES) - access_token = security_service.create_access_token( - data={"sub": user.username}, expires_delta=access_token_expires - ) + access_token = security_service.create_access_token(data={"sub": user.username}, expires_delta=access_token_expires) csrf_token = security_service.generate_csrf_token() @@ -121,15 +117,15 @@ async def login( message="Login successful", username=user.username, role="admin" if user.is_superuser else "user", - csrf_token=csrf_token + csrf_token=csrf_token, ) @router.post("/register", response_model=UserResponse) async def register( - request: Request, - user: UserCreate, - user_repo: FromDishka[UserRepository], + request: Request, + user: UserCreate, + user_repo: FromDishka[UserRepository], ) -> UserResponse: logger.info( "Registration attempt", @@ -198,19 +194,19 @@ async def register( "error_type": type(e).__name__, "error_detail": str(e), }, - exc_info=True + exc_info=True, ) raise HTTPException(status_code=500, detail="Error creating user") from e @router.get("/me", response_model=UserResponse) async def get_current_user_profile( - request: Request, - response: Response, - auth_service: FromDishka[AuthService], + request: Request, + response: Response, + auth_service: FromDishka[AuthService], ) -> UserResponse: current_user = await auth_service.get_current_user(request) - + logger.info( "User profile request", extra={ @@ -219,18 +215,18 @@ async def get_current_user_profile( "endpoint": "/me", }, ) - + # Set cache control headers response.headers["Cache-Control"] = "no-store" response.headers["Pragma"] = "no-cache" - + return current_user @router.get("/verify-token", response_model=TokenValidationResponse) async def verify_token( - request: Request, - auth_service: FromDishka[AuthService], + request: Request, + auth_service: FromDishka[AuthService], ) -> TokenValidationResponse: current_user = await auth_service.get_current_user(request) logger.info( @@ -258,7 +254,7 @@ async def verify_token( valid=True, username=current_user.username, role="admin" if current_user.is_superuser else "user", - csrf_token=csrf_token + csrf_token=csrf_token, ) except Exception as e: @@ -278,12 +274,10 @@ async def verify_token( ) from e - - @router.post("/logout", response_model=MessageResponse) async def logout( - request: Request, - response: Response, + request: Request, + response: Response, ) -> MessageResponse: logger.info( "Logout attempt", diff --git a/backend/app/api/routes/dlq.py b/backend/app/api/routes/dlq.py index beb422d0..5da8e683 100644 --- a/backend/app/api/routes/dlq.py +++ b/backend/app/api/routes/dlq.py @@ -9,12 +9,12 @@ from app.db.repositories.dlq_repository import DLQRepository from app.dlq import RetryPolicy from app.dlq.manager import DLQManager +from app.dlq.models import DLQMessageStatus from app.schemas_pydantic.dlq import ( DLQBatchRetryResponse, DLQMessageDetail, DLQMessageResponse, DLQMessagesResponse, - DLQMessageStatus, DLQStats, DLQTopicSummaryResponse, ManualRetryRequest, @@ -23,17 +23,12 @@ from app.schemas_pydantic.user import MessageResponse router = APIRouter( - prefix="/dlq", - tags=["Dead Letter Queue"], - route_class=DishkaRoute, - dependencies=[Depends(current_user)] + prefix="/dlq", tags=["Dead Letter Queue"], route_class=DishkaRoute, dependencies=[Depends(current_user)] ) @router.get("/stats", response_model=DLQStats) -async def get_dlq_statistics( - repository: FromDishka[DLQRepository] -) -> DLQStats: +async def get_dlq_statistics(repository: FromDishka[DLQRepository]) -> DLQStats: stats = await repository.get_dlq_stats() return DLQStats( by_status=stats.by_status, @@ -43,26 +38,24 @@ async def get_dlq_statistics( "min_age": stats.age_stats.min_age_seconds, "max_age": stats.age_stats.max_age_seconds, "avg_age": stats.age_stats.avg_age_seconds, - } if stats.age_stats else {}, + } + if stats.age_stats + else {}, timestamp=stats.timestamp, ) @router.get("/messages", response_model=DLQMessagesResponse) async def get_dlq_messages( - repository: FromDishka[DLQRepository], - status: DLQMessageStatus | None = Query(None), - topic: str | None = None, - event_type: str | None = None, - limit: int = Query(50, ge=1, le=1000), - offset: int = Query(0, ge=0) + repository: FromDishka[DLQRepository], + status: DLQMessageStatus | None = Query(None), + topic: str | None = None, + event_type: str | None = None, + limit: int = Query(50, ge=1, le=1000), + offset: int = Query(0, ge=0), ) -> DLQMessagesResponse: result = await repository.get_messages( - status=status, - topic=topic, - event_type=event_type, - limit=limit, - offset=offset + status=status, topic=topic, event_type=event_type, limit=limit, offset=offset ) # Convert domain messages to response models @@ -81,25 +74,17 @@ async def get_dlq_messages( "dlq_offset": msg.dlq_offset, "dlq_partition": msg.dlq_partition, "last_error": msg.last_error, - "next_retry_at": msg.next_retry_at - } + "next_retry_at": msg.next_retry_at, + }, ) for msg in result.messages ] - return DLQMessagesResponse( - messages=messages, - total=result.total, - offset=result.offset, - limit=result.limit - ) + return DLQMessagesResponse(messages=messages, total=result.total, offset=result.offset, limit=result.limit) @router.get("/messages/{event_id}", response_model=DLQMessageDetail) -async def get_dlq_message( - event_id: str, - repository: FromDishka[DLQRepository] -) -> DLQMessageDetail: +async def get_dlq_message(event_id: str, repository: FromDishka[DLQRepository]) -> DLQMessageDetail: message = await repository.get_message_by_id(event_id) if not message: raise HTTPException(status_code=404, detail="Message not found") @@ -122,53 +107,48 @@ async def get_dlq_message( producer_id=message.producer_id, dlq_offset=message.dlq_offset, dlq_partition=message.dlq_partition, - last_error=message.last_error + last_error=message.last_error, ) @router.post("/retry", response_model=DLQBatchRetryResponse) async def retry_dlq_messages( - retry_request: ManualRetryRequest, - repository: FromDishka[DLQRepository], - dlq_manager: FromDishka[DLQManager] + retry_request: ManualRetryRequest, repository: FromDishka[DLQRepository], dlq_manager: FromDishka[DLQManager] ) -> DLQBatchRetryResponse: result = await repository.retry_messages_batch(retry_request.event_ids, dlq_manager) return DLQBatchRetryResponse( total=result.total, successful=result.successful, failed=result.failed, - details=[{"event_id": d.event_id, "status": d.status, **({"error": d.error} if d.error else {})} for d in - result.details], + details=[ + {"event_id": d.event_id, "status": d.status, **({"error": d.error} if d.error else {})} + for d in result.details + ], ) @router.post("/retry-policy", response_model=MessageResponse) -async def set_retry_policy( - policy_request: RetryPolicyRequest, - dlq_manager: FromDishka[DLQManager] -) -> MessageResponse: +async def set_retry_policy(policy_request: RetryPolicyRequest, dlq_manager: FromDishka[DLQManager]) -> MessageResponse: policy = RetryPolicy( topic=policy_request.topic, strategy=policy_request.strategy, max_retries=policy_request.max_retries, base_delay_seconds=policy_request.base_delay_seconds, max_delay_seconds=policy_request.max_delay_seconds, - retry_multiplier=policy_request.retry_multiplier + retry_multiplier=policy_request.retry_multiplier, ) dlq_manager.set_retry_policy(policy_request.topic, policy) - return MessageResponse( - message=f"Retry policy set for topic {policy_request.topic}" - ) + return MessageResponse(message=f"Retry policy set for topic {policy_request.topic}") @router.delete("/messages/{event_id}", response_model=MessageResponse) async def discard_dlq_message( - event_id: str, - repository: FromDishka[DLQRepository], - dlq_manager: FromDishka[DLQManager], - reason: str = Query(..., description="Reason for discarding") + event_id: str, + repository: FromDishka[DLQRepository], + dlq_manager: FromDishka[DLQManager], + reason: str = Query(..., description="Reason for discarding"), ) -> MessageResponse: message_data = await repository.get_message_by_id(event_id) if not message_data: @@ -180,9 +160,7 @@ async def discard_dlq_message( @router.get("/topics", response_model=List[DLQTopicSummaryResponse]) -async def get_dlq_topics( - repository: FromDishka[DLQRepository] -) -> List[DLQTopicSummaryResponse]: +async def get_dlq_topics(repository: FromDishka[DLQRepository]) -> List[DLQTopicSummaryResponse]: topics = await repository.get_topics_summary() return [ DLQTopicSummaryResponse( @@ -192,7 +170,7 @@ async def get_dlq_topics( oldest_message=topic.oldest_message, newest_message=topic.newest_message, avg_retry_count=topic.avg_retry_count, - max_retry_count=topic.max_retry_count + max_retry_count=topic.max_retry_count, ) for topic in topics ] diff --git a/backend/app/api/routes/events.py b/backend/app/api/routes/events.py index c9f73531..ad053753 100644 --- a/backend/app/api/routes/events.py +++ b/backend/app/api/routes/events.py @@ -10,6 +10,7 @@ from app.core.correlation import CorrelationContext from app.core.logging import logger from app.core.utils import get_client_ip +from app.domain.enums.common import SortOrder from app.domain.events.event_models import EventFilter from app.infrastructure.kafka.events.metadata import EventMetadata from app.infrastructure.mappers import EventMapper, EventStatisticsMapper @@ -23,35 +24,28 @@ PublishEventRequest, PublishEventResponse, ReplayAggregateResponse, - SortOrder, ) from app.schemas_pydantic.user import UserResponse from app.services.event_service import EventService from app.services.kafka_event_service import KafkaEventService from app.settings import get_settings -router = APIRouter(prefix="/events", - tags=["events"], - route_class=DishkaRoute) +router = APIRouter(prefix="/events", tags=["events"], route_class=DishkaRoute) -@router.get("/executions/{execution_id}/events", - response_model=EventListResponse) +@router.get("/executions/{execution_id}/events", response_model=EventListResponse) async def get_execution_events( - execution_id: str, - current_user: Annotated[UserResponse, Depends(current_user)], - event_service: FromDishka[EventService], - include_system_events: bool = Query( - False, - description="Include system-generated events" - ) + execution_id: str, + current_user: Annotated[UserResponse, Depends(current_user)], + event_service: FromDishka[EventService], + include_system_events: bool = Query(False, description="Include system-generated events"), ) -> EventListResponse: mapper = EventMapper() events = await event_service.get_execution_events( execution_id=execution_id, user_id=current_user.user_id, user_role=current_user.role, - include_system_events=include_system_events + include_system_events=include_system_events, ) if events is None: @@ -59,25 +53,19 @@ async def get_execution_events( event_responses = [EventResponse(**mapper.to_dict(event)) for event in events] - return EventListResponse( - events=event_responses, - total=len(event_responses), - limit=1000, - skip=0, - has_more=False - ) + return EventListResponse(events=event_responses, total=len(event_responses), limit=1000, skip=0, has_more=False) @router.get("/user", response_model=EventListResponse) async def get_user_events( - current_user: Annotated[UserResponse, Depends(current_user)], - event_service: FromDishka[EventService], - event_types: List[str] | None = Query(None), - start_time: datetime | None = Query(None), - end_time: datetime | None = Query(None), - limit: int = Query(100, ge=1, le=1000), - skip: int = Query(0, ge=0), - sort_order: SortOrder = Query(SortOrder.DESC) + current_user: Annotated[UserResponse, Depends(current_user)], + event_service: FromDishka[EventService], + event_types: List[str] | None = Query(None), + start_time: datetime | None = Query(None), + end_time: datetime | None = Query(None), + limit: int = Query(100, ge=1, le=1000), + skip: int = Query(0, ge=0), + sort_order: SortOrder = Query(SortOrder.DESC), ) -> EventListResponse: """Get events for the current user""" mapper = EventMapper() @@ -88,25 +76,21 @@ async def get_user_events( end_time=end_time, limit=limit, skip=skip, - sort_order=sort_order + sort_order=sort_order, ) event_responses = [EventResponse(**mapper.to_dict(event)) for event in result.events] return EventListResponse( - events=event_responses, - total=result.total, - limit=limit, - skip=skip, - has_more=result.has_more + events=event_responses, total=result.total, limit=limit, skip=skip, has_more=result.has_more ) @router.post("/query", response_model=EventListResponse) async def query_events( - current_user: Annotated[UserResponse, Depends(current_user)], - filter_request: EventFilterRequest, - event_service: FromDishka[EventService], + current_user: Annotated[UserResponse, Depends(current_user)], + filter_request: EventFilterRequest, + event_service: FromDishka[EventService], ) -> EventListResponse: mapper = EventMapper() event_filter = EventFilter( @@ -127,35 +111,25 @@ async def query_events( sort_by=filter_request.sort_by, sort_order=filter_request.sort_order, limit=filter_request.limit, - skip=filter_request.skip + skip=filter_request.skip, ) if result is None: - raise HTTPException( - status_code=403, - detail="Cannot query other users' events" - ) + raise HTTPException(status_code=403, detail="Cannot query other users' events") event_responses = [EventResponse(**mapper.to_dict(event)) for event in result.events] return EventListResponse( - events=event_responses, - total=result.total, - limit=result.limit, - skip=result.skip, - has_more=result.has_more + events=event_responses, total=result.total, limit=result.limit, skip=result.skip, has_more=result.has_more ) @router.get("/correlation/{correlation_id}", response_model=EventListResponse) async def get_events_by_correlation( - correlation_id: str, - current_user: Annotated[UserResponse, Depends(current_user)], - event_service: FromDishka[EventService], - include_all_users: bool = Query( - False, - description="Include events from all users (admin only)" - ), - limit: int = Query(100, ge=1, le=1000) + correlation_id: str, + current_user: Annotated[UserResponse, Depends(current_user)], + event_service: FromDishka[EventService], + include_all_users: bool = Query(False, description="Include events from all users (admin only)"), + limit: int = Query(100, ge=1, le=1000), ) -> EventListResponse: mapper = EventMapper() events = await event_service.get_events_by_correlation( @@ -163,72 +137,45 @@ async def get_events_by_correlation( user_id=current_user.user_id, user_role=current_user.role, include_all_users=include_all_users, - limit=limit + limit=limit, ) event_responses = [EventResponse(**mapper.to_dict(event)) for event in events] - return EventListResponse( - events=event_responses, - total=len(event_responses), - limit=limit, - skip=0, - has_more=False - ) + return EventListResponse(events=event_responses, total=len(event_responses), limit=limit, skip=0, has_more=False) @router.get("/current-request", response_model=EventListResponse) async def get_current_request_events( - current_user: Annotated[UserResponse, Depends(current_user)], - event_service: FromDishka[EventService], - limit: int = Query(100, ge=1, le=1000), + current_user: Annotated[UserResponse, Depends(current_user)], + event_service: FromDishka[EventService], + limit: int = Query(100, ge=1, le=1000), ) -> EventListResponse: mapper = EventMapper() correlation_id = CorrelationContext.get_correlation_id() if not correlation_id: - return EventListResponse( - events=[], - total=0, - limit=limit, - skip=0, - has_more=False - ) + return EventListResponse(events=[], total=0, limit=limit, skip=0, has_more=False) events = await event_service.get_events_by_correlation( correlation_id=correlation_id, user_id=current_user.user_id, user_role=current_user.role, include_all_users=False, - limit=limit + limit=limit, ) event_responses = [EventResponse(**mapper.to_dict(event)) for event in events] - return EventListResponse( - events=event_responses, - total=len(event_responses), - limit=limit, - skip=0, - has_more=False - ) + return EventListResponse(events=event_responses, total=len(event_responses), limit=limit, skip=0, has_more=False) @router.get("/statistics", response_model=EventStatistics) async def get_event_statistics( - current_user: Annotated[UserResponse, Depends(current_user)], - event_service: FromDishka[EventService], - start_time: datetime | None = Query( - None, - description="Start time for statistics (defaults to 24 hours ago)" - ), - end_time: datetime | None = Query( - None, - description="End time for statistics (defaults to now)" - ), - include_all_users: bool = Query( - False, - description="Include stats from all users (admin only)" - ), + current_user: Annotated[UserResponse, Depends(current_user)], + event_service: FromDishka[EventService], + start_time: datetime | None = Query(None, description="Start time for statistics (defaults to 24 hours ago)"), + end_time: datetime | None = Query(None, description="End time for statistics (defaults to now)"), + include_all_users: bool = Query(False, description="Include stats from all users (admin only)"), ) -> EventStatistics: if not start_time: start_time = datetime.now(timezone.utc) - timedelta(days=1) # 24 hours ago @@ -240,7 +187,7 @@ async def get_event_statistics( user_role=current_user.role, start_time=start_time, end_time=end_time, - include_all_users=include_all_users + include_all_users=include_all_users, ) stats_mapper = EventStatisticsMapper() @@ -249,17 +196,11 @@ async def get_event_statistics( @router.get("/{event_id}", response_model=EventResponse) async def get_event( - event_id: str, - current_user: Annotated[UserResponse, Depends(current_user)], - event_service: FromDishka[EventService] + event_id: str, current_user: Annotated[UserResponse, Depends(current_user)], event_service: FromDishka[EventService] ) -> EventResponse: """Get a specific event by ID""" mapper = EventMapper() - event = await event_service.get_event( - event_id=event_id, - user_id=current_user.user_id, - user_role=current_user.role - ) + event = await event_service.get_event(event_id=event_id, user_id=current_user.user_id, user_role=current_user.role) if event is None: raise HTTPException(status_code=404, detail="Event not found") return EventResponse(**mapper.to_dict(event)) @@ -267,10 +208,10 @@ async def get_event( @router.post("/publish", response_model=PublishEventResponse) async def publish_custom_event( - admin: Annotated[UserResponse, Depends(admin_user)], - event_request: PublishEventRequest, - request: Request, - event_service: FromDishka[KafkaEventService] + admin: Annotated[UserResponse, Depends(admin_user)], + event_request: PublishEventRequest, + request: Request, + event_service: FromDishka[KafkaEventService], ) -> PublishEventResponse: settings = get_settings() base_meta = EventMetadata( @@ -292,24 +233,20 @@ async def publish_custom_event( metadata=base_meta, ) - return PublishEventResponse( - event_id=event_id, - status="published", - timestamp=datetime.now(timezone.utc) - ) + return PublishEventResponse(event_id=event_id, status="published", timestamp=datetime.now(timezone.utc)) @router.post("/aggregate", response_model=List[Dict[str, Any]]) async def aggregate_events( - current_user: Annotated[UserResponse, Depends(current_user)], - aggregation: EventAggregationRequest, - event_service: FromDishka[EventService], + current_user: Annotated[UserResponse, Depends(current_user)], + aggregation: EventAggregationRequest, + event_service: FromDishka[EventService], ) -> List[Dict[str, Any]]: result = await event_service.aggregate_events( user_id=current_user.user_id, user_role=current_user.role, pipeline=aggregation.pipeline, - limit=aggregation.limit + limit=aggregation.limit, ) return result.results @@ -317,26 +254,19 @@ async def aggregate_events( @router.get("/types/list", response_model=List[str]) async def list_event_types( - current_user: Annotated[UserResponse, Depends(current_user)], - event_service: FromDishka[EventService] + current_user: Annotated[UserResponse, Depends(current_user)], event_service: FromDishka[EventService] ) -> List[str]: - event_types = await event_service.list_event_types( - user_id=current_user.user_id, - user_role=current_user.role - ) + event_types = await event_service.list_event_types(user_id=current_user.user_id, user_role=current_user.role) return event_types @router.delete("/{event_id}", response_model=DeleteEventResponse) async def delete_event( - event_id: str, - admin: Annotated[UserResponse, Depends(admin_user)], - event_service: FromDishka[EventService], + event_id: str, + admin: Annotated[UserResponse, Depends(admin_user)], + event_service: FromDishka[EventService], ) -> DeleteEventResponse: - result = await event_service.delete_event_with_archival( - event_id=event_id, - deleted_by=str(admin.email) - ) + result = await event_service.delete_event_with_archival(event_id=event_id, deleted_by=str(admin.email)) if result is None: raise HTTPException(status_code=404, detail="Event not found") @@ -346,38 +276,27 @@ async def delete_event( extra={ "event_type": result.event_type, "aggregate_id": result.aggregate_id, - "correlation_id": result.correlation_id - } + "correlation_id": result.correlation_id, + }, ) return DeleteEventResponse( - message="Event deleted and archived", - event_id=event_id, - deleted_at=datetime.now(timezone.utc) + message="Event deleted and archived", event_id=event_id, deleted_at=datetime.now(timezone.utc) ) @router.post("/replay/{aggregate_id}", response_model=ReplayAggregateResponse) async def replay_aggregate_events( - aggregate_id: str, - admin: Annotated[UserResponse, Depends(admin_user)], - event_service: FromDishka[EventService], - kafka_event_service: FromDishka[KafkaEventService], - target_service: str | None = Query( - None, - description="Service to replay events to" - ), - dry_run: bool = Query( - True, - description="If true, only show what would be replayed" - ), + aggregate_id: str, + admin: Annotated[UserResponse, Depends(admin_user)], + event_service: FromDishka[EventService], + kafka_event_service: FromDishka[KafkaEventService], + target_service: str | None = Query(None, description="Service to replay events to"), + dry_run: bool = Query(True, description="If true, only show what would be replayed"), ) -> ReplayAggregateResponse: replay_info = await event_service.get_aggregate_replay_info(aggregate_id) if not replay_info: - raise HTTPException( - status_code=404, - detail=f"No events found for aggregate {aggregate_id}" - ) + raise HTTPException(status_code=404, detail=f"No events found for aggregate {aggregate_id}") if dry_run: return ReplayAggregateResponse( @@ -386,7 +305,7 @@ async def replay_aggregate_events( event_count=replay_info.event_count, event_types=replay_info.event_types, start_time=replay_info.start_time, - end_time=replay_info.end_time + end_time=replay_info.end_time, ) # Perform actual replay @@ -420,5 +339,5 @@ async def replay_aggregate_events( dry_run=False, aggregate_id=aggregate_id, replayed_count=replayed_count, - replay_correlation_id=replay_correlation_id + replay_correlation_id=replay_correlation_id, ) diff --git a/backend/app/api/routes/execution.py b/backend/app/api/routes/execution.py index 8218b2cc..ef3e9a45 100644 --- a/backend/app/api/routes/execution.py +++ b/backend/app/api/routes/execution.py @@ -30,6 +30,7 @@ ExecutionResponse, ExecutionResult, ResourceLimits, + ResourceUsage, RetryExecutionRequest, ) from app.schemas_pydantic.user import UserResponse @@ -44,9 +45,9 @@ @inject async def get_execution_with_access( - execution_id: Annotated[str, Path()], - current_user: Annotated[UserResponse, Depends(current_user)], - execution_service: FromDishka[ExecutionService], + execution_id: Annotated[str, Path()], + current_user: Annotated[UserResponse, Depends(current_user)], + execution_service: FromDishka[ExecutionService], ) -> ExecutionInDB: domain_exec = await execution_service.get_execution_result(execution_id) @@ -56,10 +57,17 @@ async def get_execution_with_access( # Map domain to Pydantic for dependency consumer ru = None if domain_exec.resource_usage is not None: - ru = domain_exec.resource_usage.to_dict() + ru = ResourceUsage(**vars(domain_exec.resource_usage)) # Map error_type to public ErrorType in API model via mapper rules - error_type = (ErrorType.SCRIPT_ERROR if domain_exec.error_type == ExecutionErrorType.SCRIPT_ERROR - else ErrorType.SYSTEM_ERROR) if domain_exec.error_type is not None else None + error_type = ( + ( + ErrorType.SCRIPT_ERROR + if domain_exec.error_type == ExecutionErrorType.SCRIPT_ERROR + else ErrorType.SYSTEM_ERROR + ) + if domain_exec.error_type is not None + else None + ) return ExecutionInDB( execution_id=domain_exec.execution_id, script=domain_exec.script, @@ -79,12 +87,12 @@ async def get_execution_with_access( @router.post("/execute", response_model=ExecutionResponse) async def create_execution( - request: Request, - current_user: Annotated[UserResponse, Depends(current_user)], - execution: ExecutionRequest, - execution_service: FromDishka[ExecutionService], - idempotency_manager: FromDishka[IdempotencyManager], - idempotency_key: Annotated[str | None, Header(alias="Idempotency-Key")] = None, + request: Request, + current_user: Annotated[UserResponse, Depends(current_user)], + execution: ExecutionRequest, + execution_service: FromDishka[ExecutionService], + idempotency_manager: FromDishka[IdempotencyManager], + idempotency_key: Annotated[str | None, Header(alias="Idempotency-Key")] = None, ) -> ExecutionResponse: add_span_attributes( **{ @@ -107,11 +115,8 @@ async def create_execution( event_type=EventType.EXECUTION_REQUESTED, timestamp=datetime.now(timezone.utc), metadata=EventMetadata( - user_id=current_user.user_id, - correlation_id=str(uuid4()), - service_name="api", - service_version="1.0.0" - ) + user_id=current_user.user_id, correlation_id=str(uuid4()), service_name="api", service_version="1.0.0" + ), ) # Check for duplicate request using custom key @@ -119,7 +124,7 @@ async def create_execution( event=pseudo_event, key_strategy="custom", custom_key=f"http:{current_user.user_id}:{idempotency_key}", - ttl_seconds=86400 # 24 hours TTL for HTTP idempotency + ttl_seconds=86400, # 24 hours TTL for HTTP idempotency ) if idempotency_result.is_duplicate: @@ -149,7 +154,7 @@ async def create_execution( event=pseudo_event, cached_json=response_model.model_dump_json(), key_strategy="custom", - custom_key=f"http:{current_user.user_id}:{idempotency_key}" + custom_key=f"http:{current_user.user_id}:{idempotency_key}", ) return ExecutionApiMapper.to_response(exec_result) @@ -161,7 +166,7 @@ async def create_execution( event=pseudo_event, error=str(e), key_strategy="custom", - custom_key=f"http:{current_user.user_id}:{idempotency_key}" + custom_key=f"http:{current_user.user_id}:{idempotency_key}", ) raise HTTPException(status_code=e.status_code, detail=e.detail) from e except Exception as e: @@ -171,36 +176,30 @@ async def create_execution( event=pseudo_event, error=str(e), key_strategy="custom", - custom_key=f"http:{current_user.user_id}:{idempotency_key}" + custom_key=f"http:{current_user.user_id}:{idempotency_key}", ) - raise HTTPException( - status_code=500, - detail="Internal server error during script execution" - ) from e + raise HTTPException(status_code=500, detail="Internal server error during script execution") from e @router.get("/result/{execution_id}", response_model=ExecutionResult) async def get_result( - execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], + execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], ) -> ExecutionResult: return ExecutionResult.model_validate(execution) @router.post("/{execution_id}/cancel", response_model=CancelResponse) async def cancel_execution( - execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], - current_user: Annotated[UserResponse, Depends(current_user)], - cancel_request: CancelExecutionRequest, - event_service: FromDishka[KafkaEventService], + execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], + current_user: Annotated[UserResponse, Depends(current_user)], + cancel_request: CancelExecutionRequest, + event_service: FromDishka[KafkaEventService], ) -> CancelResponse: # Handle terminal states terminal_states = [ExecutionStatus.COMPLETED, ExecutionStatus.FAILED, ExecutionStatus.TIMEOUT] if execution.status in terminal_states: - raise HTTPException( - status_code=400, - detail=f"Cannot cancel execution in {str(execution.status)} state" - ) + raise HTTPException(status_code=400, detail=f"Cannot cancel execution in {str(execution.status)} state") # Handle idempotency - if already cancelled, return success if execution.status == ExecutionStatus.CANCELLED: @@ -208,7 +207,7 @@ async def cancel_execution( execution_id=execution.execution_id, status="already_cancelled", message="Execution was already cancelled", - event_id="-1" # exact event_id unknown + event_id="-1", # exact event_id unknown ) settings = get_settings() @@ -234,25 +233,22 @@ async def cancel_execution( execution_id=execution.execution_id, status="cancellation_requested", message="Cancellation request submitted", - event_id=event_id + event_id=event_id, ) @router.post("/{execution_id}/retry", response_model=ExecutionResponse) async def retry_execution( - original_execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], - current_user: Annotated[UserResponse, Depends(current_user)], - retry_request: RetryExecutionRequest, - request: Request, - execution_service: FromDishka[ExecutionService], + original_execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], + current_user: Annotated[UserResponse, Depends(current_user)], + retry_request: RetryExecutionRequest, + request: Request, + execution_service: FromDishka[ExecutionService], ) -> ExecutionResponse: """Retry a failed or completed execution.""" if original_execution.status in [ExecutionStatus.RUNNING, ExecutionStatus.QUEUED]: - raise HTTPException( - status_code=400, - detail=f"Cannot retry execution in {original_execution.status} state" - ) + raise HTTPException(status_code=400, detail=f"Cannot retry execution in {original_execution.status} state") # Convert UserResponse to User object client_ip = get_client_ip(request) @@ -268,15 +264,12 @@ async def retry_execution( return ExecutionApiMapper.to_response(new_result) -@router.get("/executions/{execution_id}/events", - response_model=list[ExecutionEventResponse]) +@router.get("/executions/{execution_id}/events", response_model=list[ExecutionEventResponse]) async def get_execution_events( - execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], - event_service: FromDishka[EventService], - event_types: str | None = Query( - None, description="Comma-separated event types to filter" - ), - limit: int = Query(100, ge=1, le=1000), + execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], + event_service: FromDishka[EventService], + event_types: str | None = Query(None, description="Comma-separated event types to filter"), + limit: int = Query(100, ge=1, le=1000), ) -> list[ExecutionEventResponse]: """Get all events for an execution.""" event_type_list = None @@ -284,17 +277,12 @@ async def get_execution_events( event_type_list = [t.strip() for t in event_types.split(",")] events = await event_service.get_events_by_aggregate( - aggregate_id=execution.execution_id, - event_types=event_type_list, - limit=limit + aggregate_id=execution.execution_id, event_types=event_type_list, limit=limit ) return [ ExecutionEventResponse( - event_id=event.event_id, - event_type=event.event_type, - timestamp=event.timestamp, - payload=event.payload + event_id=event.event_id, event_type=event.event_type, timestamp=event.timestamp, payload=event.payload ) for event in events ] @@ -302,14 +290,14 @@ async def get_execution_events( @router.get("/user/executions", response_model=ExecutionListResponse) async def get_user_executions( - current_user: Annotated[UserResponse, Depends(current_user)], - execution_service: FromDishka[ExecutionService], - status: ExecutionStatus | None = Query(None), - lang: str | None = Query(None), - start_time: datetime | None = Query(None), - end_time: datetime | None = Query(None), - limit: int = Query(50, ge=1, le=200), - skip: int = Query(0, ge=0), + current_user: Annotated[UserResponse, Depends(current_user)], + execution_service: FromDishka[ExecutionService], + status: ExecutionStatus | None = Query(None), + lang: str | None = Query(None), + start_time: datetime | None = Query(None), + end_time: datetime | None = Query(None), + limit: int = Query(50, ge=1, le=200), + skip: int = Query(0, ge=0), ) -> ExecutionListResponse: """Get executions for the current user.""" @@ -320,31 +308,23 @@ async def get_user_executions( start_time=start_time, end_time=end_time, limit=limit, - skip=skip + skip=skip, ) total_count = await execution_service.count_user_executions( - user_id=current_user.user_id, - status=status, - lang=lang, - start_time=start_time, - end_time=end_time + user_id=current_user.user_id, status=status, lang=lang, start_time=start_time, end_time=end_time ) execution_results = [ExecutionApiMapper.to_result(e) for e in executions] return ExecutionListResponse( - executions=execution_results, - total=total_count, - limit=limit, - skip=skip, - has_more=(skip + limit) < total_count + executions=execution_results, total=total_count, limit=limit, skip=skip, has_more=(skip + limit) < total_count ) @router.get("/example-scripts", response_model=ExampleScripts) async def get_example_scripts( - execution_service: FromDishka[ExecutionService], + execution_service: FromDishka[ExecutionService], ) -> ExampleScripts: scripts = await execution_service.get_example_scripts() return ExampleScripts(scripts=scripts) @@ -352,26 +332,21 @@ async def get_example_scripts( @router.get("/k8s-limits", response_model=ResourceLimits) async def get_k8s_resource_limits( - execution_service: FromDishka[ExecutionService], + execution_service: FromDishka[ExecutionService], ) -> ResourceLimits: try: limits = await execution_service.get_k8s_resource_limits() - return ResourceLimits(**limits) + return ResourceLimits(**vars(limits)) except Exception as e: - raise HTTPException( - status_code=500, detail="Failed to retrieve resource limits" - ) from e + raise HTTPException(status_code=500, detail="Failed to retrieve resource limits") from e @router.delete("/{execution_id}", response_model=DeleteResponse) async def delete_execution( - execution_id: str, - admin: Annotated[UserResponse, Depends(admin_user)], - execution_service: FromDishka[ExecutionService], + execution_id: str, + admin: Annotated[UserResponse, Depends(admin_user)], + execution_service: FromDishka[ExecutionService], ) -> DeleteResponse: """Delete an execution and its associated data (admin only).""" await execution_service.delete_execution(execution_id) - return DeleteResponse( - message="Execution deleted successfully", - execution_id=execution_id - ) + return DeleteResponse(message="Execution deleted successfully", execution_id=execution_id) diff --git a/backend/app/api/routes/grafana_alerts.py b/backend/app/api/routes/grafana_alerts.py index 8a8614e6..8a0684ce 100644 --- a/backend/app/api/routes/grafana_alerts.py +++ b/backend/app/api/routes/grafana_alerts.py @@ -16,9 +16,7 @@ async def receive_grafana_alerts( ) -> AlertResponse: correlation_id = CorrelationContext.get_correlation_id() - processed_count, errors = await processor.process_webhook( - webhook_payload, correlation_id - ) + processed_count, errors = await processor.process_webhook(webhook_payload, correlation_id) alerts_count = len(webhook_payload.alerts or []) diff --git a/backend/app/api/routes/health.py b/backend/app/api/routes/health.py index b4b9a0fa..8bf19ebd 100644 --- a/backend/app/api/routes/health.py +++ b/backend/app/api/routes/health.py @@ -1,30 +1,45 @@ import time from datetime import datetime, timezone -from typing import Any from dishka.integrations.fastapi import DishkaRoute from fastapi import APIRouter +from pydantic import BaseModel, Field router = APIRouter(prefix="/health", tags=["Health"], route_class=DishkaRoute) _START_TIME = time.time() -@router.get("/live") -async def liveness() -> dict[str, Any]: +class LivenessResponse(BaseModel): + """Response model for liveness probe.""" + + status: str = Field(description="Health status") + uptime_seconds: int = Field(description="Server uptime in seconds") + timestamp: str = Field(description="ISO timestamp of health check") + + +class ReadinessResponse(BaseModel): + """Response model for readiness probe.""" + + status: str = Field(description="Readiness status") + uptime_seconds: int = Field(description="Server uptime in seconds") + + +@router.get("/live", response_model=LivenessResponse) +async def liveness() -> LivenessResponse: """Basic liveness probe. Does not touch external deps.""" - return { - "status": "ok", - "uptime_seconds": int(time.time() - _START_TIME), - "timestamp": datetime.now(timezone.utc).isoformat(), - } + return LivenessResponse( + status="ok", + uptime_seconds=int(time.time() - _START_TIME), + timestamp=datetime.now(timezone.utc).isoformat(), + ) -@router.get("/ready") -async def readiness() -> dict[str, Any]: +@router.get("/ready", response_model=ReadinessResponse) +async def readiness() -> ReadinessResponse: """Simple readiness probe. Extend with dependency checks if needed.""" # Keep it simple and fast. Add checks (DB ping, Kafka ping) when desired. - return { - "status": "ok", - "uptime_seconds": int(time.time() - _START_TIME), - } + return ReadinessResponse( + status="ok", + uptime_seconds=int(time.time() - _START_TIME), + ) diff --git a/backend/app/api/routes/notifications.py b/backend/app/api/routes/notifications.py index 2550a36c..b8829cf1 100644 --- a/backend/app/api/routes/notifications.py +++ b/backend/app/api/routes/notifications.py @@ -2,12 +2,11 @@ from dishka.integrations.fastapi import DishkaRoute from fastapi import APIRouter, Query, Request, Response +from app.domain.enums.notification import NotificationChannel, NotificationStatus from app.infrastructure.mappers import NotificationApiMapper from app.schemas_pydantic.notification import ( DeleteNotificationResponse, - NotificationChannel, NotificationListResponse, - NotificationStatus, NotificationSubscription, SubscriptionsResponse, SubscriptionUpdate, @@ -21,15 +20,15 @@ @router.get("", response_model=NotificationListResponse) async def get_notifications( - request: Request, - notification_service: FromDishka[NotificationService], - auth_service: FromDishka[AuthService], - status: NotificationStatus | None = Query(None), - include_tags: list[str] | None = Query(None, description="Only notifications with any of these tags"), - exclude_tags: list[str] | None = Query(None, description="Exclude notifications with any of these tags"), - tag_prefix: str | None = Query(None, description="Only notifications having a tag starting with this prefix"), - limit: int = Query(50, ge=1, le=100), - offset: int = Query(0, ge=0), + request: Request, + notification_service: FromDishka[NotificationService], + auth_service: FromDishka[AuthService], + status: NotificationStatus | None = Query(None), + include_tags: list[str] | None = Query(None, description="Only notifications with any of these tags"), + exclude_tags: list[str] | None = Query(None, description="Exclude notifications with any of these tags"), + tag_prefix: str | None = Query(None, description="Only notifications having a tag starting with this prefix"), + limit: int = Query(50, ge=1, le=100), + offset: int = Query(0, ge=0), ) -> NotificationListResponse: current_user = await auth_service.get_current_user(request) result = await notification_service.list_notifications( @@ -46,25 +45,20 @@ async def get_notifications( @router.put("/{notification_id}/read", status_code=204) async def mark_notification_read( - notification_id: str, - notification_service: FromDishka[NotificationService], - request: Request, - auth_service: FromDishka[AuthService] + notification_id: str, + notification_service: FromDishka[NotificationService], + request: Request, + auth_service: FromDishka[AuthService], ) -> Response: current_user = await auth_service.get_current_user(request) - _ = await notification_service.mark_as_read( - notification_id=notification_id, - user_id=current_user.user_id - ) + _ = await notification_service.mark_as_read(notification_id=notification_id, user_id=current_user.user_id) return Response(status_code=204) @router.post("/mark-all-read", status_code=204) async def mark_all_read( - notification_service: FromDishka[NotificationService], - request: Request, - auth_service: FromDishka[AuthService] + notification_service: FromDishka[NotificationService], request: Request, auth_service: FromDishka[AuthService] ) -> Response: current_user = await auth_service.get_current_user(request) """Mark all notifications as read""" @@ -74,9 +68,7 @@ async def mark_all_read( @router.get("/subscriptions", response_model=SubscriptionsResponse) async def get_subscriptions( - notification_service: FromDishka[NotificationService], - request: Request, - auth_service: FromDishka[AuthService] + notification_service: FromDishka[NotificationService], request: Request, auth_service: FromDishka[AuthService] ) -> SubscriptionsResponse: current_user = await auth_service.get_current_user(request) subscriptions_dict = await notification_service.get_subscriptions(current_user.user_id) @@ -85,11 +77,11 @@ async def get_subscriptions( @router.put("/subscriptions/{channel}", response_model=NotificationSubscription) async def update_subscription( - channel: NotificationChannel, - subscription: SubscriptionUpdate, - notification_service: FromDishka[NotificationService], - request: Request, - auth_service: FromDishka[AuthService] + channel: NotificationChannel, + subscription: SubscriptionUpdate, + notification_service: FromDishka[NotificationService], + request: Request, + auth_service: FromDishka[AuthService], ) -> NotificationSubscription: current_user = await auth_service.get_current_user(request) updated_sub = await notification_service.update_subscription( @@ -107,9 +99,7 @@ async def update_subscription( @router.get("/unread-count", response_model=UnreadCountResponse) async def get_unread_count( - notification_service: FromDishka[NotificationService], - request: Request, - auth_service: FromDishka[AuthService] + notification_service: FromDishka[NotificationService], request: Request, auth_service: FromDishka[AuthService] ) -> UnreadCountResponse: current_user = await auth_service.get_current_user(request) count = await notification_service.get_unread_count(current_user.user_id) @@ -119,15 +109,12 @@ async def get_unread_count( @router.delete("/{notification_id}", response_model=DeleteNotificationResponse) async def delete_notification( - notification_id: str, - notification_service: FromDishka[NotificationService], - request: Request, - auth_service: FromDishka[AuthService] + notification_id: str, + notification_service: FromDishka[NotificationService], + request: Request, + auth_service: FromDishka[AuthService], ) -> DeleteNotificationResponse: current_user = await auth_service.get_current_user(request) """Delete a notification""" - _ = await notification_service.delete_notification( - user_id=current_user.user_id, - notification_id=notification_id - ) + _ = await notification_service.delete_notification(user_id=current_user.user_id, notification_id=notification_id) return DeleteNotificationResponse(message="Notification deleted") diff --git a/backend/app/api/routes/replay.py b/backend/app/api/routes/replay.py index d53795aa..628ef531 100644 --- a/backend/app/api/routes/replay.py +++ b/backend/app/api/routes/replay.py @@ -14,16 +14,13 @@ from app.schemas_pydantic.replay_models import ReplaySession from app.services.replay_service import ReplayService -router = APIRouter(prefix="/replay", - tags=["Event Replay"], - route_class=DishkaRoute, - dependencies=[Depends(admin_user)]) +router = APIRouter(prefix="/replay", tags=["Event Replay"], route_class=DishkaRoute, dependencies=[Depends(admin_user)]) @router.post("/sessions", response_model=ReplayResponse) async def create_replay_session( - replay_request: ReplayRequest, - service: FromDishka[ReplayService], + replay_request: ReplayRequest, + service: FromDishka[ReplayService], ) -> ReplayResponse: cfg = ReplayApiMapper.request_to_config(replay_request) result = await service.create_session_from_config(cfg) @@ -32,8 +29,8 @@ async def create_replay_session( @router.post("/sessions/{session_id}/start", response_model=ReplayResponse) async def start_replay_session( - session_id: str, - service: FromDishka[ReplayService], + session_id: str, + service: FromDishka[ReplayService], ) -> ReplayResponse: result = await service.start_session(session_id) return ReplayApiMapper.op_to_response(result.session_id, result.status, result.message) @@ -41,54 +38,45 @@ async def start_replay_session( @router.post("/sessions/{session_id}/pause", response_model=ReplayResponse) async def pause_replay_session( - session_id: str, - service: FromDishka[ReplayService], + session_id: str, + service: FromDishka[ReplayService], ) -> ReplayResponse: result = await service.pause_session(session_id) return ReplayApiMapper.op_to_response(result.session_id, result.status, result.message) @router.post("/sessions/{session_id}/resume", response_model=ReplayResponse) -async def resume_replay_session( - session_id: str, - service: FromDishka[ReplayService] -) -> ReplayResponse: +async def resume_replay_session(session_id: str, service: FromDishka[ReplayService]) -> ReplayResponse: result = await service.resume_session(session_id) return ReplayApiMapper.op_to_response(result.session_id, result.status, result.message) @router.post("/sessions/{session_id}/cancel", response_model=ReplayResponse) -async def cancel_replay_session( - session_id: str, - service: FromDishka[ReplayService] -) -> ReplayResponse: +async def cancel_replay_session(session_id: str, service: FromDishka[ReplayService]) -> ReplayResponse: result = await service.cancel_session(session_id) return ReplayApiMapper.op_to_response(result.session_id, result.status, result.message) @router.get("/sessions", response_model=list[SessionSummary]) async def list_replay_sessions( - service: FromDishka[ReplayService], - status: ReplayStatus | None = Query(None), - limit: int = Query(100, ge=1, le=1000), + service: FromDishka[ReplayService], + status: ReplayStatus | None = Query(None), + limit: int = Query(100, ge=1, le=1000), ) -> list[SessionSummary]: states = service.list_sessions(status=status, limit=limit) return [ReplayApiMapper.session_to_summary(s) for s in states] @router.get("/sessions/{session_id}", response_model=ReplaySession) -async def get_replay_session( - session_id: str, - service: FromDishka[ReplayService] -) -> ReplaySession: +async def get_replay_session(session_id: str, service: FromDishka[ReplayService]) -> ReplaySession: state = service.get_session(session_id) return ReplayApiMapper.session_to_response(state) @router.post("/cleanup", response_model=CleanupResponse) async def cleanup_old_sessions( - service: FromDishka[ReplayService], - older_than_hours: int = Query(24, ge=1), + service: FromDishka[ReplayService], + older_than_hours: int = Query(24, ge=1), ) -> CleanupResponse: result = await service.cleanup_old_sessions(older_than_hours) return ReplayApiMapper.cleanup_to_response(result.removed_sessions, result.message) diff --git a/backend/app/api/routes/saga.py b/backend/app/api/routes/saga.py index 30089720..5d2e4a0c 100644 --- a/backend/app/api/routes/saga.py +++ b/backend/app/api/routes/saga.py @@ -23,22 +23,22 @@ @router.get("/{saga_id}", response_model=SagaStatusResponse) async def get_saga_status( - saga_id: str, - request: Request, - saga_service: FromDishka[SagaService], - auth_service: FromDishka[AuthService], + saga_id: str, + request: Request, + saga_service: FromDishka[SagaService], + auth_service: FromDishka[AuthService], ) -> SagaStatusResponse: """Get saga status by ID. - + Args: saga_id: The saga identifier request: FastAPI request object saga_service: Saga service from DI auth_service: Auth service from DI - + Returns: Saga status response - + Raises: HTTPException: 404 if saga not found, 403 if access denied """ @@ -53,24 +53,24 @@ async def get_saga_status( @router.get("/execution/{execution_id}", response_model=SagaListResponse) async def get_execution_sagas( - execution_id: str, - request: Request, - saga_service: FromDishka[SagaService], - auth_service: FromDishka[AuthService], - state: SagaState | None = Query(None, description="Filter by saga state"), + execution_id: str, + request: Request, + saga_service: FromDishka[SagaService], + auth_service: FromDishka[AuthService], + state: SagaState | None = Query(None, description="Filter by saga state"), ) -> SagaListResponse: """Get all sagas for an execution. - + Args: execution_id: The execution identifier request: FastAPI request object saga_service: Saga service from DI auth_service: Auth service from DI state: Optional state filter - + Returns: List of sagas for the execution - + Raises: HTTPException: 403 if access denied """ @@ -86,15 +86,15 @@ async def get_execution_sagas( @router.get("/", response_model=SagaListResponse) async def list_sagas( - request: Request, - saga_service: FromDishka[SagaService], - auth_service: FromDishka[AuthService], - state: SagaState | None = Query(None, description="Filter by saga state"), - limit: int = Query(100, ge=1, le=1000), - offset: int = Query(0, ge=0), + request: Request, + saga_service: FromDishka[SagaService], + auth_service: FromDishka[AuthService], + state: SagaState | None = Query(None, description="Filter by saga state"), + limit: int = Query(100, ge=1, le=1000), + offset: int = Query(0, ge=0), ) -> SagaListResponse: """List sagas accessible by the current user. - + Args: request: FastAPI request object saga_service: Saga service from DI @@ -102,7 +102,7 @@ async def list_sagas( state: Optional state filter limit: Maximum number of results offset: Number of results to skip - + Returns: Paginated list of sagas """ @@ -110,12 +110,7 @@ async def list_sagas( service_user = User.from_response(current_user) domain_user = AdminUserMapper.from_pydantic_service_user(service_user) - result = await saga_service.list_user_sagas( - domain_user, - state, - limit, - offset - ) + result = await saga_service.list_user_sagas(domain_user, state, limit, offset) mapper = SagaResponseMapper() saga_responses = mapper.list_to_responses(result.sagas) return SagaListResponse(sagas=saga_responses, total=result.total) @@ -123,22 +118,22 @@ async def list_sagas( @router.post("/{saga_id}/cancel", response_model=SagaCancellationResponse) async def cancel_saga( - saga_id: str, - request: Request, - saga_service: FromDishka[SagaService], - auth_service: FromDishka[AuthService], + saga_id: str, + request: Request, + saga_service: FromDishka[SagaService], + auth_service: FromDishka[AuthService], ) -> SagaCancellationResponse: """Cancel a running saga. - + Args: saga_id: The saga identifier request: FastAPI request object saga_service: Saga service from DI auth_service: Auth service from DI - + Returns: Cancellation response with success status - + Raises: HTTPException: 404 if not found, 403 if denied, 400 if invalid state """ @@ -150,9 +145,6 @@ async def cancel_saga( return SagaCancellationResponse( success=success, - message=( - "Saga cancelled successfully" if success - else "Failed to cancel saga" - ), - saga_id=saga_id + message=("Saga cancelled successfully" if success else "Failed to cancel saga"), + saga_id=saga_id, ) diff --git a/backend/app/api/routes/saved_scripts.py b/backend/app/api/routes/saved_scripts.py index 67689ff2..12aff22e 100644 --- a/backend/app/api/routes/saved_scripts.py +++ b/backend/app/api/routes/saved_scripts.py @@ -15,25 +15,22 @@ @router.post("/scripts", response_model=SavedScriptResponse) async def create_saved_script( - request: Request, - saved_script: SavedScriptCreateRequest, - saved_script_service: FromDishka[SavedScriptService], - auth_service: FromDishka[AuthService], + request: Request, + saved_script: SavedScriptCreateRequest, + saved_script_service: FromDishka[SavedScriptService], + auth_service: FromDishka[AuthService], ) -> SavedScriptResponse: current_user = await auth_service.get_current_user(request) create = SavedScriptApiMapper.request_to_create(saved_script) - domain = await saved_script_service.create_saved_script( - create, - current_user.user_id - ) + domain = await saved_script_service.create_saved_script(create, current_user.user_id) return SavedScriptApiMapper.to_response(domain) @router.get("/scripts", response_model=list[SavedScriptResponse]) async def list_saved_scripts( - request: Request, - saved_script_service: FromDishka[SavedScriptService], - auth_service: FromDishka[AuthService], + request: Request, + saved_script_service: FromDishka[SavedScriptService], + auth_service: FromDishka[AuthService], ) -> list[SavedScriptResponse]: current_user = await auth_service.get_current_user(request) items = await saved_script_service.list_saved_scripts(current_user.user_id) @@ -42,50 +39,40 @@ async def list_saved_scripts( @router.get("/scripts/{script_id}", response_model=SavedScriptResponse) async def get_saved_script( - request: Request, - script_id: str, - saved_script_service: FromDishka[SavedScriptService], - auth_service: FromDishka[AuthService], + request: Request, + script_id: str, + saved_script_service: FromDishka[SavedScriptService], + auth_service: FromDishka[AuthService], ) -> SavedScriptResponse: current_user = await auth_service.get_current_user(request) - domain = await saved_script_service.get_saved_script( - script_id, - current_user.user_id - ) + domain = await saved_script_service.get_saved_script(script_id, current_user.user_id) return SavedScriptApiMapper.to_response(domain) @router.put("/scripts/{script_id}", response_model=SavedScriptResponse) async def update_saved_script( - request: Request, - script_id: str, - script_update: SavedScriptCreateRequest, - saved_script_service: FromDishka[SavedScriptService], - auth_service: FromDishka[AuthService], + request: Request, + script_id: str, + script_update: SavedScriptCreateRequest, + saved_script_service: FromDishka[SavedScriptService], + auth_service: FromDishka[AuthService], ) -> SavedScriptResponse: current_user = await auth_service.get_current_user(request) update_data = SavedScriptApiMapper.request_to_update(script_update) - domain = await saved_script_service.update_saved_script( - script_id, - current_user.user_id, - update_data - ) + domain = await saved_script_service.update_saved_script(script_id, current_user.user_id, update_data) return SavedScriptApiMapper.to_response(domain) @router.delete("/scripts/{script_id}", status_code=204) async def delete_saved_script( - request: Request, - script_id: str, - saved_script_service: FromDishka[SavedScriptService], - auth_service: FromDishka[AuthService], + request: Request, + script_id: str, + saved_script_service: FromDishka[SavedScriptService], + auth_service: FromDishka[AuthService], ) -> None: current_user = await auth_service.get_current_user(request) - await saved_script_service.delete_saved_script( - script_id, - current_user.user_id - ) + await saved_script_service.delete_saved_script(script_id, current_user.user_id) return None diff --git a/backend/app/api/routes/sse.py b/backend/app/api/routes/sse.py index b51865ad..cf410880 100644 --- a/backend/app/api/routes/sse.py +++ b/backend/app/api/routes/sse.py @@ -4,56 +4,42 @@ from sse_starlette.sse import EventSourceResponse from app.domain.sse import SSEHealthDomain -from app.schemas_pydantic.sse import SSEHealthResponse +from app.schemas_pydantic.sse import ShutdownStatusResponse, SSEHealthResponse from app.services.auth_service import AuthService from app.services.sse.sse_service import SSEService -router = APIRouter( - prefix="/events", - tags=["sse"], - route_class=DishkaRoute -) +router = APIRouter(prefix="/events", tags=["sse"], route_class=DishkaRoute) @router.get("/notifications/stream") async def notification_stream( - request: Request, - sse_service: FromDishka[SSEService], - auth_service: FromDishka[AuthService], + request: Request, + sse_service: FromDishka[SSEService], + auth_service: FromDishka[AuthService], ) -> EventSourceResponse: """Stream notifications for authenticated user.""" current_user = await auth_service.get_current_user(request) - return EventSourceResponse( - sse_service.create_notification_stream( - user_id=current_user.user_id - ) - ) + return EventSourceResponse(sse_service.create_notification_stream(user_id=current_user.user_id)) @router.get("/executions/{execution_id}") async def execution_events( - execution_id: str, - request: Request, - sse_service: FromDishka[SSEService], - auth_service: FromDishka[AuthService] + execution_id: str, request: Request, sse_service: FromDishka[SSEService], auth_service: FromDishka[AuthService] ) -> EventSourceResponse: """Stream events for specific execution.""" current_user = await auth_service.get_current_user(request) return EventSourceResponse( - sse_service.create_execution_stream( - execution_id=execution_id, - user_id=current_user.user_id - ) + sse_service.create_execution_stream(execution_id=execution_id, user_id=current_user.user_id) ) @router.get("/health", response_model=SSEHealthResponse) async def sse_health( - request: Request, - sse_service: FromDishka[SSEService], - auth_service: FromDishka[AuthService], + request: Request, + sse_service: FromDishka[SSEService], + auth_service: FromDishka[AuthService], ) -> SSEHealthResponse: """Get SSE service health status.""" _ = await auth_service.get_current_user(request) @@ -65,6 +51,6 @@ async def sse_health( active_executions=domain.active_executions, active_consumers=domain.active_consumers, max_connections_per_user=domain.max_connections_per_user, - shutdown=domain.shutdown, + shutdown=ShutdownStatusResponse(**vars(domain.shutdown)), timestamp=domain.timestamp, ) diff --git a/backend/app/api/routes/user_settings.py b/backend/app/api/routes/user_settings.py index ef323ad0..e1ebf295 100644 --- a/backend/app/api/routes/user_settings.py +++ b/backend/app/api/routes/user_settings.py @@ -18,15 +18,13 @@ ) from app.services.user_settings_service import UserSettingsService -router = APIRouter(prefix="/user/settings", - tags=["user-settings"], - route_class=DishkaRoute) +router = APIRouter(prefix="/user/settings", tags=["user-settings"], route_class=DishkaRoute) @router.get("/", response_model=UserSettings) async def get_user_settings( - current_user: Annotated[UserResponse, Depends(current_user)], - settings_service: FromDishka[UserSettingsService], + current_user: Annotated[UserResponse, Depends(current_user)], + settings_service: FromDishka[UserSettingsService], ) -> UserSettings: domain = await settings_service.get_user_settings(current_user.user_id) return UserSettingsApiMapper.to_api_settings(domain) @@ -34,9 +32,9 @@ async def get_user_settings( @router.put("/", response_model=UserSettings) async def update_user_settings( - current_user: Annotated[UserResponse, Depends(current_user)], - updates: UserSettingsUpdate, - settings_service: FromDishka[UserSettingsService], + current_user: Annotated[UserResponse, Depends(current_user)], + updates: UserSettingsUpdate, + settings_service: FromDishka[UserSettingsService], ) -> UserSettings: domain_updates = UserSettingsApiMapper.to_domain_update(updates) domain = await settings_service.update_user_settings(current_user.user_id, domain_updates) @@ -45,9 +43,9 @@ async def update_user_settings( @router.put("/theme", response_model=UserSettings) async def update_theme( - current_user: Annotated[UserResponse, Depends(current_user)], - update_request: ThemeUpdateRequest, - settings_service: FromDishka[UserSettingsService], + current_user: Annotated[UserResponse, Depends(current_user)], + update_request: ThemeUpdateRequest, + settings_service: FromDishka[UserSettingsService], ) -> UserSettings: domain = await settings_service.update_theme(current_user.user_id, update_request.theme) return UserSettingsApiMapper.to_api_settings(domain) @@ -55,9 +53,9 @@ async def update_theme( @router.put("/notifications", response_model=UserSettings) async def update_notification_settings( - current_user: Annotated[UserResponse, Depends(current_user)], - notifications: NotificationSettings, - settings_service: FromDishka[UserSettingsService], + current_user: Annotated[UserResponse, Depends(current_user)], + notifications: NotificationSettings, + settings_service: FromDishka[UserSettingsService], ) -> UserSettings: domain = await settings_service.update_notification_settings( current_user.user_id, @@ -68,9 +66,9 @@ async def update_notification_settings( @router.put("/editor", response_model=UserSettings) async def update_editor_settings( - current_user: Annotated[UserResponse, Depends(current_user)], - editor: EditorSettings, - settings_service: FromDishka[UserSettingsService], + current_user: Annotated[UserResponse, Depends(current_user)], + editor: EditorSettings, + settings_service: FromDishka[UserSettingsService], ) -> UserSettings: domain = await settings_service.update_editor_settings( current_user.user_id, @@ -81,9 +79,9 @@ async def update_editor_settings( @router.get("/history", response_model=SettingsHistoryResponse) async def get_settings_history( - current_user: Annotated[UserResponse, Depends(current_user)], - settings_service: FromDishka[UserSettingsService], - limit: int = 50, + current_user: Annotated[UserResponse, Depends(current_user)], + settings_service: FromDishka[UserSettingsService], + limit: int = 50, ) -> SettingsHistoryResponse: history = await settings_service.get_settings_history(current_user.user_id, limit=limit) return UserSettingsApiMapper.history_to_api(history) @@ -91,9 +89,9 @@ async def get_settings_history( @router.post("/restore", response_model=UserSettings) async def restore_settings( - current_user: Annotated[UserResponse, Depends(current_user)], - restore_request: RestoreSettingsRequest, - settings_service: FromDishka[UserSettingsService], + current_user: Annotated[UserResponse, Depends(current_user)], + restore_request: RestoreSettingsRequest, + settings_service: FromDishka[UserSettingsService], ) -> UserSettings: domain = await settings_service.restore_settings_to_point(current_user.user_id, restore_request.timestamp) return UserSettingsApiMapper.to_api_settings(domain) @@ -101,10 +99,10 @@ async def restore_settings( @router.put("/custom/{key}") async def update_custom_setting( - current_user: Annotated[UserResponse, Depends(current_user)], - key: str, - value: dict[str, object], - settings_service: FromDishka[UserSettingsService], + current_user: Annotated[UserResponse, Depends(current_user)], + key: str, + value: dict[str, object], + settings_service: FromDishka[UserSettingsService], ) -> UserSettings: domain = await settings_service.update_custom_setting(current_user.user_id, key, value) return UserSettingsApiMapper.to_api_settings(domain) diff --git a/backend/app/core/adaptive_sampling.py b/backend/app/core/adaptive_sampling.py index ecb2700e..26e27883 100644 --- a/backend/app/core/adaptive_sampling.py +++ b/backend/app/core/adaptive_sampling.py @@ -21,17 +21,17 @@ class AdaptiveSampler(Sampler): """ def __init__( - self, - base_rate: float = 0.1, - min_rate: float = 0.01, - max_rate: float = 1.0, - error_rate_threshold: float = 0.05, - high_traffic_threshold: int = 1000, - adjustment_interval: int = 60, + self, + base_rate: float = 0.1, + min_rate: float = 0.01, + max_rate: float = 1.0, + error_rate_threshold: float = 0.05, + high_traffic_threshold: int = 1000, + adjustment_interval: int = 60, ): """ Initialize adaptive sampler - + Args: base_rate: Base sampling rate (default 10%) min_rate: Minimum sampling rate (default 1%) @@ -67,19 +67,17 @@ def __init__( self._adjustment_thread = threading.Thread(target=self._adjustment_loop, daemon=True) self._adjustment_thread.start() - logging.getLogger("integr8scode").info( - f"Adaptive sampler initialized with base rate: {base_rate}" - ) + logging.getLogger("integr8scode").info(f"Adaptive sampler initialized with base rate: {base_rate}") def should_sample( - self, - parent_context: Context | None, - trace_id: int, - name: str, - kind: SpanKind | None = None, - attributes: Attributes | None = None, - links: Sequence[Link] | None = None, - trace_state: TraceState | None = None, + self, + parent_context: Context | None, + trace_id: int, + name: str, + kind: SpanKind | None = None, + attributes: Attributes | None = None, + links: Sequence[Link] | None = None, + trace_state: TraceState | None = None, ) -> SamplingResult: """Determine if a span should be sampled""" # Get parent trace state @@ -92,15 +90,10 @@ def should_sample( if parent_span_context.trace_flags.sampled: if parent_trace_state is not None: return SamplingResult( - decision=Decision.RECORD_AND_SAMPLE, - attributes=attributes, - trace_state=parent_trace_state + decision=Decision.RECORD_AND_SAMPLE, attributes=attributes, trace_state=parent_trace_state ) else: - return SamplingResult( - decision=Decision.RECORD_AND_SAMPLE, - attributes=attributes - ) + return SamplingResult(decision=Decision.RECORD_AND_SAMPLE, attributes=attributes) # Track request self._track_request() @@ -110,15 +103,10 @@ def should_sample( self._track_error() if parent_trace_state is not None: return SamplingResult( - decision=Decision.RECORD_AND_SAMPLE, - attributes=attributes, - trace_state=parent_trace_state + decision=Decision.RECORD_AND_SAMPLE, attributes=attributes, trace_state=parent_trace_state ) else: - return SamplingResult( - decision=Decision.RECORD_AND_SAMPLE, - attributes=attributes - ) + return SamplingResult(decision=Decision.RECORD_AND_SAMPLE, attributes=attributes) # Apply current sampling rate using integer arithmetic to avoid precision issues # Use trace ID for deterministic sampling @@ -134,12 +122,12 @@ def should_sample( return SamplingResult( decision=Decision.RECORD_AND_SAMPLE if should_sample else Decision.DROP, attributes=attributes if should_sample else None, - trace_state=parent_trace_state + trace_state=parent_trace_state, ) else: return SamplingResult( decision=Decision.RECORD_AND_SAMPLE if should_sample else Decision.DROP, - attributes=attributes if should_sample else None + attributes=attributes if should_sample else None, ) def get_description(self) -> str: @@ -211,8 +199,7 @@ def _adjust_sampling_rate(self) -> None: error_multiplier: float = min(10.0, 1 + (error_rate / self.error_rate_threshold)) new_rate = min(self.max_rate, self.base_rate * error_multiplier) logging.getLogger("integr8scode").warning( - f"High error rate detected ({error_rate:.1%}), " - f"increasing sampling to {new_rate:.1%}" + f"High error rate detected ({error_rate:.1%}), increasing sampling to {new_rate:.1%}" ) # Decrease sampling during high traffic @@ -221,17 +208,14 @@ def _adjust_sampling_rate(self) -> None: traffic_divisor = request_rate / self.high_traffic_threshold new_rate = max(self.min_rate, self.base_rate / traffic_divisor) logging.getLogger("integr8scode").info( - f"High traffic detected ({request_rate} req/min), " - f"decreasing sampling to {new_rate:.1%}" + f"High traffic detected ({request_rate} req/min), decreasing sampling to {new_rate:.1%}" ) # Apply gradual changes if new_rate != self._current_rate: # Smooth transitions change_rate = 0.5 # Adjust 50% towards target - self._current_rate = ( - self._current_rate + (new_rate - self._current_rate) * change_rate - ) + self._current_rate = self._current_rate + (new_rate - self._current_rate) * change_rate logging.getLogger("integr8scode").info( f"Adjusted sampling rate to {self._current_rate:.1%} " @@ -246,9 +230,7 @@ def _adjustment_loop(self) -> None: try: self._adjust_sampling_rate() except Exception as e: - logging.getLogger("integr8scode").error( - f"Error adjusting sampling rate: {e}" - ) + logging.getLogger("integr8scode").error(f"Error adjusting sampling rate: {e}") def shutdown(self) -> None: """Shutdown the sampler""" @@ -268,5 +250,5 @@ def create_adaptive_sampler(settings: Any | None = None) -> AdaptiveSampler: max_rate=1.0, error_rate_threshold=0.05, # 5% error rate high_traffic_threshold=1000, # 1000 requests per minute - adjustment_interval=60 # Adjust every minute + adjustment_interval=60, # Adjust every minute ) diff --git a/backend/app/core/correlation.py b/backend/app/core/correlation.py index 6dd452fd..ea62ac34 100644 --- a/backend/app/core/correlation.py +++ b/backend/app/core/correlation.py @@ -54,7 +54,7 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: # Try to get correlation ID from headers headers = dict(scope["headers"]) correlation_id = None - + for header_name in [b"x-correlation-id", b"x-request-id"]: if header_name in headers: correlation_id = headers[header_name].decode("latin-1") @@ -63,20 +63,18 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: # Generate correlation ID if not provided if not correlation_id: correlation_id = CorrelationContext.generate_correlation_id() - + # Set correlation ID correlation_id = CorrelationContext.set_correlation_id(correlation_id) # Set request metadata client = scope.get("client") client_ip = client[0] if client else None - + metadata = { "method": scope["method"], "path": scope["path"], - "client": { - "host": client_ip - } if client_ip else None + "client": {"host": client_ip} if client_ip else None, } CorrelationContext.set_request_metadata(metadata) @@ -89,6 +87,6 @@ async def send_wrapper(message: Message) -> None: # Process request await self.app(scope, receive, send_wrapper) - + # Clear context after request CorrelationContext.clear() diff --git a/backend/app/core/database_context.py b/backend/app/core/database_context.py index a8b53e9c..d7c40631 100644 --- a/backend/app/core/database_context.py +++ b/backend/app/core/database_context.py @@ -7,6 +7,8 @@ from motor.motor_asyncio import ( AsyncIOMotorClient, AsyncIOMotorClientSession, + AsyncIOMotorCollection, + AsyncIOMotorCursor, AsyncIOMotorDatabase, ) from pymongo.errors import ServerSelectionTimeoutError @@ -14,12 +16,16 @@ from app.core.logging import logger # Python 3.12 type aliases using the new 'type' statement -type DBClient = AsyncIOMotorClient[Any] -type Database = AsyncIOMotorDatabase[Any] +# MongoDocument represents the raw document type returned by Motor operations +type MongoDocument = dict[str, Any] +type DBClient = AsyncIOMotorClient[MongoDocument] +type Database = AsyncIOMotorDatabase[MongoDocument] +type Collection = AsyncIOMotorCollection[MongoDocument] +type Cursor = AsyncIOMotorCursor[MongoDocument] type DBSession = AsyncIOMotorClientSession # Type variable for generic database provider -T = TypeVar('T') +T = TypeVar("T") class DatabaseError(Exception): @@ -28,11 +34,13 @@ class DatabaseError(Exception): class DatabaseNotInitializedError(DatabaseError): """Raised when attempting to use database before initialization.""" + pass class DatabaseAlreadyInitializedError(DatabaseError): """Raised when attempting to initialize an already initialized database.""" + pass @@ -46,13 +54,12 @@ class DatabaseConfig: min_pool_size: int = 10 retry_writes: bool = True retry_reads: bool = True - write_concern: str = 'majority' + write_concern: str = "majority" journal: bool = True @runtime_checkable class DatabaseProvider(Protocol): - @property def client(self) -> DBClient: """Get the MongoDB client.""" @@ -78,7 +85,7 @@ def session(self) -> AsyncContextManager[DBSession]: class AsyncDatabaseConnection: - __slots__ = ('_client', '_database', '_db_name', '_config') + __slots__ = ("_client", "_database", "_db_name", "_config") def __init__(self, config: DatabaseConfig) -> None: self._config = config @@ -89,7 +96,7 @@ def __init__(self, config: DatabaseConfig) -> None: async def connect(self) -> None: """ Establish connection to MongoDB. - + Raises: DatabaseAlreadyInitializedError: If already connected ServerSelectionTimeoutError: If cannot connect to MongoDB @@ -101,8 +108,8 @@ async def connect(self) -> None: # Always explicitly bind to current event loop for consistency import asyncio - - client: AsyncIOMotorClient = AsyncIOMotorClient( + + client: DBClient = AsyncIOMotorClient( self._config.mongodb_url, serverSelectionTimeoutMS=self._config.server_selection_timeout_ms, connectTimeoutMS=self._config.connect_timeout_ms, @@ -112,12 +119,12 @@ async def connect(self) -> None: retryReads=self._config.retry_reads, w=self._config.write_concern, journal=self._config.journal, - io_loop=asyncio.get_running_loop() # Always bind to current loop + io_loop=asyncio.get_running_loop(), # Always bind to current loop ) # Verify connection try: - await client.admin.command('ping') + await client.admin.command("ping") logger.info("Successfully connected to MongoDB") except ServerSelectionTimeoutError as e: logger.error(f"Failed to connect to MongoDB: {e}") @@ -157,10 +164,10 @@ def is_connected(self) -> bool: async def session(self) -> AsyncIterator[DBSession]: """ Create a database session for transactions. - + Yields: Database session for use in transactions - + Example: async with connection.session() as session: await collection.insert_one(doc, session=session) @@ -172,8 +179,9 @@ async def session(self) -> AsyncIterator[DBSession]: class ContextualDatabaseProvider(DatabaseProvider): def __init__(self) -> None: - self._connection_var: contextvars.ContextVar[AsyncDatabaseConnection | None] = \ - contextvars.ContextVar('db_connection', default=None) + self._connection_var: contextvars.ContextVar[AsyncDatabaseConnection | None] = contextvars.ContextVar( + "db_connection", default=None + ) def set_connection(self, connection: AsyncDatabaseConnection) -> None: self._connection_var.set(connection) @@ -186,8 +194,7 @@ def _connection(self) -> AsyncDatabaseConnection: connection = self._connection_var.get() if connection is None: raise DatabaseNotInitializedError( - "No database connection in current context. " - "Ensure connection is set in the request lifecycle." + "No database connection in current context. Ensure connection is set in the request lifecycle." ) return connection @@ -211,28 +218,21 @@ def session(self) -> AsyncContextManager[DBSession]: return self._connection.session() - - - class DatabaseConnectionPool: def __init__(self) -> None: self._connections: dict[str, AsyncDatabaseConnection] = {} - async def create_connection( - self, - key: str, - config: DatabaseConfig - ) -> AsyncDatabaseConnection: + async def create_connection(self, key: str, config: DatabaseConfig) -> AsyncDatabaseConnection: """ Create and store a new database connection. - + Args: key: Unique identifier for this connection config: Database configuration - + Returns: The created connection - + Raises: DatabaseAlreadyInitializedError: If key already exists """ @@ -247,7 +247,7 @@ async def create_connection( def get_connection(self, key: str) -> AsyncDatabaseConnection: """ Get a connection by key. - + Raises: KeyError: If connection not found """ diff --git a/backend/app/core/dishka_lifespan.py b/backend/app/core/dishka_lifespan.py index 6da1fe82..60017e52 100644 --- a/backend/app/core/dishka_lifespan.py +++ b/backend/app/core/dishka_lifespan.py @@ -4,8 +4,8 @@ import redis.asyncio as redis from dishka import AsyncContainer from fastapi import FastAPI -from motor.motor_asyncio import AsyncIOMotorDatabase +from app.core.database_context import Database from app.core.logging import logger from app.core.startup import initialize_metrics_context, initialize_rate_limits from app.core.tracing import init_tracing @@ -20,7 +20,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: """ Application lifespan with dishka dependency injection. - + This is much cleaner than the old lifespan.py: - No dependency_overrides - No manual service management @@ -44,18 +44,18 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: service_version=settings.TRACING_SERVICE_VERSION, sampling_rate=settings.TRACING_SAMPLING_RATE, enable_console_exporter=settings.TESTING, - adaptive_sampling=settings.TRACING_ADAPTIVE_SAMPLING + adaptive_sampling=settings.TRACING_ADAPTIVE_SAMPLING, ) if instrumentation_report.has_failures(): logger.warning( "Some instrumentation libraries failed to initialize", - extra={"instrumentation_summary": instrumentation_report.get_summary()} + extra={"instrumentation_summary": instrumentation_report.get_summary()}, ) else: logger.info( "Distributed tracing initialized successfully", - extra={"instrumentation_summary": instrumentation_report.get_summary()} + extra={"instrumentation_summary": instrumentation_report.get_summary()}, ) # Initialize schema registry once at startup @@ -64,7 +64,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: await initialize_event_schemas(schema_registry) # Initialize database schema at application scope using app-scoped DB - database = await container.get(AsyncIOMotorDatabase) + database = await container.get(Database) schema_manager = SchemaManager(database) await schema_manager.apply_all() logger.info("Database schema ensured by SchemaManager") diff --git a/backend/app/core/exceptions/handlers.py b/backend/app/core/exceptions/handlers.py index 4ad68656..0f57f97b 100644 --- a/backend/app/core/exceptions/handlers.py +++ b/backend/app/core/exceptions/handlers.py @@ -11,46 +11,34 @@ def configure_exception_handlers(app: FastAPI) -> None: @app.exception_handler(IntegrationException) - async def integration_exception_handler( - request: Request, exc: IntegrationException - ) -> JSONResponse: + async def integration_exception_handler(request: Request, exc: IntegrationException) -> JSONResponse: return JSONResponse( status_code=exc.status_code, content={"detail": exc.detail}, ) @app.exception_handler(AuthenticationError) - async def authentication_error_handler( - request: Request, exc: AuthenticationError - ) -> JSONResponse: + async def authentication_error_handler(request: Request, exc: AuthenticationError) -> JSONResponse: return JSONResponse( status_code=401, content={"detail": exc.detail}, ) @app.exception_handler(ServiceError) - async def service_error_handler( - request: Request, exc: ServiceError - ) -> JSONResponse: + async def service_error_handler(request: Request, exc: ServiceError) -> JSONResponse: return JSONResponse( status_code=exc.status_code, content={"detail": exc.message}, ) @app.exception_handler(SagaNotFoundError) - async def saga_not_found_handler( - request: Request, exc: SagaNotFoundError - ) -> JSONResponse: + async def saga_not_found_handler(request: Request, exc: SagaNotFoundError) -> JSONResponse: return JSONResponse(status_code=404, content={"detail": "Saga not found"}) @app.exception_handler(SagaAccessDeniedError) - async def saga_access_denied_handler( - request: Request, exc: SagaAccessDeniedError - ) -> JSONResponse: + async def saga_access_denied_handler(request: Request, exc: SagaAccessDeniedError) -> JSONResponse: return JSONResponse(status_code=403, content={"detail": "Access denied"}) @app.exception_handler(SagaInvalidStateError) - async def saga_invalid_state_handler( - request: Request, exc: SagaInvalidStateError - ) -> JSONResponse: + async def saga_invalid_state_handler(request: Request, exc: SagaInvalidStateError) -> JSONResponse: return JSONResponse(status_code=400, content={"detail": str(exc)}) diff --git a/backend/app/core/k8s_clients.py b/backend/app/core/k8s_clients.py index 82f4b69d..ba953f0d 100644 --- a/backend/app/core/k8s_clients.py +++ b/backend/app/core/k8s_clients.py @@ -39,4 +39,3 @@ def close_k8s_clients(clients: K8sClients) -> None: close = getattr(clients.api_client, "close", None) if callable(close): close() - diff --git a/backend/app/core/logging.py b/backend/app/core/logging.py index 6ece3d5d..99a0b5c3 100644 --- a/backend/app/core/logging.py +++ b/backend/app/core/logging.py @@ -9,14 +9,10 @@ from app.settings import get_settings -correlation_id_context: contextvars.ContextVar[str | None] = contextvars.ContextVar( - 'correlation_id', - default=None -) +correlation_id_context: contextvars.ContextVar[str | None] = contextvars.ContextVar("correlation_id", default=None) request_metadata_context: contextvars.ContextVar[Dict[str, Any] | None] = contextvars.ContextVar( - 'request_metadata', - default=None + "request_metadata", default=None ) @@ -43,18 +39,20 @@ def _sanitize_sensitive_data(self, data: str) -> str: # Mask API keys, tokens, and similar sensitive data patterns = [ # API keys and tokens - (r'(["\']?(?:api[_-]?)?(?:key|token|secret|password|passwd|pwd)["\']?\s*[:=]\s*["\']?)([^"\']+)(["\']?)', - r'\1***API_KEY_OR_TOKEN_REDACTED***\3'), + ( + r'(["\']?(?:api[_-]?)?(?:key|token|secret|password|passwd|pwd)["\']?\s*[:=]\s*["\']?)([^"\']+)(["\']?)', + r"\1***API_KEY_OR_TOKEN_REDACTED***\3", + ), # Bearer tokens - (r'(Bearer\s+)([A-Za-z0-9\-_]+)', r'\1***BEARER_TOKEN_REDACTED***'), + (r"(Bearer\s+)([A-Za-z0-9\-_]+)", r"\1***BEARER_TOKEN_REDACTED***"), # JWT tokens - (r'(eyJ[A-Za-z0-9\-_]+\.eyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+)', r'***JWT_REDACTED***'), + (r"(eyJ[A-Za-z0-9\-_]+\.eyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+)", r"***JWT_REDACTED***"), # MongoDB URLs with credentials - (r'(mongodb(?:\+srv)?://[^:]+:)([^@]+)(@)', r'\1***MONGODB_REDACTED***\3'), + (r"(mongodb(?:\+srv)?://[^:]+:)([^@]+)(@)", r"\1***MONGODB_REDACTED***\3"), # Generic URLs with credentials - (r'(https?://[^:]+:)([^@]+)(@)', r'\1***URL_CREDS_REDACTED***\3'), + (r"(https?://[^:]+:)([^@]+)(@)", r"\1***URL_CREDS_REDACTED***\3"), # Email addresses (optional - uncomment if needed) - (r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', r'***EMAIL_REDACTED***'), + (r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", r"***EMAIL_REDACTED***"), ] for pattern, replacement in patterns: @@ -73,31 +71,31 @@ def format(self, record: logging.LogRecord) -> str: "message": message, } - if hasattr(record, 'correlation_id'): - log_data['correlation_id'] = record.correlation_id + if hasattr(record, "correlation_id"): + log_data["correlation_id"] = record.correlation_id - if hasattr(record, 'request_method'): - log_data['request_method'] = record.request_method + if hasattr(record, "request_method"): + log_data["request_method"] = record.request_method - if hasattr(record, 'request_path'): - log_data['request_path'] = record.request_path + if hasattr(record, "request_path"): + log_data["request_path"] = record.request_path - if hasattr(record, 'client_host'): - log_data['client_host'] = record.client_host + if hasattr(record, "client_host"): + log_data["client_host"] = record.client_host # OpenTelemetry trace context (hexadecimal ids) - if hasattr(record, 'trace_id'): - log_data['trace_id'] = record.trace_id - if hasattr(record, 'span_id'): - log_data['span_id'] = record.span_id + if hasattr(record, "trace_id"): + log_data["trace_id"] = record.trace_id + if hasattr(record, "span_id"): + log_data["span_id"] = record.span_id if record.exc_info: exc_text = self.formatException(record.exc_info) - log_data['exc_info'] = self._sanitize_sensitive_data(exc_text) + log_data["exc_info"] = self._sanitize_sensitive_data(exc_text) - if hasattr(record, 'stack_info') and record.stack_info: + if hasattr(record, "stack_info") and record.stack_info: stack_text = self.formatStack(record.stack_info) - log_data['stack_info'] = self._sanitize_sensitive_data(stack_text) + log_data["stack_info"] = self._sanitize_sensitive_data(stack_text) return json.dumps(log_data, ensure_ascii=False) @@ -123,8 +121,8 @@ def filter(self, record: logging.LogRecord) -> bool: if span and span.is_recording(): span_context = span.get_span_context() if span_context.is_valid: - trace_id = format(span_context.trace_id, '032x') - span_id = format(span_context.span_id, '016x') + trace_id = format(span_context.trace_id, "032x") + span_id = format(span_context.span_id, "016x") if trace_id: record.trace_id = trace_id if span_id: diff --git a/backend/app/core/metrics/base.py b/backend/app/core/metrics/base.py index ed7624d7..03899642 100644 --- a/backend/app/core/metrics/base.py +++ b/backend/app/core/metrics/base.py @@ -22,7 +22,7 @@ class MetricsConfig: class BaseMetrics: def __init__(self, meter_name: str | None = None): """Initialize base metrics with its own meter. - + Args: meter_name: Optional name for the meter. Defaults to class name. """ @@ -31,21 +31,21 @@ def __init__(self, meter_name: str | None = None): config = MetricsConfig( service_name=settings.TRACING_SERVICE_NAME or "integr8scode-backend", service_version="1.0.0", - otlp_endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT + otlp_endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT, ) - + # Each collector creates its own independent meter meter_name = meter_name or self.__class__.__name__ self._meter = self._create_meter(config, meter_name) self._create_instruments() - + def _create_meter(self, config: MetricsConfig, meter_name: str) -> Meter: """Create a new meter instance for this collector. - + Args: config: Metrics configuration meter_name: Name for this meter - + Returns: A new meter instance """ @@ -54,11 +54,9 @@ def _create_meter(self, config: MetricsConfig, meter_name: str) -> Meter: if not settings.ENABLE_TRACING or not config.otlp_endpoint: return NoOpMeterProvider().get_meter(meter_name) - resource = Resource.create({ - "service.name": config.service_name, - "service.version": config.service_version, - "meter.name": meter_name - }) + resource = Resource.create( + {"service.name": config.service_name, "service.version": config.service_version, "meter.name": meter_name} + ) reader = PeriodicExportingMetricReader( exporter=OTLPMetricExporter(endpoint=config.otlp_endpoint), @@ -70,11 +68,11 @@ def _create_meter(self, config: MetricsConfig, meter_name: str) -> Meter: # Return a meter from this provider return meter_provider.get_meter(meter_name) - + def _create_instruments(self) -> None: """Create metric instruments. Override in subclasses.""" pass - + def close(self) -> None: """Close the metrics collector and clean up resources.""" # Subclasses can override if they need cleanup diff --git a/backend/app/core/metrics/connections.py b/backend/app/core/metrics/connections.py index ef818fc4..3ca3c04f 100644 --- a/backend/app/core/metrics/connections.py +++ b/backend/app/core/metrics/connections.py @@ -3,97 +3,70 @@ class ConnectionMetrics(BaseMetrics): """Metrics for SSE connections and event bus.""" - + def _create_instruments(self) -> None: self.sse_active_connections = self._meter.create_up_down_counter( - name="sse.connections.active", - description="Number of active Server-Sent Events connections", - unit="1" + name="sse.connections.active", description="Number of active Server-Sent Events connections", unit="1" ) - + self.sse_messages_sent = self._meter.create_counter( - name="sse.messages.sent.total", - description="Total number of SSE messages sent", - unit="1" + name="sse.messages.sent.total", description="Total number of SSE messages sent", unit="1" ) - + self.sse_connection_duration = self._meter.create_histogram( - name="sse.connection.duration", - description="Duration of SSE connections in seconds", - unit="s" + name="sse.connection.duration", description="Duration of SSE connections in seconds", unit="s" ) - + self.sse_draining_connections = self._meter.create_up_down_counter( name="sse.connections.draining", description="Number of SSE connections being drained during shutdown", - unit="1" + unit="1", ) - + self.sse_shutdown_duration = self._meter.create_histogram( - name="sse.shutdown.duration", - description="Time taken for SSE shutdown phases in seconds", - unit="s" + name="sse.shutdown.duration", description="Time taken for SSE shutdown phases in seconds", unit="s" ) - + # Event bus metrics self.event_bus_subscribers = self._meter.create_up_down_counter( - name="event.bus.subscribers", - description="Number of active event bus subscribers by pattern", - unit="1" + name="event.bus.subscribers", description="Number of active event bus subscribers by pattern", unit="1" ) - + self.event_bus_subscriptions = self._meter.create_up_down_counter( - name="event.bus.subscriptions.total", - description="Total number of event bus subscriptions", - unit="1" + name="event.bus.subscriptions.total", description="Total number of event bus subscriptions", unit="1" ) - + def increment_sse_connections(self, endpoint: str = "default") -> None: self.sse_active_connections.add(1, attributes={"endpoint": endpoint}) - + def decrement_sse_connections(self, endpoint: str = "default") -> None: self.sse_active_connections.add(-1, attributes={"endpoint": endpoint}) - + def record_sse_message_sent(self, endpoint: str, event_type: str) -> None: - self.sse_messages_sent.add( - 1, - attributes={"endpoint": endpoint, "event_type": event_type} - ) - + self.sse_messages_sent.add(1, attributes={"endpoint": endpoint, "event_type": event_type}) + def record_sse_connection_duration(self, duration_seconds: float, endpoint: str) -> None: - self.sse_connection_duration.record( - duration_seconds, - attributes={"endpoint": endpoint} - ) - + self.sse_connection_duration.record(duration_seconds, attributes={"endpoint": endpoint}) + def update_sse_draining_connections(self, delta: int) -> None: self.sse_draining_connections.add(delta) - + def record_sse_shutdown_duration(self, duration_seconds: float, phase: str) -> None: - self.sse_shutdown_duration.record( - duration_seconds, - attributes={"phase": phase} - ) - + self.sse_shutdown_duration.record(duration_seconds, attributes={"phase": phase}) + def update_sse_shutdown_duration(self, duration_seconds: float, phase: str) -> None: - self.sse_shutdown_duration.record( - duration_seconds, - attributes={"phase": phase} - ) - + self.sse_shutdown_duration.record(duration_seconds, attributes={"phase": phase}) + def increment_event_bus_subscriptions(self) -> None: self.event_bus_subscriptions.add(1) - + def decrement_event_bus_subscriptions(self, count: int = 1) -> None: self.event_bus_subscriptions.add(-count) - + def update_event_bus_subscribers(self, count: int, pattern: str) -> None: """Update the count of event bus subscribers for a specific pattern.""" # This tracks the current number of subscribers for a pattern # We need to track the delta from the previous value # Since we can't store state in metrics, we record the absolute value # The metric system will handle the up/down nature - self.event_bus_subscribers.add( - count, - attributes={"pattern": pattern} - ) + self.event_bus_subscribers.add(count, attributes={"pattern": pattern}) diff --git a/backend/app/core/metrics/context.py b/backend/app/core/metrics/context.py index d231d472..1f382cbe 100644 --- a/backend/app/core/metrics/context.py +++ b/backend/app/core/metrics/context.py @@ -18,13 +18,13 @@ ) # Type variable for generic metrics -T = TypeVar('T') +T = TypeVar("T") class MetricsContextVar(Generic[T]): """ A wrapper around contextvars.ContextVar for type-safe metrics access. - + This class ensures that each metric type has its own context variable and provides a clean interface for getting and setting metrics. """ @@ -32,24 +32,23 @@ class MetricsContextVar(Generic[T]): def __init__(self, name: str, metric_class: Type[T]) -> None: """ Initialize a metrics context variable. - + Args: name: Name for the context variable (for debugging) metric_class: The class of the metric this context holds """ - self._context_var: contextvars.ContextVar[Optional[T]] = \ - contextvars.ContextVar(f'metrics_{name}', default=None) + self._context_var: contextvars.ContextVar[Optional[T]] = contextvars.ContextVar(f"metrics_{name}", default=None) self._metric_class = metric_class self._name = name def get(self) -> T: """ Get the metric from context, creating it if necessary. - + This method implements lazy initialization - if no metric exists in the current context, it creates one. This is useful for testing and standalone scripts where the context might not be initialized. - + Returns: The metric instance for the current context """ @@ -64,10 +63,10 @@ def get(self) -> T: def set(self, metric: T) -> contextvars.Token[Optional[T]]: """ Set the metric in the current context. - + Args: metric: The metric instance to set - + Returns: A token that can be used to reset the context """ @@ -84,24 +83,24 @@ def is_set(self) -> bool: # Create module-level context variables for each metric type # These are singletons that live for the lifetime of the application -_connection_ctx = MetricsContextVar('connection', ConnectionMetrics) -_coordinator_ctx = MetricsContextVar('coordinator', CoordinatorMetrics) -_database_ctx = MetricsContextVar('database', DatabaseMetrics) -_dlq_ctx = MetricsContextVar('dlq', DLQMetrics) -_event_ctx = MetricsContextVar('event', EventMetrics) -_execution_ctx = MetricsContextVar('execution', ExecutionMetrics) -_health_ctx = MetricsContextVar('health', HealthMetrics) -_kubernetes_ctx = MetricsContextVar('kubernetes', KubernetesMetrics) -_notification_ctx = MetricsContextVar('notification', NotificationMetrics) -_rate_limit_ctx = MetricsContextVar('rate_limit', RateLimitMetrics) -_replay_ctx = MetricsContextVar('replay', ReplayMetrics) -_security_ctx = MetricsContextVar('security', SecurityMetrics) +_connection_ctx = MetricsContextVar("connection", ConnectionMetrics) +_coordinator_ctx = MetricsContextVar("coordinator", CoordinatorMetrics) +_database_ctx = MetricsContextVar("database", DatabaseMetrics) +_dlq_ctx = MetricsContextVar("dlq", DLQMetrics) +_event_ctx = MetricsContextVar("event", EventMetrics) +_execution_ctx = MetricsContextVar("execution", ExecutionMetrics) +_health_ctx = MetricsContextVar("health", HealthMetrics) +_kubernetes_ctx = MetricsContextVar("kubernetes", KubernetesMetrics) +_notification_ctx = MetricsContextVar("notification", NotificationMetrics) +_rate_limit_ctx = MetricsContextVar("rate_limit", RateLimitMetrics) +_replay_ctx = MetricsContextVar("replay", ReplayMetrics) +_security_ctx = MetricsContextVar("security", SecurityMetrics) class MetricsContext: """ Central manager for all metrics contexts. - + This class provides a unified interface for managing all metric types in the application. It handles initialization at startup and provides access methods for each metric type. @@ -111,39 +110,39 @@ class MetricsContext: def initialize_all(cls, **metrics: Any) -> None: """ Initialize all metrics contexts at application startup. - + This should be called once during application initialization, typically in the startup sequence after dependency injection has created the metric instances. - + Args: **metrics: Keyword arguments mapping metric names to instances e.g., event=EventMetrics(), connection=ConnectionMetrics() """ for name, metric_instance in metrics.items(): - if name == 'connection': + if name == "connection": _connection_ctx.set(metric_instance) - elif name == 'coordinator': + elif name == "coordinator": _coordinator_ctx.set(metric_instance) - elif name == 'database': + elif name == "database": _database_ctx.set(metric_instance) - elif name == 'dlq': + elif name == "dlq": _dlq_ctx.set(metric_instance) - elif name == 'event': + elif name == "event": _event_ctx.set(metric_instance) - elif name == 'execution': + elif name == "execution": _execution_ctx.set(metric_instance) - elif name == 'health': + elif name == "health": _health_ctx.set(metric_instance) - elif name == 'kubernetes': + elif name == "kubernetes": _kubernetes_ctx.set(metric_instance) - elif name == 'notification': + elif name == "notification": _notification_ctx.set(metric_instance) - elif name == 'rate_limit': + elif name == "rate_limit": _rate_limit_ctx.set(metric_instance) - elif name == 'replay': + elif name == "replay": _replay_ctx.set(metric_instance) - elif name == 'security': + elif name == "security": _security_ctx.set(metric_instance) else: logger.warning(f"Unknown metric type: {name}") @@ -154,7 +153,7 @@ def initialize_all(cls, **metrics: Any) -> None: def reset_all(cls) -> None: """ Reset all metrics contexts. - + This is primarily useful for testing to ensure a clean state between test cases. """ @@ -224,6 +223,7 @@ def get_security_metrics(cls) -> SecurityMetrics: # Convenience functions for direct access with proper type annotations # Import types with forward references to avoid circular imports + def get_connection_metrics() -> ConnectionMetrics: return MetricsContext.get_connection_metrics() diff --git a/backend/app/core/metrics/coordinator.py b/backend/app/core/metrics/coordinator.py index 1977b747..9e06ca6a 100644 --- a/backend/app/core/metrics/coordinator.py +++ b/backend/app/core/metrics/coordinator.py @@ -3,254 +3,178 @@ class CoordinatorMetrics(BaseMetrics): """Metrics for coordinator and scheduling operations.""" - + def _create_instruments(self) -> None: # Coordinator processing metrics self.coordinator_processing_time = self._meter.create_histogram( name="coordinator.processing.time", description="Time spent processing execution events in seconds", - unit="s" + unit="s", ) - + self.coordinator_scheduling_duration = self._meter.create_histogram( - name="coordinator.scheduling.duration", - description="Time spent scheduling executions in seconds", - unit="s" + name="coordinator.scheduling.duration", description="Time spent scheduling executions in seconds", unit="s" ) - + self.coordinator_active_executions = self._meter.create_up_down_counter( name="coordinator.executions.active", description="Number of active executions managed by coordinator", - unit="1" + unit="1", ) - + # Queue management metrics self.coordinator_queue_time = self._meter.create_histogram( name="coordinator.queue.wait_time", description="Time spent waiting in coordinator queue by priority", - unit="s" + unit="s", ) - + self.coordinator_queue_operations = self._meter.create_counter( - name="coordinator.queue.operations.total", - description="Total queue operations (add/remove)", - unit="1" + name="coordinator.queue.operations.total", description="Total queue operations (add/remove)", unit="1" ) # Execution-only request queue depth (authoritative, maintained by coordinator) self.execution_request_queue_depth = self._meter.create_up_down_counter( name="execution.queue.depth", description="Depth of user execution requests queued (excludes replays and non-request events)", - unit="1" + unit="1", ) - + # Scheduling metrics self.coordinator_executions_scheduled = self._meter.create_counter( - name="coordinator.executions.scheduled.total", - description="Total number of executions scheduled", - unit="1" + name="coordinator.executions.scheduled.total", description="Total number of executions scheduled", unit="1" ) - + # Rate limiting metrics self.coordinator_rate_limited = self._meter.create_counter( - name="coordinator.rate_limited.total", - description="Total number of rate-limited requests", - unit="1" + name="coordinator.rate_limited.total", description="Total number of rate-limited requests", unit="1" ) - + self.coordinator_rate_limit_wait_time = self._meter.create_histogram( - name="coordinator.rate_limit.wait_time", - description="Time clients wait due to rate limiting", - unit="s" + name="coordinator.rate_limit.wait_time", description="Time clients wait due to rate limiting", unit="s" ) - + # Resource management metrics self.coordinator_resource_allocations = self._meter.create_counter( - name="coordinator.resource.allocations.total", - description="Total number of resource allocations", - unit="1" + name="coordinator.resource.allocations.total", description="Total number of resource allocations", unit="1" ) - + self.coordinator_resource_utilization = self._meter.create_up_down_counter( - name="coordinator.resource.utilization", - description="Current resource utilization", - unit="1" + name="coordinator.resource.utilization", description="Current resource utilization", unit="1" ) - + # Scheduling decision metrics self.coordinator_scheduling_decisions = self._meter.create_counter( - name="coordinator.scheduling.decisions.total", - description="Total scheduling decisions made", - unit="1" + name="coordinator.scheduling.decisions.total", description="Total scheduling decisions made", unit="1" ) - + def record_coordinator_processing_time(self, duration_seconds: float) -> None: self.coordinator_processing_time.record(duration_seconds) - + def record_scheduling_duration(self, duration_seconds: float) -> None: self.coordinator_scheduling_duration.record(duration_seconds) - + def update_active_executions_gauge(self, count: int) -> None: """Update the count of active executions (absolute value).""" # Reset to 0 then set to new value (for gauge-like behavior) # This is a workaround since we're using up_down_counter - current_val = getattr(self, '_active_executions_current', 0) + current_val = getattr(self, "_active_executions_current", 0) delta = count - current_val if delta != 0: self.coordinator_active_executions.add(delta) self._active_executions_current = count - + def record_coordinator_queue_time(self, wait_seconds: float, priority: str) -> None: - self.coordinator_queue_time.record( - wait_seconds, - attributes={"priority": priority} - ) - + self.coordinator_queue_time.record(wait_seconds, attributes={"priority": priority}) + def record_coordinator_execution_scheduled(self, status: str) -> None: - self.coordinator_executions_scheduled.add( - 1, - attributes={"status": status} - ) - + self.coordinator_executions_scheduled.add(1, attributes={"status": status}) + def record_coordinator_scheduling_duration(self, duration_seconds: float) -> None: self.coordinator_scheduling_duration.record(duration_seconds) - + def update_coordinator_active_executions(self, count: int) -> None: self.update_active_executions_gauge(count) - + def record_queue_wait_time_by_priority(self, wait_seconds: float, priority: str, queue_name: str) -> None: - self.coordinator_queue_time.record( - wait_seconds, - attributes={ - "priority": priority, - "queue": queue_name - } - ) - + self.coordinator_queue_time.record(wait_seconds, attributes={"priority": priority, "queue": queue_name}) + # Removed legacy coordinator.queue.size; use execution.queue.depth instead def update_execution_request_queue_size(self, size: int) -> None: """Update the execution-only request queue depth (absolute value).""" - key = '_exec_request_queue_size' + key = "_exec_request_queue_size" current_val = getattr(self, key, 0) delta = size - current_val if delta != 0: self.execution_request_queue_depth.add(delta) setattr(self, key, size) - + def record_rate_limited(self, limit_type: str, user_id: str) -> None: - self.coordinator_rate_limited.add( - 1, - attributes={ - "limit_type": limit_type, - "user_id": user_id - } - ) - + self.coordinator_rate_limited.add(1, attributes={"limit_type": limit_type, "user_id": user_id}) + def update_rate_limit_wait_time(self, limit_type: str, user_id: str, wait_seconds: float) -> None: self.coordinator_rate_limit_wait_time.record( - wait_seconds, - attributes={ - "limit_type": limit_type, - "user_id": user_id - } + wait_seconds, attributes={"limit_type": limit_type, "user_id": user_id} ) - + def record_resource_allocation(self, resource_type: str, amount: float, execution_id: str) -> None: self.coordinator_resource_allocations.add( - 1, - attributes={ - "resource_type": resource_type, - "execution_id": execution_id - } + 1, attributes={"resource_type": resource_type, "execution_id": execution_id} ) - + # Update gauge for current allocation - key = f'_resource_{resource_type}' + key = f"_resource_{resource_type}" current_val = getattr(self, key, 0.0) new_val = current_val + amount setattr(self, key, new_val) - + def record_resource_release(self, resource_type: str, amount: float, execution_id: str) -> None: self.coordinator_resource_allocations.add( - -1, - attributes={ - "resource_type": resource_type, - "execution_id": execution_id - } + -1, attributes={"resource_type": resource_type, "execution_id": execution_id} ) - + # Update gauge for current allocation - key = f'_resource_{resource_type}' + key = f"_resource_{resource_type}" current_val = getattr(self, key, 0.0) new_val = max(0.0, current_val - amount) setattr(self, key, new_val) - + def update_resource_usage(self, resource_type: str, usage_percent: float) -> None: # Record as a gauge-like metric - key = f'_resource_usage_{resource_type}' + key = f"_resource_usage_{resource_type}" current_val = getattr(self, key, 0.0) delta = usage_percent - current_val if delta != 0: - self.coordinator_resource_utilization.add( - delta, - attributes={"resource_type": resource_type} - ) + self.coordinator_resource_utilization.add(delta, attributes={"resource_type": resource_type}) setattr(self, key, usage_percent) - + def record_scheduling_decision(self, decision: str, reason: str) -> None: - self.coordinator_scheduling_decisions.add( - 1, - attributes={ - "decision": decision, - "reason": reason - } - ) - + self.coordinator_scheduling_decisions.add(1, attributes={"decision": decision, "reason": reason}) + def record_queue_reordering(self, queue_name: str, items_moved: int) -> None: - self.coordinator_queue_operations.add( - 1, - attributes={ - "operation": "reorder", - "queue": queue_name - } - ) - + self.coordinator_queue_operations.add(1, attributes={"operation": "reorder", "queue": queue_name}) + # Record the number of items moved as a histogram self.coordinator_queue_time.record( - float(items_moved), - attributes={ - "priority": "reordered", - "queue": queue_name - } + float(items_moved), attributes={"priority": "reordered", "queue": queue_name} ) - + def record_priority_change(self, execution_id: str, old_priority: str, new_priority: str) -> None: self.coordinator_scheduling_decisions.add( - 1, - attributes={ - "decision": "priority_change", - "reason": f"{old_priority}_to_{new_priority}" - } + 1, attributes={"decision": "priority_change", "reason": f"{old_priority}_to_{new_priority}"} ) - + def update_rate_limiter_tokens(self, limit_type: str, tokens: int) -> None: # Track tokens as gauge-like metric - key = f'_rate_limiter_{limit_type}' + key = f"_rate_limiter_{limit_type}" current_val = getattr(self, key, 0) delta = tokens - current_val if delta != 0: - self.coordinator_resource_utilization.add( - delta, - attributes={"resource_type": f"rate_limit_{limit_type}"} - ) + self.coordinator_resource_utilization.add(delta, attributes={"resource_type": f"rate_limit_{limit_type}"}) setattr(self, key, tokens) - + def record_rate_limit_reset(self, limit_type: str, user_id: str) -> None: self.coordinator_scheduling_decisions.add( - 1, - attributes={ - "decision": "rate_limit_reset", - "reason": f"{limit_type}_for_{user_id}" - } + 1, attributes={"decision": "rate_limit_reset", "reason": f"{limit_type}_for_{user_id}"} ) diff --git a/backend/app/core/metrics/database.py b/backend/app/core/metrics/database.py index b892ed67..22ef837f 100644 --- a/backend/app/core/metrics/database.py +++ b/backend/app/core/metrics/database.py @@ -7,188 +7,107 @@ class DatabaseMetrics(BaseMetrics): def _create_instruments(self) -> None: # MongoDB operation metrics self.mongodb_event_operations = self._meter.create_counter( - name="mongodb.event.operations.total", - description="Total MongoDB operations for events", - unit="1" + name="mongodb.event.operations.total", description="Total MongoDB operations for events", unit="1" ) self.mongodb_event_query_duration = self._meter.create_histogram( - name="mongodb.event.query.duration", - description="Duration of MongoDB event queries in seconds", - unit="s" + name="mongodb.event.query.duration", description="Duration of MongoDB event queries in seconds", unit="s" ) # Event store specific metrics self.event_store_operations = self._meter.create_counter( - name="event.store.operations.total", - description="Total event store operations", - unit="1" + name="event.store.operations.total", description="Total event store operations", unit="1" ) self.event_store_failures = self._meter.create_counter( - name="event.store.failures.total", - description="Total event store operation failures", - unit="1" + name="event.store.failures.total", description="Total event store operation failures", unit="1" ) # Idempotency metrics self.idempotency_cache_hits = self._meter.create_counter( - name="idempotency.cache.hits.total", - description="Total idempotency cache hits", - unit="1" + name="idempotency.cache.hits.total", description="Total idempotency cache hits", unit="1" ) self.idempotency_cache_misses = self._meter.create_counter( - name="idempotency.cache.misses.total", - description="Total idempotency cache misses", - unit="1" + name="idempotency.cache.misses.total", description="Total idempotency cache misses", unit="1" ) self.idempotency_duplicates_blocked = self._meter.create_counter( name="idempotency.duplicates.blocked.total", description="Total duplicate operations blocked by idempotency", - unit="1" + unit="1", ) self.idempotency_processing_duration = self._meter.create_histogram( - name="idempotency.processing.duration", - description="Duration of idempotency checks in seconds", - unit="s" + name="idempotency.processing.duration", description="Duration of idempotency checks in seconds", unit="s" ) self.idempotency_keys_active = self._meter.create_up_down_counter( - name="idempotency.keys.active", - description="Number of active idempotency keys", - unit="1" + name="idempotency.keys.active", description="Number of active idempotency keys", unit="1" ) # Database connection metrics self.database_connections_active = self._meter.create_up_down_counter( - name="database.connections.active", - description="Number of active database connections", - unit="1" + name="database.connections.active", description="Number of active database connections", unit="1" ) self.database_connection_errors = self._meter.create_counter( - name="database.connection.errors.total", - description="Total database connection errors", - unit="1" + name="database.connection.errors.total", description="Total database connection errors", unit="1" ) def record_mongodb_operation(self, operation: str, status: str) -> None: - self.mongodb_event_operations.add( - 1, - attributes={ - "operation": operation, - "status": status - } - ) + self.mongodb_event_operations.add(1, attributes={"operation": operation, "status": status}) def record_mongodb_query_duration(self, duration_seconds: float, operation: str) -> None: - self.mongodb_event_query_duration.record( - duration_seconds, - attributes={"operation": operation} - ) + self.mongodb_event_query_duration.record(duration_seconds, attributes={"operation": operation}) def record_event_store_duration(self, duration_seconds: float, operation: str, collection: str) -> None: self.mongodb_event_query_duration.record( - duration_seconds, - attributes={ - "operation": f"store_{operation}", - "collection": collection - } + duration_seconds, attributes={"operation": f"store_{operation}", "collection": collection} ) # Also record in event store specific counter - self.event_store_operations.add( - 1, - attributes={ - "operation": operation, - "collection": collection - } - ) + self.event_store_operations.add(1, attributes={"operation": operation, "collection": collection}) def record_event_query_duration(self, duration_seconds: float, operation: str, collection: str) -> None: self.mongodb_event_query_duration.record( - duration_seconds, - attributes={ - "operation": f"query_{operation}", - "collection": collection - } + duration_seconds, attributes={"operation": f"query_{operation}", "collection": collection} ) def record_event_store_failed(self, event_type: str, error_type: str) -> None: - self.event_store_failures.add( - 1, - attributes={ - "event_type": event_type, - "error_type": error_type - } - ) + self.event_store_failures.add(1, attributes={"event_type": event_type, "error_type": error_type}) def record_idempotency_cache_hit(self, event_type: str, operation: str) -> None: - self.idempotency_cache_hits.add( - 1, - attributes={ - "event_type": event_type, - "operation": operation - } - ) + self.idempotency_cache_hits.add(1, attributes={"event_type": event_type, "operation": operation}) def record_idempotency_cache_miss(self, event_type: str, operation: str) -> None: - self.idempotency_cache_misses.add( - 1, - attributes={ - "event_type": event_type, - "operation": operation - } - ) + self.idempotency_cache_misses.add(1, attributes={"event_type": event_type, "operation": operation}) def record_idempotency_duplicate_blocked(self, event_type: str) -> None: - self.idempotency_duplicates_blocked.add( - 1, - attributes={"event_type": event_type} - ) + self.idempotency_duplicates_blocked.add(1, attributes={"event_type": event_type}) def record_idempotency_processing_duration(self, duration_seconds: float, operation: str) -> None: - self.idempotency_processing_duration.record( - duration_seconds, - attributes={"operation": operation} - ) + self.idempotency_processing_duration.record(duration_seconds, attributes={"operation": operation}) def update_idempotency_keys_active(self, count: int, prefix: str) -> None: # Track the delta for gauge-like behavior - key = f'_idempotency_keys_{prefix}' + key = f"_idempotency_keys_{prefix}" current_val = getattr(self, key, 0) delta = count - current_val if delta != 0: - self.idempotency_keys_active.add( - delta, - attributes={"key_prefix": prefix} - ) + self.idempotency_keys_active.add(delta, attributes={"key_prefix": prefix}) setattr(self, key, count) def record_idempotent_event_processed(self, event_type: str, result: str) -> None: self.event_store_operations.add( - 1, - attributes={ - "operation": "idempotent_process", - "event_type": event_type, - "result": result - } + 1, attributes={"operation": "idempotent_process", "event_type": event_type, "result": result} ) def record_idempotent_processing_duration(self, duration_seconds: float, event_type: str) -> None: - self.idempotency_processing_duration.record( - duration_seconds, - attributes={"event_type": event_type} - ) + self.idempotency_processing_duration.record(duration_seconds, attributes={"event_type": event_type}) def update_database_connections(self, delta: int) -> None: self.database_connections_active.add(delta) def record_database_connection_error(self, error_type: str) -> None: - self.database_connection_errors.add( - 1, - attributes={"error_type": error_type} - ) + self.database_connection_errors.add(1, attributes={"error_type": error_type}) diff --git a/backend/app/core/metrics/dlq.py b/backend/app/core/metrics/dlq.py index b905ee1d..be8b988e 100644 --- a/backend/app/core/metrics/dlq.py +++ b/backend/app/core/metrics/dlq.py @@ -7,108 +7,68 @@ class DLQMetrics(BaseMetrics): def _create_instruments(self) -> None: # DLQ message metrics self.dlq_messages_received = self._meter.create_counter( - name="dlq.messages.received.total", - description="Total number of messages received in DLQ", - unit="1" + name="dlq.messages.received.total", description="Total number of messages received in DLQ", unit="1" ) self.dlq_messages_retried = self._meter.create_counter( - name="dlq.messages.retried.total", - description="Total number of DLQ messages retried", - unit="1" + name="dlq.messages.retried.total", description="Total number of DLQ messages retried", unit="1" ) self.dlq_messages_discarded = self._meter.create_counter( - name="dlq.messages.discarded.total", - description="Total number of DLQ messages discarded", - unit="1" + name="dlq.messages.discarded.total", description="Total number of DLQ messages discarded", unit="1" ) # DLQ processing metrics self.dlq_processing_duration = self._meter.create_histogram( - name="dlq.processing.duration", - description="Time spent processing DLQ messages in seconds", - unit="s" + name="dlq.processing.duration", description="Time spent processing DLQ messages in seconds", unit="s" ) self.dlq_message_age = self._meter.create_histogram( - name="dlq.message.age", - description="Age of messages in DLQ in seconds", - unit="s" + name="dlq.message.age", description="Age of messages in DLQ in seconds", unit="s" ) # DLQ queue metrics self.dlq_queue_size = self._meter.create_up_down_counter( - name="dlq.queue.size", - description="Current size of DLQ by topic", - unit="1" + name="dlq.queue.size", description="Current size of DLQ by topic", unit="1" ) self.dlq_retry_attempts = self._meter.create_histogram( - name="dlq.retry.attempts", - description="Number of retry attempts for DLQ messages", - unit="1" + name="dlq.retry.attempts", description="Number of retry attempts for DLQ messages", unit="1" ) # DLQ error metrics self.dlq_processing_errors = self._meter.create_counter( - name="dlq.processing.errors.total", - description="Total number of DLQ processing errors", - unit="1" + name="dlq.processing.errors.total", description="Total number of DLQ processing errors", unit="1" ) # DLQ throughput metrics self.dlq_throughput_rate = self._meter.create_histogram( - name="dlq.throughput.rate", - description="Messages processed per second from DLQ", - unit="msg/s" + name="dlq.throughput.rate", description="Messages processed per second from DLQ", unit="msg/s" ) def record_dlq_message_received(self, original_topic: str, event_type: str) -> None: - self.dlq_messages_received.add( - 1, - attributes={ - "original_topic": original_topic, - "event_type": event_type - } - ) + self.dlq_messages_received.add(1, attributes={"original_topic": original_topic, "event_type": event_type}) def record_dlq_message_retried(self, original_topic: str, event_type: str, result: str) -> None: self.dlq_messages_retried.add( - 1, - attributes={ - "original_topic": original_topic, - "event_type": event_type, - "result": result - } + 1, attributes={"original_topic": original_topic, "event_type": event_type, "result": result} ) def record_dlq_message_discarded(self, original_topic: str, event_type: str, reason: str) -> None: self.dlq_messages_discarded.add( - 1, - attributes={ - "original_topic": original_topic, - "event_type": event_type, - "reason": reason - } + 1, attributes={"original_topic": original_topic, "event_type": event_type, "reason": reason} ) def record_dlq_processing_duration(self, duration_seconds: float, operation: str) -> None: - self.dlq_processing_duration.record( - duration_seconds, - attributes={"operation": operation} - ) + self.dlq_processing_duration.record(duration_seconds, attributes={"operation": operation}) def update_dlq_queue_size(self, original_topic: str, size: int) -> None: # Track the delta for gauge-like behavior - key = f'_dlq_size_{original_topic}' + key = f"_dlq_size_{original_topic}" current_val = getattr(self, key, 0) delta = size - current_val if delta != 0: - self.dlq_queue_size.add( - delta, - attributes={"original_topic": original_topic} - ) + self.dlq_queue_size.add(delta, attributes={"original_topic": original_topic}) setattr(self, key, size) def record_dlq_message_age(self, age_seconds: float) -> None: @@ -116,37 +76,19 @@ def record_dlq_message_age(self, age_seconds: float) -> None: def record_dlq_retry_attempt(self, original_topic: str, event_type: str, attempt_number: int) -> None: self.dlq_retry_attempts.record( - attempt_number, - attributes={ - "original_topic": original_topic, - "event_type": event_type - } + attempt_number, attributes={"original_topic": original_topic, "event_type": event_type} ) def record_dlq_processing_error(self, original_topic: str, event_type: str, error_type: str) -> None: self.dlq_processing_errors.add( - 1, - attributes={ - "original_topic": original_topic, - "event_type": event_type, - "error_type": error_type - } + 1, attributes={"original_topic": original_topic, "event_type": event_type, "error_type": error_type} ) def record_dlq_throughput(self, messages_per_second: float, original_topic: str) -> None: - self.dlq_throughput_rate.record( - messages_per_second, - attributes={"original_topic": original_topic} - ) + self.dlq_throughput_rate.record(messages_per_second, attributes={"original_topic": original_topic}) def increment_dlq_queue_size(self, original_topic: str) -> None: - self.dlq_queue_size.add( - 1, - attributes={"original_topic": original_topic} - ) + self.dlq_queue_size.add(1, attributes={"original_topic": original_topic}) def decrement_dlq_queue_size(self, original_topic: str) -> None: - self.dlq_queue_size.add( - -1, - attributes={"original_topic": original_topic} - ) + self.dlq_queue_size.add(-1, attributes={"original_topic": original_topic}) diff --git a/backend/app/core/metrics/events.py b/backend/app/core/metrics/events.py index 0846028d..f74e94b6 100644 --- a/backend/app/core/metrics/events.py +++ b/backend/app/core/metrics/events.py @@ -1,318 +1,209 @@ - from app.core.metrics.base import BaseMetrics class EventMetrics(BaseMetrics): """Metrics for event processing and Kafka. - + This class tracks metrics related to event processing, event buffers, and Kafka message production/consumption. It's now accessed through the contextvars-based MetricsContext system rather than a singleton. - + Usage: from app.core.metrics.context import get_event_metrics - + metrics = get_event_metrics() metrics.record_event_published("execution.requested") - + The metrics instance is managed by the MetricsContext and is available throughout the application without needing to pass it through layers. """ - + def _create_instruments(self) -> None: # Core event metrics self.event_published = self._meter.create_counter( - name="events.published.total", - description="Total number of events published", - unit="1" + name="events.published.total", description="Total number of events published", unit="1" ) - + self.event_processing_duration = self._meter.create_histogram( - name="event.processing.duration", - description="Time spent processing events in seconds", - unit="s" + name="event.processing.duration", description="Time spent processing events in seconds", unit="s" ) - + self.event_processing_errors = self._meter.create_counter( - name="event.processing.errors.total", - description="Total number of event processing errors", - unit="1" + name="event.processing.errors.total", description="Total number of event processing errors", unit="1" ) - + # Event bus metrics self.event_bus_queue_size = self._meter.create_up_down_counter( - name="event.bus.queue.size", - description="Size of event bus message queue", - unit="1" + name="event.bus.queue.size", description="Size of event bus message queue", unit="1" ) - + # Pod event metrics self.pod_event_published = self._meter.create_counter( - name="pod.events.published.total", - description="Total number of pod events published", - unit="1" + name="pod.events.published.total", description="Total number of pod events published", unit="1" ) - + # Event replay metrics self.event_replay_operations = self._meter.create_counter( - name="event.replay.operations.total", - description="Total number of event replay operations", - unit="1" + name="event.replay.operations.total", description="Total number of event replay operations", unit="1" ) - - + # Event buffer metrics self.event_buffer_size = self._meter.create_up_down_counter( - name="event.buffer.size", - description="Current number of events in buffer", - unit="1" + name="event.buffer.size", description="Current number of events in buffer", unit="1" ) - + self.event_buffer_dropped = self._meter.create_counter( - name="event.buffer.dropped.total", - description="Total number of events dropped from buffer", - unit="1" + name="event.buffer.dropped.total", description="Total number of events dropped from buffer", unit="1" ) - + self.event_buffer_processed = self._meter.create_counter( - name="event.buffer.processed.total", - description="Total number of events processed from buffer", - unit="1" + name="event.buffer.processed.total", description="Total number of events processed from buffer", unit="1" ) - + self.event_buffer_latency = self._meter.create_histogram( - name="event.buffer.latency", - description="Time between event creation and processing in seconds", - unit="s" + name="event.buffer.latency", description="Time between event creation and processing in seconds", unit="s" ) - + self.event_buffer_backpressure = self._meter.create_up_down_counter( name="event.buffer.backpressure.active", description="Whether backpressure is currently active (1=active, 0=inactive)", - unit="1" + unit="1", ) - + self.event_buffer_memory_usage = self._meter.create_histogram( - name="event.buffer.memory.usage", - description="Memory usage of event buffer in MB", - unit="MB" + name="event.buffer.memory.usage", description="Memory usage of event buffer in MB", unit="MB" ) - + # Kafka-specific metrics self.kafka_messages_produced = self._meter.create_counter( - name="kafka.messages.produced.total", - description="Total number of messages produced to Kafka", - unit="1" + name="kafka.messages.produced.total", description="Total number of messages produced to Kafka", unit="1" ) - + self.kafka_messages_consumed = self._meter.create_counter( - name="kafka.messages.consumed.total", - description="Total number of messages consumed from Kafka", - unit="1" + name="kafka.messages.consumed.total", description="Total number of messages consumed from Kafka", unit="1" ) - + self.kafka_consumer_lag = self._meter.create_histogram( - name="kafka.consumer.lag", - description="Consumer lag in number of messages", - unit="1" + name="kafka.consumer.lag", description="Consumer lag in number of messages", unit="1" ) - + self.kafka_production_errors = self._meter.create_counter( - name="kafka.production.errors.total", - description="Total number of Kafka production errors", - unit="1" + name="kafka.production.errors.total", description="Total number of Kafka production errors", unit="1" ) - + self.kafka_consumption_errors = self._meter.create_counter( - name="kafka.consumption.errors.total", - description="Total number of Kafka consumption errors", - unit="1" + name="kafka.consumption.errors.total", description="Total number of Kafka consumption errors", unit="1" ) - + def record_event_published(self, event_type: str, event_category: str | None = None) -> None: """ Record that an event was published. - + Args: event_type: Full event type (e.g., "execution.requested") event_category: Event category (e.g., "execution"). If None, extracted from event_type. """ if event_category is None: # Extract category from event type (e.g., "execution" from "execution.requested") - event_category = event_type.split('.')[0] if '.' in event_type else event_type - - self.event_published.add( - 1, - attributes={ - "event_type": event_type, - "event_category": event_category - } - ) - + event_category = event_type.split(".")[0] if "." in event_type else event_type + + self.event_published.add(1, attributes={"event_type": event_type, "event_category": event_category}) + def record_event_processing_duration(self, duration_seconds: float, event_type: str) -> None: - self.event_processing_duration.record( - duration_seconds, - attributes={"event_type": event_type} - ) - + self.event_processing_duration.record(duration_seconds, attributes={"event_type": event_type}) + def record_pod_event_published(self, event_type: str) -> None: - self.pod_event_published.add( - 1, - attributes={"event_type": event_type} - ) - + self.pod_event_published.add(1, attributes={"event_type": event_type}) + def record_event_replay_operation(self, operation: str, status: str) -> None: - self.event_replay_operations.add( - 1, - attributes={ - "operation": operation, - "status": status - } - ) - - + self.event_replay_operations.add(1, attributes={"operation": operation, "status": status}) + def update_event_buffer_size(self, delta: int) -> None: self.event_buffer_size.add(delta) - + def record_event_buffer_dropped(self) -> None: self.event_buffer_dropped.add(1) - + def record_event_buffer_processed(self) -> None: self.event_buffer_processed.add(1) - + def record_event_buffer_latency(self, latency_seconds: float) -> None: self.event_buffer_latency.record(latency_seconds) - + def set_event_buffer_backpressure(self, active: bool) -> None: self.event_buffer_backpressure.add(-1 if not active else 0) self.event_buffer_backpressure.add(1 if active else 0) - + def record_event_buffer_memory_usage(self, memory_mb: float) -> None: self.event_buffer_memory_usage.record(memory_mb) - + def record_event_stored(self, event_type: str, collection: str) -> None: - self.event_published.add( - 1, - attributes={ - "event_type": event_type, - "aggregate_type": collection - } - ) - - def record_events_processing_failed(self, topic: str, event_type: str, consumer_group: str, - error_type: str) -> None: + self.event_published.add(1, attributes={"event_type": event_type, "aggregate_type": collection}) + + def record_events_processing_failed( + self, topic: str, event_type: str, consumer_group: str, error_type: str + ) -> None: self.event_processing_errors.add( 1, attributes={ "topic": topic, "event_type": event_type, "consumer_group": consumer_group, - "error_type": error_type - } + "error_type": error_type, + }, ) - + def record_event_store_duration(self, duration: float, operation: str, collection: str) -> None: """Record event store operation duration.""" - self.event_processing_duration.record( - duration, - attributes={ - "operation": operation, - "collection": collection - } - ) - + self.event_processing_duration.record(duration, attributes={"operation": operation, "collection": collection}) + def record_event_store_failed(self, event_type: str, error_type: str) -> None: """Record event store failure.""" self.event_processing_errors.add( - 1, - attributes={ - "event_type": event_type, - "error_type": error_type, - "operation": "store" - } + 1, attributes={"event_type": event_type, "error_type": error_type, "operation": "store"} ) - + def record_event_query_duration(self, duration: float, query_type: str, collection: str) -> None: """Record event query duration.""" self.event_processing_duration.record( - duration, - attributes={ - "operation": f"query_{query_type}", - "collection": collection - } + duration, attributes={"operation": f"query_{query_type}", "collection": collection} ) - - def record_processing_duration(self, duration_seconds: float, topic: str, event_type: str, - consumer_group: str) -> None: + + def record_processing_duration( + self, duration_seconds: float, topic: str, event_type: str, consumer_group: str + ) -> None: self.event_processing_duration.record( - duration_seconds, - attributes={ - "topic": topic, - "event_type": event_type, - "consumer_group": consumer_group - } + duration_seconds, attributes={"topic": topic, "event_type": event_type, "consumer_group": consumer_group} ) - + def record_kafka_message_produced(self, topic: str, partition: int = -1) -> None: self.kafka_messages_produced.add( - 1, - attributes={ - "topic": topic, - "partition": str(partition) if partition >= 0 else "auto" - } + 1, attributes={"topic": topic, "partition": str(partition) if partition >= 0 else "auto"} ) - + def record_kafka_message_consumed(self, topic: str, consumer_group: str) -> None: - self.kafka_messages_consumed.add( - 1, - attributes={ - "topic": topic, - "consumer_group": consumer_group - } - ) - + self.kafka_messages_consumed.add(1, attributes={"topic": topic, "consumer_group": consumer_group}) + def record_kafka_consumer_lag(self, lag: int, topic: str, consumer_group: str, partition: int) -> None: self.kafka_consumer_lag.record( - lag, - attributes={ - "topic": topic, - "consumer_group": consumer_group, - "partition": str(partition) - } + lag, attributes={"topic": topic, "consumer_group": consumer_group, "partition": str(partition)} ) - + def record_kafka_production_error(self, topic: str, error_type: str) -> None: - self.kafka_production_errors.add( - 1, - attributes={ - "topic": topic, - "error_type": error_type - } - ) - + self.kafka_production_errors.add(1, attributes={"topic": topic, "error_type": error_type}) + def record_kafka_consumption_error(self, topic: str, consumer_group: str, error_type: str) -> None: self.kafka_consumption_errors.add( - 1, - attributes={ - "topic": topic, - "consumer_group": consumer_group, - "error_type": error_type - } + 1, attributes={"topic": topic, "consumer_group": consumer_group, "error_type": error_type} ) - + def update_event_bus_queue_size(self, delta: int, queue_name: str = "default") -> None: - self.event_bus_queue_size.add( - delta, - attributes={"queue": queue_name} - ) - + self.event_bus_queue_size.add(delta, attributes={"queue": queue_name}) + def set_event_bus_queue_size(self, size: int, queue_name: str = "default") -> None: - key = f'_event_bus_size_{queue_name}' + key = f"_event_bus_size_{queue_name}" current_val = getattr(self, key, 0) delta = size - current_val if delta != 0: - self.event_bus_queue_size.add( - delta, - attributes={"queue": queue_name} - ) + self.event_bus_queue_size.add(delta, attributes={"queue": queue_name}) setattr(self, key, size) diff --git a/backend/app/core/metrics/execution.py b/backend/app/core/metrics/execution.py index 78265375..a2f3b74a 100644 --- a/backend/app/core/metrics/execution.py +++ b/backend/app/core/metrics/execution.py @@ -7,91 +7,66 @@ class ExecutionMetrics(BaseMetrics): def _create_instruments(self) -> None: self.script_executions = self._meter.create_counter( - name="script.executions.total", - description="Total number of script executions", - unit="1" + name="script.executions.total", description="Total number of script executions", unit="1" ) self.execution_events = self._meter.create_observable_gauge( name="script.execution.events", description="Instantaneous execution events (1 when execution starts, 0 otherwise)", - unit="1" + unit="1", ) self.execution_duration = self._meter.create_histogram( - name="script.execution.duration", - description="Time spent executing scripts in seconds", - unit="s" + name="script.execution.duration", description="Time spent executing scripts in seconds", unit="s" ) self.active_executions = self._meter.create_up_down_counter( - name="script.executions.active", - description="Number of currently running script executions", - unit="1" + name="script.executions.active", description="Number of currently running script executions", unit="1" ) self.memory_usage = self._meter.create_histogram( - name="script.memory.usage", - description="Memory usage per script execution in MiB", - unit="MiB" + name="script.memory.usage", description="Memory usage per script execution in MiB", unit="MiB" ) self.cpu_utilization = self._meter.create_histogram( name="script.cpu.utilization", description="CPU utilization in millicores per script execution", - unit="millicores" + unit="millicores", ) self.memory_utilization_percent = self._meter.create_histogram( name="script.memory.utilization.percent", description="Memory utilization as percentage of available memory", - unit="%" + unit="%", ) self.error_counter = self._meter.create_counter( - name="script.errors.total", - description="Total number of script errors by type", - unit="1" + name="script.errors.total", description="Total number of script errors by type", unit="1" ) self.executions_assigned = self._meter.create_counter( - name="executions.assigned.total", - description="Total number of executions assigned to workers", - unit="1" + name="executions.assigned.total", description="Total number of executions assigned to workers", unit="1" ) self.executions_queued = self._meter.create_counter( - name="executions.queued.total", - description="Total number of executions queued", - unit="1" + name="executions.queued.total", description="Total number of executions queued", unit="1" ) self.queue_depth = self._meter.create_up_down_counter( - name="execution.queue.depth", - description="Current number of executions waiting in queue", - unit="1" + name="execution.queue.depth", description="Current number of executions waiting in queue", unit="1" ) self.queue_wait_time = self._meter.create_histogram( name="execution.queue.wait_time", description="Time spent waiting in queue before execution starts in seconds", - unit="s" + unit="s", ) def record_script_execution(self, status: ExecutionStatus, lang_and_version: str) -> None: - self.script_executions.add( - 1, - attributes={ - "status": status, - "lang_and_version": lang_and_version - } - ) + self.script_executions.add(1, attributes={"status": status, "lang_and_version": lang_and_version}) def record_execution_duration(self, duration_seconds: float, lang_and_version: str) -> None: - self.execution_duration.record( - duration_seconds, - attributes={"lang_and_version": lang_and_version} - ) + self.execution_duration.record(duration_seconds, attributes={"lang_and_version": lang_and_version}) def increment_active_executions(self) -> None: self.active_executions.add(1) @@ -100,10 +75,7 @@ def decrement_active_executions(self) -> None: self.active_executions.add(-1) def record_memory_usage(self, memory_mib: float, lang_and_version: str) -> None: - self.memory_usage.record( - memory_mib, - attributes={"lang_and_version": lang_and_version} - ) + self.memory_usage.record(memory_mib, attributes={"lang_and_version": lang_and_version}) def record_error(self, error_type: str) -> None: self.error_counter.add(1, attributes={"error_type": error_type}) @@ -112,10 +84,7 @@ def update_queue_depth(self, delta: int) -> None: self.queue_depth.add(delta) def record_queue_wait_time(self, wait_seconds: float, lang_and_version: str) -> None: - self.queue_wait_time.record( - wait_seconds, - attributes={"lang_and_version": lang_and_version} - ) + self.queue_wait_time.record(wait_seconds, attributes={"lang_and_version": lang_and_version}) def record_execution_assigned(self) -> None: self.executions_assigned.add(1) diff --git a/backend/app/core/metrics/health.py b/backend/app/core/metrics/health.py index a20676ed..eb26af27 100644 --- a/backend/app/core/metrics/health.py +++ b/backend/app/core/metrics/health.py @@ -3,176 +3,107 @@ class HealthMetrics(BaseMetrics): """Metrics for health checks.""" - + def _create_instruments(self) -> None: # Core health check metrics - simple histogram to track latest value self.health_check_status = self._meter.create_histogram( - name="health.check.status", - description="Health check status (1=healthy, 0=unhealthy)", - unit="1" + name="health.check.status", description="Health check status (1=healthy, 0=unhealthy)", unit="1" ) - + self.health_check_duration = self._meter.create_histogram( - name="health.check.duration", - description="Time taken to perform health check in seconds", - unit="s" + name="health.check.duration", description="Time taken to perform health check in seconds", unit="s" ) - + self.health_check_failures = self._meter.create_counter( - name="health.check.failures.total", - description="Total number of health check failures", - unit="1" + name="health.check.failures.total", description="Total number of health check failures", unit="1" ) - + # Service health metrics self.service_health_status = self._meter.create_histogram( - name="service.health.status", - description="Service health status by service name", - unit="1" + name="service.health.status", description="Service health status by service name", unit="1" ) - + self.service_health_score = self._meter.create_histogram( - name="service.health.score", - description="Overall health score for a service (0-100)", - unit="%" + name="service.health.score", description="Overall health score for a service (0-100)", unit="%" ) - + # Liveness and readiness specific metrics self.liveness_check_status = self._meter.create_histogram( - name="liveness.check.status", - description="Liveness check status (1=alive, 0=dead)", - unit="1" + name="liveness.check.status", description="Liveness check status (1=alive, 0=dead)", unit="1" ) - + self.readiness_check_status = self._meter.create_histogram( - name="readiness.check.status", - description="Readiness check status (1=ready, 0=not ready)", - unit="1" + name="readiness.check.status", description="Readiness check status (1=ready, 0=not ready)", unit="1" ) - + # Dependency health metrics self.dependency_health_status = self._meter.create_histogram( - name="dependency.health.status", - description="Health status of external dependencies", - unit="1" + name="dependency.health.status", description="Health status of external dependencies", unit="1" ) - + self.dependency_response_time = self._meter.create_histogram( - name="dependency.response.time", - description="Response time for dependency health checks", - unit="s" + name="dependency.response.time", description="Response time for dependency health checks", unit="s" ) - + # Health check execution metrics self.health_checks_executed = self._meter.create_counter( - name="health.checks.executed.total", - description="Total number of health checks executed", - unit="1" + name="health.checks.executed.total", description="Total number of health checks executed", unit="1" ) - + self.health_check_timeouts = self._meter.create_counter( - name="health.check.timeouts.total", - description="Total number of health check timeouts", - unit="1" + name="health.check.timeouts.total", description="Total number of health check timeouts", unit="1" ) - + # Component health metrics self.component_health_status = self._meter.create_histogram( - name="component.health.status", - description="Health status of system components", - unit="1" + name="component.health.status", description="Health status of system components", unit="1" ) - + def record_health_check_duration(self, duration_seconds: float, check_type: str, check_name: str) -> None: self.health_check_duration.record( - duration_seconds, - attributes={ - "check_type": check_type, - "check_name": check_name - } + duration_seconds, attributes={"check_type": check_type, "check_name": check_name} ) - + # Also increment execution counter - self.health_checks_executed.add( - 1, - attributes={ - "check_type": check_type, - "check_name": check_name - } - ) - + self.health_checks_executed.add(1, attributes={"check_type": check_type, "check_name": check_name}) + def record_health_check_failure(self, check_type: str, check_name: str, failure_type: str) -> None: self.health_check_failures.add( - 1, - attributes={ - "check_type": check_type, - "check_name": check_name, - "failure_type": failure_type - } + 1, attributes={"check_type": check_type, "check_name": check_name, "failure_type": failure_type} ) - + def update_health_check_status(self, status_value: int, check_type: str, check_name: str) -> None: # Just record the current status value - self.health_check_status.record( - status_value, - attributes={"check_type": check_type, "check_name": check_name} - ) - + self.health_check_status.record(status_value, attributes={"check_type": check_type, "check_name": check_name}) + def record_health_status(self, service_name: str, status: str) -> None: # Map status to numeric value status_value = 1 if status.lower() in ["healthy", "ok", "up"] else 0 # Record the current status - self.service_health_status.record( - status_value, - attributes={"service": service_name} - ) - + self.service_health_status.record(status_value, attributes={"service": service_name}) + def record_service_health_score(self, service_name: str, score: float) -> None: - self.service_health_score.record( - score, - attributes={"service": service_name} - ) - + self.service_health_score.record(score, attributes={"service": service_name}) + def update_liveness_status(self, is_alive: bool, component: str = "default") -> None: status_value = 1 if is_alive else 0 - self.liveness_check_status.record( - status_value, - attributes={"component": component} - ) - + self.liveness_check_status.record(status_value, attributes={"component": component}) + def update_readiness_status(self, is_ready: bool, component: str = "default") -> None: status_value = 1 if is_ready else 0 - self.readiness_check_status.record( - status_value, - attributes={"component": component} - ) - + self.readiness_check_status.record(status_value, attributes={"component": component}) + def record_dependency_health(self, dependency_name: str, is_healthy: bool, response_time: float) -> None: # Update health status status_value = 1 if is_healthy else 0 - self.dependency_health_status.record( - status_value, - attributes={"dependency": dependency_name} - ) - + self.dependency_health_status.record(status_value, attributes={"dependency": dependency_name}) + # Record response time - self.dependency_response_time.record( - response_time, - attributes={"dependency": dependency_name} - ) - + self.dependency_response_time.record(response_time, attributes={"dependency": dependency_name}) + def record_health_check_timeout(self, check_type: str, check_name: str) -> None: - self.health_check_timeouts.add( - 1, - attributes={ - "check_type": check_type, - "check_name": check_name - } - ) - + self.health_check_timeouts.add(1, attributes={"check_type": check_type, "check_name": check_name}) + def update_component_health(self, component_name: str, is_healthy: bool) -> None: status_value = 1 if is_healthy else 0 - self.component_health_status.record( - status_value, - attributes={"component": component_name} - ) + self.component_health_status.record(status_value, attributes={"component": component_name}) diff --git a/backend/app/core/metrics/kubernetes.py b/backend/app/core/metrics/kubernetes.py index dd6d59f1..06d45bec 100644 --- a/backend/app/core/metrics/kubernetes.py +++ b/backend/app/core/metrics/kubernetes.py @@ -7,142 +7,100 @@ class KubernetesMetrics(BaseMetrics): def _create_instruments(self) -> None: # Pod creation metrics self.pod_creations = self._meter.create_counter( - name="pod.creations.total", - description="Total number of pod creations", - unit="1" + name="pod.creations.total", description="Total number of pod creations", unit="1" ) self.pod_creation_failures = self._meter.create_counter( - name="pod.creation.failures.total", - description="Total number of pod creation failures by reason", - unit="1" + name="pod.creation.failures.total", description="Total number of pod creation failures by reason", unit="1" ) self.pod_creation_duration = self._meter.create_histogram( - name="pod.creation.duration", - description="Time taken to create pods in seconds", - unit="s" + name="pod.creation.duration", description="Time taken to create pods in seconds", unit="s" ) self.active_pod_creations = self._meter.create_up_down_counter( - name="pod.creations.active", - description="Number of pods currently being created", - unit="1" + name="pod.creations.active", description="Number of pods currently being created", unit="1" ) # Pod lifecycle metrics self.pod_phase_transitions = self._meter.create_counter( - name="pod.phase.transitions.total", - description="Total number of pod phase transitions", - unit="1" + name="pod.phase.transitions.total", description="Total number of pod phase transitions", unit="1" ) self.pod_lifetime = self._meter.create_histogram( - name="pod.lifetime", - description="Total lifetime of pods in seconds", - unit="s" + name="pod.lifetime", description="Total lifetime of pods in seconds", unit="s" ) self.pods_by_phase = self._meter.create_up_down_counter( - name="pods.by.phase", - description="Current number of pods by phase", - unit="1" + name="pods.by.phase", description="Current number of pods by phase", unit="1" ) # ConfigMap and NetworkPolicy metrics self.config_maps_created = self._meter.create_counter( - name="configmaps.created.total", - description="Total number of ConfigMaps created", - unit="1" + name="configmaps.created.total", description="Total number of ConfigMaps created", unit="1" ) self.network_policies_created = self._meter.create_counter( - name="networkpolicies.created.total", - description="Total number of NetworkPolicies created", - unit="1" + name="networkpolicies.created.total", description="Total number of NetworkPolicies created", unit="1" ) # Pod monitor metrics self.pod_monitor_events = self._meter.create_counter( - name="pod.monitor.events.total", - description="Total number of pod monitor events", - unit="1" + name="pod.monitor.events.total", description="Total number of pod monitor events", unit="1" ) self.pod_monitor_processing_duration = self._meter.create_histogram( name="pod.monitor.processing.duration", description="Time spent processing pod monitor events in seconds", - unit="s" + unit="s", ) self.pod_monitor_reconciliations = self._meter.create_counter( name="pod.monitor.reconciliations.total", description="Total number of pod monitor reconciliations", - unit="1" + unit="1", ) self.pod_monitor_watch_errors = self._meter.create_counter( - name="pod.monitor.watch.errors.total", - description="Total number of pod monitor watch errors", - unit="1" + name="pod.monitor.watch.errors.total", description="Total number of pod monitor watch errors", unit="1" ) self.pod_monitor_watch_reconnects = self._meter.create_counter( name="pod.monitor.watch.reconnects.total", description="Total number of pod monitor watch reconnects", - unit="1" + unit="1", ) self.pods_monitored = self._meter.create_up_down_counter( - name="pods.monitored", - description="Number of pods currently being monitored", - unit="1" + name="pods.monitored", description="Number of pods currently being monitored", unit="1" ) # Resource metrics self.pod_resource_requests = self._meter.create_histogram( - name="pod.resource.requests", - description="Pod resource requests", - unit="1" + name="pod.resource.requests", description="Pod resource requests", unit="1" ) self.pod_resource_limits = self._meter.create_histogram( - name="pod.resource.limits", - description="Pod resource limits", - unit="1" + name="pod.resource.limits", description="Pod resource limits", unit="1" ) # Node metrics self.pods_per_node = self._meter.create_histogram( - name="pods.per.node", - description="Number of pods per node", - unit="1" + name="pods.per.node", description="Number of pods per node", unit="1" ) def record_pod_creation_failure(self, failure_reason: str) -> None: - self.pod_creation_failures.add( - 1, - attributes={"failure_reason": failure_reason} - ) + self.pod_creation_failures.add(1, attributes={"failure_reason": failure_reason}) def record_pod_created(self, status: str, language: str) -> None: - self.pod_creations.add( - 1, - attributes={ - "status": status, - "language": language - } - ) + self.pod_creations.add(1, attributes={"status": status, "language": language}) def record_pod_creation_duration(self, duration_seconds: float, language: str) -> None: - self.pod_creation_duration.record( - duration_seconds, - attributes={"language": language} - ) + self.pod_creation_duration.record(duration_seconds, attributes={"language": language}) def update_active_pod_creations(self, count: int) -> None: # Track the delta for gauge-like behavior - key = '_active_pod_creations' + key = "_active_pod_creations" current_val = getattr(self, key, 0) delta = count - current_val if delta != 0: @@ -156,10 +114,7 @@ def decrement_active_pod_creations(self) -> None: self.active_pod_creations.add(-1) def record_config_map_created(self, status: str) -> None: - self.config_maps_created.add( - 1, - attributes={"status": status} - ) + self.config_maps_created.add(1, attributes={"status": status}) def record_k8s_pod_created(self, status: str, language: str) -> None: self.record_pod_created(status, language) @@ -171,10 +126,7 @@ def record_k8s_config_map_created(self, status: str) -> None: self.record_config_map_created(status) def record_k8s_network_policy_created(self, status: str) -> None: - self.network_policies_created.add( - 1, - attributes={"status": status} - ) + self.network_policies_created.add(1, attributes={"status": status}) def update_k8s_active_creations(self, count: int) -> None: self.update_active_pod_creations(count) @@ -183,35 +135,20 @@ def increment_pod_monitor_watch_reconnects(self) -> None: self.pod_monitor_watch_reconnects.add(1) def record_pod_monitor_event_processing_duration(self, duration_seconds: float, event_type: str) -> None: - self.pod_monitor_processing_duration.record( - duration_seconds, - attributes={"event_type": event_type} - ) + self.pod_monitor_processing_duration.record(duration_seconds, attributes={"event_type": event_type}) def record_pod_monitor_event_published(self, event_type: str, pod_phase: str) -> None: - self.pod_monitor_events.add( - 1, - attributes={ - "event_type": event_type, - "pod_phase": pod_phase - } - ) + self.pod_monitor_events.add(1, attributes={"event_type": event_type, "pod_phase": pod_phase}) def record_pod_monitor_reconciliation_run(self, status: str) -> None: - self.pod_monitor_reconciliations.add( - 1, - attributes={"status": status} - ) + self.pod_monitor_reconciliations.add(1, attributes={"status": status}) def record_pod_monitor_watch_error(self, error_type: str) -> None: - self.pod_monitor_watch_errors.add( - 1, - attributes={"error_type": error_type} - ) + self.pod_monitor_watch_errors.add(1, attributes={"error_type": error_type}) def update_pod_monitor_pods_watched(self, count: int) -> None: # Track the delta for gauge-like behavior - key = '_pods_monitored' + key = "_pods_monitored" current_val = getattr(self, key, 0) delta = count - current_val if delta != 0: @@ -220,55 +157,26 @@ def update_pod_monitor_pods_watched(self, count: int) -> None: def record_pod_phase_transition(self, from_phase: str, to_phase: str, pod_name: str) -> None: self.pod_phase_transitions.add( - 1, - attributes={ - "from_phase": from_phase, - "to_phase": to_phase, - "pod_name": pod_name - } + 1, attributes={"from_phase": from_phase, "to_phase": to_phase, "pod_name": pod_name} ) def record_pod_lifetime(self, lifetime_seconds: float, final_phase: str, language: str) -> None: - self.pod_lifetime.record( - lifetime_seconds, - attributes={ - "final_phase": final_phase, - "language": language - } - ) + self.pod_lifetime.record(lifetime_seconds, attributes={"final_phase": final_phase, "language": language}) def update_pods_by_phase(self, phase: str, count: int) -> None: # Track the delta for gauge-like behavior - key = f'_pods_phase_{phase}' + key = f"_pods_phase_{phase}" current_val = getattr(self, key, 0) delta = count - current_val if delta != 0: - self.pods_by_phase.add( - delta, - attributes={"phase": phase} - ) + self.pods_by_phase.add(delta, attributes={"phase": phase}) setattr(self, key, count) def record_pod_resource_request(self, resource_type: str, value: float, language: str) -> None: - self.pod_resource_requests.record( - value, - attributes={ - "resource_type": resource_type, - "language": language - } - ) + self.pod_resource_requests.record(value, attributes={"resource_type": resource_type, "language": language}) def record_pod_resource_limit(self, resource_type: str, value: float, language: str) -> None: - self.pod_resource_limits.record( - value, - attributes={ - "resource_type": resource_type, - "language": language - } - ) + self.pod_resource_limits.record(value, attributes={"resource_type": resource_type, "language": language}) def record_pods_per_node(self, node_name: str, pod_count: int) -> None: - self.pods_per_node.record( - pod_count, - attributes={"node_name": node_name} - ) + self.pods_per_node.record(pod_count, attributes={"node_name": node_name}) diff --git a/backend/app/core/metrics/notifications.py b/backend/app/core/metrics/notifications.py index 9797c270..1610e659 100644 --- a/backend/app/core/metrics/notifications.py +++ b/backend/app/core/metrics/notifications.py @@ -3,427 +3,275 @@ class NotificationMetrics(BaseMetrics): """Metrics for notifications.""" - + def _create_instruments(self) -> None: # Core notification metrics self.notifications_sent = self._meter.create_counter( - name="notifications.sent.total", - description="Total number of notifications sent", - unit="1" + name="notifications.sent.total", description="Total number of notifications sent", unit="1" ) - + self.notifications_failed = self._meter.create_counter( - name="notifications.failed.total", - description="Total number of failed notifications", - unit="1" + name="notifications.failed.total", description="Total number of failed notifications", unit="1" ) - + self.notification_delivery_time = self._meter.create_histogram( - name="notification.delivery.time", - description="Time taken to deliver notifications in seconds", - unit="s" + name="notification.delivery.time", description="Time taken to deliver notifications in seconds", unit="s" ) - + # Channel-specific metrics self.notifications_by_channel = self._meter.create_counter( - name="notifications.by.channel.total", - description="Total notifications sent by channel", - unit="1" + name="notifications.by.channel.total", description="Total notifications sent by channel", unit="1" ) - + self.channel_delivery_time = self._meter.create_histogram( - name="notification.channel.delivery.time", - description="Delivery time by channel in seconds", - unit="s" + name="notification.channel.delivery.time", description="Delivery time by channel in seconds", unit="s" ) - + self.channel_failures = self._meter.create_counter( - name="notification.channel.failures.total", - description="Total failures by channel", - unit="1" + name="notification.channel.failures.total", description="Total failures by channel", unit="1" ) - + # Severity metrics self.notifications_by_severity = self._meter.create_counter( - name="notifications.by.severity.total", - description="Total notifications by severity level", - unit="1" + name="notifications.by.severity.total", description="Total notifications by severity level", unit="1" ) - + # Status tracking self.notification_status_changes = self._meter.create_counter( - name="notification.status.changes.total", - description="Total notification status changes", - unit="1" + name="notification.status.changes.total", description="Total notification status changes", unit="1" ) - + self.notifications_pending = self._meter.create_up_down_counter( - name="notifications.pending", - description="Number of pending notifications", - unit="1" + name="notifications.pending", description="Number of pending notifications", unit="1" ) - + self.notifications_queued = self._meter.create_up_down_counter( - name="notifications.queued", - description="Number of queued notifications", - unit="1" + name="notifications.queued", description="Number of queued notifications", unit="1" ) - + # User engagement metrics self.notifications_read = self._meter.create_counter( - name="notifications.read.total", - description="Total notifications read by users", - unit="1" + name="notifications.read.total", description="Total notifications read by users", unit="1" ) - + self.notifications_clicked = self._meter.create_counter( - name="notifications.clicked.total", - description="Total notifications clicked by users", - unit="1" + name="notifications.clicked.total", description="Total notifications clicked by users", unit="1" ) - + self.time_to_read = self._meter.create_histogram( - name="notification.time.to.read", - description="Time between notification sent and read in seconds", - unit="s" + name="notification.time.to.read", description="Time between notification sent and read in seconds", unit="s" ) - + self.unread_count = self._meter.create_up_down_counter( - name="notifications.unread.count", - description="Current unread notifications per user", - unit="1" + name="notifications.unread.count", description="Current unread notifications per user", unit="1" ) - + # Throttling metrics self.notifications_throttled = self._meter.create_counter( - name="notifications.throttled.total", - description="Total notifications throttled", - unit="1" + name="notifications.throttled.total", description="Total notifications throttled", unit="1" ) - + self.throttle_window_hits = self._meter.create_counter( name="notification.throttle.window.hits.total", description="Number of times throttle window was hit", - unit="1" + unit="1", ) - + # Retry metrics self.notification_retries = self._meter.create_counter( - name="notification.retries.total", - description="Total notification retry attempts", - unit="1" + name="notification.retries.total", description="Total notification retry attempts", unit="1" ) - + self.retry_success_rate = self._meter.create_histogram( - name="notification.retry.success.rate", - description="Success rate of retried notifications", - unit="%" + name="notification.retry.success.rate", description="Success rate of retried notifications", unit="%" ) - + # Batch processing metrics self.batch_notifications_processed = self._meter.create_counter( - name="notification.batch.processed.total", - description="Total notifications processed in batches", - unit="1" + name="notification.batch.processed.total", description="Total notifications processed in batches", unit="1" ) - + self.batch_processing_time = self._meter.create_histogram( name="notification.batch.processing.time", description="Time to process notification batch in seconds", - unit="s" + unit="s", ) - + self.batch_size = self._meter.create_histogram( - name="notification.batch.size", - description="Size of notification batches", - unit="1" + name="notification.batch.size", description="Size of notification batches", unit="1" ) - + # Template rendering metrics self.template_render_time = self._meter.create_histogram( name="notification.template.render.time", description="Time to render notification template in seconds", - unit="s" + unit="s", ) - + self.template_render_errors = self._meter.create_counter( - name="notification.template.render.errors.total", - description="Total template rendering errors", - unit="1" + name="notification.template.render.errors.total", description="Total template rendering errors", unit="1" ) - + # Webhook-specific metrics self.webhook_delivery_time = self._meter.create_histogram( name="notification.webhook.delivery.time", description="Time to deliver webhook notifications in seconds", - unit="s" + unit="s", ) - + self.webhook_response_status = self._meter.create_counter( - name="notification.webhook.response.status.total", - description="Webhook response status codes", - unit="1" + name="notification.webhook.response.status.total", description="Webhook response status codes", unit="1" ) - + # Slack-specific metrics self.slack_delivery_time = self._meter.create_histogram( name="notification.slack.delivery.time", description="Time to deliver Slack notifications in seconds", - unit="s" + unit="s", ) - + self.slack_api_errors = self._meter.create_counter( - name="notification.slack.api.errors.total", - description="Total Slack API errors", - unit="1" + name="notification.slack.api.errors.total", description="Total Slack API errors", unit="1" ) - + # Subscription metrics self.subscriptions_active = self._meter.create_up_down_counter( name="notification.subscriptions.active", description="Number of active notification subscriptions", - unit="1" + unit="1", ) - + self.subscription_changes = self._meter.create_counter( - name="notification.subscription.changes.total", - description="Total subscription changes", - unit="1" - ) - - def record_notification_sent(self, notification_type: str, channel: str = "in_app", - severity: str = "medium") -> None: - self.notifications_sent.add( - 1, - attributes={"category": notification_type} + name="notification.subscription.changes.total", description="Total subscription changes", unit="1" ) - - self.notifications_by_channel.add( - 1, - attributes={ - "channel": channel, - "category": notification_type - } - ) - - self.notifications_by_severity.add( - 1, - attributes={ - "severity": severity, - "category": notification_type - } - ) - + + def record_notification_sent( + self, notification_type: str, channel: str = "in_app", severity: str = "medium" + ) -> None: + self.notifications_sent.add(1, attributes={"category": notification_type}) + + self.notifications_by_channel.add(1, attributes={"channel": channel, "category": notification_type}) + + self.notifications_by_severity.add(1, attributes={"severity": severity, "category": notification_type}) + def record_notification_failed(self, notification_type: str, error: str, channel: str = "in_app") -> None: - self.notifications_failed.add( - 1, - attributes={ - "category": notification_type, - "error": error - } - ) - - self.channel_failures.add( - 1, - attributes={ - "channel": channel, - "error": error - } - ) - - def record_notification_delivery_time(self, duration_seconds: float, notification_type: str, - channel: str = "in_app") -> None: - self.notification_delivery_time.record( - duration_seconds, - attributes={"category": notification_type} - ) - + self.notifications_failed.add(1, attributes={"category": notification_type, "error": error}) + + self.channel_failures.add(1, attributes={"channel": channel, "error": error}) + + def record_notification_delivery_time( + self, duration_seconds: float, notification_type: str, channel: str = "in_app" + ) -> None: + self.notification_delivery_time.record(duration_seconds, attributes={"category": notification_type}) + self.channel_delivery_time.record( - duration_seconds, - attributes={ - "channel": channel, - "category": notification_type - } + duration_seconds, attributes={"channel": channel, "category": notification_type} ) - + def record_notification_status_change(self, notification_id: str, from_status: str, to_status: str) -> None: - self.notification_status_changes.add( - 1, - attributes={ - "from_status": from_status, - "to_status": to_status - } - ) - + self.notification_status_changes.add(1, attributes={"from_status": from_status, "to_status": to_status}) + # Update pending/queued counters if from_status == "pending": self.notifications_pending.add(-1) if to_status == "pending": self.notifications_pending.add(1) - + if from_status == "queued": self.notifications_queued.add(-1) if to_status == "queued": self.notifications_queued.add(1) - + def record_notification_read(self, notification_type: str, time_to_read_seconds: float) -> None: - self.notifications_read.add( - 1, - attributes={"category": notification_type} - ) - - self.time_to_read.record( - time_to_read_seconds, - attributes={"category": notification_type} - ) - + self.notifications_read.add(1, attributes={"category": notification_type}) + + self.time_to_read.record(time_to_read_seconds, attributes={"category": notification_type}) + def record_notification_clicked(self, notification_type: str) -> None: - self.notifications_clicked.add( - 1, - attributes={"category": notification_type} - ) - + self.notifications_clicked.add(1, attributes={"category": notification_type}) + def update_unread_count(self, user_id: str, count: int) -> None: # Track the delta for gauge-like behavior - key = f'_unread_{user_id}' + key = f"_unread_{user_id}" current_val = getattr(self, key, 0) delta = count - current_val if delta != 0: - self.unread_count.add( - delta, - attributes={"user_id": user_id} - ) + self.unread_count.add(delta, attributes={"user_id": user_id}) setattr(self, key, count) - + def record_notification_throttled(self, notification_type: str, user_id: str) -> None: - self.notifications_throttled.add( - 1, - attributes={ - "category": notification_type, - "user_id": user_id - } - ) - + self.notifications_throttled.add(1, attributes={"category": notification_type, "user_id": user_id}) + def record_throttle_window_hit(self, user_id: str) -> None: - self.throttle_window_hits.add( - 1, - attributes={"user_id": user_id} - ) - + self.throttle_window_hits.add(1, attributes={"user_id": user_id}) + def record_notification_retry(self, notification_type: str, attempt_number: int, success: bool) -> None: self.notification_retries.add( - 1, - attributes={ - "category": notification_type, - "attempt": str(attempt_number), - "success": str(success) - } + 1, attributes={"category": notification_type, "attempt": str(attempt_number), "success": str(success)} ) - + if attempt_number > 1: # Only record retry success rate for actual retries - self.retry_success_rate.record( - 100.0 if success else 0.0, - attributes={"category": notification_type} - ) - - def record_batch_processed(self, batch_size_count: int, processing_time_seconds: float, - notification_type: str = "mixed") -> None: - self.batch_notifications_processed.add( - batch_size_count, - attributes={"category": notification_type} - ) - - self.batch_processing_time.record( - processing_time_seconds, - attributes={"category": notification_type} - ) - - self.batch_size.record( - batch_size_count, - attributes={"category": notification_type} - ) - + self.retry_success_rate.record(100.0 if success else 0.0, attributes={"category": notification_type}) + + def record_batch_processed( + self, batch_size_count: int, processing_time_seconds: float, notification_type: str = "mixed" + ) -> None: + self.batch_notifications_processed.add(batch_size_count, attributes={"category": notification_type}) + + self.batch_processing_time.record(processing_time_seconds, attributes={"category": notification_type}) + + self.batch_size.record(batch_size_count, attributes={"category": notification_type}) + def record_template_render(self, duration_seconds: float, template_name: str, success: bool) -> None: self.template_render_time.record( - duration_seconds, - attributes={ - "template": template_name, - "success": str(success) - } + duration_seconds, attributes={"template": template_name, "success": str(success)} ) - + if not success: - self.template_render_errors.add( - 1, - attributes={"template": template_name} - ) - + self.template_render_errors.add(1, attributes={"template": template_name}) + def record_webhook_delivery(self, duration_seconds: float, status_code: int, url_pattern: str) -> None: self.webhook_delivery_time.record( - duration_seconds, - attributes={ - "status_code": str(status_code), - "url_pattern": url_pattern - } - ) - - self.webhook_response_status.add( - 1, - attributes={ - "status_code": str(status_code), - "url_pattern": url_pattern - } - ) - - def record_slack_delivery(self, duration_seconds: float, channel: str, success: bool, - error_type: str | None = None) -> None: - self.slack_delivery_time.record( - duration_seconds, - attributes={ - "channel": channel, - "success": str(success) - } + duration_seconds, attributes={"status_code": str(status_code), "url_pattern": url_pattern} ) - + + self.webhook_response_status.add(1, attributes={"status_code": str(status_code), "url_pattern": url_pattern}) + + def record_slack_delivery( + self, duration_seconds: float, channel: str, success: bool, error_type: str | None = None + ) -> None: + self.slack_delivery_time.record(duration_seconds, attributes={"channel": channel, "success": str(success)}) + if not success and error_type: - self.slack_api_errors.add( - 1, - attributes={ - "error_type": error_type, - "channel": channel - } - ) - + self.slack_api_errors.add(1, attributes={"error_type": error_type, "channel": channel}) + def update_active_subscriptions(self, user_id: str, count: int) -> None: # Track the delta for gauge-like behavior - key = f'_subscriptions_{user_id}' + key = f"_subscriptions_{user_id}" current_val = getattr(self, key, 0) delta = count - current_val if delta != 0: - self.subscriptions_active.add( - delta, - attributes={"user_id": user_id} - ) + self.subscriptions_active.add(delta, attributes={"user_id": user_id}) setattr(self, key, count) - + def record_subscription_change(self, user_id: str, notification_type: str, action: str) -> None: self.subscription_changes.add( 1, attributes={ "user_id": user_id, "category": notification_type, - "action": action # "subscribe" or "unsubscribe" - } + "action": action, # "subscribe" or "unsubscribe" + }, ) - + def increment_pending_notifications(self) -> None: self.notifications_pending.add(1) - + def decrement_pending_notifications(self) -> None: self.notifications_pending.add(-1) - + def increment_queued_notifications(self) -> None: self.notifications_queued.add(1) - + def decrement_queued_notifications(self) -> None: self.notifications_queued.add(-1) diff --git a/backend/app/core/metrics/rate_limit.py b/backend/app/core/metrics/rate_limit.py index 89665023..a904ce79 100644 --- a/backend/app/core/metrics/rate_limit.py +++ b/backend/app/core/metrics/rate_limit.py @@ -26,7 +26,7 @@ def _create_instruments(self) -> None: description="Number of bypassed rate limit checks", unit="1", ) - + # Performance metrics self.check_duration = self._meter.create_histogram( name="rate_limit.check.duration", @@ -43,7 +43,7 @@ def _create_instruments(self) -> None: description="Time to execute rate limit algorithm", unit="ms", ) - + # Usage metrics self.remaining = self._meter.create_histogram( name="rate_limit.remaining", @@ -60,7 +60,7 @@ def _create_instruments(self) -> None: description="Size of rate limit window", unit="s", ) - + # Configuration metrics - using histograms to record absolute values # We record the current value, and Grafana queries the latest self.active_rules = self._meter.create_histogram( @@ -78,7 +78,7 @@ def _create_instruments(self) -> None: description="Number of users with rate limit bypass", unit="1", ) - + # Token bucket specific metrics self.token_bucket_tokens = self._meter.create_histogram( name="rate_limit.token_bucket.tokens", @@ -90,7 +90,7 @@ def _create_instruments(self) -> None: description="Token bucket refill rate", unit="tokens/s", ) - + # Error metrics self.redis_errors = self._meter.create_counter( name="rate_limit.redis.errors.total", @@ -102,6 +102,6 @@ def _create_instruments(self) -> None: description="Number of configuration load errors", unit="1", ) - + # Authenticated vs anonymous checks can be derived from labels on requests_total # No separate ip/user counters to avoid duplication and complexity. diff --git a/backend/app/core/metrics/replay.py b/backend/app/core/metrics/replay.py index 00ccfaca..fc5beae9 100644 --- a/backend/app/core/metrics/replay.py +++ b/backend/app/core/metrics/replay.py @@ -3,290 +3,178 @@ class ReplayMetrics(BaseMetrics): """Metrics for event replay operations.""" - + def _create_instruments(self) -> None: # Core replay metrics self.replay_sessions_created = self._meter.create_counter( - name="replay.sessions.created.total", - description="Total number of replay sessions created", - unit="1" + name="replay.sessions.created.total", description="Total number of replay sessions created", unit="1" ) - + self.replay_sessions_active = self._meter.create_up_down_counter( - name="replay.sessions.active", - description="Number of currently active replay sessions", - unit="1" + name="replay.sessions.active", description="Number of currently active replay sessions", unit="1" ) - + self.replay_events_processed = self._meter.create_counter( - name="replay.events.processed.total", - description="Total number of events replayed", - unit="1" + name="replay.events.processed.total", description="Total number of events replayed", unit="1" ) - + self.replay_events_failed = self._meter.create_counter( - name="replay.events.failed.total", - description="Total number of failed replay events", - unit="1" + name="replay.events.failed.total", description="Total number of failed replay events", unit="1" ) - + self.replay_events_skipped = self._meter.create_counter( - name="replay.events.skipped.total", - description="Total number of skipped replay events", - unit="1" + name="replay.events.skipped.total", description="Total number of skipped replay events", unit="1" ) - + # Performance metrics self.replay_duration = self._meter.create_histogram( - name="replay.duration", - description="Duration of replay sessions in seconds", - unit="s" + name="replay.duration", description="Duration of replay sessions in seconds", unit="s" ) - + self.replay_event_processing_time = self._meter.create_histogram( name="replay.event.processing.time", description="Time to process individual replay events in seconds", - unit="s" + unit="s", ) - + self.replay_throughput = self._meter.create_histogram( - name="replay.throughput", - description="Events replayed per second", - unit="event/s" + name="replay.throughput", description="Events replayed per second", unit="event/s" ) - + self.replay_batch_size = self._meter.create_histogram( - name="replay.batch.size", - description="Size of replay batches", - unit="1" + name="replay.batch.size", description="Size of replay batches", unit="1" ) - + # Status tracking self.replay_status_changes = self._meter.create_counter( - name="replay.status.changes.total", - description="Total replay session status changes", - unit="1" + name="replay.status.changes.total", description="Total replay session status changes", unit="1" ) - + self.replay_sessions_by_status = self._meter.create_up_down_counter( - name="replay.sessions.by.status", - description="Number of replay sessions by status", - unit="1" + name="replay.sessions.by.status", description="Number of replay sessions by status", unit="1" ) - + # Target metrics self.replay_by_target = self._meter.create_counter( - name="replay.by.target.total", - description="Total replays by target type", - unit="1" + name="replay.by.target.total", description="Total replays by target type", unit="1" ) - + self.replay_target_errors = self._meter.create_counter( - name="replay.target.errors.total", - description="Errors by replay target", - unit="1" + name="replay.target.errors.total", description="Errors by replay target", unit="1" ) - + # Speed control metrics self.replay_speed_multiplier = self._meter.create_histogram( - name="replay.speed.multiplier", - description="Speed multiplier used for replay sessions", - unit="x" + name="replay.speed.multiplier", description="Speed multiplier used for replay sessions", unit="x" ) - + self.replay_delay_applied = self._meter.create_histogram( - name="replay.delay.applied", - description="Delay applied between replay events in seconds", - unit="s" + name="replay.delay.applied", description="Delay applied between replay events in seconds", unit="s" ) - + # Filter metrics self.replay_events_filtered = self._meter.create_counter( - name="replay.events.filtered.total", - description="Total events filtered during replay", - unit="1" + name="replay.events.filtered.total", description="Total events filtered during replay", unit="1" ) - + self.replay_filter_effectiveness = self._meter.create_histogram( - name="replay.filter.effectiveness", - description="Percentage of events passing filters", - unit="%" + name="replay.filter.effectiveness", description="Percentage of events passing filters", unit="%" ) - + # Memory and resource metrics self.replay_memory_usage = self._meter.create_histogram( - name="replay.memory.usage", - description="Memory usage during replay in MB", - unit="MB" + name="replay.memory.usage", description="Memory usage during replay in MB", unit="MB" ) - + self.replay_queue_size = self._meter.create_up_down_counter( - name="replay.queue.size", - description="Size of replay event queue", - unit="1" + name="replay.queue.size", description="Size of replay event queue", unit="1" ) - + def record_session_created(self, replay_type: str, target: str) -> None: - self.replay_sessions_created.add( - 1, - attributes={ - "replay_type": replay_type, - "target": target - } - ) - + self.replay_sessions_created.add(1, attributes={"replay_type": replay_type, "target": target}) + def update_active_replays(self, count: int) -> None: # Track the delta for gauge-like behavior - key = '_active_replays' + key = "_active_replays" current_val = getattr(self, key, 0) delta = count - current_val if delta != 0: self.replay_sessions_active.add(delta) setattr(self, key, count) - + def increment_active_replays(self) -> None: self.replay_sessions_active.add(1) - + def decrement_active_replays(self) -> None: self.replay_sessions_active.add(-1) - + def record_events_replayed(self, replay_type: str, event_type: str, status: str, count: int = 1) -> None: if status == "success": - self.replay_events_processed.add( - count, - attributes={ - "replay_type": replay_type, - "event_type": event_type - } - ) + self.replay_events_processed.add(count, attributes={"replay_type": replay_type, "event_type": event_type}) elif status == "failed": - self.replay_events_failed.add( - count, - attributes={ - "replay_type": replay_type, - "event_type": event_type - } - ) + self.replay_events_failed.add(count, attributes={"replay_type": replay_type, "event_type": event_type}) elif status == "skipped": - self.replay_events_skipped.add( - count, - attributes={ - "replay_type": replay_type, - "event_type": event_type - } - ) - + self.replay_events_skipped.add(count, attributes={"replay_type": replay_type, "event_type": event_type}) + def record_event_replayed(self, replay_type: str, event_type: str, status: str) -> None: self.record_events_replayed(replay_type, event_type, status, 1) - - def record_replay_duration(self, duration_seconds: float, replay_type: str, - total_events: int = 0) -> None: - self.replay_duration.record( - duration_seconds, - attributes={"replay_type": replay_type} - ) - + + def record_replay_duration(self, duration_seconds: float, replay_type: str, total_events: int = 0) -> None: + self.replay_duration.record(duration_seconds, attributes={"replay_type": replay_type}) + # Calculate and record throughput if events were processed if total_events > 0 and duration_seconds > 0: throughput = total_events / duration_seconds - self.replay_throughput.record( - throughput, - attributes={"replay_type": replay_type} - ) - + self.replay_throughput.record(throughput, attributes={"replay_type": replay_type}) + def record_event_processing_time(self, duration_seconds: float, event_type: str) -> None: - self.replay_event_processing_time.record( - duration_seconds, - attributes={"event_type": event_type} - ) - + self.replay_event_processing_time.record(duration_seconds, attributes={"event_type": event_type}) + def record_replay_error(self, error_type: str, replay_type: str = "unknown") -> None: - self.replay_events_failed.add( - 1, - attributes={ - "error_type": error_type, - "replay_type": replay_type - } - ) - + self.replay_events_failed.add(1, attributes={"error_type": error_type, "replay_type": replay_type}) + def record_status_change(self, session_id: str, from_status: str, to_status: str) -> None: - self.replay_status_changes.add( - 1, - attributes={ - "from_status": from_status, - "to_status": to_status - } - ) - + self.replay_status_changes.add(1, attributes={"from_status": from_status, "to_status": to_status}) + # Update sessions by status self.update_sessions_by_status(from_status, -1) self.update_sessions_by_status(to_status, 1) - + def update_sessions_by_status(self, status: str, delta: int) -> None: if delta != 0: - self.replay_sessions_by_status.add( - delta, - attributes={"status": status} - ) - + self.replay_sessions_by_status.add(delta, attributes={"status": status}) + def record_replay_by_target(self, target: str, success: bool) -> None: - self.replay_by_target.add( - 1, - attributes={ - "target": target, - "success": str(success) - } - ) - + self.replay_by_target.add(1, attributes={"target": target, "success": str(success)}) + if not success: - self.replay_target_errors.add( - 1, - attributes={"target": target} - ) - + self.replay_target_errors.add(1, attributes={"target": target}) + def record_speed_multiplier(self, multiplier: float, replay_type: str) -> None: - self.replay_speed_multiplier.record( - multiplier, - attributes={"replay_type": replay_type} - ) - + self.replay_speed_multiplier.record(multiplier, attributes={"replay_type": replay_type}) + def record_delay_applied(self, delay_seconds: float) -> None: self.replay_delay_applied.record(delay_seconds) - + def record_batch_size(self, size: int, replay_type: str) -> None: - self.replay_batch_size.record( - size, - attributes={"replay_type": replay_type} - ) - + self.replay_batch_size.record(size, attributes={"replay_type": replay_type}) + def record_events_filtered(self, filter_type: str, count: int) -> None: - self.replay_events_filtered.add( - count, - attributes={"filter_type": filter_type} - ) - + self.replay_events_filtered.add(count, attributes={"filter_type": filter_type}) + def record_filter_effectiveness(self, passed: int, total: int, filter_type: str) -> None: if total > 0: effectiveness = (passed / total) * 100 - self.replay_filter_effectiveness.record( - effectiveness, - attributes={"filter_type": filter_type} - ) - + self.replay_filter_effectiveness.record(effectiveness, attributes={"filter_type": filter_type}) + def record_replay_memory_usage(self, memory_mb: float, session_id: str) -> None: - self.replay_memory_usage.record( - memory_mb, - attributes={"session_id": session_id} - ) - + self.replay_memory_usage.record(memory_mb, attributes={"session_id": session_id}) + def update_replay_queue_size(self, session_id: str, size: int) -> None: # Track the delta for gauge-like behavior - key = f'_queue_{session_id}' + key = f"_queue_{session_id}" current_val = getattr(self, key, 0) delta = size - current_val if delta != 0: - self.replay_queue_size.add( - delta, - attributes={"session_id": session_id} - ) + self.replay_queue_size.add(delta, attributes={"session_id": session_id}) setattr(self, key, size) diff --git a/backend/app/core/metrics/security.py b/backend/app/core/metrics/security.py index 31b7e252..2442aa1b 100644 --- a/backend/app/core/metrics/security.py +++ b/backend/app/core/metrics/security.py @@ -3,505 +3,320 @@ class SecurityMetrics(BaseMetrics): """Metrics for security events.""" - + def _create_instruments(self) -> None: # Core security event metrics self.security_events = self._meter.create_counter( - name="security.events.total", - description="Total number of security events by type", - unit="1" + name="security.events.total", description="Total number of security events by type", unit="1" ) - + self.security_violations = self._meter.create_counter( - name="security.violations.total", - description="Total number of security violations", - unit="1" + name="security.violations.total", description="Total number of security violations", unit="1" ) - + self.security_alerts = self._meter.create_counter( - name="security.alerts.total", - description="Total number of security alerts raised", - unit="1" + name="security.alerts.total", description="Total number of security alerts raised", unit="1" ) - + # Authentication metrics self.authentication_attempts = self._meter.create_counter( - name="authentication.attempts.total", - description="Total number of authentication attempts", - unit="1" + name="authentication.attempts.total", description="Total number of authentication attempts", unit="1" ) - + self.authentication_failures = self._meter.create_counter( - name="authentication.failures.total", - description="Total number of failed authentications", - unit="1" + name="authentication.failures.total", description="Total number of failed authentications", unit="1" ) - + self.authentication_duration = self._meter.create_histogram( - name="authentication.duration", - description="Time taken for authentication in seconds", - unit="s" + name="authentication.duration", description="Time taken for authentication in seconds", unit="s" ) - + self.active_sessions = self._meter.create_up_down_counter( - name="authentication.sessions.active", - description="Number of active user sessions", - unit="1" + name="authentication.sessions.active", description="Number of active user sessions", unit="1" ) - + # Token metrics self.tokens_generated = self._meter.create_counter( - name="tokens.generated.total", - description="Total number of tokens generated", - unit="1" + name="tokens.generated.total", description="Total number of tokens generated", unit="1" ) - + self.tokens_refreshed = self._meter.create_counter( - name="tokens.refreshed.total", - description="Total number of tokens refreshed", - unit="1" + name="tokens.refreshed.total", description="Total number of tokens refreshed", unit="1" ) - + self.tokens_revoked = self._meter.create_counter( - name="tokens.revoked.total", - description="Total number of tokens revoked", - unit="1" + name="tokens.revoked.total", description="Total number of tokens revoked", unit="1" ) - + self.token_validation_failures = self._meter.create_counter( - name="token.validation.failures.total", - description="Total number of token validation failures", - unit="1" + name="token.validation.failures.total", description="Total number of token validation failures", unit="1" ) - + self.token_expiry_time = self._meter.create_histogram( - name="token.expiry.time", - description="Token expiry time in seconds", - unit="s" + name="token.expiry.time", description="Token expiry time in seconds", unit="s" ) - + # Authorization metrics self.authorization_checks = self._meter.create_counter( - name="authorization.checks.total", - description="Total number of authorization checks", - unit="1" + name="authorization.checks.total", description="Total number of authorization checks", unit="1" ) - + self.authorization_denials = self._meter.create_counter( - name="authorization.denials.total", - description="Total number of authorization denials", - unit="1" + name="authorization.denials.total", description="Total number of authorization denials", unit="1" ) - + self.permission_checks = self._meter.create_counter( - name="permission.checks.total", - description="Total number of permission checks", - unit="1" + name="permission.checks.total", description="Total number of permission checks", unit="1" ) - + # CSRF protection metrics self.csrf_tokens_generated = self._meter.create_counter( - name="csrf.tokens.generated.total", - description="Total number of CSRF tokens generated", - unit="1" + name="csrf.tokens.generated.total", description="Total number of CSRF tokens generated", unit="1" ) - + self.csrf_validation_failures = self._meter.create_counter( - name="csrf.validation.failures.total", - description="Total number of CSRF validation failures", - unit="1" + name="csrf.validation.failures.total", description="Total number of CSRF validation failures", unit="1" ) - + # Network security metrics self.network_policy_violations = self._meter.create_counter( - name="network.policy.violations.total", - description="Total number of network policy violations", - unit="1" + name="network.policy.violations.total", description="Total number of network policy violations", unit="1" ) - + self.network_policy_created = self._meter.create_counter( - name="network.policies.created.total", - description="Total number of network policies created", - unit="1" + name="network.policies.created.total", description="Total number of network policies created", unit="1" ) - + # Privilege escalation metrics self.privilege_escalation_attempts = self._meter.create_counter( name="privilege.escalation.attempts.total", description="Total number of privilege escalation attempts", - unit="1" + unit="1", ) - + self.privilege_escalation_blocked = self._meter.create_counter( name="privilege.escalation.blocked.total", description="Total number of blocked privilege escalation attempts", - unit="1" + unit="1", ) - + # Rate limiting metrics self.rate_limit_hits = self._meter.create_counter( - name="rate.limit.hits.total", - description="Total number of rate limit hits", - unit="1" + name="rate.limit.hits.total", description="Total number of rate limit hits", unit="1" ) - + self.rate_limit_violations = self._meter.create_counter( - name="rate.limit.violations.total", - description="Total number of rate limit violations", - unit="1" + name="rate.limit.violations.total", description="Total number of rate limit violations", unit="1" ) - + # API key metrics self.api_keys_created = self._meter.create_counter( - name="api.keys.created.total", - description="Total number of API keys created", - unit="1" + name="api.keys.created.total", description="Total number of API keys created", unit="1" ) - + self.api_keys_revoked = self._meter.create_counter( - name="api.keys.revoked.total", - description="Total number of API keys revoked", - unit="1" + name="api.keys.revoked.total", description="Total number of API keys revoked", unit="1" ) - + self.api_key_usage = self._meter.create_counter( - name="api.key.usage.total", - description="Total API key usage", - unit="1" + name="api.key.usage.total", description="Total API key usage", unit="1" ) - + # Audit log metrics self.audit_events_logged = self._meter.create_counter( - name="audit.events.logged.total", - description="Total number of audit events logged", - unit="1" + name="audit.events.logged.total", description="Total number of audit events logged", unit="1" ) - + # Password metrics self.password_changes = self._meter.create_counter( - name="password.changes.total", - description="Total number of password changes", - unit="1" + name="password.changes.total", description="Total number of password changes", unit="1" ) - + self.password_reset_requests = self._meter.create_counter( - name="password.reset.requests.total", - description="Total number of password reset requests", - unit="1" + name="password.reset.requests.total", description="Total number of password reset requests", unit="1" ) - + self.weak_password_attempts = self._meter.create_counter( - name="weak.password.attempts.total", - description="Total number of weak password attempts", - unit="1" + name="weak.password.attempts.total", description="Total number of weak password attempts", unit="1" ) - + # Brute force detection self.brute_force_attempts = self._meter.create_counter( - name="brute.force.attempts.total", - description="Total number of detected brute force attempts", - unit="1" + name="brute.force.attempts.total", description="Total number of detected brute force attempts", unit="1" ) - + self.accounts_locked = self._meter.create_counter( - name="accounts.locked.total", - description="Total number of accounts locked due to security", - unit="1" + name="accounts.locked.total", description="Total number of accounts locked due to security", unit="1" ) - + def record_security_event(self, event_type: str, severity: str = "info", source: str = "system") -> None: - self.security_events.add( - 1, - attributes={ - "event_type": event_type, - "severity": severity, - "source": source - } - ) - + self.security_events.add(1, attributes={"event_type": event_type, "severity": severity, "source": source}) + if severity in ["critical", "high"]: - self.security_alerts.add( - 1, - attributes={ - "event_type": event_type, - "severity": severity - } - ) - - def record_security_violation(self, - violation_type: str, - user_id: str | None = None, - ip_address: str | None = None) -> None: + self.security_alerts.add(1, attributes={"event_type": event_type, "severity": severity}) + + def record_security_violation( + self, violation_type: str, user_id: str | None = None, ip_address: str | None = None + ) -> None: self.security_violations.add( 1, attributes={ "violation_type": violation_type, "user_id": user_id or "anonymous", - "ip_address": ip_address or "unknown" - } + "ip_address": ip_address or "unknown", + }, ) - - def record_authentication_attempt(self, method: str, success: bool, user_id: str | None = None, - duration_seconds: float | None = None) -> None: + + def record_authentication_attempt( + self, method: str, success: bool, user_id: str | None = None, duration_seconds: float | None = None + ) -> None: self.authentication_attempts.add( - 1, - attributes={ - "method": method, - "success": str(success), - "user_id": user_id or "unknown" - } + 1, attributes={"method": method, "success": str(success), "user_id": user_id or "unknown"} ) - + if not success: - self.authentication_failures.add( - 1, - attributes={ - "method": method, - "user_id": user_id or "unknown" - } - ) - + self.authentication_failures.add(1, attributes={"method": method, "user_id": user_id or "unknown"}) + if duration_seconds is not None: - self.authentication_duration.record( - duration_seconds, - attributes={"method": method} - ) - + self.authentication_duration.record(duration_seconds, attributes={"method": method}) + def update_active_sessions(self, count: int) -> None: # Track the delta for gauge-like behavior - key = '_active_sessions' + key = "_active_sessions" current_val = getattr(self, key, 0) delta = count - current_val if delta != 0: self.active_sessions.add(delta) setattr(self, key, count) - + def increment_active_sessions(self) -> None: self.active_sessions.add(1) - + def decrement_active_sessions(self) -> None: self.active_sessions.add(-1) - + def record_token_generated(self, token_type: str, expiry_seconds: float) -> None: - self.tokens_generated.add( - 1, - attributes={"token_type": token_type} - ) - - self.token_expiry_time.record( - expiry_seconds, - attributes={"token_type": token_type} - ) - + self.tokens_generated.add(1, attributes={"token_type": token_type}) + + self.token_expiry_time.record(expiry_seconds, attributes={"token_type": token_type}) + def record_token_refreshed(self, token_type: str) -> None: - self.tokens_refreshed.add( - 1, - attributes={"token_type": token_type} - ) - + self.tokens_refreshed.add(1, attributes={"token_type": token_type}) + def record_token_revoked(self, token_type: str, reason: str) -> None: - self.tokens_revoked.add( - 1, - attributes={ - "token_type": token_type, - "reason": reason - } - ) - + self.tokens_revoked.add(1, attributes={"token_type": token_type, "reason": reason}) + def record_token_validation_failure(self, token_type: str, failure_reason: str) -> None: - self.token_validation_failures.add( - 1, - attributes={ - "token_type": token_type, - "failure_reason": failure_reason - } - ) - - def record_authorization_check(self, - resource: str, - action: str, - allowed: bool, - user_role: str | None = None) -> None: + self.token_validation_failures.add(1, attributes={"token_type": token_type, "failure_reason": failure_reason}) + + def record_authorization_check( + self, resource: str, action: str, allowed: bool, user_role: str | None = None + ) -> None: self.authorization_checks.add( 1, attributes={ "resource": resource, "action": action, "allowed": str(allowed), - "user_role": user_role or "unknown" - } + "user_role": user_role or "unknown", + }, ) - + if not allowed: self.authorization_denials.add( - 1, - attributes={ - "resource": resource, - "action": action, - "user_role": user_role or "unknown" - } + 1, attributes={"resource": resource, "action": action, "user_role": user_role or "unknown"} ) - + def record_permission_check(self, permission: str, granted: bool, user_id: str | None = None) -> None: self.permission_checks.add( - 1, - attributes={ - "permission": permission, - "granted": str(granted), - "user_id": user_id or "unknown" - } + 1, attributes={"permission": permission, "granted": str(granted), "user_id": user_id or "unknown"} ) - + def record_csrf_token_generated(self) -> None: self.csrf_tokens_generated.add(1) - + def record_csrf_validation_failure(self, reason: str) -> None: - self.csrf_validation_failures.add( - 1, - attributes={"reason": reason} - ) - - def record_network_policy_violation(self, policy_name: str, pod_name: str | None = None, - violation_type: str = "ingress") -> None: + self.csrf_validation_failures.add(1, attributes={"reason": reason}) + + def record_network_policy_violation( + self, policy_name: str, pod_name: str | None = None, violation_type: str = "ingress" + ) -> None: self.network_policy_violations.add( 1, attributes={ "policy_name": policy_name, "pod_name": pod_name or "unknown", - "violation_type": violation_type - } + "violation_type": violation_type, + }, ) - + def record_network_policy_created(self, policy_name: str, namespace: str) -> None: - self.network_policy_created.add( - 1, - attributes={ - "policy_name": policy_name, - "namespace": namespace - } - ) - + self.network_policy_created.add(1, attributes={"policy_name": policy_name, "namespace": namespace}) + def record_privilege_escalation_attempt(self, user_id: str, target_privilege: str, blocked: bool) -> None: self.privilege_escalation_attempts.add( - 1, - attributes={ - "user_id": user_id, - "target_privilege": target_privilege, - "blocked": str(blocked) - } + 1, attributes={"user_id": user_id, "target_privilege": target_privilege, "blocked": str(blocked)} ) - + if blocked: self.privilege_escalation_blocked.add( - 1, - attributes={ - "user_id": user_id, - "target_privilege": target_privilege - } + 1, attributes={"user_id": user_id, "target_privilege": target_privilege} ) - + def record_rate_limit_hit(self, endpoint: str, user_id: str | None = None) -> None: - self.rate_limit_hits.add( - 1, - attributes={ - "endpoint": endpoint, - "user_id": user_id or "anonymous" - } - ) - + self.rate_limit_hits.add(1, attributes={"endpoint": endpoint, "user_id": user_id or "anonymous"}) + def record_rate_limit_violation(self, endpoint: str, user_id: str | None = None, limit: int | None = None) -> None: self.rate_limit_violations.add( 1, attributes={ "endpoint": endpoint, "user_id": user_id or "anonymous", - "limit": str(limit) if limit else "unknown" - } + "limit": str(limit) if limit else "unknown", + }, ) - + def record_api_key_created(self, key_id: str, scopes: str | None = None) -> None: - self.api_keys_created.add( - 1, - attributes={ - "key_id": key_id, - "scopes": scopes or "default" - } - ) - + self.api_keys_created.add(1, attributes={"key_id": key_id, "scopes": scopes or "default"}) + def record_api_key_revoked(self, key_id: str, reason: str) -> None: - self.api_keys_revoked.add( - 1, - attributes={ - "key_id": key_id, - "reason": reason - } - ) - + self.api_keys_revoked.add(1, attributes={"key_id": key_id, "reason": reason}) + def record_api_key_usage(self, key_id: str, endpoint: str) -> None: - self.api_key_usage.add( - 1, - attributes={ - "key_id": key_id, - "endpoint": endpoint - } - ) - + self.api_key_usage.add(1, attributes={"key_id": key_id, "endpoint": endpoint}) + def record_audit_event(self, event_type: str, user_id: str, resource: str | None = None) -> None: self.audit_events_logged.add( - 1, - attributes={ - "event_type": event_type, - "user_id": user_id, - "resource": resource or "system" - } + 1, attributes={"event_type": event_type, "user_id": user_id, "resource": resource or "system"} ) - + def record_password_change(self, user_id: str, forced: bool = False) -> None: - self.password_changes.add( - 1, - attributes={ - "user_id": user_id, - "forced": str(forced) - } - ) - + self.password_changes.add(1, attributes={"user_id": user_id, "forced": str(forced)}) + def record_password_reset_request(self, user_id: str, method: str = "email") -> None: - self.password_reset_requests.add( - 1, - attributes={ - "user_id": user_id, - "method": method - } - ) - + self.password_reset_requests.add(1, attributes={"user_id": user_id, "method": method}) + def record_weak_password_attempt(self, user_id: str, weakness_type: str) -> None: - self.weak_password_attempts.add( - 1, - attributes={ - "user_id": user_id, - "weakness_type": weakness_type - } - ) - - def record_brute_force_attempt(self, - ip_address: str, - target_user: str | None = None, - action_taken: str = "logged") -> None: + self.weak_password_attempts.add(1, attributes={"user_id": user_id, "weakness_type": weakness_type}) + + def record_brute_force_attempt( + self, ip_address: str, target_user: str | None = None, action_taken: str = "logged" + ) -> None: self.brute_force_attempts.add( 1, attributes={ "ip_address": ip_address, "target_user": target_user or "multiple", - "action_taken": action_taken - } + "action_taken": action_taken, + }, ) - + def record_account_locked(self, user_id: str, reason: str, duration_seconds: float | None = None) -> None: self.accounts_locked.add( 1, attributes={ "user_id": user_id, "reason": reason, - "duration": str(duration_seconds) if duration_seconds else "permanent" - } + "duration": str(duration_seconds) if duration_seconds else "permanent", + }, ) diff --git a/backend/app/core/middlewares/cache.py b/backend/app/core/middlewares/cache.py index e2e8a780..fd4927f7 100644 --- a/backend/app/core/middlewares/cache.py +++ b/backend/app/core/middlewares/cache.py @@ -22,7 +22,7 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: method = scope["method"] path = scope["path"] - + # Only modify headers for GET requests if method != "GET": await self.app(scope, receive, send) @@ -40,11 +40,11 @@ async def send_wrapper(message: Message) -> None: if status_code == 200: headers = MutableHeaders(scope=message) headers["Cache-Control"] = cache_control - + # Add ETag support for better caching if "public" in cache_control: headers["Vary"] = "Accept-Encoding" - + await send(message) await self.app(scope, receive, send_wrapper) diff --git a/backend/app/core/middlewares/metrics.py b/backend/app/core/middlewares/metrics.py index 58920c8e..562b8489 100644 --- a/backend/app/core/middlewares/metrics.py +++ b/backend/app/core/middlewares/metrics.py @@ -25,33 +25,23 @@ def __init__(self, app: ASGIApp) -> None: # Create metrics instruments self.request_counter = self.meter.create_counter( - name="http_requests_total", - description="Total number of HTTP requests", - unit="requests" + name="http_requests_total", description="Total number of HTTP requests", unit="requests" ) self.request_duration = self.meter.create_histogram( - name="http_request_duration_seconds", - description="HTTP request duration in seconds", - unit="seconds" + name="http_request_duration_seconds", description="HTTP request duration in seconds", unit="seconds" ) self.request_size = self.meter.create_histogram( - name="http_request_size_bytes", - description="HTTP request size in bytes", - unit="bytes" + name="http_request_size_bytes", description="HTTP request size in bytes", unit="bytes" ) self.response_size = self.meter.create_histogram( - name="http_response_size_bytes", - description="HTTP response size in bytes", - unit="bytes" + name="http_response_size_bytes", description="HTTP response size in bytes", unit="bytes" ) self.active_requests = self.meter.create_up_down_counter( - name="http_requests_active", - description="Number of active HTTP requests", - unit="requests" + name="http_requests_active", description="Number of active HTTP requests", unit="requests" ) async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: @@ -60,7 +50,7 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: return path = scope["path"] - + # Skip metrics endpoint to avoid recursion if path == "/metrics": await self.app(scope, receive, send) @@ -76,10 +66,7 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: headers = dict(scope["headers"]) content_length = headers.get(b"content-length") if content_length: - self.request_size.record( - int(content_length), - {"method": method, "path": path_template} - ) + self.request_size.record(int(content_length), {"method": method, "path": path_template}) # Time the request start_time = time.time() @@ -88,14 +75,14 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: async def send_wrapper(message: Message) -> None: nonlocal status_code, response_content_length - + if message["type"] == "http.response.start": status_code = message["status"] response_headers = dict(message.get("headers", [])) content_length_header = response_headers.get(b"content-length") if content_length_header: response_content_length = int(content_length_header) - + await send(message) await self.app(scope, receive, send_wrapper) @@ -103,11 +90,7 @@ async def send_wrapper(message: Message) -> None: # Record metrics after response duration = time.time() - start_time - labels = { - "method": method, - "path": path_template, - "status": str(status_code) - } + labels = {"method": method, "path": path_template, "status": str(status_code)} self.request_counter.add(1, labels) self.request_duration.record(duration, labels) @@ -124,17 +107,13 @@ def _get_path_template(path: str) -> str: # Common patterns to replace # UUID pattern - path = re.sub( - r'/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', - '/{id}', - path - ) + path = re.sub(r"/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", "/{id}", path) # Numeric IDs - path = re.sub(r'/\d+', '/{id}', path) + path = re.sub(r"/\d+", "/{id}", path) # MongoDB ObjectIds - path = re.sub(r'/[0-9a-f]{24}', '/{id}', path) + path = re.sub(r"/[0-9a-f]{24}", "/{id}", path) return path @@ -146,49 +125,51 @@ def setup_metrics(app: FastAPI) -> None: if settings.TESTING or os.getenv("OTEL_SDK_DISABLED", "").lower() in {"1", "true", "yes"}: logger.info("OpenTelemetry metrics disabled (TESTING/OTEL_SDK_DISABLED)") return - + # Configure OpenTelemetry resource - resource = Resource.create({ - SERVICE_NAME: settings.PROJECT_NAME, - SERVICE_VERSION: "1.0.0", - "service.environment": "test" if settings.TESTING else "production", - }) - + resource = Resource.create( + { + SERVICE_NAME: settings.PROJECT_NAME, + SERVICE_VERSION: "1.0.0", + "service.environment": "test" if settings.TESTING else "production", + } + ) + # Configure OTLP exporter (sends to OpenTelemetry Collector or compatible backend) # Default endpoint is localhost:4317 for gRPC otlp_exporter = OTLPMetricExporter( endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317"), insecure=True, # Use insecure for local development ) - + # Create metric reader with 60 second export interval metric_reader = PeriodicExportingMetricReader( exporter=otlp_exporter, export_interval_millis=60000, ) - + # Set up the meter provider meter_provider = MeterProvider( resource=resource, metric_readers=[metric_reader], ) - + # Set the global meter provider metrics.set_meter_provider(meter_provider) - + # Create system metrics create_system_metrics() - + # Add the metrics middleware (disabled for now to avoid DNS issues) # app.add_middleware(MetricsMiddleware) - + logger.info("OpenTelemetry metrics configured with OTLP exporter") def create_system_metrics() -> None: """Create system metrics collectors.""" meter = metrics.get_meter(__name__) - + # Process for system metrics current_process = psutil.Process(os.getpid()) @@ -199,14 +180,11 @@ def get_memory_usage(_: CallbackOptions) -> list[Observation]: return [ Observation(memory.used, {"type": "used"}), Observation(memory.available, {"type": "available"}), - Observation(memory.percent, {"type": "percent"}) + Observation(memory.percent, {"type": "percent"}), ] meter.create_observable_gauge( - name="system_memory_bytes", - callbacks=[get_memory_usage], - description="System memory usage", - unit="bytes" + name="system_memory_bytes", callbacks=[get_memory_usage], description="System memory usage", unit="bytes" ) # CPU usage @@ -216,10 +194,7 @@ def get_cpu_usage(_: CallbackOptions) -> list[Observation]: return [Observation(cpu_percent)] meter.create_observable_gauge( - name="system_cpu_percent", - callbacks=[get_cpu_usage], - description="System CPU usage percentage", - unit="percent" + name="system_cpu_percent", callbacks=[get_cpu_usage], description="System CPU usage percentage", unit="percent" ) # Process metrics @@ -229,12 +204,9 @@ def get_process_metrics(_: CallbackOptions) -> list[Observation]: Observation(current_process.memory_info().rss, {"type": "rss"}), Observation(current_process.memory_info().vms, {"type": "vms"}), Observation(current_process.cpu_percent(), {"type": "cpu"}), - Observation(current_process.num_threads(), {"type": "threads"}) + Observation(current_process.num_threads(), {"type": "threads"}), ] meter.create_observable_gauge( - name="process_metrics", - callbacks=[get_process_metrics], - description="Process-level metrics", - unit="mixed" + name="process_metrics", callbacks=[get_process_metrics], description="Process-level metrics", unit="mixed" ) diff --git a/backend/app/core/middlewares/rate_limit.py b/backend/app/core/middlewares/rate_limit.py index e21098d1..a08a708e 100644 --- a/backend/app/core/middlewares/rate_limit.py +++ b/backend/app/core/middlewares/rate_limit.py @@ -15,7 +15,7 @@ class RateLimitMiddleware: """ Middleware for rate limiting API requests. - + Features: - User-based limits for authenticated requests - IP-based limits for anonymous requests @@ -24,16 +24,18 @@ class RateLimitMiddleware: """ # Paths exempt from rate limiting - EXCLUDED_PATHS = frozenset({ - "/health", - "/metrics", - "/docs", - "/openapi.json", - "/favicon.ico", - "/api/v1/auth/login", # Auth endpoints handle their own limits - "/api/v1/auth/register", - "/api/v1/auth/logout" - }) + EXCLUDED_PATHS = frozenset( + { + "/health", + "/metrics", + "/docs", + "/openapi.json", + "/favicon.ico", + "/api/v1/auth/login", # Auth endpoints handle their own limits + "/api/v1/auth/register", + "/api/v1/auth/logout", + } + ) def __init__( self, @@ -51,9 +53,9 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: if scope["type"] != "http": await self.app(scope, receive, send) return - + path = scope["path"] - + if not self.enabled or path in self.EXCLUDED_PATHS: await self.app(scope, receive, send) return @@ -65,7 +67,7 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: container = asgi_app.state.dishka_container async with container() as container_scope: self.rate_limit_service = await container_scope.get(RateLimitService) - + if self.rate_limit_service is None: await self.app(scope, receive, send) return @@ -98,11 +100,7 @@ def _extract_user_id(self, request: Request) -> str: return str(user.user_id) return f"ip:{get_client_ip(request)}" - async def _check_rate_limit( - self, - user_id: str, - endpoint: str - ) -> RateLimitStatus: + async def _check_rate_limit(self, user_id: str, endpoint: str) -> RateLimitStatus: # At this point service should be available; if not, allow request if self.rate_limit_service is None: return RateLimitStatus( @@ -112,17 +110,14 @@ async def _check_rate_limit( reset_at=datetime.now(timezone.utc), ) - return await self.rate_limit_service.check_rate_limit( - user_id=user_id, - endpoint=endpoint - ) + return await self.rate_limit_service.check_rate_limit(user_id=user_id, endpoint=endpoint) def _rate_limit_exceeded_response(self, status: RateLimitStatus) -> JSONResponse: headers = { "X-RateLimit-Limit": str(status.limit), "X-RateLimit-Remaining": "0", "X-RateLimit-Reset": str(int(status.reset_at.timestamp())), - "Retry-After": str(status.retry_after or 60) + "Retry-After": str(status.retry_after or 60), } return JSONResponse( @@ -130,7 +125,7 @@ def _rate_limit_exceeded_response(self, status: RateLimitStatus) -> JSONResponse content={ "detail": "Rate limit exceeded", "retry_after": status.retry_after, - "reset_at": status.reset_at.isoformat() + "reset_at": status.reset_at.isoformat(), }, - headers=headers + headers=headers, ) diff --git a/backend/app/core/middlewares/request_size_limit.py b/backend/app/core/middlewares/request_size_limit.py index a4ff33b0..dcfdecd0 100644 --- a/backend/app/core/middlewares/request_size_limit.py +++ b/backend/app/core/middlewares/request_size_limit.py @@ -16,17 +16,15 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: headers = dict(scope["headers"]) content_length_header = headers.get(b"content-length") - + if content_length_header: content_length = int(content_length_header) if content_length > self.max_size_bytes: response = JSONResponse( status_code=413, - content={ - "detail": f"Request too large. Maximum size is {self.max_size_bytes / 1024 / 1024}MB" - } + content={"detail": f"Request too large. Maximum size is {self.max_size_bytes / 1024 / 1024}MB"}, ) await response(scope, receive, send) return - + await self.app(scope, receive, send) diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py index fbd0b2f2..ddc4937d 100644 --- a/backend/app/core/providers.py +++ b/backend/app/core/providers.py @@ -2,10 +2,10 @@ import redis.asyncio as redis from dishka import Provider, Scope, provide -from motor.motor_asyncio import AsyncIOMotorDatabase from app.core.database_context import ( AsyncDatabaseConnection, + Database, DatabaseConfig, create_database_connection, ) @@ -94,7 +94,7 @@ async def get_database_connection(self, settings: Settings) -> AsyncIterator[Asy server_selection_timeout_ms=5000, connect_timeout_ms=5000, max_pool_size=50, - min_pool_size=10 + min_pool_size=10, ) db_connection = create_database_connection(db_config) @@ -105,7 +105,7 @@ async def get_database_connection(self, settings: Settings) -> AsyncIterator[Asy await db_connection.disconnect() @provide - def get_database(self, db_connection: AsyncDatabaseConnection) -> AsyncIOMotorDatabase: + def get_database(self, db_connection: AsyncDatabaseConnection) -> Database: return db_connection.database @@ -127,10 +127,8 @@ async def get_redis_client(self, settings: Settings) -> AsyncIterator[redis.Redi socket_timeout=5, ) # Test connection - await client.execute_command("PING") - logger.info( - f"Redis connected: {settings.REDIS_HOST}:{settings.REDIS_PORT}/{settings.REDIS_DB}" - ) + await client.execute_command("PING") # type: ignore[no-untyped-call] + logger.info(f"Redis connected: {settings.REDIS_HOST}:{settings.REDIS_PORT}/{settings.REDIS_DB}") try: yield client finally: @@ -138,10 +136,7 @@ async def get_redis_client(self, settings: Settings) -> AsyncIterator[redis.Redi @provide def get_rate_limit_service( - self, - redis_client: redis.Redis, - settings: Settings, - rate_limit_metrics: RateLimitMetrics + self, redis_client: redis.Redis, settings: Settings, rate_limit_metrics: RateLimitMetrics ) -> RateLimitService: return RateLimitService(redis_client, settings, rate_limit_metrics) @@ -159,13 +154,9 @@ class MessagingProvider(Provider): @provide async def get_kafka_producer( - self, - settings: Settings, - schema_registry: SchemaRegistryManager + self, settings: Settings, schema_registry: SchemaRegistryManager ) -> AsyncIterator[UnifiedProducer]: - config = ProducerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS - ) + config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS) producer = UnifiedProducer(config, schema_registry) await producer.start() try: @@ -174,7 +165,7 @@ async def get_kafka_producer( await producer.stop() @provide - async def get_dlq_manager(self, database: AsyncIOMotorDatabase) -> AsyncIterator[DLQManager]: + async def get_dlq_manager(self, database: Database) -> AsyncIterator[DLQManager]: manager = create_dlq_manager(database) await manager.start() try: @@ -183,16 +174,12 @@ async def get_dlq_manager(self, database: AsyncIOMotorDatabase) -> AsyncIterator await manager.stop() @provide - def get_idempotency_repository(self, - redis_client: redis.Redis) -> RedisIdempotencyRepository: - return RedisIdempotencyRepository(redis_client, - key_prefix="idempotency") + def get_idempotency_repository(self, redis_client: redis.Redis) -> RedisIdempotencyRepository: + return RedisIdempotencyRepository(redis_client, key_prefix="idempotency") @provide - async def get_idempotency_manager(self, - repo: RedisIdempotencyRepository) -> AsyncIterator[IdempotencyManager]: - manager = create_idempotency_manager(repository=repo, - config=IdempotencyConfig()) + async def get_idempotency_manager(self, repo: RedisIdempotencyRepository) -> AsyncIterator[IdempotencyManager]: + manager = create_idempotency_manager(repository=repo, config=IdempotencyConfig()) await manager.initialize() try: yield manager @@ -208,31 +195,20 @@ def get_schema_registry(self) -> SchemaRegistryManager: return create_schema_registry_manager() @provide - async def get_event_store( - self, - database: AsyncIOMotorDatabase, - schema_registry: SchemaRegistryManager - ) -> EventStore: - store = create_event_store( - db=database, - schema_registry=schema_registry, - ttl_days=90 - ) + async def get_event_store(self, database: Database, schema_registry: SchemaRegistryManager) -> EventStore: + store = create_event_store(db=database, schema_registry=schema_registry, ttl_days=90) return store @provide async def get_event_store_consumer( - self, - event_store: EventStore, - schema_registry: SchemaRegistryManager, - kafka_producer: UnifiedProducer + self, event_store: EventStore, schema_registry: SchemaRegistryManager, kafka_producer: UnifiedProducer ) -> EventStoreConsumer: topics = get_all_topics() return create_event_store_consumer( event_store=event_store, topics=list(topics), schema_registry_manager=schema_registry, - producer=kafka_producer + producer=kafka_producer, ) @provide @@ -316,11 +292,11 @@ def get_sse_shutdown_manager(self) -> SSEShutdownManager: @provide(scope=Scope.APP) async def get_sse_kafka_redis_bridge( - self, - schema_registry: SchemaRegistryManager, - settings: Settings, - event_metrics: EventMetrics, - sse_redis_bus: SSERedisBus, + self, + schema_registry: SchemaRegistryManager, + settings: Settings, + event_metrics: EventMetrics, + sse_redis_bus: SSERedisBus, ) -> SSEKafkaRedisBridge: return create_sse_kafka_redis_bridge( schema_registry=schema_registry, @@ -330,10 +306,7 @@ async def get_sse_kafka_redis_bridge( ) @provide - def get_sse_repository( - self, - database: AsyncIOMotorDatabase - ) -> SSERepository: + def get_sse_repository(self, database: Database) -> SSERepository: return SSERepository(database) @provide @@ -343,12 +316,12 @@ async def get_sse_redis_bus(self, redis_client: redis.Redis) -> AsyncIterator[SS @provide(scope=Scope.REQUEST) def get_sse_service( - self, - sse_repository: SSERepository, - router: SSEKafkaRedisBridge, - sse_redis_bus: SSERedisBus, - shutdown_manager: SSEShutdownManager, - settings: Settings + self, + sse_repository: SSERepository, + router: SSEKafkaRedisBridge, + sse_redis_bus: SSERedisBus, + shutdown_manager: SSEShutdownManager, + settings: Settings, ) -> SSEService: # Ensure shutdown manager coordinates with the router in this request scope shutdown_manager.set_router(router) @@ -357,7 +330,7 @@ def get_sse_service( router=router, sse_bus=sse_redis_bus, shutdown_manager=shutdown_manager, - settings=settings + settings=settings, ) @@ -365,7 +338,7 @@ class AuthProvider(Provider): scope = Scope.APP @provide - def get_user_repository(self, database: AsyncIOMotorDatabase) -> UserRepository: + def get_user_repository(self, database: Database) -> UserRepository: return UserRepository(database) @provide @@ -377,11 +350,11 @@ class UserServicesProvider(Provider): scope = Scope.APP @provide - def get_user_settings_repository(self, database: AsyncIOMotorDatabase) -> UserSettingsRepository: + def get_user_settings_repository(self, database: Database) -> UserSettingsRepository: return UserSettingsRepository(database) @provide - def get_event_repository(self, database: AsyncIOMotorDatabase) -> EventRepository: + def get_event_repository(self, database: Database) -> EventRepository: return EventRepository(database) @provide @@ -390,21 +363,16 @@ async def get_event_service(self, event_repository: EventRepository) -> EventSer @provide async def get_kafka_event_service( - self, - event_repository: EventRepository, - kafka_producer: UnifiedProducer + self, event_repository: EventRepository, kafka_producer: UnifiedProducer ) -> KafkaEventService: - return KafkaEventService( - event_repository=event_repository, - kafka_producer=kafka_producer - ) + return KafkaEventService(event_repository=event_repository, kafka_producer=kafka_producer) @provide async def get_user_settings_service( - self, - repository: UserSettingsRepository, - kafka_event_service: KafkaEventService, - event_bus_manager: EventBusManager + self, + repository: UserSettingsRepository, + kafka_event_service: KafkaEventService, + event_bus_manager: EventBusManager, ) -> UserSettingsService: service = UserSettingsService(repository, kafka_event_service) await service.initialize(event_bus_manager) @@ -415,49 +383,49 @@ class AdminServicesProvider(Provider): scope = Scope.APP @provide - def get_admin_events_repository(self, database: AsyncIOMotorDatabase) -> AdminEventsRepository: + def get_admin_events_repository(self, database: Database) -> AdminEventsRepository: return AdminEventsRepository(database) @provide(scope=Scope.REQUEST) def get_admin_events_service( - self, - admin_events_repository: AdminEventsRepository, - replay_service: ReplayService, + self, + admin_events_repository: AdminEventsRepository, + replay_service: ReplayService, ) -> AdminEventsService: return AdminEventsService(admin_events_repository, replay_service) @provide - def get_admin_settings_repository(self, database: AsyncIOMotorDatabase) -> AdminSettingsRepository: + def get_admin_settings_repository(self, database: Database) -> AdminSettingsRepository: return AdminSettingsRepository(database) @provide def get_admin_settings_service( - self, - admin_settings_repository: AdminSettingsRepository, + self, + admin_settings_repository: AdminSettingsRepository, ) -> AdminSettingsService: return AdminSettingsService(admin_settings_repository) @provide - def get_admin_user_repository(self, database: AsyncIOMotorDatabase) -> AdminUserRepository: + def get_admin_user_repository(self, database: Database) -> AdminUserRepository: return AdminUserRepository(database) @provide - def get_saga_repository(self, database: AsyncIOMotorDatabase) -> SagaRepository: + def get_saga_repository(self, database: Database) -> SagaRepository: return SagaRepository(database) @provide - def get_notification_repository(self, database: AsyncIOMotorDatabase) -> NotificationRepository: + def get_notification_repository(self, database: Database) -> NotificationRepository: return NotificationRepository(database) @provide def get_notification_service( - self, - notification_repository: NotificationRepository, - kafka_event_service: KafkaEventService, - event_bus_manager: EventBusManager, - schema_registry: SchemaRegistryManager, - sse_redis_bus: SSERedisBus, - settings: Settings, + self, + notification_repository: NotificationRepository, + kafka_event_service: KafkaEventService, + event_bus_manager: EventBusManager, + schema_registry: SchemaRegistryManager, + sse_redis_bus: SSERedisBus, + settings: Settings, ) -> NotificationService: service = NotificationService( notification_repository=notification_repository, @@ -472,8 +440,8 @@ def get_notification_service( @provide def get_grafana_alert_processor( - self, - notification_service: NotificationService, + self, + notification_service: NotificationService, ) -> GrafanaAlertProcessor: return GrafanaAlertProcessor(notification_service) @@ -482,34 +450,34 @@ class BusinessServicesProvider(Provider): scope = Scope.REQUEST @provide - def get_execution_repository(self, database: AsyncIOMotorDatabase) -> ExecutionRepository: + def get_execution_repository(self, database: Database) -> ExecutionRepository: return ExecutionRepository(database) @provide - def get_resource_allocation_repository(self, database: AsyncIOMotorDatabase) -> ResourceAllocationRepository: + def get_resource_allocation_repository(self, database: Database) -> ResourceAllocationRepository: return ResourceAllocationRepository(database) @provide - def get_saved_script_repository(self, database: AsyncIOMotorDatabase) -> SavedScriptRepository: + def get_saved_script_repository(self, database: Database) -> SavedScriptRepository: return SavedScriptRepository(database) @provide - def get_dlq_repository(self, database: AsyncIOMotorDatabase) -> DLQRepository: + def get_dlq_repository(self, database: Database) -> DLQRepository: return DLQRepository(database) @provide - def get_replay_repository(self, database: AsyncIOMotorDatabase) -> ReplayRepository: + def get_replay_repository(self, database: Database) -> ReplayRepository: return ReplayRepository(database) @provide async def get_saga_orchestrator( - self, - saga_repository: SagaRepository, - kafka_producer: UnifiedProducer, - event_store: EventStore, - idempotency_manager: IdempotencyManager, - resource_allocation_repository: ResourceAllocationRepository, - settings: Settings, + self, + saga_repository: SagaRepository, + kafka_producer: UnifiedProducer, + event_store: EventStore, + idempotency_manager: IdempotencyManager, + resource_allocation_repository: ResourceAllocationRepository, + settings: Settings, ) -> AsyncIterator[SagaOrchestrator]: config = SagaConfig( name="main-orchestrator", @@ -535,60 +503,47 @@ async def get_saga_orchestrator( @provide def get_saga_service( - self, - saga_repository: SagaRepository, - execution_repository: ExecutionRepository, - saga_orchestrator: SagaOrchestrator + self, + saga_repository: SagaRepository, + execution_repository: ExecutionRepository, + saga_orchestrator: SagaOrchestrator, ) -> SagaService: return SagaService( - saga_repo=saga_repository, - execution_repo=execution_repository, - orchestrator=saga_orchestrator + saga_repo=saga_repository, execution_repo=execution_repository, orchestrator=saga_orchestrator ) @provide def get_execution_service( - self, - execution_repository: ExecutionRepository, - kafka_producer: UnifiedProducer, - event_store: EventStore, - settings: Settings + self, + execution_repository: ExecutionRepository, + kafka_producer: UnifiedProducer, + event_store: EventStore, + settings: Settings, ) -> ExecutionService: return ExecutionService( - execution_repo=execution_repository, - producer=kafka_producer, - event_store=event_store, - settings=settings + execution_repo=execution_repository, producer=kafka_producer, event_store=event_store, settings=settings ) @provide - def get_saved_script_service( - self, - saved_script_repository: SavedScriptRepository - ) -> SavedScriptService: + def get_saved_script_service(self, saved_script_repository: SavedScriptRepository) -> SavedScriptService: return SavedScriptService(saved_script_repository) @provide async def get_replay_service( - self, - replay_repository: ReplayRepository, - kafka_producer: UnifiedProducer, - event_store: EventStore + self, replay_repository: ReplayRepository, kafka_producer: UnifiedProducer, event_store: EventStore ) -> ReplayService: event_replay_service = EventReplayService( - repository=replay_repository, - producer=kafka_producer, - event_store=event_store + repository=replay_repository, producer=kafka_producer, event_store=event_store ) return ReplayService(replay_repository, event_replay_service) @provide def get_admin_user_service( - self, - admin_user_repository: AdminUserRepository, - event_service: EventService, - execution_service: ExecutionService, - rate_limit_service: RateLimitService, + self, + admin_user_repository: AdminUserRepository, + event_service: EventService, + execution_service: ExecutionService, + rate_limit_service: RateLimitService, ) -> AdminUserService: return AdminUserService( user_repository=admin_user_repository, @@ -599,12 +554,12 @@ def get_admin_user_service( @provide async def get_execution_coordinator( - self, - kafka_producer: UnifiedProducer, - schema_registry: SchemaRegistryManager, - event_store: EventStore, - execution_repository: ExecutionRepository, - idempotency_manager: IdempotencyManager, + self, + kafka_producer: UnifiedProducer, + schema_registry: SchemaRegistryManager, + event_store: EventStore, + execution_repository: ExecutionRepository, + idempotency_manager: IdempotencyManager, ) -> AsyncIterator[ExecutionCoordinator]: coordinator = ExecutionCoordinator( producer=kafka_producer, @@ -623,5 +578,5 @@ class ResultProcessorProvider(Provider): scope = Scope.APP @provide - def get_execution_repository(self, database: AsyncIOMotorDatabase) -> ExecutionRepository: + def get_execution_repository(self, database: Database) -> ExecutionRepository: return ExecutionRepository(database) diff --git a/backend/app/core/security.py b/backend/app/core/security.py index eb9b362f..4f0bb130 100644 --- a/backend/app/core/security.py +++ b/backend/app/core/security.py @@ -34,21 +34,17 @@ def verify_password(self, plain_password: str, hashed_password: str) -> bool: def get_password_hash(self, password: str) -> str: return self.pwd_context.hash(password) # type: ignore - def create_access_token( - self, data: dict, expires_delta: timedelta - ) -> str: + def create_access_token(self, data: dict[str, Any], expires_delta: timedelta) -> str: to_encode = data.copy() expire = datetime.now(timezone.utc) + expires_delta to_encode.update({"exp": expire}) - encoded_jwt = jwt.encode( - to_encode, self.settings.SECRET_KEY, algorithm=self.settings.ALGORITHM - ) + encoded_jwt = jwt.encode(to_encode, self.settings.SECRET_KEY, algorithm=self.settings.ALGORITHM) return encoded_jwt async def get_current_user( - self, - token: str, - user_repo: Any, # Avoid circular import by using Any + self, + token: str, + user_repo: Any, # Avoid circular import by using Any ) -> DomainAdminUser: credentials_exception = HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, @@ -56,9 +52,7 @@ async def get_current_user( headers={"WWW-Authenticate": "Bearer"}, ) try: - payload = jwt.decode( - token, self.settings.SECRET_KEY, algorithms=[self.settings.ALGORITHM] - ) + payload = jwt.decode(token, self.settings.SECRET_KEY, algorithms=[self.settings.ALGORITHM]) username: str = payload.get("sub") if username is None: raise credentials_exception @@ -72,6 +66,7 @@ async def get_current_user( def generate_csrf_token(self) -> str: """Generate a CSRF token using secure random""" import secrets + return secrets.token_urlsafe(32) def validate_csrf_token(self, header_token: str, cookie_token: str) -> bool: @@ -80,6 +75,7 @@ def validate_csrf_token(self, header_token: str, cookie_token: str) -> bool: return False # Constant-time comparison to prevent timing attacks import hmac + return hmac.compare_digest(header_token, cookie_token) @@ -111,15 +107,9 @@ def validate_csrf_token(request: Request) -> str: cookie_token = request.cookies.get("csrf_token", "") if not header_token: - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, - detail="CSRF token missing" - ) + raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="CSRF token missing") if not security_service.validate_csrf_token(header_token, cookie_token): - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, - detail="CSRF token invalid" - ) + raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="CSRF token invalid") return header_token diff --git a/backend/app/core/startup.py b/backend/app/core/startup.py index 3edbca9a..fccdfb86 100644 --- a/backend/app/core/startup.py +++ b/backend/app/core/startup.py @@ -30,18 +30,18 @@ async def initialize_metrics_context(container: AsyncContainer) -> None: # Only add metrics that are actually provided by the container # Some metrics might not be needed for certain deployments - metrics_mapping['event'] = await container.get(EventMetrics) - metrics_mapping['connection'] = await container.get(ConnectionMetrics) - metrics_mapping['rate_limit'] = await container.get(RateLimitMetrics) - metrics_mapping['execution'] = await container.get(ExecutionMetrics) - metrics_mapping['database'] = await container.get(DatabaseMetrics) - metrics_mapping['health'] = await container.get(HealthMetrics) - metrics_mapping['kubernetes'] = await container.get(KubernetesMetrics) - metrics_mapping['coordinator'] = await container.get(CoordinatorMetrics) - metrics_mapping['dlq'] = await container.get(DLQMetrics) - metrics_mapping['notification'] = await container.get(NotificationMetrics) - metrics_mapping['replay'] = await container.get(ReplayMetrics) - metrics_mapping['security'] = await container.get(SecurityMetrics) + metrics_mapping["event"] = await container.get(EventMetrics) + metrics_mapping["connection"] = await container.get(ConnectionMetrics) + metrics_mapping["rate_limit"] = await container.get(RateLimitMetrics) + metrics_mapping["execution"] = await container.get(ExecutionMetrics) + metrics_mapping["database"] = await container.get(DatabaseMetrics) + metrics_mapping["health"] = await container.get(HealthMetrics) + metrics_mapping["kubernetes"] = await container.get(KubernetesMetrics) + metrics_mapping["coordinator"] = await container.get(CoordinatorMetrics) + metrics_mapping["dlq"] = await container.get(DLQMetrics) + metrics_mapping["notification"] = await container.get(NotificationMetrics) + metrics_mapping["replay"] = await container.get(ReplayMetrics) + metrics_mapping["security"] = await container.get(SecurityMetrics) # Initialize the context with available metrics MetricsContext.initialize_all(**metrics_mapping) @@ -54,10 +54,7 @@ async def initialize_metrics_context(container: AsyncContainer) -> None: # The context will lazy-initialize metrics as needed -async def initialize_rate_limits( - redis_client: redis.Redis, - settings: Settings -) -> None: +async def initialize_rate_limits(redis_client: redis.Redis, settings: Settings) -> None: """ Initialize default rate limits in Redis on application startup. This ensures default limits are always available. diff --git a/backend/app/core/tracing/config.py b/backend/app/core/tracing/config.py index bd1f1257..379ed081 100644 --- a/backend/app/core/tracing/config.py +++ b/backend/app/core/tracing/config.py @@ -26,7 +26,7 @@ class TracingConfiguration: """Configuration for OpenTelemetry tracing.""" - + def __init__( self, service_name: str, @@ -34,7 +34,7 @@ def __init__( otlp_endpoint: str | None = None, enable_console_exporter: bool = False, sampling_rate: float = 1.0, - adaptive_sampling: bool = False + adaptive_sampling: bool = False, ) -> None: self.service_name = service_name self.service_version = service_version @@ -43,40 +43,42 @@ def __init__( self.sampling_rate = sampling_rate self.adaptive_sampling = adaptive_sampling self._settings = get_settings() - + def create_resource(self) -> Resource: """Create OpenTelemetry resource with service metadata.""" - return Resource.create({ - SERVICE_NAME: self.service_name, - SERVICE_VERSION: self.service_version, - "deployment.environment": self._get_environment(), - "service.namespace": "integr8scode", - "service.instance.id": os.environ.get("HOSTNAME", "unknown"), - }) - + return Resource.create( + { + SERVICE_NAME: self.service_name, + SERVICE_VERSION: self.service_version, + "deployment.environment": self._get_environment(), + "service.namespace": "integr8scode", + "service.instance.id": os.environ.get("HOSTNAME", "unknown"), + } + ) + def create_sampler(self) -> Sampler: """Create appropriate sampler based on configuration.""" if self.adaptive_sampling: return create_adaptive_sampler() - + if self.sampling_rate <= 0: return ALWAYS_OFF - + if self.sampling_rate >= 1.0: return ALWAYS_ON - + return TraceIdRatioBased(self.sampling_rate) - + def get_otlp_endpoint(self) -> str | None: """Get OTLP endpoint from config or environment.""" if self.otlp_endpoint: return self.otlp_endpoint - + if self._settings.JAEGER_AGENT_HOST: return f"{self._settings.JAEGER_AGENT_HOST}:4317" - + return None - + def _get_environment(self) -> str: """Get deployment environment.""" return "test" if self._settings.TESTING else "production" @@ -84,34 +86,31 @@ def _get_environment(self) -> str: class TracingInitializer: """Initializes OpenTelemetry tracing with instrumentation.""" - + def __init__(self, config: TracingConfiguration) -> None: self.config = config self.instrumentation_report = InstrumentationReport() - + def initialize(self) -> InstrumentationReport: """Initialize tracing and instrument libraries.""" provider = self._create_provider() self._configure_exporters(provider) trace.set_tracer_provider(provider) set_global_textmap(TraceContextTextMapPropagator()) - + self._instrument_libraries() - + logger.info( f"OpenTelemetry tracing initialized for {self.config.service_name}", - extra={"instrumentation_summary": self.instrumentation_report.get_summary()} + extra={"instrumentation_summary": self.instrumentation_report.get_summary()}, ) - + return self.instrumentation_report - + def _create_provider(self) -> TracerProvider: """Create tracer provider with resource and sampler.""" - return TracerProvider( - resource=self.config.create_resource(), - sampler=self.config.create_sampler() - ) - + return TracerProvider(resource=self.config.create_resource(), sampler=self.config.create_sampler()) + def _configure_exporters(self, provider: TracerProvider) -> None: """Configure span exporters.""" otlp_endpoint = self.config.get_otlp_endpoint() @@ -121,19 +120,19 @@ def _configure_exporters(self, provider: TracerProvider) -> None: insecure=True, ) provider.add_span_processor(BatchSpanProcessor(otlp_exporter)) - + if self.config.enable_console_exporter: console_exporter = ConsoleSpanExporter() provider.add_span_processor(BatchSpanProcessor(console_exporter)) - + def _instrument_libraries(self) -> None: """Instrument all configured libraries.""" libraries = self._get_libraries_to_instrument() - + for lib in libraries: result = self._instrument_library(lib) self.instrumentation_report.add_result(result) - + def _get_libraries_to_instrument(self) -> list[LibraryInstrumentation]: """Get list of libraries to instrument.""" return [ @@ -142,48 +141,36 @@ def _get_libraries_to_instrument(self) -> list[LibraryInstrumentation]: instrumentor=FastAPIInstrumentor(), config={ "tracer_provider": trace.get_tracer_provider(), - "excluded_urls": "health,metrics,docs,openapi.json" - } + "excluded_urls": "health,metrics,docs,openapi.json", + }, ), LibraryInstrumentation( name="httpx", instrumentor=HTTPXClientInstrumentor(), - config={"tracer_provider": trace.get_tracer_provider()} + config={"tracer_provider": trace.get_tracer_provider()}, ), LibraryInstrumentation( name="pymongo", instrumentor=PymongoInstrumentor(), - config={"tracer_provider": trace.get_tracer_provider()} + config={"tracer_provider": trace.get_tracer_provider()}, ), LibraryInstrumentation( name="logging", instrumentor=LoggingInstrumentor(), - config={ - "set_logging_format": True, - "log_level": "INFO" - } - ) + config={"set_logging_format": True, "log_level": "INFO"}, + ), ] - + def _instrument_library(self, lib: LibraryInstrumentation) -> InstrumentationResult: """Instrument a single library and return result.""" try: lib.instrumentor.instrument(**lib.config) - return InstrumentationResult( - library=lib.name, - status=InstrumentationStatus.SUCCESS - ) + return InstrumentationResult(library=lib.name, status=InstrumentationStatus.SUCCESS) except Exception as e: logger.warning( - f"Failed to instrument {lib.name}", - exc_info=True, - extra={"library": lib.name, "error": str(e)} - ) - return InstrumentationResult( - library=lib.name, - status=InstrumentationStatus.FAILED, - error=e + f"Failed to instrument {lib.name}", exc_info=True, extra={"library": lib.name, "error": str(e)} ) + return InstrumentationResult(library=lib.name, status=InstrumentationStatus.FAILED, error=e) def init_tracing( @@ -192,7 +179,7 @@ def init_tracing( otlp_endpoint: str | None = None, enable_console_exporter: bool = False, sampling_rate: float = 1.0, - adaptive_sampling: bool = False + adaptive_sampling: bool = False, ) -> InstrumentationReport: """Initialize OpenTelemetry tracing with the given configuration.""" config = TracingConfiguration( @@ -201,8 +188,8 @@ def init_tracing( otlp_endpoint=otlp_endpoint, enable_console_exporter=enable_console_exporter, sampling_rate=sampling_rate, - adaptive_sampling=adaptive_sampling + adaptive_sampling=adaptive_sampling, ) - + initializer = TracingInitializer(config) return initializer.initialize() diff --git a/backend/app/core/tracing/models.py b/backend/app/core/tracing/models.py index 6fef732d..d5932e57 100644 --- a/backend/app/core/tracing/models.py +++ b/backend/app/core/tracing/models.py @@ -8,6 +8,7 @@ class EventAttributes(StringEnum): """Standard attribute names for tracing events.""" + EVENT_TYPE = "event.type" EVENT_ID = "event.id" EXECUTION_ID = "execution.id" @@ -28,6 +29,7 @@ class EventAttributes(StringEnum): class InstrumentationStatus(StringEnum): """Status of library instrumentation.""" + SUCCESS = "success" FAILED = "failed" NOT_ATTEMPTED = "not_attempted" @@ -36,6 +38,7 @@ class InstrumentationStatus(StringEnum): @dataclass class InstrumentationResult: """Result of instrumenting a single library.""" + library: str status: InstrumentationStatus error: Exception | None = None @@ -44,6 +47,7 @@ class InstrumentationResult: @dataclass class InstrumentationReport: """Report of all instrumentation results.""" + results: Dict[str, InstrumentationResult] = field(default_factory=dict) def add_result(self, result: InstrumentationResult) -> None: @@ -52,17 +56,11 @@ def add_result(self, result: InstrumentationResult) -> None: def get_summary(self) -> Dict[str, str]: """Get a summary of instrumentation statuses.""" - return { - library: result.status - for library, result in self.results.items() - } + return {library: result.status for library, result in self.results.items()} def has_failures(self) -> bool: """Check if any instrumentation failed.""" - return any( - result.status == InstrumentationStatus.FAILED - for result in self.results.values() - ) + return any(result.status == InstrumentationStatus.FAILED for result in self.results.values()) class Instrumentor(Protocol): @@ -74,6 +72,7 @@ def instrument(self, **kwargs: Any) -> None: ... @dataclass class LibraryInstrumentation: """Configuration for instrumenting a library.""" + name: str instrumentor: Instrumentor config: Dict[str, Any] = field(default_factory=dict) diff --git a/backend/app/core/tracing/utils.py b/backend/app/core/tracing/utils.py index ddd47ff9..598cdaf9 100644 --- a/backend/app/core/tracing/utils.py +++ b/backend/app/core/tracing/utils.py @@ -1,11 +1,15 @@ import asyncio import functools +from collections.abc import Callable, Generator from contextlib import contextmanager -from typing import Any, Callable, Dict, Generator +from typing import Any, ParamSpec, TypeVar from opentelemetry import context, propagate, trace from opentelemetry.trace import SpanKind, Status, StatusCode +P = ParamSpec("P") +R = TypeVar("R") + def get_tracer() -> trace.Tracer: """Get a tracer for the current module.""" @@ -16,31 +20,27 @@ def get_tracer() -> trace.Tracer: def trace_span( name: str, kind: SpanKind = SpanKind.INTERNAL, - attributes: Dict[str, Any] | None = None, + attributes: dict[str, Any] | None = None, set_status_on_exception: bool = True, - tracer: trace.Tracer | None = None + tracer: trace.Tracer | None = None, ) -> Generator[trace.Span, None, None]: """ Context manager for creating a traced span. - + Args: name: Name of the span kind: Kind of span (INTERNAL, CLIENT, SERVER, etc.) attributes: Additional attributes to set on the span set_status_on_exception: Whether to set error status on exception tracer: Optional tracer to use, defaults to module tracer - + Yields: The created span """ if tracer is None: tracer = get_tracer() - with tracer.start_as_current_span( - name, - kind=kind, - attributes=attributes or {} - ) as span: + with tracer.start_as_current_span(name, kind=kind, attributes=attributes or {}) as span: try: yield span except Exception as e: @@ -51,46 +51,47 @@ def trace_span( def trace_method( - name: str | None = None, - kind: SpanKind = SpanKind.INTERNAL, - attributes: Dict[str, Any] | None = None -) -> Callable: + name: str | None = None, kind: SpanKind = SpanKind.INTERNAL, attributes: dict[str, Any] | None = None +) -> Callable[[Callable[P, R]], Callable[P, R]]: """ Decorator for tracing method calls. - + Args: name: Custom span name, defaults to module.method_name kind: Kind of span (INTERNAL, CLIENT, SERVER, etc.) attributes: Additional attributes to set on the span - + Returns: Decorated function with tracing """ - def decorator(func: Callable) -> Callable: + + def decorator(func: Callable[P, R]) -> Callable[P, R]: span_name = name or f"{func.__module__}.{func.__name__}" @functools.wraps(func) - async def async_wrapper(*args: Any, **kwargs: Any) -> Any: + async def async_wrapper(*args: P.args, **kwargs: P.kwargs) -> R: with trace_span(span_name, kind=kind, attributes=attributes): - return await func(*args, **kwargs) + return await func(*args, **kwargs) # type: ignore[misc, no-any-return] @functools.wraps(func) - def sync_wrapper(*args: Any, **kwargs: Any) -> Any: + def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R: with trace_span(span_name, kind=kind, attributes=attributes): return func(*args, **kwargs) - return async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper + if asyncio.iscoroutinefunction(func): + return async_wrapper # type: ignore[return-value] + return sync_wrapper return decorator -def inject_trace_context(headers: Dict[str, str]) -> Dict[str, str]: +def inject_trace_context(headers: dict[str, str]) -> dict[str, str]: """ Inject current trace context into headers for propagation. - + Args: headers: Existing headers dictionary - + Returns: Headers with trace context injected """ @@ -99,13 +100,13 @@ def inject_trace_context(headers: Dict[str, str]) -> Dict[str, str]: return propagation_headers -def extract_trace_context(headers: Dict[str, str]) -> context.Context: +def extract_trace_context(headers: dict[str, str]) -> context.Context: """ Extract trace context from headers. - + Args: headers: Headers containing trace context - + Returns: Extracted OpenTelemetry context """ @@ -115,7 +116,7 @@ def extract_trace_context(headers: Dict[str, str]) -> context.Context: def add_span_attributes(**attributes: Any) -> None: """ Add attributes to the current span. - + Args: **attributes: Key-value pairs to add as span attributes """ @@ -126,10 +127,10 @@ def add_span_attributes(**attributes: Any) -> None: span.set_attribute(key, value) -def add_span_event(name: str, attributes: Dict[str, Any] | None = None) -> None: +def add_span_event(name: str, attributes: dict[str, Any] | None = None) -> None: """ Add an event to the current span. - + Args: name: Name of the event attributes: Optional attributes for the event @@ -142,7 +143,7 @@ def add_span_event(name: str, attributes: Dict[str, Any] | None = None) -> None: def set_span_status(code: StatusCode, description: str | None = None) -> None: """ Set the status of the current span. - + Args: code: Status code (OK, ERROR, UNSET) description: Optional description @@ -155,7 +156,7 @@ def set_span_status(code: StatusCode, description: str | None = None) -> None: def get_current_trace_id() -> str | None: """ Get the current trace ID as a hex string. - + Returns: Trace ID as 32-character hex string, or None if no active trace """ @@ -163,14 +164,14 @@ def get_current_trace_id() -> str | None: if span and span.is_recording(): span_context = span.get_span_context() if span_context.is_valid: - return format(span_context.trace_id, '032x') + return format(span_context.trace_id, "032x") return None def get_current_span_id() -> str | None: """ Get the current span ID as a hex string. - + Returns: Span ID as 16-character hex string, or None if no active span """ @@ -178,5 +179,5 @@ def get_current_span_id() -> str | None: if span and span.is_recording(): span_context = span.get_span_context() if span_context.is_valid: - return format(span_context.span_id, '016x') + return format(span_context.span_id, "016x") return None diff --git a/backend/app/core/utils.py b/backend/app/core/utils.py index 9c11da8a..37f48058 100644 --- a/backend/app/core/utils.py +++ b/backend/app/core/utils.py @@ -6,15 +6,15 @@ class StringEnum(StrEnum): """ A StrEnum subclass that behaves like a plain string in all representations. - + This fixes the issue where StrEnum.__repr__ returns the enum member representation (e.g., '') instead of just the string value. - + Usage: class MyEnum(StringEnum): VALUE1 = "value1" VALUE2 = "value2" - + # Now repr() returns just "value1" instead of "" """ diff --git a/backend/app/db/repositories/admin/admin_events_repository.py b/backend/app/db/repositories/admin/admin_events_repository.py index 80f4dc24..f7f311f8 100644 --- a/backend/app/db/repositories/admin/admin_events_repository.py +++ b/backend/app/db/repositories/admin/admin_events_repository.py @@ -1,9 +1,9 @@ from datetime import datetime, timedelta, timezone from typing import Any, Dict, List -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase from pymongo import ReturnDocument +from app.core.database_context import Collection, Database from app.domain.admin import ( ReplayQuery, ReplaySession, @@ -43,29 +43,26 @@ class AdminEventsRepository: """Repository for admin event operations using domain models.""" - def __init__(self, db: AsyncIOMotorDatabase): + def __init__(self, db: Database): self.db = db - self.events_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.EVENTS) - self.event_store_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.EVENT_STORE) + self.events_collection: Collection = self.db.get_collection(CollectionNames.EVENTS) + self.event_store_collection: Collection = self.db.get_collection(CollectionNames.EVENT_STORE) # Bind related collections used by this repository - self.executions_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.EXECUTIONS) - self.events_archive_collection: AsyncIOMotorCollection = self.db.get_collection( - CollectionNames.EVENTS_ARCHIVE - ) + self.executions_collection: Collection = self.db.get_collection(CollectionNames.EXECUTIONS) + self.events_archive_collection: Collection = self.db.get_collection(CollectionNames.EVENTS_ARCHIVE) self.replay_mapper = ReplaySessionMapper() self.replay_query_mapper = ReplayQueryMapper() - self.replay_sessions_collection: AsyncIOMotorCollection = self.db.get_collection( - CollectionNames.REPLAY_SESSIONS) + self.replay_sessions_collection: Collection = self.db.get_collection(CollectionNames.REPLAY_SESSIONS) self.mapper = EventMapper() self.summary_mapper = EventSummaryMapper() async def browse_events( - self, - filter: EventFilter, - skip: int = 0, - limit: int = 50, - sort_by: str = EventFields.TIMESTAMP, - sort_order: int = SortDirection.DESCENDING + self, + filter: EventFilter, + skip: int = 0, + limit: int = 50, + sort_by: str = EventFields.TIMESTAMP, + sort_order: int = SortDirection.DESCENDING, ) -> EventBrowseResult: """Browse events with filters using domain models.""" query = EventFilterMapper.to_mongo_query(filter) @@ -82,12 +79,7 @@ async def browse_events( event_docs = await cursor.to_list(length=limit) events = [self.mapper.from_mongo_document(doc) for doc in event_docs] - return EventBrowseResult( - events=events, - total=total, - skip=skip, - limit=limit - ) + return EventBrowseResult(events=events, total=total, skip=skip, limit=limit) async def get_event_detail(self, event_id: str) -> EventDetail | None: """Get detailed information about an event.""" @@ -99,10 +91,13 @@ async def get_event_detail(self, event_id: str) -> EventDetail | None: event = self.mapper.from_mongo_document(event_doc) # Get related events - cursor = self.events_collection.find({ - EventFields.METADATA_CORRELATION_ID: event.correlation_id, - EventFields.EVENT_ID: {"$ne": event_id} - }).sort(EventFields.TIMESTAMP, SortDirection.ASCENDING).limit(10) + cursor = ( + self.events_collection.find( + {EventFields.METADATA_CORRELATION_ID: event.correlation_id, EventFields.EVENT_ID: {"$ne": event_id}} + ) + .sort(EventFields.TIMESTAMP, SortDirection.ASCENDING) + .limit(10) + ) related_docs = await cursor.to_list(length=10) related_events = [self.summary_mapper.from_mongo_document(doc) for doc in related_docs] @@ -110,11 +105,7 @@ async def get_event_detail(self, event_id: str) -> EventDetail | None: # Build timeline (could be expanded with more logic) timeline = related_events[:5] # Simple timeline for now - detail = EventDetail( - event=event, - related_events=related_events, - timeline=timeline - ) + detail = EventDetail(event=event, related_events=related_events, timeline=timeline) return detail @@ -131,18 +122,19 @@ async def get_event_stats(self, hours: int = 24) -> EventStatistics: overview_pipeline = EventStatsAggregation.build_overview_pipeline(start_time) overview_result = await self.events_collection.aggregate(overview_pipeline).to_list(1) - stats = overview_result[0] if overview_result else { - "total_events": 0, - "event_type_count": 0, - "unique_user_count": 0, - "service_count": 0 - } + stats = ( + overview_result[0] + if overview_result + else {"total_events": 0, "event_type_count": 0, "unique_user_count": 0, "service_count": 0} + ) # Get error rate - error_count = await self.events_collection.count_documents({ - EventFields.TIMESTAMP: {"$gte": start_time}, - EventFields.EVENT_TYPE: {"$regex": "failed|error|timeout", "$options": "i"} - }) + error_count = await self.events_collection.count_documents( + { + EventFields.TIMESTAMP: {"$gte": start_time}, + EventFields.EVENT_TYPE: {"$regex": "failed|error|timeout", "$options": "i"}, + } + ) error_rate = (error_count / stats["total_events"] * 100) if stats["total_events"] > 0 else 0 @@ -155,8 +147,7 @@ async def get_event_stats(self, hours: int = 24) -> EventStatistics: hourly_pipeline = EventStatsAggregation.build_hourly_events_pipeline(start_time) hourly_cursor = self.events_collection.aggregate(hourly_pipeline) events_by_hour: list[HourlyEventCount | dict[str, Any]] = [ - HourlyEventCount(hour=doc["_id"], count=doc["count"]) - async for doc in hourly_cursor + HourlyEventCount(hour=doc["_id"], count=doc["count"]) async for doc in hourly_cursor ] # Get top users @@ -178,20 +169,16 @@ async def get_event_stats(self, hours: int = 24) -> EventStatistics: "$match": { "created_at": {"$gte": start_time}, "status": "completed", - "resource_usage.execution_time_wall_seconds": {"$exists": True} + "resource_usage.execution_time_wall_seconds": {"$exists": True}, } }, - { - "$group": { - "_id": None, - "avg_duration": {"$avg": "$resource_usage.execution_time_wall_seconds"} - } - } + {"$group": {"_id": None, "avg_duration": {"$avg": "$resource_usage.execution_time_wall_seconds"}}}, ] exec_result = await executions_collection.aggregate(exec_pipeline).to_list(1) - avg_processing_time = exec_result[0]["avg_duration"] if exec_result and exec_result[0].get( - "avg_duration") else 0 + avg_processing_time = ( + exec_result[0]["avg_duration"] if exec_result and exec_result[0].get("avg_duration") else 0 + ) statistics = EventStatistics( total_events=stats["total_events"], @@ -199,7 +186,7 @@ async def get_event_stats(self, hours: int = 24) -> EventStatistics: events_by_hour=events_by_hour, top_users=top_users, error_rate=round(error_rate, 2), - avg_processing_time=round(avg_processing_time, 2) + avg_processing_time=round(avg_processing_time, 2), ) return statistics @@ -208,10 +195,7 @@ async def export_events_csv(self, filter: EventFilter) -> List[EventExportRow]: """Export events as CSV data.""" query = EventFilterMapper.to_mongo_query(filter) - cursor = self.events_collection.find(query).sort( - EventFields.TIMESTAMP, - SortDirection.DESCENDING - ).limit(10000) + cursor = self.events_collection.find(query).sort(EventFields.TIMESTAMP, SortDirection.DESCENDING).limit(10000) event_docs = await cursor.to_list(length=10000) @@ -243,9 +227,7 @@ async def create_replay_session(self, session: ReplaySession) -> str: async def get_replay_session(self, session_id: str) -> ReplaySession | None: """Get replay session by ID.""" - doc = await self.replay_sessions_collection.find_one({ - ReplaySessionFields.SESSION_ID: session_id - }) + doc = await self.replay_sessions_collection.find_one({ReplaySessionFields.SESSION_ID: session_id}) return self.replay_mapper.from_dict(doc) if doc else None async def update_replay_session(self, session_id: str, updates: ReplaySessionUpdate) -> bool: @@ -256,16 +238,13 @@ async def update_replay_session(self, session_id: str, updates: ReplaySessionUpd mongo_updates = updates.to_dict() result = await self.replay_sessions_collection.update_one( - {ReplaySessionFields.SESSION_ID: session_id}, - {"$set": mongo_updates} + {ReplaySessionFields.SESSION_ID: session_id}, {"$set": mongo_updates} ) return result.modified_count > 0 async def get_replay_status_with_progress(self, session_id: str) -> ReplaySessionStatusDetail | None: """Get replay session status with progress updates.""" - doc = await self.replay_sessions_collection.find_one({ - ReplaySessionFields.SESSION_ID: session_id - }) + doc = await self.replay_sessions_collection.find_one({ReplaySessionFields.SESSION_ID: session_id}) if not doc: return None @@ -278,17 +257,14 @@ async def get_replay_status_with_progress(self, session_id: str) -> ReplaySessio if time_since_created.total_seconds() > 2: # Use atomic update to prevent race conditions update_result = await self.replay_sessions_collection.find_one_and_update( - { - ReplaySessionFields.SESSION_ID: session_id, - ReplaySessionFields.STATUS: ReplayStatus.SCHEDULED - }, + {ReplaySessionFields.SESSION_ID: session_id, ReplaySessionFields.STATUS: ReplayStatus.SCHEDULED}, { "$set": { ReplaySessionFields.STATUS: ReplayStatus.RUNNING, - ReplaySessionFields.STARTED_AT: current_time + ReplaySessionFields.STARTED_AT: current_time, } }, - return_document=ReturnDocument.AFTER + return_document=ReturnDocument.AFTER, ) if update_result: # Update local session object with the atomically updated values @@ -298,18 +274,13 @@ async def get_replay_status_with_progress(self, session_id: str) -> ReplaySessio if session.is_running and session.started_at: time_since_started = current_time - session.started_at # Assume 10 events per second processing rate - estimated_progress = min( - int(time_since_started.total_seconds() * 10), - session.total_events - ) + estimated_progress = min(int(time_since_started.total_seconds() * 10), session.total_events) # Update progress - returns new instance updated_session = session.update_progress(estimated_progress) # Update in database - session_update = ReplaySessionUpdate( - replayed_events=updated_session.replayed_events - ) + session_update = ReplaySessionUpdate(replayed_events=updated_session.replayed_events) if updated_session.is_completed: session_update.status = updated_session.status @@ -359,23 +330,23 @@ async def get_replay_status_with_progress(self, session_id: str) -> ReplaySessio for exec_id in list(execution_ids)[:10]: # Limit to 10 exec_doc = await executions_collection.find_one({"execution_id": exec_id}) if exec_doc: - execution_results.append({ - "execution_id": exec_doc.get("execution_id"), - "status": exec_doc.get("status"), - "stdout": exec_doc.get("stdout"), - "stderr": exec_doc.get("stderr"), - "exit_code": exec_doc.get("exit_code"), - "execution_time": exec_doc.get("execution_time"), - "lang": exec_doc.get("lang"), - "lang_version": exec_doc.get("lang_version"), - "created_at": exec_doc.get("created_at"), - "updated_at": exec_doc.get("updated_at") - }) + execution_results.append( + { + "execution_id": exec_doc.get("execution_id"), + "status": exec_doc.get("status"), + "stdout": exec_doc.get("stdout"), + "stderr": exec_doc.get("stderr"), + "exit_code": exec_doc.get("exit_code"), + "execution_time": exec_doc.get("execution_time"), + "lang": exec_doc.get("lang"), + "lang_version": exec_doc.get("lang_version"), + "created_at": exec_doc.get("created_at"), + "updated_at": exec_doc.get("updated_at"), + } + ) return ReplaySessionStatusDetail( - session=session, - estimated_completion=estimated_completion, - execution_results=execution_results + session=session, estimated_completion=estimated_completion, execution_results=execution_results ) async def count_events_for_replay(self, query: Dict[str, Any]) -> int: @@ -402,11 +373,7 @@ def build_replay_query(self, replay_query: ReplayQuery) -> Dict[str, Any]: return self.replay_query_mapper.to_mongodb_query(replay_query) async def prepare_replay_session( - self, - query: Dict[str, Any], - dry_run: bool, - replay_correlation_id: str, - max_events: int = 1000 + self, query: Dict[str, Any], dry_run: bool, replay_correlation_id: str, max_events: int = 1000 ) -> ReplaySessionData: """Prepare replay session with validation and preview.""" event_count = await self.count_events_for_replay(query) @@ -427,23 +394,16 @@ async def prepare_replay_session( replay_correlation_id=replay_correlation_id, dry_run=dry_run, query=query, - events_preview=events_preview + events_preview=events_preview, ) return session_data async def get_replay_events_preview( - self, - event_ids: List[str] | None = None, - correlation_id: str | None = None, - aggregate_id: str | None = None + self, event_ids: List[str] | None = None, correlation_id: str | None = None, aggregate_id: str | None = None ) -> Dict[str, Any]: """Get preview of events that would be replayed - backward compatibility.""" - replay_query = ReplayQuery( - event_ids=event_ids, - correlation_id=correlation_id, - aggregate_id=aggregate_id - ) + replay_query = ReplayQuery(event_ids=event_ids, correlation_id=correlation_id, aggregate_id=aggregate_id) query = self.replay_query_mapper.to_mongodb_query(replay_query) @@ -452,15 +412,9 @@ async def get_replay_events_preview( total = await self.event_store_collection.count_documents(query) - cursor = self.event_store_collection.find(query).sort( - EventFields.TIMESTAMP, - SortDirection.ASCENDING - ).limit(100) + cursor = self.event_store_collection.find(query).sort(EventFields.TIMESTAMP, SortDirection.ASCENDING).limit(100) # Batch fetch all events from cursor events = await cursor.to_list(length=100) - return { - "events": events, - "total": total - } + return {"events": events, "total": total} diff --git a/backend/app/db/repositories/admin/admin_settings_repository.py b/backend/app/db/repositories/admin/admin_settings_repository.py index 04323046..1e2e0d19 100644 --- a/backend/app/db/repositories/admin/admin_settings_repository.py +++ b/backend/app/db/repositories/admin/admin_settings_repository.py @@ -1,7 +1,6 @@ from datetime import datetime, timezone -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase - +from app.core.database_context import Collection, Database from app.core.logging import logger from app.domain.admin import ( AuditAction, @@ -12,10 +11,10 @@ class AdminSettingsRepository: - def __init__(self, db: AsyncIOMotorDatabase): + def __init__(self, db: Database): self.db = db - self.settings_collection: AsyncIOMotorCollection = self.db.get_collection("system_settings") - self.audit_log_collection: AsyncIOMotorCollection = self.db.get_collection("audit_log") + self.settings_collection: Collection = self.db.get_collection("system_settings") + self.audit_log_collection: Collection = self.db.get_collection("audit_log") self.settings_mapper = SettingsMapper() self.audit_mapper = AuditLogMapper() @@ -34,12 +33,7 @@ async def get_system_settings(self) -> SystemSettings: return self.settings_mapper.system_settings_from_dict(settings_doc) - async def update_system_settings( - self, - settings: SystemSettings, - updated_by: str, - user_id: str - ) -> SystemSettings: + async def update_system_settings(self, settings: SystemSettings, updated_by: str, user_id: str) -> SystemSettings: """Update system-wide settings.""" # Update settings metadata settings.updated_at = datetime.now(timezone.utc) @@ -47,11 +41,7 @@ async def update_system_settings( # Convert to dict and save settings_dict = self.settings_mapper.system_settings_to_dict(settings) - await self.settings_collection.replace_one( - {"_id": "global"}, - settings_dict, - upsert=True - ) + await self.settings_collection.replace_one({"_id": "global"}, settings_dict, upsert=True) # Create audit log entry audit_entry = AuditLogEntry( @@ -59,12 +49,10 @@ async def update_system_settings( user_id=user_id, username=updated_by, timestamp=datetime.now(timezone.utc), - changes=settings_dict + changes=settings_dict, ) - await self.audit_log_collection.insert_one( - self.audit_mapper.to_dict(audit_entry) - ) + await self.audit_log_collection.insert_one(self.audit_mapper.to_dict(audit_entry)) return settings @@ -78,7 +66,7 @@ async def reset_system_settings(self, username: str, user_id: str) -> SystemSett action=AuditAction.SYSTEM_SETTINGS_RESET, user_id=user_id, username=username, - timestamp=datetime.now(timezone.utc) + timestamp=datetime.now(timezone.utc), ) await self.audit_log_collection.insert_one(self.audit_mapper.to_dict(audit_entry)) diff --git a/backend/app/db/repositories/admin/admin_user_repository.py b/backend/app/db/repositories/admin/admin_user_repository.py index ef04ed37..f4d38549 100644 --- a/backend/app/db/repositories/admin/admin_user_repository.py +++ b/backend/app/db/repositories/admin/admin_user_repository.py @@ -1,7 +1,6 @@ from datetime import datetime, timezone -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase - +from app.core.database_context import Collection, Database from app.core.security import SecurityService from app.domain.enums import UserRole from app.domain.events.event_models import CollectionNames @@ -17,33 +16,26 @@ class AdminUserRepository: - def __init__(self, db: AsyncIOMotorDatabase): + def __init__(self, db: Database): self.db = db - self.users_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.USERS) + self.users_collection: Collection = self.db.get_collection(CollectionNames.USERS) # Related collections used by this repository (e.g., cascade deletes) - self.executions_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.EXECUTIONS) - self.saved_scripts_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.SAVED_SCRIPTS) - self.notifications_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.NOTIFICATIONS) - self.user_settings_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.USER_SETTINGS) - self.events_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.EVENTS) - self.sagas_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.SAGAS) + self.executions_collection: Collection = self.db.get_collection(CollectionNames.EXECUTIONS) + self.saved_scripts_collection: Collection = self.db.get_collection(CollectionNames.SAVED_SCRIPTS) + self.notifications_collection: Collection = self.db.get_collection(CollectionNames.NOTIFICATIONS) + self.user_settings_collection: Collection = self.db.get_collection(CollectionNames.USER_SETTINGS) + self.events_collection: Collection = self.db.get_collection(CollectionNames.EVENTS) + self.sagas_collection: Collection = self.db.get_collection(CollectionNames.SAGAS) self.security_service = SecurityService() self.mapper = UserMapper() async def list_users( - self, - limit: int = 100, - offset: int = 0, - search: str | None = None, - role: UserRole | None = None + self, limit: int = 100, offset: int = 0, search: str | None = None, role: UserRole | None = None ) -> UserListResult: """List all users with optional filtering.""" # Create search filter - search_filter = UserSearchFilter( - search_text=search, - role=role - ) + search_filter = UserSearchFilter(search_text=search, role=role) query = self.mapper.search_filter_to_query(search_filter) @@ -57,12 +49,7 @@ async def list_users( async for user_doc in cursor: users.append(self.mapper.from_mongo_document(user_doc)) - return UserListResult( - users=users, - total=total, - offset=offset, - limit=limit - ) + return UserListResult(users=users, total=total, offset=offset, limit=limit) async def get_user_by_id(self, user_id: str) -> User | None: """Get user by ID.""" @@ -71,11 +58,7 @@ async def get_user_by_id(self, user_id: str) -> User | None: return self.mapper.from_mongo_document(user_doc) return None - async def update_user( - self, - user_id: str, - update_data: UserUpdate - ) -> User | None: + async def update_user(self, user_id: str, update_data: UserUpdate) -> User | None: """Update user details.""" if not update_data.has_updates(): return await self.get_user_by_id(user_id) @@ -92,10 +75,7 @@ async def update_user( # Add updated_at timestamp update_dict[UserFields.UPDATED_AT] = datetime.now(timezone.utc) - result = await self.users_collection.update_one( - {UserFields.USER_ID: user_id}, - {"$set": update_dict} - ) + result = await self.users_collection.update_one({UserFields.USER_ID: user_id}, {"$set": update_dict}) if result.modified_count > 0: return await self.get_user_by_id(user_id) @@ -147,10 +127,7 @@ async def reset_user_password(self, password_reset: PasswordReset) -> bool: result = await self.users_collection.update_one( {UserFields.USER_ID: password_reset.user_id}, - {"$set": { - UserFields.HASHED_PASSWORD: hashed_password, - UserFields.UPDATED_AT: datetime.now(timezone.utc) - }} + {"$set": {UserFields.HASHED_PASSWORD: hashed_password, UserFields.UPDATED_AT: datetime.now(timezone.utc)}}, ) return result.modified_count > 0 diff --git a/backend/app/db/repositories/dlq_repository.py b/backend/app/db/repositories/dlq_repository.py index cb15f9cd..e873ca36 100644 --- a/backend/app/db/repositories/dlq_repository.py +++ b/backend/app/db/repositories/dlq_repository.py @@ -1,8 +1,7 @@ from datetime import datetime, timezone from typing import Dict, List, Mapping -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase - +from app.core.database_context import Collection, Database from app.core.logging import logger from app.dlq import ( AgeStatistics, @@ -24,17 +23,14 @@ class DLQRepository: - def __init__(self, db: AsyncIOMotorDatabase): + def __init__(self, db: Database): self.db = db - self.dlq_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.DLQ_MESSAGES) + self.dlq_collection: Collection = self.db.get_collection(CollectionNames.DLQ_MESSAGES) async def get_dlq_stats(self) -> DLQStatistics: # Get counts by status status_pipeline: list[Mapping[str, object]] = [ - {"$group": { - "_id": f"${DLQFields.STATUS}", - "count": {"$sum": 1} - }} + {"$group": {"_id": f"${DLQFields.STATUS}", "count": {"$sum": 1}}} ] status_results = [] @@ -49,57 +45,52 @@ async def get_dlq_stats(self) -> DLQStatistics: # Get counts by topic topic_pipeline: list[Mapping[str, object]] = [ - {"$group": { - "_id": f"${DLQFields.ORIGINAL_TOPIC}", - "count": {"$sum": 1}, - "avg_retry_count": {"$avg": f"${DLQFields.RETRY_COUNT}"} - }}, + { + "$group": { + "_id": f"${DLQFields.ORIGINAL_TOPIC}", + "count": {"$sum": 1}, + "avg_retry_count": {"$avg": f"${DLQFields.RETRY_COUNT}"}, + } + }, {"$sort": {"count": -1}}, - {"$limit": 10} + {"$limit": 10}, ] by_topic: List[TopicStatistic] = [] async for doc in self.dlq_collection.aggregate(topic_pipeline): - by_topic.append(TopicStatistic( - topic=doc["_id"], - count=doc["count"], - avg_retry_count=round(doc["avg_retry_count"], 2) - )) + by_topic.append( + TopicStatistic(topic=doc["_id"], count=doc["count"], avg_retry_count=round(doc["avg_retry_count"], 2)) + ) # Get counts by event type event_type_pipeline: list[Mapping[str, object]] = [ - {"$group": { - "_id": f"${DLQFields.EVENT_TYPE}", - "count": {"$sum": 1} - }}, + {"$group": {"_id": f"${DLQFields.EVENT_TYPE}", "count": {"$sum": 1}}}, {"$sort": {"count": -1}}, - {"$limit": 10} + {"$limit": 10}, ] by_event_type: List[EventTypeStatistic] = [] async for doc in self.dlq_collection.aggregate(event_type_pipeline): if doc["_id"]: # Skip null event types - by_event_type.append(EventTypeStatistic( - event_type=doc["_id"], - count=doc["count"] - )) + by_event_type.append(EventTypeStatistic(event_type=doc["_id"], count=doc["count"])) # Get age statistics age_pipeline: list[Mapping[str, object]] = [ - {"$project": { - "age_seconds": { - "$divide": [ - {"$subtract": [datetime.now(timezone.utc), f"${DLQFields.FAILED_AT}"]}, - 1000 - ] + { + "$project": { + "age_seconds": { + "$divide": [{"$subtract": [datetime.now(timezone.utc), f"${DLQFields.FAILED_AT}"]}, 1000] + } } - }}, - {"$group": { - "_id": None, - "min_age": {"$min": "$age_seconds"}, - "max_age": {"$max": "$age_seconds"}, - "avg_age": {"$avg": "$age_seconds"} - }} + }, + { + "$group": { + "_id": None, + "min_age": {"$min": "$age_seconds"}, + "max_age": {"$max": "$age_seconds"}, + "avg_age": {"$avg": "$age_seconds"}, + } + }, ] age_result = await self.dlq_collection.aggregate(age_pipeline).to_list(1) @@ -107,48 +98,34 @@ async def get_dlq_stats(self) -> DLQStatistics: age_stats = AgeStatistics( min_age_seconds=age_stats_data.get("min_age", 0.0), max_age_seconds=age_stats_data.get("max_age", 0.0), - avg_age_seconds=age_stats_data.get("avg_age", 0.0) + avg_age_seconds=age_stats_data.get("avg_age", 0.0), ) - return DLQStatistics( - by_status=by_status, - by_topic=by_topic, - by_event_type=by_event_type, - age_stats=age_stats - ) + return DLQStatistics(by_status=by_status, by_topic=by_topic, by_event_type=by_event_type, age_stats=age_stats) async def get_messages( - self, - status: str | None = None, - topic: str | None = None, - event_type: str | None = None, - limit: int = 50, - offset: int = 0 + self, + status: str | None = None, + topic: str | None = None, + event_type: str | None = None, + limit: int = 50, + offset: int = 0, ) -> DLQMessageListResult: # Create filter filter = DLQMessageFilter( - status=DLQMessageStatus(status) if status else None, - topic=topic, - event_type=event_type + status=DLQMessageStatus(status) if status else None, topic=topic, event_type=event_type ) query = DLQMapper.filter_to_query(filter) total_count = await self.dlq_collection.count_documents(query) - cursor = self.dlq_collection.find(query).sort( - DLQFields.FAILED_AT, -1 - ).skip(offset).limit(limit) + cursor = self.dlq_collection.find(query).sort(DLQFields.FAILED_AT, -1).skip(offset).limit(limit) messages = [] async for doc in cursor: messages.append(DLQMapper.from_mongo_document(doc)) - return DLQMessageListResult( - messages=messages, - total=total_count, - offset=offset, - limit=limit - ) + return DLQMessageListResult(messages=messages, total=total_count, offset=offset, limit=limit) async def get_message_by_id(self, event_id: str) -> DLQMessage | None: doc = await self.dlq_collection.find_one({DLQFields.EVENT_ID: event_id}) @@ -159,16 +136,18 @@ async def get_message_by_id(self, event_id: str) -> DLQMessage | None: async def get_topics_summary(self) -> list[DLQTopicSummary]: pipeline: list[Mapping[str, object]] = [ - {"$group": { - "_id": f"${DLQFields.ORIGINAL_TOPIC}", - "count": {"$sum": 1}, - "statuses": {"$push": f"${DLQFields.STATUS}"}, - "oldest_message": {"$min": f"${DLQFields.FAILED_AT}"}, - "newest_message": {"$max": f"${DLQFields.FAILED_AT}"}, - "avg_retry_count": {"$avg": f"${DLQFields.RETRY_COUNT}"}, - "max_retry_count": {"$max": f"${DLQFields.RETRY_COUNT}"} - }}, - {"$sort": {"count": -1}} + { + "$group": { + "_id": f"${DLQFields.ORIGINAL_TOPIC}", + "count": {"$sum": 1}, + "statuses": {"$push": f"${DLQFields.STATUS}"}, + "oldest_message": {"$min": f"${DLQFields.FAILED_AT}"}, + "newest_message": {"$max": f"${DLQFields.FAILED_AT}"}, + "avg_retry_count": {"$avg": f"${DLQFields.RETRY_COUNT}"}, + "max_retry_count": {"$max": f"${DLQFields.RETRY_COUNT}"}, + } + }, + {"$sort": {"count": -1}}, ] topics = [] @@ -177,15 +156,17 @@ async def get_topics_summary(self) -> list[DLQTopicSummary]: for status in result["statuses"]: status_counts[status] = status_counts.get(status, 0) + 1 - topics.append(DLQTopicSummary( - topic=result["_id"], - total_messages=result["count"], - status_breakdown=status_counts, - oldest_message=result["oldest_message"], - newest_message=result["newest_message"], - avg_retry_count=round(result["avg_retry_count"], 2), - max_retry_count=result["max_retry_count"] - )) + topics.append( + DLQTopicSummary( + topic=result["_id"], + total_messages=result["count"], + status_breakdown=status_counts, + oldest_message=result["oldest_message"], + newest_message=result["newest_message"], + avg_retry_count=round(result["avg_retry_count"], 2), + max_retry_count=result["max_retry_count"], + ) + ) return topics @@ -197,9 +178,9 @@ async def mark_message_retried(self, event_id: str) -> bool: "$set": { DLQFields.STATUS: DLQMessageStatus.RETRIED, DLQFields.RETRIED_AT: now, - DLQFields.LAST_UPDATED: now + DLQFields.LAST_UPDATED: now, } - } + }, ) return result.modified_count > 0 @@ -212,9 +193,9 @@ async def mark_message_discarded(self, event_id: str, reason: str) -> bool: DLQFields.STATUS: DLQMessageStatus.DISCARDED.value, DLQFields.DISCARDED_AT: now, DLQFields.DISCARD_REASON: reason, - DLQFields.LAST_UPDATED: now + DLQFields.LAST_UPDATED: now, } - } + }, ) return result.modified_count > 0 @@ -231,11 +212,7 @@ async def retry_messages_batch(self, event_ids: list[str], dlq_manager: DLQManag if not message: failed += 1 - details.append(DLQRetryResult( - event_id=event_id, - status="failed", - error="Message not found" - )) + details.append(DLQRetryResult(event_id=event_id, status="failed", error="Message not found")) continue # Use dlq_manager for retry logic @@ -245,30 +222,14 @@ async def retry_messages_batch(self, event_ids: list[str], dlq_manager: DLQManag # Mark as retried await self.mark_message_retried(event_id) successful += 1 - details.append(DLQRetryResult( - event_id=event_id, - status="success" - )) + details.append(DLQRetryResult(event_id=event_id, status="success")) else: failed += 1 - details.append(DLQRetryResult( - event_id=event_id, - status="failed", - error="Retry failed" - )) + details.append(DLQRetryResult(event_id=event_id, status="failed", error="Retry failed")) except Exception as e: logger.error(f"Error retrying message {event_id}: {e}") failed += 1 - details.append(DLQRetryResult( - event_id=event_id, - status="failed", - error=str(e) - )) - - return DLQBatchRetryResult( - total=len(event_ids), - successful=successful, - failed=failed, - details=details - ) + details.append(DLQRetryResult(event_id=event_id, status="failed", error=str(e))) + + return DLQBatchRetryResult(total=len(event_ids), successful=successful, failed=failed, details=details) diff --git a/backend/app/db/repositories/event_repository.py b/backend/app/db/repositories/event_repository.py index 2d2789a9..f26d3e21 100644 --- a/backend/app/db/repositories/event_repository.py +++ b/backend/app/db/repositories/event_repository.py @@ -3,9 +3,9 @@ from types import MappingProxyType from typing import Any, AsyncIterator, Mapping -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase from pymongo import ASCENDING, DESCENDING +from app.core.database_context import Collection, Database from app.core.logging import logger from app.core.tracing import EventAttributes from app.core.tracing.utils import add_span_attributes @@ -25,34 +25,25 @@ class EventRepository: - def __init__(self, database: AsyncIOMotorDatabase) -> None: + def __init__(self, database: Database) -> None: self.database = database self.mapper = EventMapper() - self._collection: AsyncIOMotorCollection = self.database.get_collection(CollectionNames.EVENTS) + self._collection: Collection = self.database.get_collection(CollectionNames.EVENTS) - def _build_time_filter( - self, - start_time: datetime | None, - end_time: datetime | None - ) -> dict[str, object]: + def _build_time_filter(self, start_time: datetime | None, end_time: datetime | None) -> dict[str, object]: """Build time range filter, eliminating if-else branching.""" - return { - key: value for key, value in { - "$gte": start_time, - "$lte": end_time - }.items() if value is not None - } + return {key: value for key, value in {"$gte": start_time, "$lte": end_time}.items() if value is not None} async def store_event(self, event: Event) -> str: """ Store an event in the collection - + Args: event: Event domain model to store - + Returns: Event ID of stored event - + Raises: DuplicateKeyError: If event with same ID already exists """ @@ -75,10 +66,10 @@ async def store_event(self, event: Event) -> str: async def store_events_batch(self, events: list[Event]) -> list[str]: """ Store multiple events in a batch - + Args: events: List of event domain models to store - + Returns: List of stored event IDs """ @@ -106,12 +97,12 @@ async def get_event(self, event_id: str) -> Event | None: return self.mapper.from_mongo_document(result) if result else None async def get_events_by_type( - self, - event_type: str, - start_time: datetime | None = None, - end_time: datetime | None = None, - limit: int = 100, - skip: int = 0 + self, + event_type: str, + start_time: datetime | None = None, + end_time: datetime | None = None, + limit: int = 100, + skip: int = 0, ) -> list[Event]: query: dict[str, Any] = {EventFields.EVENT_TYPE: event_type} time_filter = self._build_time_filter(start_time, end_time) @@ -123,10 +114,7 @@ async def get_events_by_type( return [self.mapper.from_mongo_document(doc) for doc in docs] async def get_events_by_aggregate( - self, - aggregate_id: str, - event_types: list[str] | None = None, - limit: int = 100 + self, aggregate_id: str, event_types: list[str] | None = None, limit: int = 100 ) -> list[Event]: query: dict[str, Any] = {EventFields.AGGREGATE_ID: aggregate_id} if event_types: @@ -136,24 +124,23 @@ async def get_events_by_aggregate( docs = await cursor.to_list(length=limit) return [self.mapper.from_mongo_document(doc) for doc in docs] - async def get_events_by_correlation( - self, - correlation_id: str, - limit: int = 100 - ) -> list[Event]: - cursor = (self._collection.find({EventFields.METADATA_CORRELATION_ID: correlation_id}) - .sort(EventFields.TIMESTAMP, ASCENDING).limit(limit)) + async def get_events_by_correlation(self, correlation_id: str, limit: int = 100) -> list[Event]: + cursor = ( + self._collection.find({EventFields.METADATA_CORRELATION_ID: correlation_id}) + .sort(EventFields.TIMESTAMP, ASCENDING) + .limit(limit) + ) docs = await cursor.to_list(length=limit) return [self.mapper.from_mongo_document(doc) for doc in docs] async def get_events_by_user( - self, - user_id: str, - event_types: list[str] | None = None, - start_time: datetime | None = None, - end_time: datetime | None = None, - limit: int = 100, - skip: int = 0 + self, + user_id: str, + event_types: list[str] | None = None, + start_time: datetime | None = None, + end_time: datetime | None = None, + limit: int = 100, + skip: int = 0, ) -> list[Event]: query: dict[str, Any] = {EventFields.METADATA_USER_ID: user_id} if event_types: @@ -166,28 +153,15 @@ async def get_events_by_user( docs = await cursor.to_list(length=limit) return [self.mapper.from_mongo_document(doc) for doc in docs] - async def get_execution_events( - self, - execution_id: str, - limit: int = 100 - ) -> list[Event]: - query = { - "$or": [ - {EventFields.PAYLOAD_EXECUTION_ID: execution_id}, - {EventFields.AGGREGATE_ID: execution_id} - ] - } + async def get_execution_events(self, execution_id: str, limit: int = 100) -> list[Event]: + query = {"$or": [{EventFields.PAYLOAD_EXECUTION_ID: execution_id}, {EventFields.AGGREGATE_ID: execution_id}]} cursor = self._collection.find(query).sort(EventFields.TIMESTAMP, ASCENDING).limit(limit) docs = await cursor.to_list(length=limit) return [self.mapper.from_mongo_document(doc) for doc in docs] async def search_events( - self, - text_query: str, - filters: dict[str, object] | None = None, - limit: int = 100, - skip: int = 0 + self, text_query: str, filters: dict[str, object] | None = None, limit: int = 100, skip: int = 0 ) -> list[Event]: query: dict[str, object] = {"$text": {"$search": text_query}} if filters: @@ -198,9 +172,7 @@ async def search_events( return [self.mapper.from_mongo_document(doc) for doc in docs] async def get_event_statistics( - self, - start_time: datetime | None = None, - end_time: datetime | None = None + self, start_time: datetime | None = None, end_time: datetime | None = None ) -> EventStatistics: pipeline: list[Mapping[str, object]] = [] @@ -208,37 +180,37 @@ async def get_event_statistics( if time_filter: pipeline.append({"$match": {EventFields.TIMESTAMP: time_filter}}) - pipeline.extend([ - { - "$facet": { - "by_type": [ - {"$group": {"_id": f"${EventFields.EVENT_TYPE}", "count": {"$sum": 1}}}, - {"$sort": {"count": -1}} - ], - "by_service": [ - {"$group": {"_id": f"${EventFields.METADATA_SERVICE_NAME}", "count": {"$sum": 1}}}, - {"$sort": {"count": -1}} - ], - "by_hour": [ - { - "$group": { - "_id": { - "$dateToString": { - "format": "%Y-%m-%d %H:00", - "date": f"${EventFields.TIMESTAMP}" - } - }, - "count": {"$sum": 1} - } - }, - {"$sort": {"_id": 1}} - ], - "total": [ - {"$count": "count"} - ] + pipeline.extend( + [ + { + "$facet": { + "by_type": [ + {"$group": {"_id": f"${EventFields.EVENT_TYPE}", "count": {"$sum": 1}}}, + {"$sort": {"count": -1}}, + ], + "by_service": [ + {"$group": {"_id": f"${EventFields.METADATA_SERVICE_NAME}", "count": {"$sum": 1}}}, + {"$sort": {"count": -1}}, + ], + "by_hour": [ + { + "$group": { + "_id": { + "$dateToString": { + "format": "%Y-%m-%d %H:00", + "date": f"${EventFields.TIMESTAMP}", + } + }, + "count": {"$sum": 1}, + } + }, + {"$sort": {"_id": 1}}, + ], + "total": [{"$count": "count"}], + } } - } - ]) + ] + ) result = await self._collection.aggregate(pipeline).to_list(length=1) @@ -248,21 +220,16 @@ async def get_event_statistics( total_events=stats["total"][0]["count"] if stats["total"] else 0, events_by_type={item["_id"]: item["count"] for item in stats["by_type"]}, events_by_service={item["_id"]: item["count"] for item in stats["by_service"]}, - events_by_hour=stats["by_hour"] + events_by_hour=stats["by_hour"], ) - return EventStatistics( - total_events=0, - events_by_type={}, - events_by_service={}, - events_by_hour=[] - ) + return EventStatistics(total_events=0, events_by_type={}, events_by_service={}, events_by_hour=[]) async def get_event_statistics_filtered( - self, - match: Mapping[str, object] = MappingProxyType({}), - start_time: datetime | None = None, - end_time: datetime | None = None, + self, + match: Mapping[str, object] = MappingProxyType({}), + start_time: datetime | None = None, + end_time: datetime | None = None, ) -> EventStatistics: pipeline: list[Mapping[str, object]] = [] @@ -275,37 +242,37 @@ async def get_event_statistics_filtered( if and_clauses: pipeline.append({"$match": {"$and": and_clauses}}) - pipeline.extend([ - { - "$facet": { - "by_type": [ - {"$group": {"_id": f"${EventFields.EVENT_TYPE}", "count": {"$sum": 1}}}, - {"$sort": {"count": -1}} - ], - "by_service": [ - {"$group": {"_id": f"${EventFields.METADATA_SERVICE_NAME}", "count": {"$sum": 1}}}, - {"$sort": {"count": -1}} - ], - "by_hour": [ - { - "$group": { - "_id": { - "$dateToString": { - "format": "%Y-%m-%d %H:00", - "date": f"${EventFields.TIMESTAMP}" - } - }, - "count": {"$sum": 1} - } - }, - {"$sort": {"_id": 1}} - ], - "total": [ - {"$count": "count"} - ] + pipeline.extend( + [ + { + "$facet": { + "by_type": [ + {"$group": {"_id": f"${EventFields.EVENT_TYPE}", "count": {"$sum": 1}}}, + {"$sort": {"count": -1}}, + ], + "by_service": [ + {"$group": {"_id": f"${EventFields.METADATA_SERVICE_NAME}", "count": {"$sum": 1}}}, + {"$sort": {"count": -1}}, + ], + "by_hour": [ + { + "$group": { + "_id": { + "$dateToString": { + "format": "%Y-%m-%d %H:00", + "date": f"${EventFields.TIMESTAMP}", + } + }, + "count": {"$sum": 1}, + } + }, + {"$sort": {"_id": 1}}, + ], + "total": [{"$count": "count"}], + } } - } - ]) + ] + ) result = await self._collection.aggregate(pipeline).to_list(length=1) if result: @@ -314,18 +281,16 @@ async def get_event_statistics_filtered( total_events=stats["total"][0]["count"] if stats["total"] else 0, events_by_type={item["_id"]: item["count"] for item in stats["by_type"]}, events_by_service={item["_id"]: item["count"] for item in stats["by_service"]}, - events_by_hour=stats["by_hour"] + events_by_hour=stats["by_hour"], ) return EventStatistics(total_events=0, events_by_type={}, events_by_service={}, events_by_hour=[]) async def stream_events( - self, - filters: dict[str, object] | None = None, - start_after: dict[str, object] | None = None + self, filters: dict[str, object] | None = None, start_after: dict[str, object] | None = None ) -> AsyncIterator[dict[str, object]]: """ Stream events using change streams for real-time updates - + Args: filters: Optional filters for events start_after: Resume token for continuing from previous position @@ -334,29 +299,22 @@ async def stream_events( if filters: pipeline.append({"$match": filters}) - async with self._collection.watch( - pipeline, - start_after=start_after, - full_document="updateLookup" - ) as stream: + async with self._collection.watch(pipeline, start_after=start_after, full_document="updateLookup") as stream: async for change in stream: if change["operationType"] in ["insert", "update", "replace"]: yield change["fullDocument"] async def cleanup_old_events( - self, - older_than_days: int = 30, - event_types: list[str] | None = None, - dry_run: bool = False + self, older_than_days: int = 30, event_types: list[str] | None = None, dry_run: bool = False ) -> int: """ Manually cleanup old events (in addition to TTL) - + Args: older_than_days: Delete events older than this many days event_types: Only cleanup specific event types dry_run: If True, only count events without deleting - + Returns: Number of events deleted (or would be deleted if dry_run) """ @@ -378,14 +336,14 @@ async def cleanup_old_events( # Access checks are handled in the service layer. async def get_user_events_paginated( - self, - user_id: str, - event_types: list[str] | None = None, - start_time: datetime | None = None, - end_time: datetime | None = None, - limit: int = 100, - skip: int = 0, - sort_order: str = "desc" + self, + user_id: str, + event_types: list[str] | None = None, + start_time: datetime | None = None, + end_time: datetime | None = None, + limit: int = 100, + skip: int = 0, + sort_order: str = "desc", ) -> EventListResult: """Get paginated user events with count""" query: dict[str, Any] = {EventFields.METADATA_USER_ID: user_id} @@ -411,15 +369,10 @@ async def get_user_events_paginated( total=total_count, skip=skip, limit=limit, - has_more=(skip + limit) < total_count + has_more=(skip + limit) < total_count, ) - async def query_events_advanced( - self, - user_id: str, - user_role: str, - filters: EventFilter - ) -> EventListResult | None: + async def query_events_advanced(self, user_id: str, user_role: str, filters: EventFilter) -> EventListResult | None: """Advanced event query with filters""" query: dict[str, object] = {} @@ -453,16 +406,12 @@ async def query_events_advanced( total=total_count, skip=0, limit=100, - has_more=100 < total_count + has_more=100 < total_count, ) add_span_attributes(**{"events.query.total": total_count}) return result_obj - async def aggregate_events( - self, - pipeline: list[dict[str, object]], - limit: int = 100 - ) -> EventAggregationResult: + async def aggregate_events(self, pipeline: list[dict[str, object]], limit: int = 100) -> EventAggregationResult: pipeline = pipeline.copy() pipeline.append({"$limit": limit}) @@ -478,22 +427,19 @@ async def list_event_types(self, match: Mapping[str, object] = MappingProxyType( pipeline: list[Mapping[str, object]] = [] if match: pipeline.append({"$match": dict(match)}) - pipeline.extend([ - {"$group": {"_id": f"${EventFields.EVENT_TYPE}"}}, - {"$sort": {"_id": 1}} - ]) + pipeline.extend([{"$group": {"_id": f"${EventFields.EVENT_TYPE}"}}, {"$sort": {"_id": 1}}]) event_types: list[str] = [] async for doc in self._collection.aggregate(pipeline): event_types.append(doc["_id"]) return event_types async def query_events_generic( - self, - query: dict[str, object], - sort_field: str, - sort_direction: int, - skip: int, - limit: int, + self, + query: dict[str, object], + sort_field: str, + sort_direction: int, + skip: int, + limit: int, ) -> EventListResult: total_count = await self._collection.count_documents(query) @@ -514,10 +460,7 @@ async def query_events_generic( ) async def delete_event_with_archival( - self, - event_id: str, - deleted_by: str, - deletion_reason: str = "Admin deletion via API" + self, event_id: str, deleted_by: str, deletion_reason: str = "Admin deletion via API" ) -> ArchivedEvent | None: """Delete event and archive it""" event = await self.get_event(event_id) @@ -540,7 +483,7 @@ async def delete_event_with_archival( error=event.error, deleted_at=datetime.now(timezone.utc), deleted_by=deleted_by, - deletion_reason=deletion_reason + deletion_reason=deletion_reason, ) # Archive the event @@ -556,26 +499,16 @@ async def delete_event_with_archival( return archived_event - async def get_aggregate_events_for_replay( - self, - aggregate_id: str, - limit: int = 10000 - ) -> list[Event]: + async def get_aggregate_events_for_replay(self, aggregate_id: str, limit: int = 10000) -> list[Event]: """Get all events for an aggregate for replay purposes""" - events = await self.get_events_by_aggregate( - aggregate_id=aggregate_id, - limit=limit - ) + events = await self.get_events_by_aggregate(aggregate_id=aggregate_id, limit=limit) if not events: return [] return events - async def get_aggregate_replay_info( - self, - aggregate_id: str - ) -> EventReplayInfo | None: + async def get_aggregate_replay_info(self, aggregate_id: str) -> EventReplayInfo | None: """Get aggregate events and prepare replay information""" events = await self.get_aggregate_events_for_replay(aggregate_id) @@ -587,5 +520,5 @@ async def get_aggregate_replay_info( event_count=len(events), event_types=list(set(e.event_type for e in events)), start_time=min(e.timestamp for e in events), - end_time=max(e.timestamp for e in events) + end_time=max(e.timestamp for e in events), ) diff --git a/backend/app/db/repositories/execution_repository.py b/backend/app/db/repositories/execution_repository.py index 55fad752..a0d8fcd0 100644 --- a/backend/app/db/repositories/execution_repository.py +++ b/backend/app/db/repositories/execution_repository.py @@ -1,7 +1,7 @@ from datetime import datetime, timezone +from typing import Any -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase - +from app.core.database_context import Collection, Database from app.core.logging import logger from app.domain.enums.execution import ExecutionStatus from app.domain.events.event_models import CollectionNames @@ -9,10 +9,10 @@ class ExecutionRepository: - def __init__(self, db: AsyncIOMotorDatabase): + def __init__(self, db: Database): self.db = db - self.collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.EXECUTIONS) - self.results_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.EXECUTION_RESULTS) + self.collection: Collection = self.db.get_collection(CollectionNames.EXECUTIONS) + self.results_collection: Collection = self.db.get_collection(CollectionNames.EXECUTION_RESULTS) async def create_execution(self, execution: DomainExecution) -> DomainExecution: execution_dict = { @@ -55,8 +55,9 @@ async def get_execution(self, execution_id: str) -> DomainExecution | None: document["status"] = result_doc.get("status") sv = document.get("status") + resource_usage_data = document.get("resource_usage") return DomainExecution( - execution_id=document.get("execution_id"), + execution_id=document["execution_id"], script=document.get("script", ""), status=ExecutionStatus(str(sv)), stdout=document.get("stdout"), @@ -66,22 +67,18 @@ async def get_execution(self, execution_id: str) -> DomainExecution | None: created_at=document.get("created_at", datetime.now(timezone.utc)), updated_at=document.get("updated_at", datetime.now(timezone.utc)), resource_usage=( - ResourceUsageDomain.from_dict(document.get("resource_usage")) - if document.get("resource_usage") is not None - else None + ResourceUsageDomain.from_dict(resource_usage_data) if resource_usage_data is not None else None ), user_id=document.get("user_id"), exit_code=document.get("exit_code"), error_type=document.get("error_type"), ) - async def update_execution(self, execution_id: str, update_data: dict) -> bool: + async def update_execution(self, execution_id: str, update_data: dict[str, Any]) -> bool: update_data.setdefault("updated_at", datetime.now(timezone.utc)) update_payload = {"$set": update_data} - result = await self.collection.update_one( - {"execution_id": execution_id}, update_payload - ) + result = await self.collection.update_one({"execution_id": execution_id}, update_payload) return result.matched_count > 0 async def write_terminal_result(self, exec_result: ExecutionResultDomain) -> bool: @@ -123,11 +120,7 @@ async def write_terminal_result(self, exec_result: ExecutionResultDomain) -> boo return True async def get_executions( - self, - query: dict, - limit: int = 50, - skip: int = 0, - sort: list | None = None + self, query: dict[str, Any], limit: int = 50, skip: int = 0, sort: list[tuple[str, int]] | None = None ) -> list[DomainExecution]: cursor = self.collection.find(query) if sort: @@ -161,7 +154,7 @@ async def get_executions( return executions - async def count_executions(self, query: dict) -> int: + async def count_executions(self, query: dict[str, Any]) -> int: return await self.collection.count_documents(query) async def delete_execution(self, execution_id: str) -> bool: diff --git a/backend/app/db/repositories/notification_repository.py b/backend/app/db/repositories/notification_repository.py index dfc99308..bc0f3709 100644 --- a/backend/app/db/repositories/notification_repository.py +++ b/backend/app/db/repositories/notification_repository.py @@ -1,8 +1,8 @@ from datetime import UTC, datetime, timedelta -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase from pymongo import ASCENDING, DESCENDING, IndexModel +from app.core.database_context import Collection, Database from app.core.logging import logger from app.domain.enums.notification import ( NotificationChannel, @@ -16,35 +16,38 @@ class NotificationRepository: - def __init__(self, database: AsyncIOMotorDatabase): - self.db: AsyncIOMotorDatabase = database + def __init__(self, database: Database): + self.db: Database = database - self.notifications_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.NOTIFICATIONS) - self.subscriptions_collection: AsyncIOMotorCollection = self.db.get_collection( - CollectionNames.NOTIFICATION_SUBSCRIPTIONS) + self.notifications_collection: Collection = self.db.get_collection(CollectionNames.NOTIFICATIONS) + self.subscriptions_collection: Collection = self.db.get_collection(CollectionNames.NOTIFICATION_SUBSCRIPTIONS) self.mapper = NotificationMapper() async def create_indexes(self) -> None: # Create indexes if only _id exists notif_indexes = await self.notifications_collection.list_indexes().to_list(None) if len(notif_indexes) <= 1: - await self.notifications_collection.create_indexes([ - IndexModel([("user_id", ASCENDING), ("created_at", DESCENDING)]), - IndexModel([("status", ASCENDING), ("scheduled_for", ASCENDING)]), - IndexModel([("created_at", ASCENDING)]), - IndexModel([("notification_id", ASCENDING)], unique=True), - # Multikey index to speed up tag queries (include/exclude/prefix) - IndexModel([("tags", ASCENDING)]), - ]) + await self.notifications_collection.create_indexes( + [ + IndexModel([("user_id", ASCENDING), ("created_at", DESCENDING)]), + IndexModel([("status", ASCENDING), ("scheduled_for", ASCENDING)]), + IndexModel([("created_at", ASCENDING)]), + IndexModel([("notification_id", ASCENDING)], unique=True), + # Multikey index to speed up tag queries (include/exclude/prefix) + IndexModel([("tags", ASCENDING)]), + ] + ) subs_indexes = await self.subscriptions_collection.list_indexes().to_list(None) if len(subs_indexes) <= 1: - await self.subscriptions_collection.create_indexes([ - IndexModel([("user_id", ASCENDING), ("channel", ASCENDING)], unique=True), - IndexModel([("enabled", ASCENDING)]), - IndexModel([("include_tags", ASCENDING)]), - IndexModel([("severities", ASCENDING)]), - ]) + await self.subscriptions_collection.create_indexes( + [ + IndexModel([("user_id", ASCENDING), ("channel", ASCENDING)], unique=True), + IndexModel([("enabled", ASCENDING)]), + IndexModel([("include_tags", ASCENDING)]), + IndexModel([("severities", ASCENDING)]), + ] + ) # Notifications async def create_notification(self, notification: DomainNotification) -> str: @@ -60,9 +63,7 @@ async def update_notification(self, notification: DomainNotification) -> bool: return result.modified_count > 0 async def get_notification(self, notification_id: str, user_id: str) -> DomainNotification | None: - doc = await self.notifications_collection.find_one( - {"notification_id": notification_id, "user_id": user_id} - ) + doc = await self.notifications_collection.find_one({"notification_id": notification_id, "user_id": user_id}) if not doc: return None return self.mapper.from_mongo_document(doc) @@ -88,14 +89,14 @@ async def delete_notification(self, notification_id: str, user_id: str) -> bool: return result.deleted_count > 0 async def list_notifications( - self, - user_id: str, - status: NotificationStatus | None = None, - skip: int = 0, - limit: int = 20, - include_tags: list[str] | None = None, - exclude_tags: list[str] | None = None, - tag_prefix: str | None = None, + self, + user_id: str, + status: NotificationStatus | None = None, + skip: int = 0, + limit: int = 20, + include_tags: list[str] | None = None, + exclude_tags: list[str] | None = None, + tag_prefix: str | None = None, ) -> list[DomainNotification]: base: dict[str, object] = {"user_id": user_id} if status: @@ -112,10 +113,7 @@ async def list_notifications( query = {"$and": [base] + tag_filters} cursor = ( - self.notifications_collection.find(query or base) - .sort("created_at", DESCENDING) - .skip(skip) - .limit(limit) + self.notifications_collection.find(query or base).sort("created_at", DESCENDING).skip(skip).limit(limit) ) items: list[DomainNotification] = [] @@ -124,11 +122,11 @@ async def list_notifications( return items async def list_notifications_by_tag( - self, - user_id: str, - tag: str, - skip: int = 0, - limit: int = 20, + self, + user_id: str, + tag: str, + skip: int = 0, + limit: int = 20, ) -> list[DomainNotification]: """Convenience helper to list notifications filtered by a single exact tag.""" return await self.list_notifications( @@ -138,9 +136,7 @@ async def list_notifications_by_tag( include_tags=[tag], ) - async def count_notifications( - self, user_id: str, additional_filters: dict[str, object] | None = None - ) -> int: + async def count_notifications(self, user_id: str, additional_filters: dict[str, object] | None = None) -> int: query: dict[str, object] = {"user_id": user_id} if additional_filters: query.update(additional_filters) @@ -204,43 +200,33 @@ async def cleanup_old_notifications(self, days: int = 30) -> int: # Subscriptions async def get_subscription( - self, user_id: str, channel: NotificationChannel + self, user_id: str, channel: NotificationChannel ) -> DomainNotificationSubscription | None: - doc = await self.subscriptions_collection.find_one( - {"user_id": user_id, "channel": channel} - ) + doc = await self.subscriptions_collection.find_one({"user_id": user_id, "channel": channel}) if not doc: return None return self.mapper.subscription_from_mongo_document(doc) async def upsert_subscription( - self, - user_id: str, - channel: NotificationChannel, - subscription: DomainNotificationSubscription, + self, + user_id: str, + channel: NotificationChannel, + subscription: DomainNotificationSubscription, ) -> None: subscription.user_id = user_id subscription.channel = channel subscription.updated_at = datetime.now(UTC) doc = self.mapper.subscription_to_mongo_document(subscription) - await self.subscriptions_collection.replace_one( - {"user_id": user_id, "channel": channel}, doc, upsert=True - ) + await self.subscriptions_collection.replace_one({"user_id": user_id, "channel": channel}, doc, upsert=True) - async def get_all_subscriptions( - self, user_id: str - ) -> dict[str, DomainNotificationSubscription]: + async def get_all_subscriptions(self, user_id: str) -> dict[str, DomainNotificationSubscription]: subs: dict[str, DomainNotificationSubscription] = {} for channel in NotificationChannel: - doc = await self.subscriptions_collection.find_one( - {"user_id": user_id, "channel": channel} - ) + doc = await self.subscriptions_collection.find_one({"user_id": user_id, "channel": channel}) if doc: subs[channel] = self.mapper.subscription_from_mongo_document(doc) else: - subs[channel] = DomainNotificationSubscription( - user_id=user_id, channel=channel, enabled=True - ) + subs[channel] = DomainNotificationSubscription(user_id=user_id, channel=channel, enabled=True) return subs # User query operations for system notifications @@ -282,9 +268,7 @@ async def get_active_users(self, days: int = 30) -> list[str]: user_ids.add(user["user_id"]) executions_collection = self.db.executions - exec_cursor = executions_collection.find( - {"created_at": {"$gte": cutoff_date}}, {"user_id": 1} - ).limit(1000) + exec_cursor = executions_collection.find({"created_at": {"$gte": cutoff_date}}, {"user_id": 1}).limit(1000) async for execution in exec_cursor: if execution.get("user_id"): diff --git a/backend/app/db/repositories/replay_repository.py b/backend/app/db/repositories/replay_repository.py index 6f5620d1..1bf86eea 100644 --- a/backend/app/db/repositories/replay_repository.py +++ b/backend/app/db/repositories/replay_repository.py @@ -1,8 +1,8 @@ from typing import Any, AsyncIterator, Dict, List -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase from pymongo import ASCENDING, DESCENDING +from app.core.database_context import Collection, Database from app.core.logging import logger from app.domain.admin.replay_updates import ReplaySessionUpdate from app.domain.events.event_models import CollectionNames @@ -11,10 +11,10 @@ class ReplayRepository: - def __init__(self, database: AsyncIOMotorDatabase) -> None: + def __init__(self, database: Database) -> None: self.db = database - self.replay_collection: AsyncIOMotorCollection = database.get_collection(CollectionNames.REPLAY_SESSIONS) - self.events_collection: AsyncIOMotorCollection = database.get_collection(CollectionNames.EVENTS) + self.replay_collection: Collection = database.get_collection(CollectionNames.REPLAY_SESSIONS) + self.events_collection: Collection = database.get_collection(CollectionNames.EVENTS) self._mapper = ReplayStateMapper() async def create_indexes(self) -> None: @@ -34,11 +34,7 @@ async def create_indexes(self) -> None: async def save_session(self, session: ReplaySessionState) -> None: """Save or update a replay session (domain → persistence).""" doc = self._mapper.to_mongo_document(session) - await self.replay_collection.update_one( - {"session_id": session.session_id}, - {"$set": doc}, - upsert=True - ) + await self.replay_collection.update_one({"session_id": session.session_id}, {"$set": doc}, upsert=True) async def get_session(self, session_id: str) -> ReplaySessionState | None: """Get a replay session by ID (persistence → domain).""" @@ -46,11 +42,7 @@ async def get_session(self, session_id: str) -> ReplaySessionState | None: return self._mapper.from_mongo_document(data) if data else None async def list_sessions( - self, - status: str | None = None, - user_id: str | None = None, - limit: int = 100, - skip: int = 0 + self, status: str | None = None, user_id: str | None = None, limit: int = 100, skip: int = 0 ) -> list[ReplaySessionState]: collection = self.replay_collection @@ -68,38 +60,27 @@ async def list_sessions( async def update_session_status(self, session_id: str, status: str) -> bool: """Update the status of a replay session""" - result = await self.replay_collection.update_one( - {"session_id": session_id}, - {"$set": {"status": status}} - ) + result = await self.replay_collection.update_one({"session_id": session_id}, {"$set": {"status": status}}) return result.modified_count > 0 async def delete_old_sessions(self, cutoff_time: str) -> int: """Delete old completed/failed/cancelled sessions""" - result = await self.replay_collection.delete_many({ - "created_at": {"$lt": cutoff_time}, - "status": {"$in": ["completed", "failed", "cancelled"]} - }) + result = await self.replay_collection.delete_many( + {"created_at": {"$lt": cutoff_time}, "status": {"$in": ["completed", "failed", "cancelled"]}} + ) return result.deleted_count async def count_sessions(self, query: dict[str, object] | None = None) -> int: """Count sessions matching the given query""" return await self.replay_collection.count_documents(query or {}) - async def update_replay_session( - self, - session_id: str, - updates: ReplaySessionUpdate - ) -> bool: + async def update_replay_session(self, session_id: str, updates: ReplaySessionUpdate) -> bool: """Update specific fields of a replay session""" if not updates.has_updates(): return False mongo_updates = updates.to_dict() - result = await self.replay_collection.update_one( - {"session_id": session_id}, - {"$set": mongo_updates} - ) + result = await self.replay_collection.update_one({"session_id": session_id}, {"$set": mongo_updates}) return result.modified_count > 0 async def count_events(self, filter: ReplayFilter) -> int: @@ -108,10 +89,7 @@ async def count_events(self, filter: ReplayFilter) -> int: return await self.events_collection.count_documents(query) async def fetch_events( - self, - filter: ReplayFilter, - batch_size: int = 100, - skip: int = 0 + self, filter: ReplayFilter, batch_size: int = 100, skip: int = 0 ) -> AsyncIterator[List[Dict[str, Any]]]: """Fetch events in batches based on filter""" query = filter.to_mongo_query() diff --git a/backend/app/db/repositories/resource_allocation_repository.py b/backend/app/db/repositories/resource_allocation_repository.py index 56a4d5a9..c0aa8454 100644 --- a/backend/app/db/repositories/resource_allocation_repository.py +++ b/backend/app/db/repositories/resource_allocation_repository.py @@ -1,33 +1,34 @@ from datetime import datetime, timezone -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase - +from app.core.database_context import Collection, Database from app.domain.events.event_models import CollectionNames class ResourceAllocationRepository: """Repository for resource allocation bookkeeping used by saga steps.""" - def __init__(self, database: AsyncIOMotorDatabase): + def __init__(self, database: Database): self._db = database - self._collection: AsyncIOMotorCollection = self._db.get_collection(CollectionNames.RESOURCE_ALLOCATIONS) + self._collection: Collection = self._db.get_collection(CollectionNames.RESOURCE_ALLOCATIONS) async def count_active(self, language: str) -> int: - return await self._collection.count_documents({ - "status": "active", - "language": language, - }) + return await self._collection.count_documents( + { + "status": "active", + "language": language, + } + ) async def create_allocation( - self, - allocation_id: str, - *, - execution_id: str, - language: str, - cpu_request: str, - memory_request: str, - cpu_limit: str, - memory_limit: str, + self, + allocation_id: str, + *, + execution_id: str, + language: str, + cpu_request: str, + memory_request: str, + cpu_limit: str, + memory_limit: str, ) -> bool: doc = { "_id": allocation_id, @@ -45,7 +46,6 @@ async def create_allocation( async def release_allocation(self, allocation_id: str) -> bool: result = await self._collection.update_one( - {"_id": allocation_id}, - {"$set": {"status": "released", "released_at": datetime.now(timezone.utc)}} + {"_id": allocation_id}, {"$set": {"status": "released", "released_at": datetime.now(timezone.utc)}} ) return result.modified_count > 0 diff --git a/backend/app/db/repositories/saga_repository.py b/backend/app/db/repositories/saga_repository.py index 6ee276dc..e5bcf86c 100644 --- a/backend/app/db/repositories/saga_repository.py +++ b/backend/app/db/repositories/saga_repository.py @@ -1,8 +1,8 @@ from datetime import datetime, timezone -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase from pymongo import DESCENDING +from app.core.database_context import Collection, Database from app.domain.enums.saga import SagaState from app.domain.events.event_models import CollectionNames from app.domain.saga.models import Saga, SagaFilter, SagaListResult @@ -11,16 +11,16 @@ class SagaRepository: """Repository for saga data access. - + This repository handles all database operations for sagas, following clean architecture principles with no business logic or HTTP-specific concerns. """ - def __init__(self, database: AsyncIOMotorDatabase): + def __init__(self, database: Database): self.db = database - self.sagas: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.SAGAS) - self.executions: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.EXECUTIONS) + self.sagas: Collection = self.db.get_collection(CollectionNames.SAGAS) + self.executions: Collection = self.db.get_collection(CollectionNames.EXECUTIONS) self.mapper = SagaMapper() self.filter_mapper = SagaFilterMapper() @@ -34,21 +34,19 @@ async def upsert_saga(self, saga: Saga) -> bool: return result.modified_count > 0 async def get_saga_by_execution_and_name(self, execution_id: str, saga_name: str) -> Saga | None: - doc = await self.sagas.find_one({ - "execution_id": execution_id, - "saga_name": saga_name, - }) + doc = await self.sagas.find_one( + { + "execution_id": execution_id, + "saga_name": saga_name, + } + ) return self.mapper.from_mongo(doc) if doc else None async def get_saga(self, saga_id: str) -> Saga | None: doc = await self.sagas.find_one({"saga_id": saga_id}) return self.mapper.from_mongo(doc) if doc else None - async def get_sagas_by_execution( - self, - execution_id: str, - state: str | None = None - ) -> list[Saga]: + async def get_sagas_by_execution(self, execution_id: str, state: str | None = None) -> list[Saga]: query: dict[str, object] = {"execution_id": execution_id} if state: query["state"] = state @@ -57,69 +55,37 @@ async def get_sagas_by_execution( docs = await cursor.to_list(length=None) return [self.mapper.from_mongo(doc) for doc in docs] - async def list_sagas( - self, - filter: SagaFilter, - limit: int = 100, - skip: int = 0 - ) -> SagaListResult: + async def list_sagas(self, filter: SagaFilter, limit: int = 100, skip: int = 0) -> SagaListResult: query = self.filter_mapper.to_mongodb_query(filter) # Get total count total = await self.sagas.count_documents(query) # Get sagas with pagination - cursor = (self.sagas.find(query) - .sort("created_at", DESCENDING) - .skip(skip) - .limit(limit)) + cursor = self.sagas.find(query).sort("created_at", DESCENDING).skip(skip).limit(limit) docs = await cursor.to_list(length=limit) sagas = [self.mapper.from_mongo(doc) for doc in docs] - return SagaListResult( - sagas=sagas, - total=total, - skip=skip, - limit=limit - ) + return SagaListResult(sagas=sagas, total=total, skip=skip, limit=limit) - async def update_saga_state( - self, - saga_id: str, - state: str, - error_message: str | None = None - ) -> bool: - update_data = { - "state": state, - "updated_at": datetime.now(timezone.utc) - } + async def update_saga_state(self, saga_id: str, state: str, error_message: str | None = None) -> bool: + update_data = {"state": state, "updated_at": datetime.now(timezone.utc)} if error_message: update_data["error_message"] = error_message - result = await self.sagas.update_one( - {"saga_id": saga_id}, - {"$set": update_data} - ) + result = await self.sagas.update_one({"saga_id": saga_id}, {"$set": update_data}) return result.modified_count > 0 async def get_user_execution_ids(self, user_id: str) -> list[str]: - cursor = self.executions.find( - {"user_id": user_id}, - {"execution_id": 1} - ) + cursor = self.executions.find({"user_id": user_id}, {"execution_id": 1}) docs = await cursor.to_list(length=None) return [doc["execution_id"] for doc in docs] async def count_sagas_by_state(self) -> dict[str, int]: - pipeline = [ - {"$group": { - "_id": "$state", - "count": {"$sum": 1} - }} - ] + pipeline = [{"$group": {"_id": "$state", "count": {"$sum": 1}}}] result = {} async for doc in self.sagas.aggregate(pipeline): @@ -128,10 +94,10 @@ async def count_sagas_by_state(self) -> dict[str, int]: return result async def find_timed_out_sagas( - self, - cutoff_time: datetime, - states: list[SagaState] | None = None, - limit: int = 100, + self, + cutoff_time: datetime, + states: list[SagaState] | None = None, + limit: int = 100, ) -> list[Saga]: states = states or [SagaState.RUNNING, SagaState.COMPENSATING] query = { @@ -142,23 +108,14 @@ async def find_timed_out_sagas( docs = await cursor.to_list(length=limit) return [self.mapper.from_mongo(doc) for doc in docs] - async def get_saga_statistics( - self, - filter: SagaFilter | None = None - ) -> dict[str, object]: + async def get_saga_statistics(self, filter: SagaFilter | None = None) -> dict[str, object]: query = self.filter_mapper.to_mongodb_query(filter) if filter else {} # Basic counts total = await self.sagas.count_documents(query) # State distribution - state_pipeline = [ - {"$match": query}, - {"$group": { - "_id": "$state", - "count": {"$sum": 1} - }} - ] + state_pipeline = [{"$match": query}, {"$group": {"_id": "$state", "count": {"$sum": 1}}}] states = {} async for doc in self.sagas.aggregate(state_pipeline): @@ -167,15 +124,8 @@ async def get_saga_statistics( # Average duration for completed sagas duration_pipeline = [ {"$match": {**query, "state": "completed", "completed_at": {"$ne": None}}}, - {"$project": { - "duration": { - "$subtract": ["$completed_at", "$created_at"] - } - }}, - {"$group": { - "_id": None, - "avg_duration": {"$avg": "$duration"} - }} + {"$project": {"duration": {"$subtract": ["$completed_at", "$created_at"]}}}, + {"$group": {"_id": None, "avg_duration": {"$avg": "$duration"}}}, ] avg_duration = 0.0 @@ -183,8 +133,4 @@ async def get_saga_statistics( # Convert milliseconds to seconds avg_duration = doc["avg_duration"] / 1000.0 if doc["avg_duration"] else 0.0 - return { - "total": total, - "by_state": states, - "average_duration_seconds": avg_duration - } + return {"total": total, "by_state": states, "average_duration_seconds": avg_duration} diff --git a/backend/app/db/repositories/saved_script_repository.py b/backend/app/db/repositories/saved_script_repository.py index 6aa557af..eb26fa1d 100644 --- a/backend/app/db/repositories/saved_script_repository.py +++ b/backend/app/db/repositories/saved_script_repository.py @@ -1,5 +1,4 @@ -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase - +from app.core.database_context import Collection, Database from app.domain.events.event_models import CollectionNames from app.domain.saved_script import ( DomainSavedScript, @@ -10,9 +9,9 @@ class SavedScriptRepository: - def __init__(self, database: AsyncIOMotorDatabase): + def __init__(self, database: Database): self.db = database - self.collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.SAVED_SCRIPTS) + self.collection: Collection = self.db.get_collection(CollectionNames.SAVED_SCRIPTS) self.mapper = SavedScriptMapper() async def create_saved_script(self, saved_script: DomainSavedScriptCreate, user_id: str) -> DomainSavedScript: @@ -24,24 +23,16 @@ async def create_saved_script(self, saved_script: DomainSavedScriptCreate, user_ raise ValueError("Insert not acknowledged") return self.mapper.from_mongo_document(doc) - async def get_saved_script( - self, script_id: str, user_id: str - ) -> DomainSavedScript | None: - saved_script = await self.collection.find_one( - {"script_id": script_id, "user_id": user_id} - ) + async def get_saved_script(self, script_id: str, user_id: str) -> DomainSavedScript | None: + saved_script = await self.collection.find_one({"script_id": script_id, "user_id": user_id}) if not saved_script: return None return self.mapper.from_mongo_document(saved_script) - async def update_saved_script( - self, script_id: str, user_id: str, update_data: DomainSavedScriptUpdate - ) -> None: + async def update_saved_script(self, script_id: str, user_id: str, update_data: DomainSavedScriptUpdate) -> None: update = self.mapper.to_update_dict(update_data) - await self.collection.update_one( - {"script_id": script_id, "user_id": user_id}, {"$set": update} - ) + await self.collection.update_one({"script_id": script_id, "user_id": user_id}, {"$set": update}) async def delete_saved_script(self, script_id: str, user_id: str) -> None: await self.collection.delete_one({"script_id": script_id, "user_id": user_id}) diff --git a/backend/app/db/repositories/sse_repository.py b/backend/app/db/repositories/sse_repository.py index 536aa076..8b578413 100644 --- a/backend/app/db/repositories/sse_repository.py +++ b/backend/app/db/repositories/sse_repository.py @@ -1,5 +1,4 @@ -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase - +from app.core.database_context import Collection, Database from app.domain.events.event_models import CollectionNames from app.domain.execution import DomainExecution from app.domain.sse import SSEEventDomain, SSEExecutionStatusDomain @@ -7,31 +6,25 @@ class SSERepository: - def __init__(self, database: AsyncIOMotorDatabase): + def __init__(self, database: Database): self.db = database - self.executions_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.EXECUTIONS) - self.events_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.EVENTS) + self.executions_collection: Collection = self.db.get_collection(CollectionNames.EXECUTIONS) + self.events_collection: Collection = self.db.get_collection(CollectionNames.EVENTS) self.mapper = SSEMapper() async def get_execution_status(self, execution_id: str) -> SSEExecutionStatusDomain | None: execution = await self.executions_collection.find_one( - {"execution_id": execution_id}, - {"status": 1, "execution_id": 1, "_id": 0} + {"execution_id": execution_id}, {"status": 1, "execution_id": 1, "_id": 0} ) if execution: return self.mapper.to_execution_status(execution_id, execution.get("status", "unknown")) return None - async def get_execution_events( - self, - execution_id: str, - limit: int = 100, - skip: int = 0 - ) -> list[SSEEventDomain]: - cursor = self.events_collection.find( - {"aggregate_id": execution_id} - ).sort("timestamp", 1).skip(skip).limit(limit) + async def get_execution_events(self, execution_id: str, limit: int = 100, skip: int = 0) -> list[SSEEventDomain]: + cursor = ( + self.events_collection.find({"aggregate_id": execution_id}).sort("timestamp", 1).skip(skip).limit(limit) + ) events: list[SSEEventDomain] = [] async for event in cursor: @@ -39,18 +32,13 @@ async def get_execution_events( return events async def get_execution_for_user(self, execution_id: str, user_id: str) -> DomainExecution | None: - doc = await self.executions_collection.find_one({ - "execution_id": execution_id, - "user_id": user_id - }) + doc = await self.executions_collection.find_one({"execution_id": execution_id, "user_id": user_id}) if not doc: return None return self.mapper.execution_from_mongo_document(doc) async def get_execution(self, execution_id: str) -> DomainExecution | None: - doc = await self.executions_collection.find_one({ - "execution_id": execution_id - }) + doc = await self.executions_collection.find_one({"execution_id": execution_id}) if not doc: return None return self.mapper.execution_from_mongo_document(doc) diff --git a/backend/app/db/repositories/user_repository.py b/backend/app/db/repositories/user_repository.py index 64a761a9..3ee60369 100644 --- a/backend/app/db/repositories/user_repository.py +++ b/backend/app/db/repositories/user_repository.py @@ -2,8 +2,7 @@ import uuid from datetime import datetime, timezone -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase - +from app.core.database_context import Collection, Database from app.domain.enums.user import UserRole from app.domain.events.event_models import CollectionNames from app.domain.user import User as DomainAdminUser @@ -13,9 +12,9 @@ class UserRepository: - def __init__(self, db: AsyncIOMotorDatabase): + def __init__(self, db: Database): self.db = db - self.collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.USERS) + self.collection: Collection = self.db.get_collection(CollectionNames.USERS) self.mapper = UserMapper() async def get_user(self, username: str) -> DomainAdminUser | None: @@ -43,11 +42,7 @@ async def get_user_by_id(self, user_id: str) -> DomainAdminUser | None: return None async def list_users( - self, - limit: int = 100, - offset: int = 0, - search: str | None = None, - role: UserRole | None = None + self, limit: int = 100, offset: int = 0, search: str | None = None, role: UserRole | None = None ) -> list[DomainAdminUser]: query: dict[str, object] = {} @@ -56,7 +51,7 @@ async def list_users( escaped_search = re.escape(search) query["$or"] = [ {"username": {"$regex": escaped_search, "$options": "i"}}, - {"email": {"$regex": escaped_search, "$options": "i"}} + {"email": {"$regex": escaped_search, "$options": "i"}}, ] if role: @@ -77,10 +72,7 @@ async def update_user(self, user_id: str, update_data: DomainUserUpdate) -> Doma if update_data.password: update_dict[UserFields.HASHED_PASSWORD] = update_data.password # caller should pass hashed if desired update_dict[UserFields.UPDATED_AT] = datetime.now(timezone.utc) - result = await self.collection.update_one( - {UserFields.USER_ID: user_id}, - {"$set": update_dict} - ) + result = await self.collection.update_one({UserFields.USER_ID: user_id}, {"$set": update_dict}) if result.modified_count > 0: return await self.get_user_by_id(user_id) return None diff --git a/backend/app/db/repositories/user_settings_repository.py b/backend/app/db/repositories/user_settings_repository.py index dfda8cda..bca4f614 100644 --- a/backend/app/db/repositories/user_settings_repository.py +++ b/backend/app/db/repositories/user_settings_repository.py @@ -1,9 +1,9 @@ from datetime import datetime from typing import Any, Dict, List -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase from pymongo import ASCENDING, DESCENDING, IndexModel +from app.core.database_context import Collection, Database from app.core.logging import logger from app.domain.enums.events import EventType from app.domain.events.event_models import CollectionNames @@ -15,26 +15,28 @@ class UserSettingsRepository: - def __init__(self, database: AsyncIOMotorDatabase) -> None: + def __init__(self, database: Database) -> None: self.db = database - self.snapshots_collection: AsyncIOMotorCollection = self.db.get_collection( - CollectionNames.USER_SETTINGS_SNAPSHOTS - ) - self.events_collection: AsyncIOMotorCollection = self.db.get_collection(CollectionNames.EVENTS) + self.snapshots_collection: Collection = self.db.get_collection(CollectionNames.USER_SETTINGS_SNAPSHOTS) + self.events_collection: Collection = self.db.get_collection(CollectionNames.EVENTS) self.mapper = UserSettingsMapper() async def create_indexes(self) -> None: # Create indexes for settings snapshots - await self.snapshots_collection.create_indexes([ - IndexModel([("user_id", ASCENDING)], unique=True), - IndexModel([("updated_at", DESCENDING)]), - ]) + await self.snapshots_collection.create_indexes( + [ + IndexModel([("user_id", ASCENDING)], unique=True), + IndexModel([("updated_at", DESCENDING)]), + ] + ) # Create indexes for settings events - await self.events_collection.create_indexes([ - IndexModel([("event_type", ASCENDING), ("aggregate_id", ASCENDING)]), - IndexModel([("aggregate_id", ASCENDING), ("timestamp", ASCENDING)]), - ]) + await self.events_collection.create_indexes( + [ + IndexModel([("event_type", ASCENDING), ("aggregate_id", ASCENDING)]), + IndexModel([("aggregate_id", ASCENDING), ("timestamp", ASCENDING)]), + ] + ) logger.info("User settings repository indexes created successfully") @@ -46,25 +48,18 @@ async def get_snapshot(self, user_id: str) -> DomainUserSettings | None: async def create_snapshot(self, settings: DomainUserSettings) -> None: doc = self.mapper.to_snapshot_document(settings) - await self.snapshots_collection.replace_one( - {"user_id": settings.user_id}, - doc, - upsert=True - ) + await self.snapshots_collection.replace_one({"user_id": settings.user_id}, doc, upsert=True) logger.info(f"Created settings snapshot for user {settings.user_id}") async def get_settings_events( - self, - user_id: str, - event_types: List[EventType], - since: datetime | None = None, - until: datetime | None = None, - limit: int | None = None + self, + user_id: str, + event_types: List[EventType], + since: datetime | None = None, + until: datetime | None = None, + limit: int | None = None, ) -> List[DomainSettingsEvent]: - query = { - "aggregate_id": f"user_settings_{user_id}", - "event_type": {"$in": [str(et) for et in event_types]} - } + query = {"aggregate_id": f"user_settings_{user_id}", "event_type": {"$in": [str(et) for et in event_types]}} if since or until: timestamp_query: Dict[str, Any] = {} @@ -86,28 +81,21 @@ async def count_events_since_snapshot(self, user_id: str) -> int: snapshot = await self.get_snapshot(user_id) if not snapshot: - return await self.events_collection.count_documents({ - "aggregate_id": f"user_settings_{user_id}" - }) + return await self.events_collection.count_documents({"aggregate_id": f"user_settings_{user_id}"}) - return await self.events_collection.count_documents({ - "aggregate_id": f"user_settings_{user_id}", - "timestamp": {"$gt": snapshot.updated_at} - }) + return await self.events_collection.count_documents( + {"aggregate_id": f"user_settings_{user_id}", "timestamp": {"$gt": snapshot.updated_at}} + ) async def count_events_for_user(self, user_id: str) -> int: - return await self.events_collection.count_documents({ - "aggregate_id": f"user_settings_{user_id}" - }) - + return await self.events_collection.count_documents({"aggregate_id": f"user_settings_{user_id}"}) + async def delete_user_settings(self, user_id: str) -> None: """Delete all settings data for a user (snapshot and events).""" # Delete snapshot await self.snapshots_collection.delete_one({"user_id": user_id}) - + # Delete all events - await self.events_collection.delete_many({ - "aggregate_id": f"user_settings_{user_id}" - }) - + await self.events_collection.delete_many({"aggregate_id": f"user_settings_{user_id}"}) + logger.info(f"Deleted all settings data for user {user_id}") diff --git a/backend/app/db/schema/schema_manager.py b/backend/app/db/schema/schema_manager.py index 5f1e60c8..c3eb40f0 100644 --- a/backend/app/db/schema/schema_manager.py +++ b/backend/app/db/schema/schema_manager.py @@ -1,11 +1,11 @@ from __future__ import annotations from datetime import datetime, timezone -from typing import Awaitable, Callable, Iterable +from typing import Any, Awaitable, Callable, Iterable -from motor.motor_asyncio import AsyncIOMotorDatabase from pymongo import ASCENDING, DESCENDING, IndexModel +from app.core.database_context import Database from app.core.logging import logger from app.domain.events.event_models import EventFields @@ -13,7 +13,7 @@ class SchemaManager: """Applies idempotent, versioned MongoDB migrations per database.""" - def __init__(self, database: AsyncIOMotorDatabase) -> None: + def __init__(self, database: Database) -> None: self.db = database self._versions = self.db["schema_versions"] @@ -44,8 +44,11 @@ async def apply_all(self) -> None: ("0006_saga_indexes", "Create saga indexes", self._m_0006_sagas), ("0007_execution_results_indexes", "Create execution results indexes", self._m_0007_execution_results), ("0008_dlq_indexes", "Create DLQ indexes", self._m_0008_dlq), - ("0009_event_store_extra_indexes", "Additional events indexes for event_store", - self._m_0009_event_store_extra), + ( + "0009_event_store_extra_indexes", + "Additional events indexes for event_store", + self._m_0009_event_store_extra, + ), ] for mig_id, desc, func in migrations: @@ -62,17 +65,22 @@ async def _m_0001_events_init(self) -> None: # Create named, idempotent indexes indexes: Iterable[IndexModel] = [ IndexModel([(EventFields.EVENT_ID, ASCENDING)], name="idx_event_id_unique", unique=True), - IndexModel([(EventFields.EVENT_TYPE, ASCENDING), (EventFields.TIMESTAMP, DESCENDING)], - name="idx_event_type_ts"), - IndexModel([(EventFields.AGGREGATE_ID, ASCENDING), (EventFields.TIMESTAMP, DESCENDING)], - name="idx_aggregate_ts"), + IndexModel( + [(EventFields.EVENT_TYPE, ASCENDING), (EventFields.TIMESTAMP, DESCENDING)], name="idx_event_type_ts" + ), + IndexModel( + [(EventFields.AGGREGATE_ID, ASCENDING), (EventFields.TIMESTAMP, DESCENDING)], name="idx_aggregate_ts" + ), IndexModel([(EventFields.METADATA_CORRELATION_ID, ASCENDING)], name="idx_meta_correlation"), - IndexModel([(EventFields.METADATA_USER_ID, ASCENDING), (EventFields.TIMESTAMP, DESCENDING)], - name="idx_meta_user_ts"), - IndexModel([(EventFields.METADATA_SERVICE_NAME, ASCENDING), (EventFields.TIMESTAMP, DESCENDING)], - name="idx_meta_service_ts"), - IndexModel([(EventFields.STATUS, ASCENDING), (EventFields.TIMESTAMP, DESCENDING)], - name="idx_status_ts"), + IndexModel( + [(EventFields.METADATA_USER_ID, ASCENDING), (EventFields.TIMESTAMP, DESCENDING)], + name="idx_meta_user_ts", + ), + IndexModel( + [(EventFields.METADATA_SERVICE_NAME, ASCENDING), (EventFields.TIMESTAMP, DESCENDING)], + name="idx_meta_service_ts", + ), + IndexModel([(EventFields.STATUS, ASCENDING), (EventFields.TIMESTAMP, DESCENDING)], name="idx_status_ts"), IndexModel([(EventFields.PAYLOAD_EXECUTION_ID, ASCENDING)], name="idx_payload_execution", sparse=True), IndexModel([(EventFields.PAYLOAD_POD_NAME, ASCENDING)], name="idx_payload_pod", sparse=True), # Optional TTL on ttl_expires_at (no effect for nulls) @@ -81,12 +89,17 @@ async def _m_0001_events_init(self) -> None: # Use language_override: "none" to prevent MongoDB from interpreting # the "language" field as a text search language (which causes # "language override unsupported: python" errors) - IndexModel([ - (EventFields.EVENT_TYPE, "text"), - (EventFields.METADATA_SERVICE_NAME, "text"), - (EventFields.METADATA_USER_ID, "text"), - (EventFields.PAYLOAD, "text"), - ], name="idx_text_search", language_override="none", default_language="english"), + IndexModel( + [ + (EventFields.EVENT_TYPE, "text"), + (EventFields.METADATA_SERVICE_NAME, "text"), + (EventFields.METADATA_USER_ID, "text"), + (EventFields.PAYLOAD, "text"), + ], + name="idx_text_search", + language_override="none", + default_language="english", + ), ] try: @@ -97,18 +110,20 @@ async def _m_0001_events_init(self) -> None: # Validator (moderate, warn) — non-blocking try: - await self.db.command({ - "collMod": "events", - "validator": {"$jsonSchema": self._event_json_schema()}, - "validationLevel": "moderate", - "validationAction": "warn", - }) + await self.db.command( + { + "collMod": "events", + "validator": {"$jsonSchema": self._event_json_schema()}, + "validationLevel": "moderate", + "validationAction": "warn", + } + ) logger.info("Events collection validator ensured") except Exception as e: logger.warning(f"Could not set events validator: {e}") @staticmethod - def _event_json_schema() -> dict: + def _event_json_schema() -> dict[str, Any]: return { "bsonType": "object", "required": [ @@ -135,14 +150,18 @@ async def _m_0002_user_settings(self) -> None: snapshots = self.db["user_settings_snapshots"] events = self.db["events"] try: - await snapshots.create_indexes([ - IndexModel([("user_id", ASCENDING)], name="idx_settings_user_unique", unique=True), - IndexModel([("updated_at", DESCENDING)], name="idx_settings_updated_at_desc"), - ]) - await events.create_indexes([ - IndexModel([("event_type", ASCENDING), ("aggregate_id", ASCENDING)], name="idx_events_type_agg"), - IndexModel([("aggregate_id", ASCENDING), ("timestamp", ASCENDING)], name="idx_events_agg_ts"), - ]) + await snapshots.create_indexes( + [ + IndexModel([("user_id", ASCENDING)], name="idx_settings_user_unique", unique=True), + IndexModel([("updated_at", DESCENDING)], name="idx_settings_updated_at_desc"), + ] + ) + await events.create_indexes( + [ + IndexModel([("event_type", ASCENDING), ("aggregate_id", ASCENDING)], name="idx_events_type_agg"), + IndexModel([("aggregate_id", ASCENDING), ("timestamp", ASCENDING)], name="idx_events_agg_ts"), + ] + ) logger.info("User settings indexes ensured") except Exception as e: logger.warning(f"Failed ensuring user settings indexes: {e}") @@ -151,17 +170,21 @@ async def _m_0003_replay(self) -> None: sessions = self.db["replay_sessions"] events = self.db["events"] try: - await sessions.create_indexes([ - IndexModel([("session_id", ASCENDING)], name="idx_replay_session_id", unique=True), - IndexModel([("status", ASCENDING)], name="idx_replay_status"), - IndexModel([("created_at", DESCENDING)], name="idx_replay_created_at_desc"), - IndexModel([("user_id", ASCENDING)], name="idx_replay_user"), - ]) - await events.create_indexes([ - IndexModel([("execution_id", ASCENDING), ("timestamp", ASCENDING)], name="idx_events_exec_ts"), - IndexModel([("event_type", ASCENDING), ("timestamp", ASCENDING)], name="idx_events_type_ts"), - IndexModel([("metadata.user_id", ASCENDING), ("timestamp", ASCENDING)], name="idx_events_user_ts"), - ]) + await sessions.create_indexes( + [ + IndexModel([("session_id", ASCENDING)], name="idx_replay_session_id", unique=True), + IndexModel([("status", ASCENDING)], name="idx_replay_status"), + IndexModel([("created_at", DESCENDING)], name="idx_replay_created_at_desc"), + IndexModel([("user_id", ASCENDING)], name="idx_replay_user"), + ] + ) + await events.create_indexes( + [ + IndexModel([("execution_id", ASCENDING), ("timestamp", ASCENDING)], name="idx_events_exec_ts"), + IndexModel([("event_type", ASCENDING), ("timestamp", ASCENDING)], name="idx_events_type_ts"), + IndexModel([("metadata.user_id", ASCENDING), ("timestamp", ASCENDING)], name="idx_events_user_ts"), + ] + ) logger.info("Replay indexes ensured") except Exception as e: logger.warning(f"Failed ensuring replay indexes: {e}") @@ -171,21 +194,32 @@ async def _m_0004_notifications(self) -> None: rules = self.db["notification_rules"] subs = self.db["notification_subscriptions"] try: - await notifications.create_indexes([ - IndexModel([("user_id", ASCENDING), ("created_at", DESCENDING)], name="idx_notif_user_created_desc"), - IndexModel([("status", ASCENDING), ("scheduled_for", ASCENDING)], name="idx_notif_status_sched"), - IndexModel([("created_at", ASCENDING)], name="idx_notif_created_at"), - IndexModel([("notification_id", ASCENDING)], name="idx_notif_id_unique", unique=True), - ]) - await rules.create_indexes([ - IndexModel([("event_types", ASCENDING)], name="idx_rules_event_types"), - IndexModel([("enabled", ASCENDING)], name="idx_rules_enabled"), - ]) - await subs.create_indexes([ - IndexModel([("user_id", ASCENDING), ("channel", ASCENDING)], - name="idx_sub_user_channel_unique", unique=True), - IndexModel([("enabled", ASCENDING)], name="idx_sub_enabled"), - ]) + await notifications.create_indexes( + [ + IndexModel( + [("user_id", ASCENDING), ("created_at", DESCENDING)], name="idx_notif_user_created_desc" + ), + IndexModel([("status", ASCENDING), ("scheduled_for", ASCENDING)], name="idx_notif_status_sched"), + IndexModel([("created_at", ASCENDING)], name="idx_notif_created_at"), + IndexModel([("notification_id", ASCENDING)], name="idx_notif_id_unique", unique=True), + ] + ) + await rules.create_indexes( + [ + IndexModel([("event_types", ASCENDING)], name="idx_rules_event_types"), + IndexModel([("enabled", ASCENDING)], name="idx_rules_enabled"), + ] + ) + await subs.create_indexes( + [ + IndexModel( + [("user_id", ASCENDING), ("channel", ASCENDING)], + name="idx_sub_user_channel_unique", + unique=True, + ), + IndexModel([("enabled", ASCENDING)], name="idx_sub_enabled"), + ] + ) logger.info("Notification indexes ensured") except Exception as e: logger.warning(f"Failed ensuring notification indexes: {e}") @@ -193,12 +227,14 @@ async def _m_0004_notifications(self) -> None: async def _m_0005_idempotency(self) -> None: coll = self.db["idempotency_keys"] try: - await coll.create_indexes([ - IndexModel([("key", ASCENDING)], name="idx_idem_key_unique", unique=True), - IndexModel([("created_at", ASCENDING)], name="idx_idem_created_ttl", expireAfterSeconds=3600), - IndexModel([("status", ASCENDING)], name="idx_idem_status"), - IndexModel([("event_type", ASCENDING)], name="idx_idem_event_type"), - ]) + await coll.create_indexes( + [ + IndexModel([("key", ASCENDING)], name="idx_idem_key_unique", unique=True), + IndexModel([("created_at", ASCENDING)], name="idx_idem_created_ttl", expireAfterSeconds=3600), + IndexModel([("status", ASCENDING)], name="idx_idem_status"), + IndexModel([("event_type", ASCENDING)], name="idx_idem_event_type"), + ] + ) logger.info("Idempotency indexes ensured") except Exception as e: logger.warning(f"Failed ensuring idempotency indexes: {e}") @@ -206,13 +242,15 @@ async def _m_0005_idempotency(self) -> None: async def _m_0006_sagas(self) -> None: coll = self.db["sagas"] try: - await coll.create_indexes([ - IndexModel([("saga_id", ASCENDING)], name="idx_saga_id_unique", unique=True), - IndexModel([("execution_id", ASCENDING)], name="idx_saga_execution"), - IndexModel([("state", ASCENDING)], name="idx_saga_state"), - IndexModel([("created_at", ASCENDING)], name="idx_saga_created_at"), - IndexModel([("state", ASCENDING), ("created_at", ASCENDING)], name="idx_saga_state_created"), - ]) + await coll.create_indexes( + [ + IndexModel([("saga_id", ASCENDING)], name="idx_saga_id_unique", unique=True), + IndexModel([("execution_id", ASCENDING)], name="idx_saga_execution"), + IndexModel([("state", ASCENDING)], name="idx_saga_state"), + IndexModel([("created_at", ASCENDING)], name="idx_saga_created_at"), + IndexModel([("state", ASCENDING), ("created_at", ASCENDING)], name="idx_saga_state_created"), + ] + ) logger.info("Saga indexes ensured") except Exception as e: logger.warning(f"Failed ensuring saga indexes: {e}") @@ -220,11 +258,15 @@ async def _m_0006_sagas(self) -> None: async def _m_0007_execution_results(self) -> None: coll = self.db["execution_results"] try: - await coll.create_indexes([ - IndexModel([("execution_id", ASCENDING)], name="idx_results_execution_unique", unique=True), - IndexModel([("created_at", ASCENDING)], name="idx_results_created_at"), - IndexModel([("user_id", ASCENDING), ("created_at", DESCENDING)], name="idx_results_user_created_desc"), - ]) + await coll.create_indexes( + [ + IndexModel([("execution_id", ASCENDING)], name="idx_results_execution_unique", unique=True), + IndexModel([("created_at", ASCENDING)], name="idx_results_created_at"), + IndexModel( + [("user_id", ASCENDING), ("created_at", DESCENDING)], name="idx_results_user_created_desc" + ), + ] + ) logger.info("Execution results indexes ensured") except Exception as e: logger.warning(f"Failed ensuring execution results indexes: {e}") @@ -232,16 +274,20 @@ async def _m_0007_execution_results(self) -> None: async def _m_0008_dlq(self) -> None: coll = self.db["dlq_messages"] try: - await coll.create_indexes([ - IndexModel([("event_id", ASCENDING)], name="idx_dlq_event_id_unique", unique=True), - IndexModel([("original_topic", ASCENDING)], name="idx_dlq_topic"), - IndexModel([("event_type", ASCENDING)], name="idx_dlq_event_type"), - IndexModel([("failed_at", DESCENDING)], name="idx_dlq_failed_desc"), - IndexModel([("retry_count", ASCENDING)], name="idx_dlq_retry_count"), - IndexModel([("status", ASCENDING)], name="idx_dlq_status"), - IndexModel([("next_retry_at", ASCENDING)], name="idx_dlq_next_retry"), - IndexModel([("created_at", ASCENDING)], name="idx_dlq_created_ttl", expireAfterSeconds=7 * 24 * 3600), - ]) + await coll.create_indexes( + [ + IndexModel([("event_id", ASCENDING)], name="idx_dlq_event_id_unique", unique=True), + IndexModel([("original_topic", ASCENDING)], name="idx_dlq_topic"), + IndexModel([("event_type", ASCENDING)], name="idx_dlq_event_type"), + IndexModel([("failed_at", DESCENDING)], name="idx_dlq_failed_desc"), + IndexModel([("retry_count", ASCENDING)], name="idx_dlq_retry_count"), + IndexModel([("status", ASCENDING)], name="idx_dlq_status"), + IndexModel([("next_retry_at", ASCENDING)], name="idx_dlq_next_retry"), + IndexModel( + [("created_at", ASCENDING)], name="idx_dlq_created_ttl", expireAfterSeconds=7 * 24 * 3600 + ), + ] + ) logger.info("DLQ indexes ensured") except Exception as e: logger.warning(f"Failed ensuring DLQ indexes: {e}") @@ -249,11 +295,17 @@ async def _m_0008_dlq(self) -> None: async def _m_0009_event_store_extra(self) -> None: events = self.db["events"] try: - await events.create_indexes([ - IndexModel([("metadata.user_id", ASCENDING), ("event_type", ASCENDING)], name="idx_events_user_type"), - IndexModel([("event_type", ASCENDING), ("metadata.user_id", ASCENDING), ("timestamp", DESCENDING)], - name="idx_events_type_user_ts"), - ]) + await events.create_indexes( + [ + IndexModel( + [("metadata.user_id", ASCENDING), ("event_type", ASCENDING)], name="idx_events_user_type" + ), + IndexModel( + [("event_type", ASCENDING), ("metadata.user_id", ASCENDING), ("timestamp", DESCENDING)], + name="idx_events_type_user_ts", + ), + ] + ) logger.info("Additional event store indexes ensured") except Exception as e: logger.warning(f"Failed ensuring event store extra indexes: {e}") diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py index 133b2ebf..e6d15ed4 100644 --- a/backend/app/dlq/manager.py +++ b/backend/app/dlq/manager.py @@ -4,9 +4,9 @@ from typing import Any, Awaitable, Callable, Mapping, Sequence from confluent_kafka import Consumer, KafkaError, Message, Producer -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase from opentelemetry.trace import SpanKind +from app.core.database_context import Collection, Database from app.core.lifecycle import LifecycleEnabled from app.core.logging import logger from app.core.metrics.context import get_dlq_metrics @@ -29,28 +29,27 @@ class DLQManager(LifecycleEnabled): def __init__( - self, - database: AsyncIOMotorDatabase, - consumer: Consumer, - producer: Producer, - dlq_topic: KafkaTopic = KafkaTopic.DEAD_LETTER_QUEUE, - retry_topic_suffix: str = "-retry", - default_retry_policy: RetryPolicy | None = None, + self, + database: Database, + consumer: Consumer, + producer: Producer, + dlq_topic: KafkaTopic = KafkaTopic.DEAD_LETTER_QUEUE, + retry_topic_suffix: str = "-retry", + default_retry_policy: RetryPolicy | None = None, ): self.metrics = get_dlq_metrics() self.dlq_topic = dlq_topic self.retry_topic_suffix = retry_topic_suffix self.default_retry_policy = default_retry_policy or RetryPolicy( - topic="default", - strategy=RetryStrategy.EXPONENTIAL_BACKOFF + topic="default", strategy=RetryStrategy.EXPONENTIAL_BACKOFF ) self.consumer: Consumer = consumer self.producer: Producer = producer - self.dlq_collection: AsyncIOMotorCollection[Any] = database.get_collection(CollectionNames.DLQ_MESSAGES) + self.dlq_collection: Collection = database.get_collection(CollectionNames.DLQ_MESSAGES) self._running = False - self._process_task: asyncio.Task | None = None - self._monitor_task: asyncio.Task | None = None + self._process_task: asyncio.Task[None] | None = None + self._monitor_task: asyncio.Task[None] | None = None # Topic-specific retry policies self._retry_policies: dict[str, RetryPolicy] = {} @@ -153,10 +152,7 @@ def _extract_headers(self, msg: Message) -> dict[str, str]: async def _record_message_metrics(self, dlq_message: DLQMessage) -> None: """Record metrics for received DLQ message.""" - self.metrics.record_dlq_message_received( - dlq_message.original_topic, - dlq_message.event_type - ) + self.metrics.record_dlq_message_received(dlq_message.original_topic, dlq_message.event_type) self.metrics.record_dlq_message_age(dlq_message.age_seconds) async def _process_message_with_tracing(self, msg: Message, dlq_message: DLQMessage) -> None: @@ -194,10 +190,7 @@ async def _process_dlq_message(self, message: DLQMessage) -> None: await self._store_message(message) # Get retry policy for topic - retry_policy = self._retry_policies.get( - message.original_topic, - self.default_retry_policy - ) + retry_policy = self._retry_policies.get(message.original_topic, self.default_retry_policy) # Check if should retry if not retry_policy.should_retry(message): @@ -224,11 +217,7 @@ async def _store_message(self, message: DLQMessage) -> None: doc = DLQMapper.to_mongo_document(message) - await self.dlq_collection.update_one( - {DLQFields.EVENT_ID: message.event_id}, - {"$set": doc}, - upsert=True - ) + await self.dlq_collection.update_one({DLQFields.EVENT_ID: message.event_id}, {"$set": doc}, upsert=True) async def _update_message_status(self, event_id: str, update: DLQMessageUpdate) -> None: update_doc = DLQMapper.update_to_mongo(update) @@ -248,6 +237,7 @@ async def _retry_message(self, message: DLQMessage) -> None: } hdrs = inject_trace_context(hdrs) from typing import cast + kafka_headers = cast(list[tuple[str, str | bytes]], [(k, v.encode()) for k, v in hdrs.items()]) # Get the original event @@ -274,11 +264,7 @@ async def _retry_message(self, message: DLQMessage) -> None: await asyncio.to_thread(self.producer.flush, timeout=5) # Update metrics - self.metrics.record_dlq_message_retried( - message.original_topic, - message.event_type, - "success" - ) + self.metrics.record_dlq_message_retried(message.original_topic, message.event_type, "success") # Update status await self._update_message_status( @@ -297,11 +283,7 @@ async def _retry_message(self, message: DLQMessage) -> None: async def _discard_message(self, message: DLQMessage, reason: str) -> None: # Update metrics - self.metrics.record_dlq_message_discarded( - message.original_topic, - message.event_type, - reason - ) + self.metrics.record_dlq_message_discarded(message.original_topic, message.event_type, reason) # Update status await self._update_message_status( @@ -324,10 +306,9 @@ async def _monitor_dlq(self) -> None: # Find messages ready for retry now = datetime.now(timezone.utc) - cursor = self.dlq_collection.find({ - "status": DLQMessageStatus.SCHEDULED, - "next_retry_at": {"$lte": now} - }).limit(100) + cursor = self.dlq_collection.find( + {"status": DLQMessageStatus.SCHEDULED, "next_retry_at": {"$lte": now}} + ).limit(100) async for doc in cursor: # Recreate DLQ message from MongoDB document @@ -349,12 +330,8 @@ async def _monitor_dlq(self) -> None: async def _update_queue_metrics(self) -> None: # Get counts by topic pipeline: Sequence[Mapping[str, Any]] = [ - {"$match": {str(DLQFields.STATUS): {"$in": [DLQMessageStatus.PENDING, - DLQMessageStatus.SCHEDULED]}}}, - {"$group": { - "_id": f"${DLQFields.ORIGINAL_TOPIC}", - "count": {"$sum": 1} - }} + {"$match": {str(DLQFields.STATUS): {"$in": [DLQMessageStatus.PENDING, DLQMessageStatus.SCHEDULED]}}}, + {"$group": {"_id": f"${DLQFields.ORIGINAL_TOPIC}", "count": {"$sum": 1}}}, ] async for result in self.dlq_collection.aggregate(pipeline): @@ -395,35 +372,36 @@ async def retry_message_manually(self, event_id: str) -> bool: await self._retry_message(message) return True + def create_dlq_manager( - database: AsyncIOMotorDatabase, - dlq_topic: KafkaTopic = KafkaTopic.DEAD_LETTER_QUEUE, - retry_topic_suffix: str = "-retry", - default_retry_policy: RetryPolicy | None = None, + database: Database, + dlq_topic: KafkaTopic = KafkaTopic.DEAD_LETTER_QUEUE, + retry_topic_suffix: str = "-retry", + default_retry_policy: RetryPolicy | None = None, ) -> DLQManager: - settings = get_settings() - consumer = Consumer({ - 'bootstrap.servers': settings.KAFKA_BOOTSTRAP_SERVERS, - 'group.id': f"{GroupId.DLQ_MANAGER}.{settings.KAFKA_GROUP_SUFFIX}", - 'enable.auto.commit': False, - 'auto.offset.reset': 'earliest', - 'client.id': 'dlq-manager-consumer' - }) - producer = Producer({ - 'bootstrap.servers': settings.KAFKA_BOOTSTRAP_SERVERS, - 'client.id': 'dlq-manager-producer', - 'acks': 'all', - 'enable.idempotence': True, - 'compression.type': 'gzip', - 'batch.size': 16384, - 'linger.ms': 10 - }) + consumer = Consumer( + { + "bootstrap.servers": settings.KAFKA_BOOTSTRAP_SERVERS, + "group.id": f"{GroupId.DLQ_MANAGER}.{settings.KAFKA_GROUP_SUFFIX}", + "enable.auto.commit": False, + "auto.offset.reset": "earliest", + "client.id": "dlq-manager-consumer", + } + ) + producer = Producer( + { + "bootstrap.servers": settings.KAFKA_BOOTSTRAP_SERVERS, + "client.id": "dlq-manager-producer", + "acks": "all", + "enable.idempotence": True, + "compression.type": "gzip", + "batch.size": 16384, + "linger.ms": 10, + } + ) if default_retry_policy is None: - default_retry_policy = RetryPolicy( - topic="default", - strategy=RetryStrategy.EXPONENTIAL_BACKOFF - ) + default_retry_policy = RetryPolicy(topic="default", strategy=RetryStrategy.EXPONENTIAL_BACKOFF) return DLQManager( database=database, consumer=consumer, diff --git a/backend/app/dlq/models.py b/backend/app/dlq/models.py index a960f2ab..95523e61 100644 --- a/backend/app/dlq/models.py +++ b/backend/app/dlq/models.py @@ -8,6 +8,7 @@ class DLQMessageStatus(StringEnum): """Status of a message in the Dead Letter Queue.""" + PENDING = "pending" SCHEDULED = "scheduled" RETRIED = "retried" @@ -16,6 +17,7 @@ class DLQMessageStatus(StringEnum): class RetryStrategy(StringEnum): """Retry strategies for DLQ messages.""" + IMMEDIATE = "immediate" EXPONENTIAL_BACKOFF = "exponential_backoff" FIXED_INTERVAL = "fixed_interval" @@ -25,6 +27,7 @@ class RetryStrategy(StringEnum): class DLQFields(StringEnum): """Database field names for DLQ messages collection.""" + EVENT_ID = "event_id" EVENT = "event" EVENT_TYPE = "event.event_type" @@ -94,6 +97,7 @@ def event_type(self) -> str: @dataclass class DLQMessageUpdate: """Strongly-typed update descriptor for DLQ message status changes.""" + status: DLQMessageStatus next_retry_at: datetime | None = None retried_at: datetime | None = None @@ -107,6 +111,7 @@ class DLQMessageUpdate: @dataclass class DLQMessageFilter: """Filter criteria for querying DLQ messages.""" + status: DLQMessageStatus | None = None topic: str | None = None event_type: str | None = None @@ -115,6 +120,7 @@ class DLQMessageFilter: @dataclass class RetryPolicy: """Retry policy configuration for DLQ messages.""" + topic: str strategy: RetryStrategy max_retries: int = 5 @@ -140,10 +146,7 @@ def get_next_retry_time(self, message: DLQMessage) -> datetime: delay = self.base_delay_seconds elif self.strategy == RetryStrategy.EXPONENTIAL_BACKOFF: - delay = min( - self.base_delay_seconds * (self.retry_multiplier ** message.retry_count), - self.max_delay_seconds - ) + delay = min(self.base_delay_seconds * (self.retry_multiplier**message.retry_count), self.max_delay_seconds) # Add jitter to avoid thundering herd jitter = delay * self.jitter_factor * (2 * random.random() - 1) delay = max(0, delay + jitter) @@ -158,6 +161,7 @@ def get_next_retry_time(self, message: DLQMessage) -> datetime: @dataclass class TopicStatistic: """Statistics for a single topic.""" + topic: str count: int avg_retry_count: float @@ -166,6 +170,7 @@ class TopicStatistic: @dataclass class EventTypeStatistic: """Statistics for a single event type.""" + event_type: str count: int @@ -173,6 +178,7 @@ class EventTypeStatistic: @dataclass class AgeStatistics: """Age statistics for DLQ messages.""" + min_age_seconds: float max_age_seconds: float avg_age_seconds: float @@ -181,6 +187,7 @@ class AgeStatistics: @dataclass class DLQStatistics: """Comprehensive DLQ statistics.""" + by_status: dict[str, int] by_topic: list[TopicStatistic] by_event_type: list[EventTypeStatistic] @@ -191,6 +198,7 @@ class DLQStatistics: @dataclass class DLQRetryResult: """Result of a single retry operation.""" + event_id: str status: str # "success" or "failed" error: str | None = None @@ -199,6 +207,7 @@ class DLQRetryResult: @dataclass class DLQBatchRetryResult: """Result of batch retry operation.""" + total: int successful: int failed: int @@ -208,6 +217,7 @@ class DLQBatchRetryResult: @dataclass class DLQMessageListResult: """Result of listing DLQ messages.""" + messages: list[DLQMessage] total: int offset: int @@ -217,6 +227,7 @@ class DLQMessageListResult: @dataclass class DLQTopicSummary: """Summary of a topic in DLQ.""" + topic: str total_messages: int status_breakdown: dict[str, int] diff --git a/backend/app/domain/admin/replay_models.py b/backend/app/domain/admin/replay_models.py index 18479867..220b3c8b 100644 --- a/backend/app/domain/admin/replay_models.py +++ b/backend/app/domain/admin/replay_models.py @@ -9,6 +9,7 @@ class ReplaySessionFields(StringEnum): """Database field names for replay sessions.""" + SESSION_ID = "session_id" TYPE = "type" STATUS = "status" @@ -64,20 +65,11 @@ def is_running(self) -> bool: def update_progress(self, replayed: int, failed: int = 0, skipped: int = 0) -> "ReplaySession": # Create new instance with updated values - new_session = replace( - self, - replayed_events=replayed, - failed_events=failed, - skipped_events=skipped - ) + new_session = replace(self, replayed_events=replayed, failed_events=failed, skipped_events=skipped) # Check if completed and update status if new_session.replayed_events >= new_session.total_events: - new_session = replace( - new_session, - status=ReplayStatus.COMPLETED, - completed_at=datetime.now(timezone.utc) - ) + new_session = replace(new_session, status=ReplayStatus.COMPLETED, completed_at=datetime.now(timezone.utc)) return new_session @@ -114,18 +106,13 @@ class ReplayQuery: end_time: datetime | None = None def is_empty(self) -> bool: - return not any([ - self.event_ids, - self.correlation_id, - self.aggregate_id, - self.start_time, - self.end_time - ]) + return not any([self.event_ids, self.correlation_id, self.aggregate_id, self.start_time, self.end_time]) @dataclass class ReplaySessionData: """Unified replay session data for both preview and actual replay.""" + total_events: int replay_correlation_id: str dry_run: bool diff --git a/backend/app/domain/admin/replay_updates.py b/backend/app/domain/admin/replay_updates.py index ec45d6bf..c4450a1c 100644 --- a/backend/app/domain/admin/replay_updates.py +++ b/backend/app/domain/admin/replay_updates.py @@ -27,7 +27,7 @@ def to_dict(self) -> dict[str, object]: result: dict[str, object] = {} if self.status is not None: - result["status"] = self.status.value if hasattr(self.status, 'value') else self.status + result["status"] = self.status.value if hasattr(self.status, "value") else self.status if self.total_events is not None: result["total_events"] = self.total_events if self.replayed_events is not None: diff --git a/backend/app/domain/admin/settings_models.py b/backend/app/domain/admin/settings_models.py index 67d7ab93..e3b0398a 100644 --- a/backend/app/domain/admin/settings_models.py +++ b/backend/app/domain/admin/settings_models.py @@ -7,6 +7,7 @@ class SettingsFields(StringEnum): """Database field names for settings collection.""" + ID = "_id" CREATED_AT = "created_at" UPDATED_AT = "updated_at" @@ -18,6 +19,7 @@ class SettingsFields(StringEnum): class AuditLogFields(StringEnum): """Database field names for audit log collection.""" + ACTION = "action" USER_ID = "user_id" USERNAME = "username" @@ -27,12 +29,14 @@ class AuditLogFields(StringEnum): class AuditAction(StringEnum): """Audit log action types.""" + SYSTEM_SETTINGS_UPDATED = "system_settings_updated" SYSTEM_SETTINGS_RESET = "system_settings_reset" class LogLevel(StringEnum): """Log level options.""" + DEBUG = "DEBUG" INFO = "INFO" WARNING = "WARNING" @@ -67,6 +71,7 @@ class MonitoringSettings: @dataclass class SystemSettings: """Complete system settings configuration.""" + execution_limits: ExecutionLimits = field(default_factory=ExecutionLimits) security_settings: SecuritySettings = field(default_factory=SecuritySettings) monitoring_settings: MonitoringSettings = field(default_factory=MonitoringSettings) diff --git a/backend/app/domain/enums/__init__.py b/backend/app/domain/enums/__init__.py index 907a0b8a..50722b89 100644 --- a/backend/app/domain/enums/__init__.py +++ b/backend/app/domain/enums/__init__.py @@ -27,5 +27,5 @@ # Saga "SagaState", # User - "UserRole" + "UserRole", ] diff --git a/backend/app/domain/enums/auth.py b/backend/app/domain/enums/auth.py index ff0a7a84..8488ccee 100644 --- a/backend/app/domain/enums/auth.py +++ b/backend/app/domain/enums/auth.py @@ -3,6 +3,7 @@ class LoginMethod(StringEnum): """User login methods.""" + PASSWORD = "password" OAUTH = "oauth" SSO = "sso" @@ -11,6 +12,7 @@ class LoginMethod(StringEnum): class SettingsType(StringEnum): """Types of user settings.""" + PREFERENCES = "preferences" NOTIFICATION = "notification" EDITOR = "editor" diff --git a/backend/app/domain/enums/common.py b/backend/app/domain/enums/common.py index 5030101c..a850ae54 100644 --- a/backend/app/domain/enums/common.py +++ b/backend/app/domain/enums/common.py @@ -3,6 +3,7 @@ class ErrorType(StringEnum): """Classification of error types in execution platform.""" + SCRIPT_ERROR = "script_error" # User code had errors SYSTEM_ERROR = "system_error" # Infrastructure/platform issues SUCCESS = "success" # No errors @@ -10,6 +11,7 @@ class ErrorType(StringEnum): class Theme(StringEnum): """Available UI themes.""" + LIGHT = "light" DARK = "dark" AUTO = "auto" @@ -17,12 +19,14 @@ class Theme(StringEnum): class SortOrder(StringEnum): """Sort order for queries.""" + ASC = "asc" DESC = "desc" class Environment(StringEnum): """Deployment environments.""" + DEVELOPMENT = "development" STAGING = "staging" PRODUCTION = "production" diff --git a/backend/app/domain/enums/execution.py b/backend/app/domain/enums/execution.py index e830b384..abb4809d 100644 --- a/backend/app/domain/enums/execution.py +++ b/backend/app/domain/enums/execution.py @@ -3,6 +3,7 @@ class ExecutionStatus(StringEnum): """Status of an execution.""" + QUEUED = "queued" SCHEDULED = "scheduled" RUNNING = "running" diff --git a/backend/app/domain/enums/health.py b/backend/app/domain/enums/health.py index 5f1706ff..00985562 100644 --- a/backend/app/domain/enums/health.py +++ b/backend/app/domain/enums/health.py @@ -3,6 +3,7 @@ class AlertSeverity(StringEnum): """Alert severity levels.""" + CRITICAL = "critical" WARNING = "warning" INFO = "info" @@ -10,12 +11,14 @@ class AlertSeverity(StringEnum): class AlertStatus(StringEnum): """Alert status.""" + FIRING = "firing" RESOLVED = "resolved" class ComponentStatus(StringEnum): """Health check component status.""" + HEALTHY = "healthy" DEGRADED = "degraded" UNHEALTHY = "unhealthy" diff --git a/backend/app/domain/enums/kafka.py b/backend/app/domain/enums/kafka.py index 036d9cc5..148d29c0 100644 --- a/backend/app/domain/enums/kafka.py +++ b/backend/app/domain/enums/kafka.py @@ -58,6 +58,7 @@ class KafkaTopic(StringEnum): class GroupId(StringEnum): """Kafka consumer group IDs.""" + EXECUTION_COORDINATOR = "execution-coordinator" K8S_WORKER = "k8s-worker" POD_MONITOR = "pod-monitor" diff --git a/backend/app/domain/enums/notification.py b/backend/app/domain/enums/notification.py index 08576814..ecd377fe 100644 --- a/backend/app/domain/enums/notification.py +++ b/backend/app/domain/enums/notification.py @@ -3,6 +3,7 @@ class NotificationChannel(StringEnum): """Notification delivery channels.""" + IN_APP = "in_app" WEBHOOK = "webhook" SLACK = "slack" @@ -10,6 +11,7 @@ class NotificationChannel(StringEnum): class NotificationSeverity(StringEnum): """Notification severity levels.""" + LOW = "low" MEDIUM = "medium" HIGH = "high" @@ -18,6 +20,7 @@ class NotificationSeverity(StringEnum): class NotificationStatus(StringEnum): """Notification delivery status.""" + PENDING = "pending" QUEUED = "queued" SENDING = "sending" diff --git a/backend/app/domain/enums/saga.py b/backend/app/domain/enums/saga.py index 4edac34e..7c563663 100644 --- a/backend/app/domain/enums/saga.py +++ b/backend/app/domain/enums/saga.py @@ -3,6 +3,7 @@ class SagaState(StringEnum): """Saga execution states.""" + CREATED = "created" RUNNING = "running" COMPENSATING = "compensating" diff --git a/backend/app/domain/enums/storage.py b/backend/app/domain/enums/storage.py index 735af107..9f234a38 100644 --- a/backend/app/domain/enums/storage.py +++ b/backend/app/domain/enums/storage.py @@ -5,6 +5,7 @@ class ExecutionErrorType(StringEnum): """Types of execution errors.""" + SYSTEM_ERROR = "system_error" TIMEOUT = "timeout" RESOURCE_LIMIT = "resource_limit" @@ -14,6 +15,7 @@ class ExecutionErrorType(StringEnum): class StorageType(StringEnum): """Types of storage backends.""" + DATABASE = "database" S3 = "s3" FILESYSTEM = "filesystem" diff --git a/backend/app/domain/enums/user.py b/backend/app/domain/enums/user.py index 72a5fdb3..30a127d0 100644 --- a/backend/app/domain/enums/user.py +++ b/backend/app/domain/enums/user.py @@ -3,6 +3,7 @@ class UserRole(StringEnum): """User roles in the system.""" + USER = "user" ADMIN = "admin" MODERATOR = "moderator" diff --git a/backend/app/domain/events/__init__.py b/backend/app/domain/events/__init__.py index c9be24dd..c2c7d4d6 100644 --- a/backend/app/domain/events/__init__.py +++ b/backend/app/domain/events/__init__.py @@ -5,7 +5,6 @@ EventFields, EventFilter, EventListResult, - EventMetadata, EventProjection, EventQuery, EventReplayInfo, @@ -13,6 +12,7 @@ EventStatistics, ExecutionEventsResult, ) +from app.infrastructure.kafka.events.metadata import EventMetadata __all__ = [ "ArchivedEvent", diff --git a/backend/app/domain/events/event_models.py b/backend/app/domain/events/event_models.py index 072f1d57..99f0c2ba 100644 --- a/backend/app/domain/events/event_models.py +++ b/backend/app/domain/events/event_models.py @@ -5,15 +5,13 @@ from app.core.utils import StringEnum from app.infrastructure.kafka.events.metadata import EventMetadata -MongoQueryValue = ( - str | - dict[str, str | list[str] | float | datetime] -) +MongoQueryValue = str | dict[str, str | list[str] | float | datetime] MongoQuery = dict[str, MongoQueryValue] class EventFields(StringEnum): """Database field names for events collection.""" + ID = "_id" EVENT_ID = "event_id" EVENT_TYPE = "event_type" @@ -74,12 +72,10 @@ class CollectionNames(StringEnum): DLQ_MESSAGES = "dlq_messages" - - - @dataclass class Event: """Domain model for an event.""" + event_id: str event_type: str event_version: str @@ -100,6 +96,7 @@ def correlation_id(self) -> str | None: @dataclass class EventSummary: """Lightweight event summary for lists and previews.""" + event_id: str event_type: str timestamp: datetime @@ -109,6 +106,7 @@ class EventSummary: @dataclass class EventFilter: """Filter criteria for querying events.""" + event_types: list[str] | None = None aggregate_id: str | None = None correlation_id: str | None = None @@ -124,6 +122,7 @@ class EventFilter: @dataclass class EventQuery: """Query parameters for event search.""" + filter: EventFilter sort_by: str = EventFields.TIMESTAMP sort_order: EventSortOrder = EventSortOrder.DESC @@ -137,6 +136,7 @@ def get_sort_direction(self) -> int: @dataclass class EventListResult: """Result of event list query.""" + events: list[Event] total: int skip: int @@ -147,6 +147,7 @@ class EventListResult: @dataclass class EventBrowseResult: """Result for event browsing.""" + events: list[Event] total: int skip: int @@ -156,6 +157,7 @@ class EventBrowseResult: @dataclass class EventDetail: """Detailed event information with related events.""" + event: Event related_events: list[EventSummary] = field(default_factory=list) timeline: list[EventSummary] = field(default_factory=list) @@ -176,6 +178,7 @@ class UserEventCount: @dataclass class EventStatistics: """Event statistics.""" + total_events: int events_by_type: dict[str, int] = field(default_factory=dict) events_by_service: dict[str, int] = field(default_factory=dict) @@ -190,6 +193,7 @@ class EventStatistics: @dataclass class EventProjection: """Configuration for event projections.""" + name: str pipeline: list[dict[str, Any]] output_collection: str @@ -202,6 +206,7 @@ class EventProjection: @dataclass class ArchivedEvent(Event): """Archived event with deletion metadata.""" + deleted_at: datetime | None = None deleted_by: str | None = None deletion_reason: str | None = None @@ -210,6 +215,7 @@ class ArchivedEvent(Event): @dataclass class EventReplayInfo: """Information for event replay.""" + events: list[Event] event_count: int event_types: list[str] @@ -220,6 +226,7 @@ class EventReplayInfo: @dataclass class ExecutionEventsResult: """Result of execution events query.""" + events: list[Event] access_allowed: bool include_system_events: bool @@ -231,10 +238,7 @@ def get_filtered_events(self) -> list[Event]: events = self.events if not self.include_system_events: - events = [ - e for e in events - if not e.metadata.service_name.startswith("system-") - ] + events = [e for e in events if not e.metadata.service_name.startswith("system-")] return events @@ -242,6 +246,7 @@ def get_filtered_events(self) -> list[Event]: @dataclass class EventExportRow: """Event export row for CSV.""" + event_id: str event_type: str timestamp: str @@ -256,6 +261,7 @@ class EventExportRow: @dataclass class EventAggregationResult: """Result of event aggregation.""" + results: list[dict[str, Any]] pipeline: list[dict[str, Any]] execution_time_ms: float | None = None diff --git a/backend/app/domain/events/query_builders.py b/backend/app/domain/events/query_builders.py index 5e8ade9a..69ffd158 100644 --- a/backend/app/domain/events/query_builders.py +++ b/backend/app/domain/events/query_builders.py @@ -53,12 +53,7 @@ def size(field: str) -> dict[str, str]: @staticmethod def date_to_string(date_field: str, format: str = "%Y-%m-%d-%H") -> dict[str, Any]: """Create a $dateToString expression.""" - return { - "$dateToString": { - "format": format, - "date": date_field - } - } + return {"$dateToString": {"format": format, "date": date_field}} class EventStatsAggregation: @@ -66,69 +61,67 @@ class EventStatsAggregation: def build_overview_pipeline(start_time: datetime) -> list[dict[str, Any]]: return [ AggregationStages.match({EventFields.TIMESTAMP: {"$gte": start_time}}), - AggregationStages.group({ - "_id": None, - "total_events": AggregationStages.sum(), - "event_types": AggregationStages.add_to_set(f"${EventFields.EVENT_TYPE}"), - "unique_users": AggregationStages.add_to_set(f"${EventFields.METADATA_USER_ID}"), - "services": AggregationStages.add_to_set(f"${EventFields.METADATA_SERVICE_NAME}") - }), - AggregationStages.project({ - "_id": 0, - "total_events": 1, - "event_type_count": AggregationStages.size("$event_types"), - "unique_user_count": AggregationStages.size("$unique_users"), - "service_count": AggregationStages.size("$services") - }) + AggregationStages.group( + { + "_id": None, + "total_events": AggregationStages.sum(), + "event_types": AggregationStages.add_to_set(f"${EventFields.EVENT_TYPE}"), + "unique_users": AggregationStages.add_to_set(f"${EventFields.METADATA_USER_ID}"), + "services": AggregationStages.add_to_set(f"${EventFields.METADATA_SERVICE_NAME}"), + } + ), + AggregationStages.project( + { + "_id": 0, + "total_events": 1, + "event_type_count": AggregationStages.size("$event_types"), + "unique_user_count": AggregationStages.size("$unique_users"), + "service_count": AggregationStages.size("$services"), + } + ), ] @staticmethod def build_event_types_pipeline(start_time: datetime, limit: int = 10) -> list[dict[str, Any]]: return [ AggregationStages.match({EventFields.TIMESTAMP: {"$gte": start_time}}), - AggregationStages.group({ - "_id": f"${EventFields.EVENT_TYPE}", - "count": AggregationStages.sum() - }), + AggregationStages.group({"_id": f"${EventFields.EVENT_TYPE}", "count": AggregationStages.sum()}), AggregationStages.sort({"count": -1}), - AggregationStages.limit(limit) + AggregationStages.limit(limit), ] @staticmethod def build_hourly_events_pipeline(start_time: datetime) -> list[dict[str, Any]]: return [ AggregationStages.match({EventFields.TIMESTAMP: {"$gte": start_time}}), - AggregationStages.group({ - "_id": AggregationStages.date_to_string(f"${EventFields.TIMESTAMP}"), - "count": AggregationStages.sum() - }), - AggregationStages.sort({"_id": 1}) + AggregationStages.group( + {"_id": AggregationStages.date_to_string(f"${EventFields.TIMESTAMP}"), "count": AggregationStages.sum()} + ), + AggregationStages.sort({"_id": 1}), ] @staticmethod def build_top_users_pipeline(start_time: datetime, limit: int = 10) -> list[dict[str, Any]]: return [ AggregationStages.match({EventFields.TIMESTAMP: {"$gte": start_time}}), - AggregationStages.group({ - "_id": f"${EventFields.METADATA_USER_ID}", - "count": AggregationStages.sum() - }), + AggregationStages.group({"_id": f"${EventFields.METADATA_USER_ID}", "count": AggregationStages.sum()}), AggregationStages.sort({"count": -1}), - AggregationStages.limit(limit) + AggregationStages.limit(limit), ] @staticmethod def build_avg_duration_pipeline(start_time: datetime, event_type: str) -> list[dict[str, Any]]: return [ - AggregationStages.match({ - EventFields.TIMESTAMP: {"$gte": start_time}, - EventFields.EVENT_TYPE: event_type, - EventFields.PAYLOAD_DURATION_SECONDS: {"$exists": True} - }), - AggregationStages.group({ - "_id": None, - "avg_duration": AggregationStages.avg(f"${EventFields.PAYLOAD_DURATION_SECONDS}") - }) + AggregationStages.match( + { + EventFields.TIMESTAMP: {"$gte": start_time}, + EventFields.EVENT_TYPE: event_type, + EventFields.PAYLOAD_DURATION_SECONDS: {"$exists": True}, + } + ), + AggregationStages.group( + {"_id": None, "avg_duration": AggregationStages.avg(f"${EventFields.PAYLOAD_DURATION_SECONDS}")} + ), ] diff --git a/backend/app/domain/execution/__init__.py b/backend/app/domain/execution/__init__.py index 5ecff136..4b66b31c 100644 --- a/backend/app/domain/execution/__init__.py +++ b/backend/app/domain/execution/__init__.py @@ -7,12 +7,14 @@ from .models import ( DomainExecution, ExecutionResultDomain, + ResourceLimitsDomain, ResourceUsageDomain, ) __all__ = [ "DomainExecution", "ExecutionResultDomain", + "ResourceLimitsDomain", "ResourceUsageDomain", "ExecutionServiceError", "RuntimeNotSupportedError", diff --git a/backend/app/domain/execution/exceptions.py b/backend/app/domain/execution/exceptions.py index 79e56f6d..03d3b3b7 100644 --- a/backend/app/domain/execution/exceptions.py +++ b/backend/app/domain/execution/exceptions.py @@ -1,19 +1,22 @@ class ExecutionServiceError(Exception): """Base exception for execution service errors.""" + pass class RuntimeNotSupportedError(ExecutionServiceError): """Raised when requested runtime is not supported.""" + pass class EventPublishError(ExecutionServiceError): """Raised when event publishing fails.""" + pass class ExecutionNotFoundError(ExecutionServiceError): """Raised when execution is not found.""" - pass + pass diff --git a/backend/app/domain/execution/models.py b/backend/app/domain/execution/models.py index 1442d3c2..482e9f39 100644 --- a/backend/app/domain/execution/models.py +++ b/backend/app/domain/execution/models.py @@ -62,3 +62,15 @@ def from_dict(data: dict[str, Any]) -> "ResourceUsageDomain": clk_tck_hertz=int(data.get("clk_tck_hertz", 0)), peak_memory_kb=int(data.get("peak_memory_kb", 0)), ) + + +@dataclass +class ResourceLimitsDomain: + """K8s resource limits configuration.""" + + cpu_limit: str + memory_limit: str + cpu_request: str + memory_request: str + execution_timeout: int + supported_runtimes: dict[str, list[str]] diff --git a/backend/app/domain/idempotency/__init__.py b/backend/app/domain/idempotency/__init__.py index 4e995ecc..342314e5 100644 --- a/backend/app/domain/idempotency/__init__.py +++ b/backend/app/domain/idempotency/__init__.py @@ -9,4 +9,3 @@ "IdempotencyRecord", "IdempotencyStats", ] - diff --git a/backend/app/domain/rate_limit/__init__.py b/backend/app/domain/rate_limit/__init__.py index 44c8e3e8..e066a8c5 100644 --- a/backend/app/domain/rate_limit/__init__.py +++ b/backend/app/domain/rate_limit/__init__.py @@ -4,7 +4,9 @@ RateLimitConfig, RateLimitRule, RateLimitStatus, + RateLimitUpdateResult, UserRateLimit, + UserRateLimitsResult, UserRateLimitSummary, ) @@ -14,6 +16,8 @@ "RateLimitConfig", "RateLimitRule", "RateLimitStatus", + "RateLimitUpdateResult", "UserRateLimit", + "UserRateLimitsResult", "UserRateLimitSummary", ] diff --git a/backend/app/domain/rate_limit/rate_limit_models.py b/backend/app/domain/rate_limit/rate_limit_models.py index 15246d5d..59713554 100644 --- a/backend/app/domain/rate_limit/rate_limit_models.py +++ b/backend/app/domain/rate_limit/rate_limit_models.py @@ -67,7 +67,7 @@ def get_default_config(cls) -> "RateLimitConfig": burst_multiplier=1.5, algorithm=RateLimitAlgorithm.SLIDING_WINDOW, priority=10, - enabled=True + enabled=True, ), RateLimitRule( endpoint_pattern=r"^/api/v1/admin/.*", @@ -77,7 +77,7 @@ def get_default_config(cls) -> "RateLimitConfig": burst_multiplier=2.0, algorithm=RateLimitAlgorithm.SLIDING_WINDOW, priority=5, - enabled=True + enabled=True, ), RateLimitRule( endpoint_pattern=r"^/api/v1/events/.*", @@ -87,7 +87,7 @@ def get_default_config(cls) -> "RateLimitConfig": burst_multiplier=1.0, algorithm=RateLimitAlgorithm.SLIDING_WINDOW, priority=3, - enabled=True + enabled=True, ), RateLimitRule( endpoint_pattern=r"^/api/v1/ws", @@ -97,7 +97,7 @@ def get_default_config(cls) -> "RateLimitConfig": burst_multiplier=1.0, algorithm=RateLimitAlgorithm.SLIDING_WINDOW, priority=3, - enabled=True + enabled=True, ), RateLimitRule( endpoint_pattern=r"^/api/v1/auth/.*", @@ -107,7 +107,7 @@ def get_default_config(cls) -> "RateLimitConfig": burst_multiplier=1.5, algorithm=RateLimitAlgorithm.SLIDING_WINDOW, priority=7, - enabled=True + enabled=True, ), RateLimitRule( endpoint_pattern=r"^/api/v1/.*", @@ -117,11 +117,11 @@ def get_default_config(cls) -> "RateLimitConfig": burst_multiplier=1.5, algorithm=RateLimitAlgorithm.SLIDING_WINDOW, priority=1, - enabled=True - ) + enabled=True, + ), ], global_enabled=True, - redis_ttl=3600 + redis_ttl=3600, ) @@ -142,8 +142,27 @@ class UserRateLimitSummary: Always present for callers; reflects defaults when no override exists. """ + user_id: str has_custom_limits: bool bypass_rate_limit: bool global_multiplier: float rules_count: int + + +@dataclass +class UserRateLimitsResult: + """Result of fetching user rate limits with usage stats.""" + + user_id: str + rate_limit_config: Optional[UserRateLimit] + current_usage: Dict[str, Dict[str, object]] + + +@dataclass +class RateLimitUpdateResult: + """Result of updating user rate limits.""" + + user_id: str + updated: bool + config: UserRateLimit diff --git a/backend/app/domain/replay/__init__.py b/backend/app/domain/replay/__init__.py index 10acf809..b3f36291 100644 --- a/backend/app/domain/replay/__init__.py +++ b/backend/app/domain/replay/__init__.py @@ -13,4 +13,3 @@ "ReplayOperationResult", "CleanupResult", ] - diff --git a/backend/app/domain/replay/models.py b/backend/app/domain/replay/models.py index 52bbc8cb..36195dcd 100644 --- a/backend/app/domain/replay/models.py +++ b/backend/app/domain/replay/models.py @@ -84,6 +84,7 @@ def get_progress_callback(self) -> Any: @dataclass class ReplaySessionState: """Domain replay session model used by services only.""" + session_id: str config: ReplayConfig status: ReplayStatus = ReplayStatus.CREATED diff --git a/backend/app/domain/saga/exceptions.py b/backend/app/domain/saga/exceptions.py index 1c36c753..f7080368 100644 --- a/backend/app/domain/saga/exceptions.py +++ b/backend/app/domain/saga/exceptions.py @@ -1,33 +1,40 @@ class SagaError(Exception): """Base exception for saga-related errors.""" + pass class SagaNotFoundError(SagaError): """Raised when a saga is not found.""" + pass class SagaAccessDeniedError(SagaError): """Raised when access to a saga is denied.""" + pass class SagaInvalidStateError(SagaError): """Raised when a saga operation is invalid for the current state.""" + pass class SagaCompensationError(SagaError): """Raised when saga compensation fails.""" + pass class SagaTimeoutError(SagaError): """Raised when a saga times out.""" + pass class SagaConcurrencyError(SagaError): """Raised when there's a concurrency conflict with saga operations.""" + pass diff --git a/backend/app/domain/saga/models.py b/backend/app/domain/saga/models.py index b30c642f..1998258b 100644 --- a/backend/app/domain/saga/models.py +++ b/backend/app/domain/saga/models.py @@ -9,6 +9,7 @@ @dataclass class Saga: """Domain model for saga.""" + saga_id: str saga_name: str execution_id: str @@ -27,6 +28,7 @@ class Saga: @dataclass class SagaFilter: """Filter criteria for saga queries.""" + state: SagaState | None = None execution_ids: list[str] | None = None user_id: str | None = None @@ -39,6 +41,7 @@ class SagaFilter: @dataclass class SagaQuery: """Query parameters for saga search.""" + filter: SagaFilter sort_by: str = "created_at" sort_order: str = "desc" @@ -49,6 +52,7 @@ class SagaQuery: @dataclass class SagaListResult: """Result of saga list query.""" + sagas: list[Saga] total: int skip: int @@ -63,6 +67,7 @@ def __post_init__(self) -> None: @dataclass class SagaDetail: """Detailed saga information.""" + saga: Saga execution_details: dict[str, Any] | None = None step_details: list[dict[str, Any]] = field(default_factory=list) @@ -71,6 +76,7 @@ class SagaDetail: @dataclass class SagaStatistics: """Saga statistics.""" + total_sagas: int sagas_by_state: dict[str, int] = field(default_factory=dict) sagas_by_name: dict[str, int] = field(default_factory=dict) @@ -83,6 +89,7 @@ class SagaStatistics: @dataclass class SagaConfig: """Configuration for saga orchestration (domain).""" + name: str timeout_seconds: int = 300 max_retries: int = 3 @@ -98,6 +105,7 @@ class SagaConfig: @dataclass class SagaInstance: """Runtime instance of a saga execution (domain).""" + saga_name: str execution_id: str state: SagaState = SagaState.CREATED diff --git a/backend/app/domain/saved_script/__init__.py b/backend/app/domain/saved_script/__init__.py index f1ded779..444470f3 100644 --- a/backend/app/domain/saved_script/__init__.py +++ b/backend/app/domain/saved_script/__init__.py @@ -9,4 +9,3 @@ "DomainSavedScriptCreate", "DomainSavedScriptUpdate", ] - diff --git a/backend/app/domain/sse/__init__.py b/backend/app/domain/sse/__init__.py index faa2c31c..3b0d7c67 100644 --- a/backend/app/domain/sse/__init__.py +++ b/backend/app/domain/sse/__init__.py @@ -1,12 +1,13 @@ from .models import ( + ShutdownStatus, SSEEventDomain, SSEExecutionStatusDomain, SSEHealthDomain, ) __all__ = [ + "ShutdownStatus", "SSEHealthDomain", "SSEExecutionStatusDomain", "SSEEventDomain", ] - diff --git a/backend/app/domain/sse/models.py b/backend/app/domain/sse/models.py index 0630b851..576637c2 100644 --- a/backend/app/domain/sse/models.py +++ b/backend/app/domain/sse/models.py @@ -2,7 +2,18 @@ from dataclasses import dataclass from datetime import datetime -from typing import Any, Dict + + +@dataclass +class ShutdownStatus: + """Status of SSE shutdown process.""" + + phase: str + initiated: bool + complete: bool + active_connections: int + draining_connections: int + duration: float | None = None @dataclass @@ -13,7 +24,7 @@ class SSEHealthDomain: active_executions: int active_consumers: int max_connections_per_user: int - shutdown: Dict[str, Any] + shutdown: ShutdownStatus timestamp: datetime @@ -27,4 +38,4 @@ class SSEExecutionStatusDomain: @dataclass class SSEEventDomain: aggregate_id: str - timestamp: Any + timestamp: datetime diff --git a/backend/app/domain/user/settings_models.py b/backend/app/domain/user/settings_models.py index f0e72f42..66f2f715 100644 --- a/backend/app/domain/user/settings_models.py +++ b/backend/app/domain/user/settings_models.py @@ -105,6 +105,7 @@ class DomainSettingsHistoryEntry: @dataclass class CachedSettings: """Wrapper for cached user settings with expiration time.""" + settings: DomainUserSettings expires_at: datetime diff --git a/backend/app/domain/user/user_models.py b/backend/app/domain/user/user_models.py index 9cc95b79..da91d34a 100644 --- a/backend/app/domain/user/user_models.py +++ b/backend/app/domain/user/user_models.py @@ -6,11 +6,12 @@ from app.core.utils import StringEnum from app.domain.enums.user import UserRole -EMAIL_PATTERN = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') +EMAIL_PATTERN = re.compile(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$") class UserFields(StringEnum): """Database field names for users collection.""" + USER_ID = "user_id" USERNAME = "username" EMAIL = "email" @@ -24,6 +25,7 @@ class UserFields(StringEnum): class UserFilterType(StringEnum): """Types of user filters.""" + USERNAME = "username" EMAIL = "email" ROLE = "role" @@ -32,6 +34,7 @@ class UserFilterType(StringEnum): @dataclass class UserSearchFilter: """User search filter criteria.""" + search_text: str | None = None role: UserRole | None = None @@ -39,6 +42,7 @@ class UserSearchFilter: @dataclass class User: """User domain model.""" + user_id: str username: str email: str @@ -53,6 +57,7 @@ class User: @dataclass class UserUpdate: """User update domain model.""" + username: str | None = None email: str | None = None role: UserRole | None = None @@ -60,18 +65,21 @@ class UserUpdate: password: str | None = None def has_updates(self) -> bool: - return any([ - self.username is not None, - self.email is not None, - self.role is not None, - self.is_active is not None, - self.password is not None - ]) + return any( + [ + self.username is not None, + self.email is not None, + self.role is not None, + self.is_active is not None, + self.password is not None, + ] + ) @dataclass class UserListResult: """Result of listing users.""" + users: List[User] total: int offset: int @@ -81,6 +89,7 @@ class UserListResult: @dataclass class PasswordReset: """Password reset domain model.""" + user_id: str new_password: str @@ -91,6 +100,7 @@ def is_valid(self) -> bool: @dataclass class UserCreation: """User creation domain model.""" + username: str email: str password: str @@ -99,9 +109,11 @@ class UserCreation: is_superuser: bool = False def is_valid(self) -> bool: - return all([ - self.username, - self.email, - self.password and len(self.password) >= 8, - EMAIL_PATTERN.match(self.email) is not None # Proper email validation - ]) + return all( + [ + self.username, + self.email, + self.password and len(self.password) >= 8, + EMAIL_PATTERN.match(self.email) is not None, # Proper email validation + ] + ) diff --git a/backend/app/events/admin_utils.py b/backend/app/events/admin_utils.py index 4d0ce63f..3aef289a 100644 --- a/backend/app/events/admin_utils.py +++ b/backend/app/events/admin_utils.py @@ -9,19 +9,21 @@ class AdminUtils: """Minimal admin utilities using native AdminClient.""" - + def __init__(self, bootstrap_servers: str | None = None): settings = get_settings() - self._admin = AdminClient({ - 'bootstrap.servers': bootstrap_servers or settings.KAFKA_BOOTSTRAP_SERVERS, - 'client.id': 'integr8scode-admin' - }) - + self._admin = AdminClient( + { + "bootstrap.servers": bootstrap_servers or settings.KAFKA_BOOTSTRAP_SERVERS, + "client.id": "integr8scode-admin", + } + ) + @property def admin_client(self) -> AdminClient: """Get the native AdminClient instance.""" return self._admin - + async def check_topic_exists(self, topic: str) -> bool: """Check if topic exists.""" try: @@ -30,23 +32,21 @@ async def check_topic_exists(self, topic: str) -> bool: except Exception as e: logger.error(f"Failed to check topic {topic}: {e}") return False - + async def create_topic(self, topic: str, num_partitions: int = 1, replication_factor: int = 1) -> bool: """Create a single topic.""" try: new_topic = NewTopic(topic, num_partitions=num_partitions, replication_factor=replication_factor) futures = self._admin.create_topics([new_topic], operation_timeout=30.0) - + # Wait for result - result() returns None on success, raises exception on failure - await asyncio.get_event_loop().run_in_executor( - None, lambda: futures[topic].result(timeout=30.0) - ) + await asyncio.get_event_loop().run_in_executor(None, lambda: futures[topic].result(timeout=30.0)) logger.info(f"Topic {topic} created successfully") return True except Exception as e: logger.error(f"Failed to create topic {topic}: {e}") return False - + async def ensure_topics_exist(self, topics: List[tuple[str, int]]) -> Dict[str, bool]: """Ensure topics exist, creating them if necessary.""" results = {} @@ -56,7 +56,7 @@ async def ensure_topics_exist(self, topics: List[tuple[str, int]]) -> Dict[str, else: results[topic] = await self.create_topic(topic, partitions) return results - + def get_admin_client(self) -> AdminClient: """Get the native AdminClient for direct operations.""" return self._admin diff --git a/backend/app/events/consumer_group_monitor.py b/backend/app/events/consumer_group_monitor.py index d11b10e7..cf424834 100644 --- a/backend/app/events/consumer_group_monitor.py +++ b/backend/app/events/consumer_group_monitor.py @@ -14,6 +14,7 @@ class ConsumerGroupHealth(StringEnum): """Consumer group health status.""" + HEALTHY = "healthy" DEGRADED = "degraded" UNHEALTHY = "unhealthy" @@ -23,6 +24,7 @@ class ConsumerGroupHealth(StringEnum): @dataclass(slots=True) class ConsumerGroupMember: """Information about a consumer group member.""" + member_id: str client_id: str host: str @@ -32,6 +34,7 @@ class ConsumerGroupMember: @dataclass(slots=True) class ConsumerGroupStatus: """Comprehensive consumer group status information.""" + group_id: str state: str protocol: str @@ -65,21 +68,21 @@ def __post_init__(self) -> None: class NativeConsumerGroupMonitor: """ Enhanced consumer group monitoring using confluent-kafka native operations. - + Provides detailed consumer group health monitoring, lag tracking, and rebalancing detection using AdminClient's native capabilities. """ def __init__( - self, - bootstrap_servers: str | None = None, - client_id: str = "integr8scode-consumer-group-monitor", - request_timeout_ms: int = 30000, - # Health thresholds - max_rebalance_time_seconds: int = 300, # 5 minutes - critical_lag_threshold: int = 10000, - warning_lag_threshold: int = 1000, - min_members_threshold: int = 1, + self, + bootstrap_servers: str | None = None, + client_id: str = "integr8scode-consumer-group-monitor", + request_timeout_ms: int = 30000, + # Health thresholds + max_rebalance_time_seconds: int = 300, # 5 minutes + critical_lag_threshold: int = 10000, + warning_lag_threshold: int = 1000, + min_members_threshold: int = 1, ): settings = get_settings() self.bootstrap_servers = bootstrap_servers or settings.KAFKA_BOOTSTRAP_SERVERS @@ -97,10 +100,7 @@ def __init__( self._cache_ttl_seconds = 30 async def get_consumer_group_status( - self, - group_id: str, - timeout: float = 30.0, - include_lag: bool = True + self, group_id: str, timeout: float = 30.0, include_lag: bool = True ) -> ConsumerGroupStatus: """Get comprehensive status for a consumer group.""" try: @@ -123,16 +123,18 @@ async def get_consumer_group_status( for member in group_desc.members: # Parse assigned partitions assigned_partitions = [] - if member.assignment and hasattr(member.assignment, 'topic_partitions'): + if member.assignment and hasattr(member.assignment, "topic_partitions"): for tp in member.assignment.topic_partitions: assigned_partitions.append(f"{tp.topic}:{tp.partition}") - members.append(ConsumerGroupMember( - member_id=member.member_id, - client_id=member.client_id, - host=member.host, - assigned_partitions=assigned_partitions - )) + members.append( + ConsumerGroupMember( + member_id=member.member_id, + client_id=member.client_id, + host=member.host, + assigned_partitions=assigned_partitions, + ) + ) partition_distribution[member.member_id] = len(assigned_partitions) total_assigned_partitions += len(assigned_partitions) @@ -146,8 +148,8 @@ async def get_consumer_group_status( if include_lag and group_desc.state == ConsumerGroupState.STABLE: try: lag_info = await self._get_consumer_group_lag(group_id, timeout) - total_lag = lag_info.get('total_lag', 0) - partition_lags = lag_info.get('partition_lags', {}) + total_lag = lag_info.get("total_lag", 0) + partition_lags = lag_info.get("partition_lags", {}) except Exception as e: logger.warning(f"Failed to get lag info for group {group_id}: {e}") @@ -155,15 +157,15 @@ async def get_consumer_group_status( status = ConsumerGroupStatus( group_id=group_id, state=group_desc.state.name if group_desc.state else "UNKNOWN", - protocol=getattr(group_desc, 'protocol', 'unknown'), - protocol_type=getattr(group_desc, 'protocol_type', 'unknown'), + protocol=getattr(group_desc, "protocol", "unknown"), + protocol_type=getattr(group_desc, "protocol_type", "unknown"), coordinator=coordinator, members=members, member_count=len(members), assigned_partitions=total_assigned_partitions, partition_distribution=partition_distribution, total_lag=total_lag, - partition_lags=partition_lags + partition_lags=partition_lags, ) # Assess health @@ -189,23 +191,17 @@ async def get_consumer_group_status( assigned_partitions=0, partition_distribution={}, health=ConsumerGroupHealth.UNHEALTHY, - health_message=f"Failed to get group status: {e}" + health_message=f"Failed to get group status: {e}", ) async def get_multiple_group_status( - self, - group_ids: List[str], - timeout: float = 30.0, - include_lag: bool = True + self, group_ids: List[str], timeout: float = 30.0, include_lag: bool = True ) -> Dict[str, ConsumerGroupStatus]: """Get status for multiple consumer groups efficiently.""" results = {} # Process groups concurrently - tasks = [ - self.get_consumer_group_status(group_id, timeout, include_lag) - for group_id in group_ids - ] + tasks = [self.get_consumer_group_status(group_id, timeout, include_lag) for group_id in group_ids] try: statuses = await asyncio.gather(*tasks, return_exceptions=True) @@ -224,7 +220,7 @@ async def get_multiple_group_status( assigned_partitions=0, partition_distribution={}, health=ConsumerGroupHealth.UNHEALTHY, - health_message=str(status) + health_message=str(status), ) elif isinstance(status, ConsumerGroupStatus): results[group_id] = status @@ -244,7 +240,7 @@ async def get_multiple_group_status( assigned_partitions=0, partition_distribution={}, health=ConsumerGroupHealth.UNHEALTHY, - health_message=str(e) + health_message=str(e), ) return results @@ -261,12 +257,12 @@ async def list_consumer_groups(self, timeout: float = 10.0) -> List[str]: # Extract group IDs from result # ListConsumerGroupsResult has .valid and .errors attributes group_ids = [] - if hasattr(result, 'valid'): + if hasattr(result, "valid"): # result.valid contains a list of ConsumerGroupListing objects group_ids = [group_listing.group_id for group_listing in result.valid] # Log any errors that occurred - if hasattr(result, 'errors') and result.errors: + if hasattr(result, "errors") and result.errors: for error in result.errors: logger.warning(f"Error listing some consumer groups: {error}") @@ -295,28 +291,26 @@ async def _describe_consumer_group(self, group_id: str, timeout: float) -> Consu return group_desc except Exception as e: - if hasattr(e, 'args') and e.args and isinstance(e.args[0], KafkaError): + if hasattr(e, "args") and e.args and isinstance(e.args[0], KafkaError): kafka_err = e.args[0] - logger.error(f"Kafka error describing group {group_id}: " - f"code={kafka_err.code()}, " - f"name={kafka_err.name()}, " - f"message={kafka_err}") + logger.error( + f"Kafka error describing group {group_id}: " + f"code={kafka_err.code()}, " + f"name={kafka_err.name()}, " + f"message={kafka_err}" + ) raise ValueError(f"Failed to describe group {group_id}: {kafka_err}") raise ValueError(f"Failed to describe group {group_id}: {e}") - async def _get_consumer_group_lag( - self, - group_id: str, - timeout: float - ) -> Dict[str, Any]: + async def _get_consumer_group_lag(self, group_id: str, timeout: float) -> Dict[str, Any]: """Get consumer group lag information.""" try: # Create a temporary consumer to get lag info consumer_config = { - 'bootstrap.servers': self.bootstrap_servers, - 'group.id': f"{group_id}-lag-monitor-{datetime.now().timestamp()}", - 'enable.auto.commit': False, - 'auto.offset.reset': 'earliest' + "bootstrap.servers": self.bootstrap_servers, + "group.id": f"{group_id}-lag-monitor-{datetime.now().timestamp()}", + "enable.auto.commit": False, + "auto.offset.reset": "earliest", } consumer = Consumer(consumer_config) @@ -328,12 +322,12 @@ async def _get_consumer_group_lag( # Extract topics from member assignments topics = set() for member in group_desc.members: - if member.assignment and hasattr(member.assignment, 'topic_partitions'): + if member.assignment and hasattr(member.assignment, "topic_partitions"): for tp in member.assignment.topic_partitions: topics.add(tp.topic) if not topics: - return {'total_lag': 0, 'partition_lags': {}} + return {"total_lag": 0, "partition_lags": {}} # Get topic metadata to find all partitions metadata = await asyncio.to_thread(consumer.list_topics, timeout=timeout) @@ -351,16 +345,12 @@ async def _get_consumer_group_lag( try: # Get high water mark low, high = await asyncio.to_thread( - consumer.get_watermark_offsets, - TopicPartition(topic, partition_id), - timeout=timeout + consumer.get_watermark_offsets, TopicPartition(topic, partition_id), timeout=timeout ) # Get committed offset for the group committed = await asyncio.to_thread( - consumer.committed, - [TopicPartition(topic, partition_id)], - timeout=timeout + consumer.committed, [TopicPartition(topic, partition_id)], timeout=timeout ) if committed and len(committed) > 0: @@ -374,17 +364,14 @@ async def _get_consumer_group_lag( logger.debug(f"Failed to get lag for {topic}:{partition_id}: {e}") continue - return { - 'total_lag': total_lag, - 'partition_lags': partition_lags - } + return {"total_lag": total_lag, "partition_lags": partition_lags} finally: consumer.close() except Exception as e: logger.warning(f"Failed to get consumer group lag for {group_id}: {e}") - return {'total_lag': 0, 'partition_lags': {}} + return {"total_lag": 0, "partition_lags": {}} def _assess_group_health(self, status: ConsumerGroupStatus) -> tuple[ConsumerGroupHealth, str]: """Assess the health of a consumer group based on its status.""" @@ -436,7 +423,7 @@ def get_health_summary(self, status: ConsumerGroupStatus) -> Dict[str, Any]: "total_lag": status.total_lag, "coordinator": status.coordinator, "timestamp": status.timestamp.isoformat() if status.timestamp else None, - "partition_distribution": status.partition_distribution + "partition_distribution": status.partition_distribution, } def clear_cache(self) -> None: @@ -444,8 +431,5 @@ def clear_cache(self) -> None: self._group_status_cache.clear() -def create_consumer_group_monitor( - bootstrap_servers: str | None = None, - **kwargs: Any -) -> NativeConsumerGroupMonitor: +def create_consumer_group_monitor(bootstrap_servers: str | None = None, **kwargs: Any) -> NativeConsumerGroupMonitor: return NativeConsumerGroupMonitor(bootstrap_servers=bootstrap_servers, **kwargs) diff --git a/backend/app/events/core/consumer.py b/backend/app/events/core/consumer.py index 27c943a2..2a482ac3 100644 --- a/backend/app/events/core/consumer.py +++ b/backend/app/events/core/consumer.py @@ -2,6 +2,7 @@ import json from collections.abc import Awaitable, Callable from datetime import datetime, timezone +from typing import Any from confluent_kafka import OFFSET_BEGINNING, OFFSET_END, Consumer, Message, TopicPartition from confluent_kafka.error import KafkaError @@ -17,15 +18,15 @@ from app.settings import get_settings from .dispatcher import EventDispatcher -from .types import ConsumerConfig, ConsumerMetrics, ConsumerState +from .types import ConsumerConfig, ConsumerMetrics, ConsumerMetricsSnapshot, ConsumerState, ConsumerStatus class UnifiedConsumer: def __init__( - self, - config: ConsumerConfig, - event_dispatcher: EventDispatcher, - stats_callback: Callable[[dict], None] | None = None + self, + config: ConsumerConfig, + event_dispatcher: EventDispatcher, + stats_callback: Callable[[dict[str, Any]], None] | None = None, ): self._config = config self._schema_registry = SchemaRegistryManager() @@ -41,14 +42,11 @@ def __init__( self._topic_prefix = get_settings().KAFKA_TOPIC_PREFIX async def start(self, topics: list[KafkaTopic]) -> None: - self._state = ( - self._state if self._state != ConsumerState.STOPPED - else ConsumerState.STARTING - ) + self._state = self._state if self._state != ConsumerState.STOPPED else ConsumerState.STARTING consumer_config = self._config.to_consumer_config() if self._stats_callback: - consumer_config['stats_cb'] = self._handle_stats + consumer_config["stats_cb"] = self._handle_stats self._consumer = Consumer(consumer_config) topic_strings = [f"{self._topic_prefix}{str(topic)}" for topic in topics] @@ -102,16 +100,19 @@ async def _consume_loop(self) -> None: self._metrics.processing_errors += 1 else: message_count += 1 - logger.debug(f"Message received from topic {msg.topic()}, " - f"partition {msg.partition()}, offset {msg.offset()}") + logger.debug( + f"Message received from topic {msg.topic()}, partition {msg.partition()}, offset {msg.offset()}" + ) await self._process_message(msg) if not self._config.enable_auto_commit: await asyncio.to_thread(self._consumer.commit, msg) else: await asyncio.sleep(0.01) - - logger.warning(f"Consumer loop ended for group {self._config.group_id}: " - f"running={self._running}, consumer={self._consumer is not None}") + + logger.warning( + f"Consumer loop ended for group {self._config.group_id}: " + f"running={self._running}, consumer={self._consumer is not None}" + ) async def _process_message(self, message: Message) -> None: topic = message.topic() @@ -162,18 +163,13 @@ async def _process_message(self, message: Message) -> None: self._metrics.bytes_consumed += len(raw_value) self._metrics.last_message_time = datetime.now(timezone.utc) # Record Kafka consumption metrics - self._event_metrics.record_kafka_message_consumed( - topic=topic, - consumer_group=self._config.group_id - ) + self._event_metrics.record_kafka_message_consumed(topic=topic, consumer_group=self._config.group_id) except Exception as e: logger.error(f"Dispatcher error for event {event.event_type}: {e}") self._metrics.processing_errors += 1 # Record Kafka consumption error self._event_metrics.record_kafka_consumption_error( - topic=topic, - consumer_group=self._config.group_id, - error_type=type(e).__name__ + topic=topic, consumer_group=self._config.group_id, error_type=type(e).__name__ ) if self._error_callback: await self._error_callback(e, event) @@ -184,15 +180,15 @@ def register_error_callback(self, callback: Callable[[Exception, BaseEvent], Awa def _handle_stats(self, stats_json: str) -> None: stats = json.loads(stats_json) - self._metrics.messages_consumed = stats.get('rxmsgs', 0) - self._metrics.bytes_consumed = stats.get('rxmsg_bytes', 0) + self._metrics.messages_consumed = stats.get("rxmsgs", 0) + self._metrics.bytes_consumed = stats.get("rxmsg_bytes", 0) - topics = stats.get('topics', {}) + topics = stats.get("topics", {}) self._metrics.consumer_lag = sum( - partition_stats.get('consumer_lag', 0) + partition_stats.get("consumer_lag", 0) for topic_stats in topics.values() - for partition_stats in topic_stats.get('partitions', {}).values() - if partition_stats.get('consumer_lag', 0) >= 0 + for partition_stats in topic_stats.get("partitions", {}).values() + if partition_stats.get("consumer_lag", 0) >= 0 ) self._metrics.last_updated = datetime.now(timezone.utc) @@ -214,28 +210,24 @@ def is_running(self) -> bool: def consumer(self) -> Consumer | None: return self._consumer - def get_status(self) -> dict: - return { - "state": self._state.value, - "is_running": self.is_running, - "group_id": self._config.group_id, - "client_id": self._config.client_id, - "metrics": { - "messages_consumed": self._metrics.messages_consumed, - "bytes_consumed": self._metrics.bytes_consumed, - "consumer_lag": self._metrics.consumer_lag, - "commit_failures": self._metrics.commit_failures, - "processing_errors": self._metrics.processing_errors, - "last_message_time": ( - self._metrics.last_message_time.isoformat() - if self._metrics.last_message_time else None - ), - "last_updated": ( - self._metrics.last_updated.isoformat() - if self._metrics.last_updated else None + def get_status(self) -> ConsumerStatus: + return ConsumerStatus( + state=self._state.value, + is_running=self.is_running, + group_id=self._config.group_id, + client_id=self._config.client_id, + metrics=ConsumerMetricsSnapshot( + messages_consumed=self._metrics.messages_consumed, + bytes_consumed=self._metrics.bytes_consumed, + consumer_lag=self._metrics.consumer_lag, + commit_failures=self._metrics.commit_failures, + processing_errors=self._metrics.processing_errors, + last_message_time=( + self._metrics.last_message_time.isoformat() if self._metrics.last_message_time else None ), - } - } + last_updated=self._metrics.last_updated.isoformat() if self._metrics.last_updated else None, + ), + ) async def seek_to_beginning(self) -> None: self._seek_all_partitions(OFFSET_BEGINNING) diff --git a/backend/app/events/core/dispatcher.py b/backend/app/events/core/dispatcher.py index 5cc0e1e3..1727922b 100644 --- a/backend/app/events/core/dispatcher.py +++ b/backend/app/events/core/dispatcher.py @@ -1,20 +1,21 @@ import asyncio from collections import defaultdict from collections.abc import Awaitable, Callable -from typing import TypeVar +from typing import TypeAlias, TypeVar from app.core.logging import logger from app.domain.enums.events import EventType from app.infrastructure.kafka.events.base import BaseEvent from app.infrastructure.kafka.mappings import get_event_class_for_type -T = TypeVar('T', bound=BaseEvent) +T = TypeVar("T", bound=BaseEvent) +EventHandler: TypeAlias = Callable[[BaseEvent], Awaitable[None]] class EventDispatcher: """ Type-safe event dispatcher with automatic routing. - + This dispatcher eliminates the need for manual if/elif routing by maintaining a direct mapping from event types to their handlers. """ @@ -37,32 +38,32 @@ def __init__(self) -> None: def _build_topic_mapping(self) -> None: """Build mapping of topics to event types based on event classes.""" for event_class in BaseEvent.__subclasses__(): - if hasattr(event_class, 'topic'): + if hasattr(event_class, "topic"): topic = str(event_class.topic) self._topic_event_types[topic].add(event_class) logger.debug(f"Mapped {event_class.__name__} to topic {topic}") - def register(self, event_type: EventType) -> Callable: + def register(self, event_type: EventType) -> Callable[[EventHandler], EventHandler]: """ Decorator for registering type-safe event handlers. - + Usage: @dispatcher.register(EventType.EXECUTION_REQUESTED) async def handle_execution(event: ExecutionRequestedEvent) -> None: # Handler logic here """ - def decorator(handler: Callable[[BaseEvent], Awaitable[None]]) -> Callable: + def decorator(handler: EventHandler) -> EventHandler: logger.info(f"Registering handler '{handler.__name__}' for event type '{event_type.value}'") self._handlers[event_type].append(handler) return handler return decorator - def register_handler(self, event_type: EventType, handler: Callable[[BaseEvent], Awaitable[None]]) -> None: + def register_handler(self, event_type: EventType, handler: EventHandler) -> None: """ Direct registration method for handlers. - + Args: event_type: The event type this handler processes handler: The async handler function @@ -70,14 +71,14 @@ def register_handler(self, event_type: EventType, handler: Callable[[BaseEvent], logger.info(f"Registering handler '{handler.__name__}' for event type '{event_type.value}'") self._handlers[event_type].append(handler) - def remove_handler(self, event_type: EventType, handler: Callable[[BaseEvent], Awaitable[None]]) -> bool: + def remove_handler(self, event_type: EventType, handler: EventHandler) -> bool: """ Remove a specific handler for an event type. - + Args: event_type: The event type to remove handler from handler: The handler function to remove - + Returns: True if handler was found and removed, False otherwise """ @@ -93,15 +94,16 @@ def remove_handler(self, event_type: EventType, handler: Callable[[BaseEvent], A async def dispatch(self, event: BaseEvent) -> None: """ Dispatch an event to all registered handlers for its type. - + Args: event: The event to dispatch """ event_type = event.event_type handlers = self._handlers.get(event_type, []) logger.debug(f"Dispatcher has {len(self._handlers)} event types registered") - logger.debug(f"For event type {event_type}, found {len(handlers)} handlers: " - f"{[h.__class__.__name__ for h in handlers]}") + logger.debug( + f"For event type {event_type}, found {len(handlers)} handlers: {[h.__class__.__name__ for h in handlers]}" + ) if not handlers: self._event_metrics[event_type]["skipped"] += 1 @@ -124,29 +126,28 @@ async def dispatch(self, event: BaseEvent) -> None: else: self._event_metrics[event_type]["processed"] += 1 - async def _execute_handler(self, handler: Callable, event: BaseEvent) -> None: + async def _execute_handler(self, handler: EventHandler, event: BaseEvent) -> None: """ Execute a single handler with error handling. - + Args: handler: The handler function event: The event to process """ try: logger.debug(f"Executing handler {handler.__class__.__name__} for event {event.event_id}") - result = await handler(event) - logger.debug(f"Handler {handler.__class__.__name__} completed, result: {result}") + await handler(event) + logger.debug(f"Handler {handler.__class__.__name__} completed") except Exception as e: logger.error( - f"Handler '{handler.__class__.__name__}' failed for event {event.event_id}: {e}", - exc_info=True + f"Handler '{handler.__class__.__name__}' failed for event {event.event_id}: {e}", exc_info=True ) raise def get_topics_for_registered_handlers(self) -> set[str]: """ Get all topics that have registered handlers. - + Returns: Set of topic names that should be subscribed to """ @@ -154,16 +155,13 @@ def get_topics_for_registered_handlers(self) -> set[str]: for event_type in self._handlers.keys(): # Find event class for this type event_class = get_event_class_for_type(event_type) - if event_class and hasattr(event_class, 'topic'): + if event_class and hasattr(event_class, "topic"): topics.add(str(event_class.topic)) return topics def get_metrics(self) -> dict[str, dict[str, int]]: """Get processing metrics for all event types.""" - return { - event_type.value: metrics - for event_type, metrics in self._event_metrics.items() - } + return {event_type.value: metrics for event_type, metrics in self._event_metrics.items()} def clear_handlers(self) -> None: """Clear all registered handlers (useful for testing).""" diff --git a/backend/app/events/core/dlq_handler.py b/backend/app/events/core/dlq_handler.py index 50ab0001..a674b5a7 100644 --- a/backend/app/events/core/dlq_handler.py +++ b/backend/app/events/core/dlq_handler.py @@ -7,18 +7,16 @@ def create_dlq_error_handler( - producer: UnifiedProducer, - original_topic: str, - max_retries: int = 3 + producer: UnifiedProducer, original_topic: str, max_retries: int = 3 ) -> Callable[[Exception, BaseEvent], Awaitable[None]]: """ Create an error handler that sends failed events to DLQ. - + Args: producer: The Kafka producer to use for sending to DLQ original_topic: The topic where the event originally failed max_retries: Maximum number of retries before sending to DLQ - + Returns: An async error handler function suitable for UnifiedConsumer.register_error_callback """ @@ -28,7 +26,7 @@ def create_dlq_error_handler( async def handle_error_with_dlq(error: Exception, event: BaseEvent) -> None: """ Handle processing errors by sending to DLQ after max retries. - + Args: error: The exception that occurred event: The event that failed processing @@ -40,23 +38,16 @@ async def handle_error_with_dlq(error: Exception, event: BaseEvent) -> None: retry_counts[event_id] = retry_count + 1 logger.error( - f"Error processing event {event_id} ({event.event_type}): {error}. " - f"Retry {retry_count + 1}/{max_retries}", - exc_info=True + f"Error processing event {event_id} ({event.event_type}): {error}. Retry {retry_count + 1}/{max_retries}", + exc_info=True, ) # Send to DLQ if we've exceeded max retries if retry_count >= max_retries: - logger.warning( - f"Event {event_id} exceeded max retries ({max_retries}). " - f"Sending to DLQ." - ) + logger.warning(f"Event {event_id} exceeded max retries ({max_retries}). Sending to DLQ.") await producer.send_to_dlq( - original_event=event, - original_topic=original_topic, - error=error, - retry_count=retry_count + original_event=event, original_topic=original_topic, error=error, retry_count=retry_count ) # Clear retry count for this event @@ -70,18 +61,17 @@ async def handle_error_with_dlq(error: Exception, event: BaseEvent) -> None: def create_immediate_dlq_handler( - producer: UnifiedProducer, - original_topic: str + producer: UnifiedProducer, original_topic: str ) -> Callable[[Exception, BaseEvent], Awaitable[None]]: """ Create an error handler that immediately sends failed events to DLQ. - + This is useful for critical errors where retry won't help. - + Args: producer: The Kafka producer to use for sending to DLQ original_topic: The topic where the event originally failed - + Returns: An async error handler function suitable for UnifiedConsumer.register_error_callback """ @@ -89,7 +79,7 @@ def create_immediate_dlq_handler( async def handle_error_immediate_dlq(error: Exception, event: BaseEvent) -> None: """ Handle processing errors by immediately sending to DLQ. - + Args: error: The exception that occurred event: The event that failed processing @@ -97,14 +87,9 @@ async def handle_error_immediate_dlq(error: Exception, event: BaseEvent) -> None logger.error( f"Critical error processing event {event.event_id} ({event.event_type}): {error}. " f"Sending immediately to DLQ.", - exc_info=True + exc_info=True, ) - await producer.send_to_dlq( - original_event=event, - original_topic=original_topic, - error=error, - retry_count=0 - ) + await producer.send_to_dlq(original_event=event, original_topic=original_topic, error=error, retry_count=0) return handle_error_immediate_dlq diff --git a/backend/app/events/core/producer.py b/backend/app/events/core/producer.py index d7353b71..ab5241b1 100644 --- a/backend/app/events/core/producer.py +++ b/backend/app/events/core/producer.py @@ -24,10 +24,10 @@ class UnifiedProducer(LifecycleEnabled): def __init__( - self, - config: ProducerConfig, - schema_registry_manager: SchemaRegistryManager, - stats_callback: StatsCallback | None = None + self, + config: ProducerConfig, + schema_registry_manager: SchemaRegistryManager, + stats_callback: StatsCallback | None = None, ): self._config = config self._schema_registry = schema_registry_manager @@ -37,7 +37,7 @@ def __init__( self._running = False self._metrics = ProducerMetrics() self._event_metrics = get_event_metrics() # Singleton for Kafka metrics - self._poll_task: asyncio.Task | None = None + self._poll_task: asyncio.Task[None] | None = None # Topic prefix (for tests/local isolation); cached on init self._topic_prefix = get_settings().KAFKA_TOPIC_PREFIX @@ -65,8 +65,7 @@ def _handle_delivery(self, error: KafkaError | None, message: Message) -> None: # Record Kafka production error topic = message.topic() if message else None self._event_metrics.record_kafka_production_error( - topic=topic if topic is not None else "unknown", - error_type=str(error.code()) if error else "unknown" + topic=topic if topic is not None else "unknown", error_type=str(error.code()) ) logger.error(f"Message delivery failed: {error}") else: @@ -79,18 +78,18 @@ def _handle_delivery(self, error: KafkaError | None, message: Message) -> None: def _handle_stats(self, stats_json: str) -> None: try: stats = json.loads(stats_json) - self._metrics.queue_size = stats.get('msg_cnt', 0) + self._metrics.queue_size = stats.get("msg_cnt", 0) - topics = stats.get('topics', {}) + topics = stats.get("topics", {}) total_messages = 0 total_latency = 0 for topic_stats in topics.values(): - partitions = topic_stats.get('partitions', {}) + partitions = topic_stats.get("partitions", {}) for partition_stats in partitions.values(): - msg_cnt = partition_stats.get('msgq_cnt', 0) + msg_cnt = partition_stats.get("msgq_cnt", 0) total_messages += msg_cnt - latency = partition_stats.get('rtt', {}).get('avg', 0) + latency = partition_stats.get("rtt", {}).get("avg", 0) if latency > 0 and msg_cnt > 0: total_latency += latency * msg_cnt @@ -111,8 +110,8 @@ async def start(self) -> None: logger.info("Starting producer...") producer_config = self._config.to_producer_config() - producer_config['stats_cb'] = self._handle_stats - producer_config['statistics.interval.ms'] = 30000 + producer_config["stats_cb"] = self._handle_stats + producer_config["statistics.interval.ms"] = 30000 self._producer = Producer(producer_config) self._running = True @@ -139,7 +138,7 @@ def get_status(self) -> dict[str, Any]: "avg_latency_ms": self._metrics.avg_latency_ms, "last_error": self._metrics.last_error, "last_error_time": self._metrics.last_error_time.isoformat() if self._metrics.last_error_time else None, - } + }, } async def stop(self) -> None: @@ -173,14 +172,11 @@ async def _poll_loop(self) -> None: logger.info("Producer poll loop ended") async def produce( - self, - event_to_produce: BaseEvent, - key: str | None = None, - headers: dict[str, str] | None = None + self, event_to_produce: BaseEvent, key: str | None = None, headers: dict[str, str] | None = None ) -> None: """ Produce a message to Kafka. - + Args: event_to_produce: Message value (BaseEvent) N.B. each instance of BaseEvent has .topic classvar, returning type of KafkaTopic @@ -200,7 +196,7 @@ async def produce( value=serialized_value, key=key.encode() if isinstance(key, str) else key, headers=[(k, v.encode()) for k, v in headers.items()] if headers else None, - callback=self._handle_delivery + callback=self._handle_delivery, ) # Record Kafka metrics @@ -209,15 +205,11 @@ async def produce( logger.debug(f"Message [{event_to_produce}] queued for topic: {topic}") async def send_to_dlq( - self, - original_event: BaseEvent, - original_topic: str, - error: Exception, - retry_count: int = 0 + self, original_event: BaseEvent, original_topic: str, error: Exception, retry_count: int = 0 ) -> None: """ Send a failed event to the Dead Letter Queue. - + Args: original_event: The event that failed processing original_topic: The topic where the event originally failed @@ -231,7 +223,7 @@ async def send_to_dlq( try: # Get producer ID (hostname + task name) current_task = asyncio.current_task() - task_name = current_task.get_name() if current_task else 'main' + task_name = current_task.get_name() if current_task else "main" producer_id = f"{socket.gethostname()}-{task_name}" # Create DLQ message @@ -240,7 +232,7 @@ async def send_to_dlq( original_topic=original_topic, error=str(error), producer_id=producer_id, - retry_count=retry_count + retry_count=retry_count, ) # Create DLQ event wrapper @@ -257,7 +249,7 @@ async def send_to_dlq( } # Serialize as JSON (DLQ uses JSON format for flexibility) - serialized_value = json.dumps(dlq_event_data).encode('utf-8') + serialized_value = json.dumps(dlq_event_data).encode("utf-8") # Send to DLQ topic self._producer.produce( @@ -269,7 +261,7 @@ async def send_to_dlq( ("error_type", type(error).__name__.encode()), ("retry_count", str(retry_count).encode()), ], - callback=self._handle_delivery + callback=self._handle_delivery, ) # Record metrics @@ -287,8 +279,6 @@ async def send_to_dlq( except Exception as e: # If we can't send to DLQ, log critically but don't crash logger.critical( - f"Failed to send event {original_event.event_id} to DLQ: {e}. " - f"Original error: {error}", - exc_info=True + f"Failed to send event {original_event.event_id} to DLQ: {e}. Original error: {error}", exc_info=True ) self._metrics.messages_failed += 1 diff --git a/backend/app/events/core/types.py b/backend/app/events/core/types.py index cd590430..259622d7 100644 --- a/backend/app/events/core/types.py +++ b/backend/app/events/core/types.py @@ -7,6 +7,7 @@ class ProducerState(StringEnum): """Kafka producer state enumeration.""" + STOPPED = "stopped" STARTING = "starting" RUNNING = "running" @@ -16,6 +17,7 @@ class ProducerState(StringEnum): class ConsumerState(StringEnum): """Kafka consumer state enumeration.""" + STOPPED = "stopped" STARTING = "starting" RUNNING = "running" @@ -45,16 +47,16 @@ class ProducerConfig: def to_producer_config(self) -> dict[str, Any]: """Convert to Confluent Kafka producer configuration.""" return { - 'bootstrap.servers': self.bootstrap_servers, - 'client.id': self.client_id, - 'batch.size': self.batch_size, - 'linger.ms': self.linger_ms, - 'compression.type': self.compression_type, - 'request.timeout.ms': self.request_timeout_ms, - 'retries': self.retries, - 'enable.idempotence': self.enable_idempotence, - 'acks': self.acks, - 'max.in.flight.requests.per.connection': self.max_in_flight_requests_per_connection, + "bootstrap.servers": self.bootstrap_servers, + "client.id": self.client_id, + "batch.size": self.batch_size, + "linger.ms": self.linger_ms, + "compression.type": self.compression_type, + "request.timeout.ms": self.request_timeout_ms, + "retries": self.retries, + "enable.idempotence": self.enable_idempotence, + "acks": self.acks, + "max.in.flight.requests.per.connection": self.max_in_flight_requests_per_connection, } @@ -86,17 +88,17 @@ class ConsumerConfig: def to_consumer_config(self) -> dict[str, object]: """Convert to Confluent Kafka consumer configuration.""" return { - 'bootstrap.servers': self.bootstrap_servers, - 'group.id': self.group_id, - 'client.id': self.client_id, - 'auto.offset.reset': self.auto_offset_reset, - 'enable.auto.commit': self.enable_auto_commit, - 'session.timeout.ms': self.session_timeout_ms, - 'heartbeat.interval.ms': self.heartbeat_interval_ms, - 'max.poll.interval.ms': self.max_poll_interval_ms, - 'fetch.min.bytes': self.fetch_min_bytes, - 'fetch.wait.max.ms': self.fetch_max_wait_ms, - 'statistics.interval.ms': self.statistics_interval_ms, + "bootstrap.servers": self.bootstrap_servers, + "group.id": self.group_id, + "client.id": self.client_id, + "auto.offset.reset": self.auto_offset_reset, + "enable.auto.commit": self.enable_auto_commit, + "session.timeout.ms": self.session_timeout_ms, + "heartbeat.interval.ms": self.heartbeat_interval_ms, + "max.poll.interval.ms": self.max_poll_interval_ms, + "fetch.min.bytes": self.fetch_min_bytes, + "fetch.wait.max.ms": self.fetch_max_wait_ms, + "statistics.interval.ms": self.statistics_interval_ms, } @@ -140,3 +142,27 @@ class ConsumerMetrics: def __post_init__(self) -> None: """Initialize timestamps if not provided.""" self.last_updated = self.last_updated or datetime.now(timezone.utc) + + +@dataclass(slots=True) +class ConsumerMetricsSnapshot: + """Snapshot of consumer metrics for status reporting.""" + + messages_consumed: int + bytes_consumed: int + consumer_lag: int + commit_failures: int + processing_errors: int + last_message_time: str | None + last_updated: str | None + + +@dataclass(slots=True) +class ConsumerStatus: + """Consumer status information.""" + + state: str + is_running: bool + group_id: str + client_id: str + metrics: ConsumerMetricsSnapshot diff --git a/backend/app/events/event_store.py b/backend/app/events/event_store.py index a0040d59..b3d175eb 100644 --- a/backend/app/events/event_store.py +++ b/backend/app/events/event_store.py @@ -3,10 +3,10 @@ from datetime import datetime, timezone from typing import Any, Dict, List -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorCursor, AsyncIOMotorDatabase from pymongo import ASCENDING, DESCENDING, IndexModel from pymongo.errors import BulkWriteError, DuplicateKeyError +from app.core.database_context import Collection, Cursor, Database from app.core.logging import logger from app.core.metrics.context import get_event_metrics from app.core.tracing import EventAttributes @@ -18,18 +18,18 @@ class EventStore: def __init__( - self, - db: AsyncIOMotorDatabase, - schema_registry: SchemaRegistryManager, - collection_name: str = "events", - ttl_days: int = 90, - batch_size: int = 100, + self, + db: Database, + schema_registry: SchemaRegistryManager, + collection_name: str = "events", + ttl_days: int = 90, + batch_size: int = 100, ): self.db = db self.metrics = get_event_metrics() self.schema_registry = schema_registry self.collection_name = collection_name - self.collection: AsyncIOMotorCollection = db[collection_name] + self.collection: Collection = db[collection_name] self.ttl_days = ttl_days self.batch_size = batch_size self._initialized = False @@ -54,11 +54,13 @@ async def initialize(self) -> None: IndexModel([("execution_id", ASCENDING), ("timestamp", ASCENDING)]), IndexModel("metadata.correlation_id"), IndexModel("metadata.service_name"), - IndexModel([ - ("event_type", ASCENDING), - ("metadata.user_id", ASCENDING), - ("timestamp", DESCENDING), - ]), + IndexModel( + [ + ("event_type", ASCENDING), + ("metadata.user_id", ASCENDING), + ("timestamp", DESCENDING), + ] + ), IndexModel( "timestamp", expireAfterSeconds=self.ttl_days * 24 * 60 * 60, @@ -154,12 +156,12 @@ async def get_event(self, event_id: str) -> BaseEvent | None: return event async def get_events_by_type( - self, - event_type: EventType, - start_time: datetime | None = None, - end_time: datetime | None = None, - limit: int = 100, - offset: int = 0, + self, + event_type: EventType, + start_time: datetime | None = None, + end_time: datetime | None = None, + limit: int = 100, + offset: int = 0, ) -> List[BaseEvent]: start = asyncio.get_event_loop().time() q: Dict[str, Any] = {"event_type": str(event_type)} @@ -173,9 +175,9 @@ async def get_events_by_type( return events async def get_execution_events( - self, - execution_id: str, - event_types: List[EventType] | None = None, + self, + execution_id: str, + event_types: List[EventType] | None = None, ) -> List[BaseEvent]: start = asyncio.get_event_loop().time() q: Dict[str, Any] = {"execution_id": execution_id} @@ -189,12 +191,12 @@ async def get_execution_events( return events async def get_user_events( - self, - user_id: str, - event_types: List[EventType] | None = None, - start_time: datetime | None = None, - end_time: datetime | None = None, - limit: int = 100, + self, + user_id: str, + event_types: List[EventType] | None = None, + start_time: datetime | None = None, + end_time: datetime | None = None, + limit: int = 100, ) -> List[BaseEvent]: start = asyncio.get_event_loop().time() q: Dict[str, Any] = {"metadata.user_id": str(user_id)} @@ -210,11 +212,11 @@ async def get_user_events( return events async def get_security_events( - self, - start_time: datetime | None = None, - end_time: datetime | None = None, - user_id: str | None = None, - limit: int = 100, + self, + start_time: datetime | None = None, + end_time: datetime | None = None, + user_id: str | None = None, + limit: int = 100, ) -> List[BaseEvent]: start = asyncio.get_event_loop().time() q: Dict[str, Any] = {"event_type": {"$in": self._SECURITY_TYPES}} @@ -239,11 +241,11 @@ async def get_correlation_chain(self, correlation_id: str) -> List[BaseEvent]: return events async def replay_events( - self, - start_time: datetime, - end_time: datetime | None = None, - event_types: List[EventType] | None = None, - callback: Callable[[BaseEvent], Awaitable[None]] | None = None, + self, + start_time: datetime, + end_time: datetime | None = None, + event_types: List[EventType] | None = None, + callback: Callable[[BaseEvent], Awaitable[None]] | None = None, ) -> int: start = asyncio.get_event_loop().time() count = 0 @@ -271,9 +273,9 @@ async def replay_events( return count async def get_event_stats( - self, - start_time: datetime | None = None, - end_time: datetime | None = None, + self, + start_time: datetime | None = None, + end_time: datetime | None = None, ) -> Dict[str, Any]: pipeline: List[Dict[str, Any]] = [] if start_time or end_time: @@ -284,15 +286,19 @@ async def get_event_stats( match.setdefault("timestamp", {})["$lte"] = end_time pipeline.append({"$match": match}) - pipeline.extend([ - {"$group": { - "_id": "$event_type", - "count": {"$sum": 1}, - "first_event": {"$min": "$timestamp"}, - "last_event": {"$max": "$timestamp"}, - }}, - {"$sort": {"count": -1}}, - ]) + pipeline.extend( + [ + { + "$group": { + "_id": "$event_type", + "count": {"$sum": 1}, + "first_event": {"$min": "$timestamp"}, + "last_event": {"$max": "$timestamp"}, + } + }, + {"$sort": {"count": -1}}, + ] + ) cursor = self.collection.aggregate(pipeline) stats: Dict[str, Any] = {"total_events": 0, "event_types": {}, "start_time": start_time, "end_time": end_time} @@ -307,7 +313,7 @@ async def get_event_stats( stats["total_events"] += c return stats - async def _deserialize_cursor(self, cursor: AsyncIOMotorCursor) -> list[BaseEvent]: + async def _deserialize_cursor(self, cursor: Cursor) -> list[BaseEvent]: return [self.schema_registry.deserialize_json(doc) async for doc in cursor] def _time_range(self, start_time: datetime | None, end_time: datetime | None) -> Dict[str, Any] | None: @@ -321,12 +327,12 @@ def _time_range(self, start_time: datetime | None, end_time: datetime | None) -> return tr async def _find_events( - self, - query: Dict[str, Any], - *, - sort: tuple[str, int], - limit: int | None = None, - offset: int = 0, + self, + query: Dict[str, Any], + *, + sort: tuple[str, int], + limit: int | None = None, + offset: int = 0, ) -> List[BaseEvent]: cur = self.collection.find(query, self._PROJECTION).sort(*sort).skip(offset) if limit is not None: @@ -349,11 +355,11 @@ async def health_check(self) -> Dict[str, Any]: def create_event_store( - db: AsyncIOMotorDatabase, - schema_registry: SchemaRegistryManager, - collection_name: str = "events", - ttl_days: int = 90, - batch_size: int = 100, + db: Database, + schema_registry: SchemaRegistryManager, + collection_name: str = "events", + ttl_days: int = 90, + batch_size: int = 100, ) -> EventStore: return EventStore( db=db, diff --git a/backend/app/events/event_store_consumer.py b/backend/app/events/event_store_consumer.py index cdd1058d..9276a476 100644 --- a/backend/app/events/event_store_consumer.py +++ b/backend/app/events/event_store_consumer.py @@ -18,14 +18,14 @@ class EventStoreConsumer(LifecycleEnabled): """Consumes events from Kafka and stores them in MongoDB.""" def __init__( - self, - event_store: EventStore, - topics: list[KafkaTopic], - schema_registry_manager: SchemaRegistryManager, - producer: UnifiedProducer | None = None, - group_id: GroupId = GroupId.EVENT_STORE_CONSUMER, - batch_size: int = 100, - batch_timeout_seconds: float = 5.0, + self, + event_store: EventStore, + topics: list[KafkaTopic], + schema_registry_manager: SchemaRegistryManager, + producer: UnifiedProducer | None = None, + group_id: GroupId = GroupId.EVENT_STORE_CONSUMER, + batch_size: int = 100, + batch_timeout_seconds: float = 5.0, ): self.event_store = event_store self.topics = topics @@ -39,7 +39,7 @@ def __init__( self._batch_buffer: list[BaseEvent] = [] self._batch_lock = asyncio.Lock() self._last_batch_time = asyncio.get_event_loop().time() - self._batch_task: asyncio.Task | None = None + self._batch_task: asyncio.Task[None] | None = None self._running = False async def start(self) -> None: @@ -52,13 +52,10 @@ async def start(self) -> None: bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, group_id=f"{self.group_id}.{settings.KAFKA_GROUP_SUFFIX}", enable_auto_commit=False, - max_poll_records=self.batch_size + max_poll_records=self.batch_size, ) - self.consumer = UnifiedConsumer( - config, - event_dispatcher=self.dispatcher - ) + self.consumer = UnifiedConsumer(config, event_dispatcher=self.dispatcher) # Register handler for all event types - store everything for event_type in EventType: @@ -70,7 +67,7 @@ async def start(self) -> None: dlq_handler = create_dlq_error_handler( producer=self.producer, original_topic="event-store", # Generic topic name for event store - max_retries=3 + max_retries=3, ) self.consumer.register_error_callback(dlq_handler) else: @@ -117,10 +114,7 @@ async def _handle_event(self, event: BaseEvent) -> None: async def _handle_error_with_event(self, error: Exception, event: BaseEvent) -> None: """Handle processing errors with event context.""" - logger.error( - f"Error processing event {event.event_id} ({event.event_type}): {error}", - exc_info=True - ) + logger.error(f"Error processing event {event.event_id} ({event.event_type}): {error}", exc_info=True) async def _batch_processor(self) -> None: """Periodically flush batches based on timeout.""" @@ -131,8 +125,7 @@ async def _batch_processor(self) -> None: async with self._batch_lock: time_since_last_batch = asyncio.get_event_loop().time() - self._last_batch_time - if (self._batch_buffer and - time_since_last_batch >= self.batch_timeout): + if self._batch_buffer and time_since_last_batch >= self.batch_timeout: await self._flush_batch() except Exception as e: @@ -162,13 +155,13 @@ async def _flush_batch(self) -> None: def create_event_store_consumer( - event_store: EventStore, - topics: list[KafkaTopic], - schema_registry_manager: SchemaRegistryManager, - producer: UnifiedProducer | None = None, - group_id: GroupId = GroupId.EVENT_STORE_CONSUMER, - batch_size: int = 100, - batch_timeout_seconds: float = 5.0, + event_store: EventStore, + topics: list[KafkaTopic], + schema_registry_manager: SchemaRegistryManager, + producer: UnifiedProducer | None = None, + group_id: GroupId = GroupId.EVENT_STORE_CONSUMER, + batch_size: int = 100, + batch_timeout_seconds: float = 5.0, ) -> EventStoreConsumer: return EventStoreConsumer( event_store=event_store, @@ -177,5 +170,5 @@ def create_event_store_consumer( batch_size=batch_size, batch_timeout_seconds=batch_timeout_seconds, schema_registry_manager=schema_registry_manager, - producer=producer + producer=producer, ) diff --git a/backend/app/events/metadata.py b/backend/app/events/metadata.py index 17393fe7..f73470a4 100644 --- a/backend/app/events/metadata.py +++ b/backend/app/events/metadata.py @@ -2,14 +2,14 @@ from uuid import uuid4 from pydantic import ConfigDict, Field -from pydantic_avro import AvroBase +from pydantic_avro import AvroBase # type: ignore[attr-defined] from app.domain.enums.common import Environment class EventMetadata(AvroBase): """Unified event metadata for auditing and tracing.""" - + service_name: str service_version: str correlation_id: str = Field(default_factory=lambda: str(uuid4())) @@ -17,16 +17,12 @@ class EventMetadata(AvroBase): ip_address: str | None = None user_agent: str | None = None environment: Environment = Environment.PRODUCTION - - model_config = ConfigDict( - extra="allow", - str_strip_whitespace=True, - use_enum_values=True - ) - + + model_config = ConfigDict(extra="allow", str_strip_whitespace=True, use_enum_values=True) + def to_dict(self, exclude_none: bool = True) -> Dict[str, Any]: return self.model_dump(exclude_none=exclude_none) - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> "EventMetadata": return cls( @@ -36,18 +32,16 @@ def from_dict(cls, data: Dict[str, Any]) -> "EventMetadata": user_id=data.get("user_id"), ip_address=data.get("ip_address"), user_agent=data.get("user_agent"), - environment=data.get("environment", Environment.PRODUCTION) + environment=data.get("environment", Environment.PRODUCTION), ) - - + def with_correlation(self, correlation_id: str) -> "EventMetadata": return self.model_copy(update={"correlation_id": correlation_id}) - + def with_user(self, user_id: str) -> "EventMetadata": return self.model_copy(update={"user_id": user_id}) - + def ensure_correlation_id(self) -> "EventMetadata": if self.correlation_id: return self return self.model_copy(update={"correlation_id": str(uuid4())}) - diff --git a/backend/app/events/schema/schema_registry.py b/backend/app/events/schema/schema_registry.py index 12e3923e..09a036c5 100644 --- a/backend/app/events/schema/schema_registry.py +++ b/backend/app/events/schema/schema_registry.py @@ -124,9 +124,7 @@ def serialize_event(self, event: BaseEvent) -> bytes: schema_str = json.dumps(event.__class__.avro_schema(namespace=self.namespace)) # Use record_subject_name_strategy to ensure subject is based on record name, not topic self._serializers[subject_key] = AvroSerializer( - self.client, - schema_str, - conf={'subject.name.strategy': record_subject_name_strategy} + self.client, schema_str, conf={"subject.name.strategy": record_subject_name_strategy} ) # Prepare payload dict (exclude event_type: schema id implies the concrete record) @@ -201,8 +199,13 @@ def set_compatibility(self, subject: str, mode: str) -> None: Valid: BACKWARD, FORWARD, FULL, NONE, BACKWARD_TRANSITIVE, FORWARD_TRANSITIVE, FULL_TRANSITIVE """ valid_modes = { - "BACKWARD", "FORWARD", "FULL", "NONE", - "BACKWARD_TRANSITIVE", "FORWARD_TRANSITIVE", "FULL_TRANSITIVE", + "BACKWARD", + "FORWARD", + "FULL", + "NONE", + "BACKWARD_TRANSITIVE", + "FORWARD_TRANSITIVE", + "FULL_TRANSITIVE", } if mode not in valid_modes: raise ValueError(f"Invalid compatibility mode: {mode}") diff --git a/backend/app/infrastructure/kafka/events/base.py b/backend/app/infrastructure/kafka/events/base.py index d20bb5c4..e48e75ec 100644 --- a/backend/app/infrastructure/kafka/events/base.py +++ b/backend/app/infrastructure/kafka/events/base.py @@ -3,7 +3,7 @@ from uuid import uuid4 from pydantic import ConfigDict, Field, field_serializer -from pydantic_avro import AvroBase +from pydantic_avro import AvroBase # type: ignore[attr-defined] from app.domain.enums.events import EventType from app.domain.enums.kafka import KafkaTopic @@ -12,6 +12,7 @@ class BaseEvent(AvroBase): """Base class for all events.""" + event_id: str = Field(default_factory=lambda: str(uuid4())) event_type: EventType event_version: str = "1.0" @@ -24,13 +25,13 @@ class BaseEvent(AvroBase): model_config = ConfigDict() - @field_serializer('timestamp', when_used='json') + @field_serializer("timestamp", when_used="json") def serialize_timestamp(self, dt: datetime) -> str: return dt.isoformat() def to_dict(self) -> dict[str, Any]: # Use mode='json' to properly serialize datetime objects to ISO strings - return self.model_dump(by_alias=True, mode='json') + return self.model_dump(by_alias=True, mode="json") def to_json(self) -> str: return self.model_dump_json(by_alias=True) diff --git a/backend/app/infrastructure/kafka/events/execution.py b/backend/app/infrastructure/kafka/events/execution.py index 7c891697..f596d03a 100644 --- a/backend/app/infrastructure/kafka/events/execution.py +++ b/backend/app/infrastructure/kafka/events/execution.py @@ -43,7 +43,7 @@ class ExecutionRequestedEvent(BaseEvent): "memory_limit": "128Mi", "cpu_request": "50m", "memory_request": "64Mi", - "priority": 5 + "priority": 5, } } ) @@ -130,7 +130,7 @@ class ExecutionCancelledEvent(BaseEvent): "execution_id": "550e8400-e29b-41d4-a716-446655440000", "reason": "user_requested", "cancelled_by": "user123", - "force_terminated": False + "force_terminated": False, } } ) diff --git a/backend/app/infrastructure/kafka/events/metadata.py b/backend/app/infrastructure/kafka/events/metadata.py index 14d60c20..f73470a4 100644 --- a/backend/app/infrastructure/kafka/events/metadata.py +++ b/backend/app/infrastructure/kafka/events/metadata.py @@ -2,14 +2,14 @@ from uuid import uuid4 from pydantic import ConfigDict, Field -from pydantic_avro import AvroBase +from pydantic_avro import AvroBase # type: ignore[attr-defined] from app.domain.enums.common import Environment class EventMetadata(AvroBase): """Unified event metadata for auditing and tracing.""" - + service_name: str service_version: str correlation_id: str = Field(default_factory=lambda: str(uuid4())) @@ -17,16 +17,12 @@ class EventMetadata(AvroBase): ip_address: str | None = None user_agent: str | None = None environment: Environment = Environment.PRODUCTION - - model_config = ConfigDict( - extra="allow", - str_strip_whitespace=True, - use_enum_values=True - ) - + + model_config = ConfigDict(extra="allow", str_strip_whitespace=True, use_enum_values=True) + def to_dict(self, exclude_none: bool = True) -> Dict[str, Any]: return self.model_dump(exclude_none=exclude_none) - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> "EventMetadata": return cls( @@ -36,15 +32,15 @@ def from_dict(cls, data: Dict[str, Any]) -> "EventMetadata": user_id=data.get("user_id"), ip_address=data.get("ip_address"), user_agent=data.get("user_agent"), - environment=data.get("environment", Environment.PRODUCTION) + environment=data.get("environment", Environment.PRODUCTION), ) - + def with_correlation(self, correlation_id: str) -> "EventMetadata": return self.model_copy(update={"correlation_id": correlation_id}) - + def with_user(self, user_id: str) -> "EventMetadata": return self.model_copy(update={"user_id": user_id}) - + def ensure_correlation_id(self) -> "EventMetadata": if self.correlation_id: return self diff --git a/backend/app/infrastructure/kafka/mappings.py b/backend/app/infrastructure/kafka/mappings.py index 9a6b5500..431f1c86 100644 --- a/backend/app/infrastructure/kafka/mappings.py +++ b/backend/app/infrastructure/kafka/mappings.py @@ -84,7 +84,6 @@ def get_event_class_for_type(event_type: EventType) -> Type[BaseEvent] | None: EventType.EXECUTION_FAILED: ExecutionFailedEvent, EventType.EXECUTION_TIMEOUT: ExecutionTimeoutEvent, EventType.EXECUTION_CANCELLED: ExecutionCancelledEvent, - # Pod events EventType.POD_CREATED: PodCreatedEvent, EventType.POD_SCHEDULED: PodScheduledEvent, @@ -93,7 +92,6 @@ def get_event_class_for_type(event_type: EventType) -> Type[BaseEvent] | None: EventType.POD_FAILED: PodFailedEvent, EventType.POD_TERMINATED: PodTerminatedEvent, EventType.POD_DELETED: PodDeletedEvent, - # User events EventType.USER_REGISTERED: UserRegisteredEvent, EventType.USER_LOGGED_IN: UserLoggedInEvent, @@ -104,7 +102,6 @@ def get_event_class_for_type(event_type: EventType) -> Type[BaseEvent] | None: EventType.USER_THEME_CHANGED: UserThemeChangedEvent, EventType.USER_NOTIFICATION_SETTINGS_UPDATED: UserNotificationSettingsUpdatedEvent, EventType.USER_EDITOR_SETTINGS_UPDATED: UserEditorSettingsUpdatedEvent, - # Notification events EventType.NOTIFICATION_CREATED: NotificationCreatedEvent, EventType.NOTIFICATION_SENT: NotificationSentEvent, @@ -112,30 +109,24 @@ def get_event_class_for_type(event_type: EventType) -> Type[BaseEvent] | None: EventType.NOTIFICATION_FAILED: NotificationFailedEvent, EventType.NOTIFICATION_READ: NotificationReadEvent, EventType.NOTIFICATION_CLICKED: NotificationClickedEvent, - # Script events EventType.SCRIPT_SAVED: ScriptSavedEvent, EventType.SCRIPT_DELETED: ScriptDeletedEvent, EventType.SCRIPT_SHARED: ScriptSharedEvent, - # Security events EventType.SECURITY_VIOLATION: SecurityViolationEvent, EventType.RATE_LIMIT_EXCEEDED: RateLimitExceededEvent, EventType.AUTH_FAILED: AuthFailedEvent, - # Resource events EventType.RESOURCE_LIMIT_EXCEEDED: ResourceLimitExceededEvent, EventType.QUOTA_EXCEEDED: QuotaExceededEvent, - # System events EventType.SYSTEM_ERROR: SystemErrorEvent, EventType.SERVICE_UNHEALTHY: ServiceUnhealthyEvent, EventType.SERVICE_RECOVERED: ServiceRecoveredEvent, - # Result events EventType.RESULT_STORED: ResultStoredEvent, EventType.RESULT_FAILED: ResultFailedEvent, - # Saga events EventType.SAGA_STARTED: SagaStartedEvent, EventType.SAGA_COMPLETED: SagaCompletedEvent, @@ -143,14 +134,13 @@ def get_event_class_for_type(event_type: EventType) -> Type[BaseEvent] | None: EventType.SAGA_CANCELLED: SagaCancelledEvent, EventType.SAGA_COMPENSATING: SagaCompensatingEvent, EventType.SAGA_COMPENSATED: SagaCompensatedEvent, - # Saga command events EventType.CREATE_POD_COMMAND: CreatePodCommandEvent, EventType.DELETE_POD_COMMAND: DeletePodCommandEvent, EventType.ALLOCATE_RESOURCES_COMMAND: AllocateResourcesCommandEvent, EventType.RELEASE_RESOURCES_COMMAND: ReleaseResourcesCommandEvent, } - + return event_map.get(event_type) @@ -160,7 +150,7 @@ def get_topic_for_event(event_type: EventType) -> KafkaTopic: event_class = get_event_class_for_type(event_type) if event_class: return event_class.topic - + # Default fallback return KafkaTopic.SYSTEM_EVENTS diff --git a/backend/app/infrastructure/kafka/topics.py b/backend/app/infrastructure/kafka/topics.py index 0fae304a..389c4121 100644 --- a/backend/app/infrastructure/kafka/topics.py +++ b/backend/app/infrastructure/kafka/topics.py @@ -18,7 +18,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, KafkaTopic.EXECUTION_COMPLETED: { "num_partitions": 10, @@ -26,7 +26,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, KafkaTopic.EXECUTION_FAILED: { "num_partitions": 10, @@ -34,7 +34,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, KafkaTopic.EXECUTION_TIMEOUT: { "num_partitions": 10, @@ -42,7 +42,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, KafkaTopic.EXECUTION_REQUESTS: { "num_partitions": 10, @@ -50,7 +50,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, KafkaTopic.EXECUTION_COMMANDS: { "num_partitions": 10, @@ -58,7 +58,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "86400000", # 1 day "compression.type": "gzip", - } + }, }, KafkaTopic.EXECUTION_TASKS: { "num_partitions": 10, @@ -66,9 +66,8 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "86400000", # 1 day "compression.type": "gzip", - } + }, }, - # Pod lifecycle topics KafkaTopic.POD_EVENTS: { "num_partitions": 10, @@ -76,7 +75,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "86400000", # 1 day "compression.type": "gzip", - } + }, }, KafkaTopic.POD_STATUS_UPDATES: { "num_partitions": 10, @@ -84,7 +83,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "86400000", # 1 day "compression.type": "gzip", - } + }, }, KafkaTopic.POD_RESULTS: { "num_partitions": 10, @@ -92,9 +91,8 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, - # Result topics KafkaTopic.EXECUTION_RESULTS: { "num_partitions": 10, @@ -102,9 +100,8 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, - # User topics KafkaTopic.USER_EVENTS: { "num_partitions": 5, @@ -112,7 +109,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "2592000000", # 30 days "compression.type": "gzip", - } + }, }, KafkaTopic.USER_NOTIFICATIONS: { "num_partitions": 5, @@ -120,7 +117,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, KafkaTopic.USER_SETTINGS_EVENTS: { "num_partitions": 3, @@ -128,7 +125,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "2592000000", # 30 days "compression.type": "gzip", - } + }, }, KafkaTopic.USER_SETTINGS_THEME_EVENTS: { "num_partitions": 3, @@ -136,7 +133,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "2592000000", # 30 days "compression.type": "gzip", - } + }, }, KafkaTopic.USER_SETTINGS_NOTIFICATION_EVENTS: { "num_partitions": 3, @@ -144,7 +141,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "2592000000", # 30 days "compression.type": "gzip", - } + }, }, KafkaTopic.USER_SETTINGS_EDITOR_EVENTS: { "num_partitions": 3, @@ -152,9 +149,8 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "2592000000", # 30 days "compression.type": "gzip", - } + }, }, - # Script topics KafkaTopic.SCRIPT_EVENTS: { "num_partitions": 3, @@ -162,9 +158,8 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "2592000000", # 30 days "compression.type": "gzip", - } + }, }, - # Security topics KafkaTopic.SECURITY_EVENTS: { "num_partitions": 5, @@ -172,9 +167,8 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "2592000000", # 30 days "compression.type": "gzip", - } + }, }, - # Resource topics KafkaTopic.RESOURCE_EVENTS: { "num_partitions": 5, @@ -182,9 +176,8 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, - # Notification topics KafkaTopic.NOTIFICATION_EVENTS: { "num_partitions": 5, @@ -192,9 +185,8 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, - # System topics KafkaTopic.SYSTEM_EVENTS: { "num_partitions": 5, @@ -202,9 +194,8 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, - # Saga topics KafkaTopic.SAGA_EVENTS: { "num_partitions": 5, @@ -212,9 +203,8 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "604800000", # 7 days "compression.type": "gzip", - } + }, }, - # Infrastructure topics KafkaTopic.DEAD_LETTER_QUEUE: { "num_partitions": 3, @@ -222,7 +212,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "1209600000", # 14 days "compression.type": "gzip", - } + }, }, KafkaTopic.EVENT_BUS_STREAM: { "num_partitions": 10, @@ -230,7 +220,7 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "86400000", # 1 day "compression.type": "gzip", - } + }, }, KafkaTopic.WEBSOCKET_EVENTS: { "num_partitions": 5, @@ -238,6 +228,6 @@ def get_topic_configs() -> dict[KafkaTopic, dict[str, Any]]: "config": { "retention.ms": "86400000", # 1 day "compression.type": "gzip", - } + }, }, } diff --git a/backend/app/infrastructure/mappers/admin_mapper.py b/backend/app/infrastructure/mappers/admin_mapper.py index 10c9c008..fa192b16 100644 --- a/backend/app/infrastructure/mappers/admin_mapper.py +++ b/backend/app/infrastructure/mappers/admin_mapper.py @@ -26,7 +26,7 @@ ) from app.schemas_pydantic.user import User as ServiceUser -EMAIL_PATTERN = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') +EMAIL_PATTERN = re.compile(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$") class UserMapper: @@ -41,7 +41,7 @@ def to_mongo_document(user: DomainAdminUser) -> Dict[str, Any]: UserFields.IS_SUPERUSER: user.is_superuser, UserFields.HASHED_PASSWORD: user.hashed_password, UserFields.CREATED_AT: user.created_at, - UserFields.UPDATED_AT: user.updated_at + UserFields.UPDATED_AT: user.updated_at, } @staticmethod @@ -64,7 +64,7 @@ def from_mongo_document(data: Dict[str, Any]) -> DomainAdminUser: is_superuser=data.get(UserFields.IS_SUPERUSER, False), hashed_password=data.get(UserFields.HASHED_PASSWORD, ""), created_at=data.get(UserFields.CREATED_AT, datetime.now(timezone.utc)), - updated_at=data.get(UserFields.UPDATED_AT, datetime.now(timezone.utc)) + updated_at=data.get(UserFields.UPDATED_AT, datetime.now(timezone.utc)), ) @staticmethod @@ -80,7 +80,7 @@ def to_response_dict(user: DomainAdminUser) -> Dict[str, Any]: "is_active": user.is_active, "is_superuser": user.is_superuser, "created_at": created_at_ts, - "updated_at": updated_at_ts + "updated_at": updated_at_ts, } @staticmethod @@ -136,7 +136,7 @@ def user_creation_to_dict(creation: UserCreation) -> Dict[str, Any]: UserFields.IS_ACTIVE: creation.is_active, UserFields.IS_SUPERUSER: creation.is_superuser, UserFields.CREATED_AT: datetime.now(timezone.utc), - UserFields.UPDATED_AT: datetime.now(timezone.utc) + UserFields.UPDATED_AT: datetime.now(timezone.utc), } @@ -148,7 +148,7 @@ def to_dict(result: UserListResult) -> Dict[str, Any]: "users": [user_mapper.to_response_dict(user) for user in result.users], "total": result.total, "offset": result.offset, - "limit": result.limit + "limit": result.limit, } @@ -159,7 +159,7 @@ def execution_limits_to_dict(limits: ExecutionLimits) -> dict[str, int]: "max_timeout_seconds": limits.max_timeout_seconds, "max_memory_mb": limits.max_memory_mb, "max_cpu_cores": limits.max_cpu_cores, - "max_concurrent_executions": limits.max_concurrent_executions + "max_concurrent_executions": limits.max_concurrent_executions, } @staticmethod @@ -170,7 +170,7 @@ def execution_limits_from_dict(data: dict[str, Any] | None) -> ExecutionLimits: max_timeout_seconds=data.get("max_timeout_seconds", 300), max_memory_mb=data.get("max_memory_mb", 512), max_cpu_cores=data.get("max_cpu_cores", 2), - max_concurrent_executions=data.get("max_concurrent_executions", 10) + max_concurrent_executions=data.get("max_concurrent_executions", 10), ) @staticmethod @@ -179,7 +179,7 @@ def security_settings_to_dict(settings: SecuritySettings) -> dict[str, int]: "password_min_length": settings.password_min_length, "session_timeout_minutes": settings.session_timeout_minutes, "max_login_attempts": settings.max_login_attempts, - "lockout_duration_minutes": settings.lockout_duration_minutes + "lockout_duration_minutes": settings.lockout_duration_minutes, } @staticmethod @@ -190,7 +190,7 @@ def security_settings_from_dict(data: dict[str, Any] | None) -> SecuritySettings password_min_length=data.get("password_min_length", 8), session_timeout_minutes=data.get("session_timeout_minutes", 60), max_login_attempts=data.get("max_login_attempts", 5), - lockout_duration_minutes=data.get("lockout_duration_minutes", 15) + lockout_duration_minutes=data.get("lockout_duration_minutes", 15), ) @staticmethod @@ -199,7 +199,7 @@ def monitoring_settings_to_dict(settings: MonitoringSettings) -> dict[str, Any]: "metrics_retention_days": settings.metrics_retention_days, "log_level": settings.log_level.value, "enable_tracing": settings.enable_tracing, - "sampling_rate": settings.sampling_rate + "sampling_rate": settings.sampling_rate, } @staticmethod @@ -210,7 +210,7 @@ def monitoring_settings_from_dict(data: dict[str, Any] | None) -> MonitoringSett metrics_retention_days=data.get("metrics_retention_days", 30), log_level=LogLevel(data.get("log_level", LogLevel.INFO)), enable_tracing=data.get("enable_tracing", True), - sampling_rate=data.get("sampling_rate", 0.1) + sampling_rate=data.get("sampling_rate", 0.1), ) @staticmethod @@ -221,7 +221,7 @@ def system_settings_to_dict(settings: SystemSettings) -> dict[str, Any]: SettingsFields.SECURITY_SETTINGS: mapper.security_settings_to_dict(settings.security_settings), SettingsFields.MONITORING_SETTINGS: mapper.monitoring_settings_to_dict(settings.monitoring_settings), SettingsFields.CREATED_AT: settings.created_at, - SettingsFields.UPDATED_AT: settings.updated_at + SettingsFields.UPDATED_AT: settings.updated_at, } @staticmethod @@ -234,7 +234,7 @@ def system_settings_from_dict(data: dict[str, Any] | None) -> SystemSettings: security_settings=mapper.security_settings_from_dict(data.get(SettingsFields.SECURITY_SETTINGS)), monitoring_settings=mapper.monitoring_settings_from_dict(data.get(SettingsFields.MONITORING_SETTINGS)), created_at=data.get(SettingsFields.CREATED_AT, datetime.now(timezone.utc)), - updated_at=data.get(SettingsFields.UPDATED_AT, datetime.now(timezone.utc)) + updated_at=data.get(SettingsFields.UPDATED_AT, datetime.now(timezone.utc)), ) @staticmethod @@ -243,7 +243,7 @@ def system_settings_to_pydantic_dict(settings: SystemSettings) -> dict[str, Any] return { "execution_limits": mapper.execution_limits_to_dict(settings.execution_limits), "security_settings": mapper.security_settings_to_dict(settings.security_settings), - "monitoring_settings": mapper.monitoring_settings_to_dict(settings.monitoring_settings) + "monitoring_settings": mapper.monitoring_settings_to_dict(settings.monitoring_settings), } @staticmethod @@ -265,7 +265,7 @@ def to_dict(entry: AuditLogEntry) -> dict[str, Any]: AuditLogFields.USER_ID: entry.user_id, AuditLogFields.USERNAME: entry.username, AuditLogFields.CHANGES: entry.changes, - "reason": entry.reason # reason is not in the enum but used as additional field + "reason": entry.reason, # reason is not in the enum but used as additional field } @staticmethod @@ -276,5 +276,5 @@ def from_dict(data: dict[str, Any]) -> AuditLogEntry: user_id=data[AuditLogFields.USER_ID], username=data.get(AuditLogFields.USERNAME, ""), changes=data.get(AuditLogFields.CHANGES, {}), - reason=data.get("reason", "") + reason=data.get("reason", ""), ) diff --git a/backend/app/infrastructure/mappers/dlq_mapper.py b/backend/app/infrastructure/mappers/dlq_mapper.py index 3f9d3b22..9d1d20eb 100644 --- a/backend/app/infrastructure/mappers/dlq_mapper.py +++ b/backend/app/infrastructure/mappers/dlq_mapper.py @@ -188,11 +188,11 @@ def batch_retry_result_to_dict(result: DLQBatchRetryResult) -> dict[str, object] # Domain construction and updates @staticmethod def from_failed_event( - event: BaseEvent, - original_topic: str, - error: str, - producer_id: str, - retry_count: int = 0, + event: BaseEvent, + original_topic: str, + error: str, + producer_id: str, + retry_count: int = 0, ) -> DLQMessage: return DLQMessage( event=event, diff --git a/backend/app/infrastructure/mappers/event_mapper.py b/backend/app/infrastructure/mappers/event_mapper.py index d1b7b9d4..6ea1e790 100644 --- a/backend/app/infrastructure/mappers/event_mapper.py +++ b/backend/app/infrastructure/mappers/event_mapper.py @@ -22,7 +22,7 @@ class EventMapper: """Handles all Event serialization/deserialization.""" - + @staticmethod def to_mongo_document(event: Event) -> dict[str, Any]: """Convert domain event to MongoDB document.""" @@ -32,9 +32,9 @@ def to_mongo_document(event: Event) -> dict[str, Any]: EventFields.EVENT_VERSION: event.event_version, EventFields.TIMESTAMP: event.timestamp, EventFields.METADATA: event.metadata.to_dict(), - EventFields.PAYLOAD: event.payload + EventFields.PAYLOAD: event.payload, } - + if event.aggregate_id is not None: doc[EventFields.AGGREGATE_ID] = event.aggregate_id if event.stored_at is not None: @@ -45,23 +45,31 @@ def to_mongo_document(event: Event) -> dict[str, Any]: doc[EventFields.STATUS] = event.status if event.error is not None: doc[EventFields.ERROR] = event.error - + return doc - + @staticmethod def from_mongo_document(document: dict[str, Any]) -> Event: """Create domain event from MongoDB document.""" # Define base event fields that should NOT be in payload base_fields = { - EventFields.EVENT_ID, EventFields.EVENT_TYPE, EventFields.EVENT_VERSION, - EventFields.TIMESTAMP, EventFields.METADATA, EventFields.AGGREGATE_ID, - EventFields.STORED_AT, EventFields.TTL_EXPIRES_AT, EventFields.STATUS, - EventFields.ERROR, "_id", "stored_at" + EventFields.EVENT_ID, + EventFields.EVENT_TYPE, + EventFields.EVENT_VERSION, + EventFields.TIMESTAMP, + EventFields.METADATA, + EventFields.AGGREGATE_ID, + EventFields.STORED_AT, + EventFields.TTL_EXPIRES_AT, + EventFields.STATUS, + EventFields.ERROR, + "_id", + "stored_at", } - + # Extract all non-base fields as payload payload = {k: v for k, v in document.items() if k not in base_fields} - + return Event( event_id=document[EventFields.EVENT_ID], event_type=document[EventFields.EVENT_TYPE], @@ -73,9 +81,9 @@ def from_mongo_document(document: dict[str, Any]) -> Event: stored_at=document.get(EventFields.STORED_AT), ttl_expires_at=document.get(EventFields.TTL_EXPIRES_AT), status=document.get(EventFields.STATUS), - error=document.get(EventFields.ERROR) + error=document.get(EventFields.ERROR), ) - + @staticmethod def to_dict(event: Event) -> dict[str, Any]: """Convert event to API response dictionary.""" @@ -85,9 +93,9 @@ def to_dict(event: Event) -> dict[str, Any]: "event_version": event.event_version, "timestamp": event.timestamp, "metadata": event.metadata.to_dict(), - "payload": event.payload + "payload": event.payload, } - + if event.aggregate_id is not None: result["aggregate_id"] = event.aggregate_id if event.correlation_id: @@ -100,9 +108,9 @@ def to_dict(event: Event) -> dict[str, Any]: result["status"] = event.status if event.error is not None: result["error"] = event.error - + return result - + @staticmethod def from_dict(data: dict[str, Any]) -> Event: """Create event from API request dictionary.""" @@ -117,52 +125,52 @@ def from_dict(data: dict[str, Any]) -> Event: stored_at=data.get("stored_at"), ttl_expires_at=data.get("ttl_expires_at"), status=data.get("status"), - error=data.get("error") + error=data.get("error"), ) class EventSummaryMapper: """Handles EventSummary serialization.""" - + @staticmethod def to_dict(summary: EventSummary) -> dict[EventFields, Any]: result = { EventFields.EVENT_ID: summary.event_id, EventFields.EVENT_TYPE: summary.event_type, - EventFields.TIMESTAMP: summary.timestamp + EventFields.TIMESTAMP: summary.timestamp, } if summary.aggregate_id is not None: result[EventFields.AGGREGATE_ID] = summary.aggregate_id return result - + @staticmethod def from_mongo_document(document: dict[str, Any]) -> EventSummary: return EventSummary( event_id=document[EventFields.EVENT_ID], event_type=document[EventFields.EVENT_TYPE], timestamp=document[EventFields.TIMESTAMP], - aggregate_id=document.get(EventFields.AGGREGATE_ID) + aggregate_id=document.get(EventFields.AGGREGATE_ID), ) class EventDetailMapper: """Handles EventDetail serialization.""" - + @staticmethod def to_dict(detail: EventDetail) -> dict[str, Any]: event_mapper = EventMapper() summary_mapper = EventSummaryMapper() - + return { "event": event_mapper.to_dict(detail.event), "related_events": [summary_mapper.to_dict(e) for e in detail.related_events], - "timeline": [summary_mapper.to_dict(e) for e in detail.timeline] + "timeline": [summary_mapper.to_dict(e) for e in detail.timeline], } class EventListResultMapper: """Handles EventListResult serialization.""" - + @staticmethod def to_dict(result: EventListResult) -> dict[str, Any]: event_mapper = EventMapper() @@ -171,13 +179,13 @@ def to_dict(result: EventListResult) -> dict[str, Any]: "total": result.total, "skip": result.skip, "limit": result.limit, - "has_more": result.has_more + "has_more": result.has_more, } class EventBrowseResultMapper: """Handles EventBrowseResult serialization.""" - + @staticmethod def to_dict(result: EventBrowseResult) -> dict[str, Any]: event_mapper = EventMapper() @@ -185,13 +193,13 @@ def to_dict(result: EventBrowseResult) -> dict[str, Any]: "events": [event_mapper.to_dict(event) for event in result.events], "total": result.total, "skip": result.skip, - "limit": result.limit + "limit": result.limit, } class EventStatisticsMapper: """Handles EventStatistics serialization.""" - + @staticmethod def to_dict(stats: EventStatistics) -> dict[str, Any]: result: dict[str, Any] = { @@ -202,61 +210,58 @@ def to_dict(stats: EventStatistics) -> dict[str, Any]: {"hour": h.hour, "count": h.count} if isinstance(h, HourlyEventCount) else h for h in stats.events_by_hour ], - "top_users": [ - {"user_id": u.user_id, "event_count": u.event_count} - for u in stats.top_users - ], + "top_users": [{"user_id": u.user_id, "event_count": u.event_count} for u in stats.top_users], "error_rate": stats.error_rate, - "avg_processing_time": stats.avg_processing_time + "avg_processing_time": stats.avg_processing_time, } - + if stats.start_time is not None: result["start_time"] = stats.start_time if stats.end_time is not None: result["end_time"] = stats.end_time - + return result class EventProjectionMapper: """Handles EventProjection serialization.""" - + @staticmethod def to_dict(projection: EventProjection) -> dict[str, Any]: result: dict[str, Any] = { "name": projection.name, "pipeline": projection.pipeline, "output_collection": projection.output_collection, - "refresh_interval_seconds": projection.refresh_interval_seconds + "refresh_interval_seconds": projection.refresh_interval_seconds, } - + if projection.description is not None: result["description"] = projection.description if projection.source_events is not None: result["source_events"] = projection.source_events if projection.last_updated is not None: result["last_updated"] = projection.last_updated - + return result class ArchivedEventMapper: """Handles ArchivedEvent serialization.""" - + @staticmethod def to_mongo_document(event: ArchivedEvent) -> dict[str, Any]: event_mapper = EventMapper() doc = event_mapper.to_mongo_document(event) - + if event.deleted_at is not None: doc[EventFields.DELETED_AT] = event.deleted_at if event.deleted_by is not None: doc[EventFields.DELETED_BY] = event.deleted_by if event.deletion_reason is not None: doc[EventFields.DELETION_REASON] = event.deletion_reason - + return doc - + @staticmethod def from_event(event: Event, deleted_by: str, deletion_reason: str) -> ArchivedEvent: return ArchivedEvent( @@ -273,13 +278,13 @@ def from_event(event: Event, deleted_by: str, deletion_reason: str) -> ArchivedE error=event.error, deleted_at=datetime.now(timezone.utc), deleted_by=deleted_by, - deletion_reason=deletion_reason + deletion_reason=deletion_reason, ) class EventExportRowMapper: """Handles EventExportRow serialization.""" - + @staticmethod def to_dict(row: EventExportRow) -> dict[str, str]: return { @@ -291,7 +296,7 @@ def to_dict(row: EventExportRow) -> dict[str, str]: "User ID": row.user_id, "Service": row.service, "Status": row.status, - "Error": row.error + "Error": row.error, } @staticmethod @@ -363,7 +368,7 @@ def from_admin_pydantic(pflt: AdminEventFilter) -> EventFilter: class EventReplayInfoMapper: """Handles EventReplayInfo serialization.""" - + @staticmethod def to_dict(info: EventReplayInfo) -> dict[str, Any]: event_mapper = EventMapper() @@ -372,5 +377,5 @@ def to_dict(info: EventReplayInfo) -> dict[str, Any]: "event_count": info.event_count, "event_types": info.event_types, "start_time": info.start_time, - "end_time": info.end_time + "end_time": info.end_time, } diff --git a/backend/app/infrastructure/mappers/execution_api_mapper.py b/backend/app/infrastructure/mappers/execution_api_mapper.py index 2f6f7ff9..a72105a7 100644 --- a/backend/app/infrastructure/mappers/execution_api_mapper.py +++ b/backend/app/infrastructure/mappers/execution_api_mapper.py @@ -22,6 +22,7 @@ def to_result(e: DomainExecution) -> ExecutionResult: ru = None if isinstance(e.resource_usage, ResourceUsageDomain): ru = ResourceUsageSchema(**e.resource_usage.to_dict()) + # Map domain ExecutionErrorType -> public ErrorType def _map_error(t: Optional[ExecutionErrorType]) -> Optional[ErrorType]: if t is None: @@ -30,6 +31,7 @@ def _map_error(t: Optional[ExecutionErrorType]) -> Optional[ErrorType]: return ErrorType.SCRIPT_ERROR # TIMEOUT, RESOURCE_LIMIT, SYSTEM_ERROR, PERMISSION_DENIED -> SYSTEM_ERROR class return ErrorType.SYSTEM_ERROR + return ExecutionResult( execution_id=e.execution_id, status=e.status, diff --git a/backend/app/infrastructure/mappers/notification_mapper.py b/backend/app/infrastructure/mappers/notification_mapper.py index 8edc32c3..f5e7e63b 100644 --- a/backend/app/infrastructure/mappers/notification_mapper.py +++ b/backend/app/infrastructure/mappers/notification_mapper.py @@ -1,4 +1,5 @@ from dataclasses import asdict, fields +from typing import Any from app.domain.notification import ( DomainNotification, @@ -11,28 +12,28 @@ class NotificationMapper: # DomainNotification @staticmethod - def to_mongo_document(notification: DomainNotification) -> dict: + def to_mongo_document(notification: DomainNotification) -> dict[str, Any]: return asdict(notification) @staticmethod - def to_update_dict(notification: DomainNotification) -> dict: + def to_update_dict(notification: DomainNotification) -> dict[str, Any]: doc = asdict(notification) doc.pop("notification_id", None) return doc @staticmethod - def from_mongo_document(doc: dict) -> DomainNotification: + def from_mongo_document(doc: dict[str, Any]) -> DomainNotification: allowed = {f.name for f in fields(DomainNotification)} filtered = {k: v for k, v in doc.items() if k in allowed} return DomainNotification(**filtered) # DomainNotificationSubscription @staticmethod - def subscription_to_mongo_document(subscription: DomainNotificationSubscription) -> dict: + def subscription_to_mongo_document(subscription: DomainNotificationSubscription) -> dict[str, Any]: return asdict(subscription) @staticmethod - def subscription_from_mongo_document(doc: dict) -> DomainNotificationSubscription: + def subscription_from_mongo_document(doc: dict[str, Any]) -> DomainNotificationSubscription: allowed = {f.name for f in fields(DomainNotificationSubscription)} filtered = {k: v for k, v in doc.items() if k in allowed} return DomainNotificationSubscription(**filtered) diff --git a/backend/app/infrastructure/mappers/rate_limit_mapper.py b/backend/app/infrastructure/mappers/rate_limit_mapper.py index fe7bbc78..bca38d2e 100644 --- a/backend/app/infrastructure/mappers/rate_limit_mapper.py +++ b/backend/app/infrastructure/mappers/rate_limit_mapper.py @@ -24,9 +24,9 @@ def to_dict(rule: RateLimitRule) -> Dict[str, Any]: "burst_multiplier": rule.burst_multiplier, "algorithm": rule.algorithm.value, "priority": rule.priority, - "enabled": rule.enabled + "enabled": rule.enabled, } - + @staticmethod def from_dict(data: Dict[str, Any]) -> RateLimitRule: return RateLimitRule( @@ -37,7 +37,7 @@ def from_dict(data: Dict[str, Any]) -> RateLimitRule: burst_multiplier=data.get("burst_multiplier", 1.5), algorithm=RateLimitAlgorithm(data.get("algorithm", RateLimitAlgorithm.SLIDING_WINDOW)), priority=data.get("priority", 0), - enabled=data.get("enabled", True) + enabled=data.get("enabled", True), ) @@ -52,25 +52,25 @@ def to_dict(user_limit: UserRateLimit) -> Dict[str, Any]: "rules": [rule_mapper.to_dict(rule) for rule in user_limit.rules], "created_at": user_limit.created_at.isoformat() if user_limit.created_at else None, "updated_at": user_limit.updated_at.isoformat() if user_limit.updated_at else None, - "notes": user_limit.notes + "notes": user_limit.notes, } - + @staticmethod def from_dict(data: Dict[str, Any]) -> UserRateLimit: rule_mapper = RateLimitRuleMapper() - + created_at = data.get("created_at") if created_at and isinstance(created_at, str): created_at = datetime.fromisoformat(created_at) elif not created_at: created_at = datetime.now(timezone.utc) - + updated_at = data.get("updated_at") if updated_at and isinstance(updated_at, str): updated_at = datetime.fromisoformat(updated_at) elif not updated_at: updated_at = datetime.now(timezone.utc) - + return UserRateLimit( user_id=data["user_id"], bypass_rate_limit=data.get("bypass_rate_limit", False), @@ -78,9 +78,9 @@ def from_dict(data: Dict[str, Any]) -> UserRateLimit: rules=[rule_mapper.from_dict(rule_data) for rule_data in data.get("rules", [])], created_at=created_at, updated_at=updated_at, - notes=data.get("notes") + notes=data.get("notes"), ) - + @staticmethod def model_dump(user_limit: UserRateLimit) -> Dict[str, Any]: """Pydantic-compatible method for serialization.""" @@ -94,31 +94,32 @@ def to_dict(config: RateLimitConfig) -> Dict[str, Any]: user_mapper = UserRateLimitMapper() return { "default_rules": [rule_mapper.to_dict(rule) for rule in config.default_rules], - "user_overrides": {uid: user_mapper.to_dict(user_limit) - for uid, user_limit in config.user_overrides.items()}, + "user_overrides": { + uid: user_mapper.to_dict(user_limit) for uid, user_limit in config.user_overrides.items() + }, "global_enabled": config.global_enabled, - "redis_ttl": config.redis_ttl + "redis_ttl": config.redis_ttl, } - + @staticmethod def from_dict(data: Dict[str, Any]) -> RateLimitConfig: rule_mapper = RateLimitRuleMapper() user_mapper = UserRateLimitMapper() return RateLimitConfig( - default_rules=[rule_mapper.from_dict(rule_data) - for rule_data in data.get("default_rules", [])], - user_overrides={uid: user_mapper.from_dict(user_data) - for uid, user_data in data.get("user_overrides", {}).items()}, + default_rules=[rule_mapper.from_dict(rule_data) for rule_data in data.get("default_rules", [])], + user_overrides={ + uid: user_mapper.from_dict(user_data) for uid, user_data in data.get("user_overrides", {}).items() + }, global_enabled=data.get("global_enabled", True), - redis_ttl=data.get("redis_ttl", 3600) + redis_ttl=data.get("redis_ttl", 3600), ) - + @staticmethod def model_validate_json(json_str: str | bytes) -> RateLimitConfig: """Pydantic-compatible method for deserialization from JSON.""" data = json.loads(json_str) return RateLimitConfigMapper.from_dict(data) - + @staticmethod def model_dump_json(config: RateLimitConfig) -> str: """Pydantic-compatible method for serialization to JSON.""" diff --git a/backend/app/infrastructure/mappers/replay_mapper.py b/backend/app/infrastructure/mappers/replay_mapper.py index d903f393..c1ee2efb 100644 --- a/backend/app/infrastructure/mappers/replay_mapper.py +++ b/backend/app/infrastructure/mappers/replay_mapper.py @@ -30,7 +30,7 @@ def to_dict(session: ReplaySession) -> dict[str, Any]: ReplaySessionFields.CORRELATION_ID: session.correlation_id, ReplaySessionFields.CREATED_AT: session.created_at, ReplaySessionFields.DRY_RUN: session.dry_run, - "triggered_executions": session.triggered_executions + "triggered_executions": session.triggered_executions, } if session.started_at: @@ -64,7 +64,7 @@ def from_dict(data: dict[str, Any]) -> ReplaySession: created_by=data.get(ReplaySessionFields.CREATED_BY), target_service=data.get(ReplaySessionFields.TARGET_SERVICE), dry_run=data.get(ReplaySessionFields.DRY_RUN, False), - triggered_executions=data.get("triggered_executions", []) + triggered_executions=data.get("triggered_executions", []), ) @staticmethod @@ -82,7 +82,7 @@ def status_detail_to_dict(detail: ReplaySessionStatusDetail) -> dict[str, Any]: "completed_at": detail.session.completed_at, "error": detail.session.error, "progress_percentage": detail.session.progress_percentage, - "execution_results": detail.execution_results + "execution_results": detail.execution_results, } if detail.estimated_completion: @@ -157,7 +157,7 @@ def to_dict(data: ReplaySessionData) -> dict[str, Any]: "dry_run": data.dry_run, "total_events": data.total_events, "replay_correlation_id": data.replay_correlation_id, - "query": data.query + "query": data.query, } if data.dry_run and data.events_preview: @@ -166,7 +166,7 @@ def to_dict(data: ReplaySessionData) -> dict[str, Any]: "event_id": e.event_id, "event_type": e.event_type, "timestamp": e.timestamp, - "aggregate_id": e.aggregate_id + "aggregate_id": e.aggregate_id, } for e in data.events_preview ] diff --git a/backend/app/infrastructure/mappers/saga_mapper.py b/backend/app/infrastructure/mappers/saga_mapper.py index f07f2c55..fe2ef1b6 100644 --- a/backend/app/infrastructure/mappers/saga_mapper.py +++ b/backend/app/infrastructure/mappers/saga_mapper.py @@ -25,7 +25,7 @@ def from_mongo(self, doc: dict[str, Any]) -> Saga: created_at=doc["created_at"], updated_at=doc["updated_at"], completed_at=doc.get("completed_at"), - retry_count=doc.get("retry_count", 0) + retry_count=doc.get("retry_count", 0), ) def to_mongo(self, saga: Saga) -> dict[str, Any]: @@ -51,7 +51,7 @@ def to_mongo(self, saga: Saga) -> dict[str, Any]: "created_at": saga.created_at, "updated_at": saga.updated_at, "completed_at": saga.completed_at, - "retry_count": saga.retry_count + "retry_count": saga.retry_count, } def from_instance(self, instance: SagaInstance) -> Saga: @@ -86,7 +86,7 @@ def to_dict(self, saga: Saga) -> dict[str, Any]: "created_at": saga.created_at.isoformat(), "updated_at": saga.updated_at.isoformat(), "completed_at": saga.completed_at.isoformat() if saga.completed_at else None, - "retry_count": saga.retry_count + "retry_count": saga.retry_count, } @@ -118,7 +118,6 @@ class SagaInstanceMapper: @staticmethod def from_mongo(doc: dict[str, Any]) -> SagaInstance: - # Robust state conversion raw_state = doc.get("state", SagaState.CREATED) try: @@ -140,13 +139,13 @@ def from_mongo(doc: dict[str, Any]) -> SagaInstance: "completed_at": doc.get("completed_at"), "retry_count": int(doc.get("retry_count", 0)), } - + # Only add datetime fields if they exist and are valid if doc.get("created_at"): kwargs["created_at"] = doc.get("created_at") if doc.get("updated_at"): kwargs["updated_at"] = doc.get("updated_at") - + return SagaInstance(**kwargs) @staticmethod diff --git a/backend/app/infrastructure/mappers/sse_mapper.py b/backend/app/infrastructure/mappers/sse_mapper.py index 85f1145e..aa8f6b3f 100644 --- a/backend/app/infrastructure/mappers/sse_mapper.py +++ b/backend/app/infrastructure/mappers/sse_mapper.py @@ -23,7 +23,7 @@ def to_execution_status(execution_id: str, status: str) -> SSEExecutionStatusDom def event_from_mongo_document(doc: Dict[str, Any]) -> SSEEventDomain: return SSEEventDomain( aggregate_id=str(doc.get("aggregate_id", "")), - timestamp=doc.get("timestamp"), + timestamp=doc["timestamp"], ) # Executions diff --git a/backend/app/infrastructure/mappers/user_settings_api_mapper.py b/backend/app/infrastructure/mappers/user_settings_api_mapper.py index 91106e8c..95afb3fd 100644 --- a/backend/app/infrastructure/mappers/user_settings_api_mapper.py +++ b/backend/app/infrastructure/mappers/user_settings_api_mapper.py @@ -22,8 +22,9 @@ class UserSettingsApiMapper: @staticmethod def to_domain_update(upd: UserSettingsUpdate) -> DomainUserSettingsUpdate: - notifications = UserSettingsApiMapper._to_domain_notifications(upd.notifications) \ - if upd.notifications is not None else None + notifications = ( + UserSettingsApiMapper._to_domain_notifications(upd.notifications) if upd.notifications is not None else None + ) return DomainUserSettingsUpdate( theme=upd.theme, timezone=upd.timezone, @@ -99,4 +100,3 @@ def history_to_api(items: List[DomainSettingsHistoryEntry]) -> SettingsHistoryRe for i in items ] return SettingsHistoryResponse(history=entries, total=len(entries)) - diff --git a/backend/app/runtime_registry.py b/backend/app/runtime_registry.py index 40917d77..cf4dd441 100644 --- a/backend/app/runtime_registry.py +++ b/backend/app/runtime_registry.py @@ -168,11 +168,7 @@ def _make_runtime_configs() -> dict[str, dict[str, RuntimeConfig]]: v: RuntimeConfig( image=image_tpl.format(version=v), file_name=file_name, - command=( - interpreter_cmd - if "{file}" in " ".join(interpreter_cmd) - else interpreter_cmd + [full_path] - ), + command=(interpreter_cmd if "{file}" in " ".join(interpreter_cmd) else interpreter_cmd + [full_path]), ) for v in versions } @@ -182,6 +178,4 @@ def _make_runtime_configs() -> dict[str, dict[str, RuntimeConfig]]: RUNTIME_REGISTRY: dict[str, dict[str, RuntimeConfig]] = _make_runtime_configs() -SUPPORTED_RUNTIMES: dict[str, list[str]] = { - lang: list(versions.keys()) for lang, versions in RUNTIME_REGISTRY.items() -} +SUPPORTED_RUNTIMES: dict[str, list[str]] = {lang: list(versions.keys()) for lang, versions in RUNTIME_REGISTRY.items()} diff --git a/backend/app/schemas_pydantic/admin_events.py b/backend/app/schemas_pydantic/admin_events.py index 894c0aff..4212a1ce 100644 --- a/backend/app/schemas_pydantic/admin_events.py +++ b/backend/app/schemas_pydantic/admin_events.py @@ -8,6 +8,7 @@ class EventFilter(BaseModel): """Filter criteria for browsing events""" + event_types: List[EventType] | None = None aggregate_id: str | None = None correlation_id: str | None = None @@ -20,6 +21,7 @@ class EventFilter(BaseModel): class EventBrowseRequest(BaseModel): """Request model for browsing events""" + filters: EventFilter skip: int = 0 limit: int = Field(default=50, le=500) @@ -29,6 +31,7 @@ class EventBrowseRequest(BaseModel): class EventReplayRequest(BaseModel): """Request model for replaying events""" + event_ids: List[str] | None = None correlation_id: str | None = None aggregate_id: str | None = None @@ -40,6 +43,7 @@ class EventReplayRequest(BaseModel): class EventBrowseResponse(BaseModel): """Response model for browsing events""" + events: List[Dict[str, Any]] total: int skip: int @@ -48,6 +52,7 @@ class EventBrowseResponse(BaseModel): class EventDetailResponse(BaseModel): """Response model for event detail""" + event: Dict[str, Any] related_events: List[Dict[str, Any]] timeline: List[Dict[str, Any]] @@ -55,6 +60,7 @@ class EventDetailResponse(BaseModel): class EventReplayResponse(BaseModel): """Response model for event replay""" + dry_run: bool total_events: int replay_correlation_id: str @@ -65,6 +71,7 @@ class EventReplayResponse(BaseModel): class EventReplayStatusResponse(BaseModel): """Response model for replay status""" + session_id: str status: str total_events: int @@ -83,12 +90,14 @@ class EventReplayStatusResponse(BaseModel): class EventDeleteResponse(BaseModel): """Response model for event deletion""" + message: str event_id: str class EventStatsResponse(BaseModel): """Response model for event statistics""" + total_events: int events_by_type: Dict[str, int] events_by_hour: List[Dict[str, Any]] diff --git a/backend/app/schemas_pydantic/admin_settings.py b/backend/app/schemas_pydantic/admin_settings.py index 0528c149..23645420 100644 --- a/backend/app/schemas_pydantic/admin_settings.py +++ b/backend/app/schemas_pydantic/admin_settings.py @@ -3,6 +3,7 @@ class ExecutionLimitsSchema(BaseModel): """Execution resource limits schema.""" + max_timeout_seconds: int = Field(default=300, ge=10, le=3600, description="Maximum execution timeout") max_memory_mb: int = Field(default=512, ge=128, le=4096, description="Maximum memory in MB") max_cpu_cores: int = Field(default=2, ge=1, le=8, description="Maximum CPU cores") @@ -11,6 +12,7 @@ class ExecutionLimitsSchema(BaseModel): class SecuritySettingsSchema(BaseModel): """Security configuration schema.""" + password_min_length: int = Field(default=8, ge=6, le=32, description="Minimum password length") session_timeout_minutes: int = Field(default=60, ge=5, le=1440, description="Session timeout in minutes") max_login_attempts: int = Field(default=5, ge=3, le=10, description="Maximum login attempts") @@ -19,6 +21,7 @@ class SecuritySettingsSchema(BaseModel): class MonitoringSettingsSchema(BaseModel): """Monitoring and observability schema.""" + metrics_retention_days: int = Field(default=30, ge=7, le=90, description="Metrics retention in days") log_level: str = Field(default="INFO", pattern="^(DEBUG|INFO|WARNING|ERROR|CRITICAL)$", description="Log level") enable_tracing: bool = Field(default=True, description="Enable distributed tracing") @@ -27,6 +30,7 @@ class MonitoringSettingsSchema(BaseModel): class SystemSettings(BaseModel): """System-wide settings model.""" + model_config = ConfigDict(extra="ignore") execution_limits: ExecutionLimitsSchema = Field(default_factory=ExecutionLimitsSchema) diff --git a/backend/app/schemas_pydantic/dlq.py b/backend/app/schemas_pydantic/dlq.py index 690b6c35..b0e9f8e2 100644 --- a/backend/app/schemas_pydantic/dlq.py +++ b/backend/app/schemas_pydantic/dlq.py @@ -8,6 +8,7 @@ class DLQStats(BaseModel): """Statistics for the Dead Letter Queue.""" + by_status: dict[str, int] by_topic: list[dict[str, Any]] by_event_type: list[dict[str, Any]] @@ -17,6 +18,7 @@ class DLQStats(BaseModel): class DLQMessageResponse(BaseModel): """Response model for a DLQ message.""" + event_id: str event_type: str original_topic: str @@ -30,6 +32,7 @@ class DLQMessageResponse(BaseModel): class RetryPolicyRequest(BaseModel): """Request model for setting a retry policy.""" + topic: str strategy: RetryStrategy max_retries: int = 5 @@ -40,11 +43,13 @@ class RetryPolicyRequest(BaseModel): class ManualRetryRequest(BaseModel): """Request model for manual retry of messages.""" + event_ids: list[str] class DLQMessagesResponse(BaseModel): """Response model for listing DLQ messages.""" + messages: list[DLQMessageResponse] total: int offset: int @@ -53,6 +58,7 @@ class DLQMessagesResponse(BaseModel): class DLQBatchRetryResponse(BaseModel): """Response model for batch retry operation.""" + total: int successful: int failed: int @@ -61,6 +67,7 @@ class DLQBatchRetryResponse(BaseModel): class DLQTopicSummaryResponse(BaseModel): """Response model for topic summary.""" + topic: str total_messages: int status_breakdown: dict[str, int] @@ -72,6 +79,7 @@ class DLQTopicSummaryResponse(BaseModel): class DLQMessageDetail(BaseModel): """Detailed DLQ message response.""" + event_id: str event: dict[str, Any] # BaseEvent as dict event_type: str diff --git a/backend/app/schemas_pydantic/events.py b/backend/app/schemas_pydantic/events.py index e07659e3..daa6c8bf 100644 --- a/backend/app/schemas_pydantic/events.py +++ b/backend/app/schemas_pydantic/events.py @@ -3,6 +3,7 @@ This module contains Pydantic models for event-related API requests and responses. For Avro-based event schemas used in Kafka streaming, see app.schemas_avro.event_schemas. """ + from datetime import datetime, timedelta, timezone from typing import Any, Dict, List from uuid import uuid4 @@ -37,6 +38,7 @@ class EventListResponse(BaseModel): class EventFilterRequest(BaseModel): """Request model for filtering events.""" + event_types: List[EventType] | None = Field(None, description="Filter by event types") aggregate_id: str | None = Field(None, description="Filter by aggregate ID") correlation_id: str | None = Field(None, description="Filter by correlation ID") @@ -53,10 +55,7 @@ class EventFilterRequest(BaseModel): @field_validator("sort_by") @classmethod def validate_sort_field(cls, v: str) -> str: - allowed_fields = { - "timestamp", "event_type", "aggregate_id", - "correlation_id", "stored_at" - } + allowed_fields = {"timestamp", "event_type", "aggregate_id", "correlation_id", "stored_at"} if v not in allowed_fields: raise ValueError(f"Sort field must be one of {allowed_fields}") return v @@ -64,15 +63,14 @@ def validate_sort_field(cls, v: str) -> str: class EventAggregationRequest(BaseModel): """Request model for event aggregation queries.""" - pipeline: List[Dict[str, Any]] = Field( - ..., - description="MongoDB aggregation pipeline" - ) + + pipeline: List[Dict[str, Any]] = Field(..., description="MongoDB aggregation pipeline") limit: int = Field(100, ge=1, le=1000) class PublishEventRequest(BaseModel): """Request model for publishing events.""" + event_type: EventType = Field(..., description="Type of event to publish") payload: Dict[str, Any] = Field(..., description="Event payload data") aggregate_id: str | None = Field(None, description="Aggregate root ID") @@ -83,6 +81,7 @@ class PublishEventRequest(BaseModel): class EventBase(BaseModel): """Base event model for API responses.""" + event_id: str = Field(default_factory=lambda: str(uuid4())) event_type: EventType event_version: str = "1.0" @@ -106,14 +105,14 @@ class EventBase(BaseModel): "user_id": "user-789", "service_name": "api-gateway", "service_version": "1.0.0", - "ip_address": "192.168.1.1" + "ip_address": "192.168.1.1", }, "payload": { "execution_id": "execution-123", "script": "print('hello')", "language": "python", - "version": "3.11" - } + "version": "3.11", + }, } } ) @@ -121,6 +120,7 @@ class EventBase(BaseModel): class ExecutionEventPayload(BaseModel): """Common payload for execution-related events in API responses.""" + execution_id: str user_id: str status: str | None = None @@ -135,6 +135,7 @@ class ExecutionEventPayload(BaseModel): class PodEventPayload(BaseModel): """Common payload for pod-related events in API responses.""" + pod_name: str namespace: str execution_id: str @@ -148,12 +149,14 @@ class PodEventPayload(BaseModel): class EventInDB(EventBase): """Event as stored in database.""" + stored_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) ttl_expires_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc) + timedelta(days=30)) class EventQuery(BaseModel): """Query parameters for event search.""" + event_types: List[EventType] | None = None aggregate_id: str | None = None correlation_id: str | None = None @@ -173,7 +176,7 @@ class EventQuery(BaseModel): "start_time": "2024-01-20T00:00:00Z", "end_time": "2024-01-20T23:59:59Z", "limit": 100, - "skip": 0 + "skip": 0, } } ) @@ -181,6 +184,7 @@ class EventQuery(BaseModel): class EventStatistics(BaseModel): """Event statistics response.""" + total_events: int events_by_type: Dict[str, int] events_by_service: Dict[str, int] @@ -195,16 +199,13 @@ class EventStatistics(BaseModel): "events_by_type": { EventType.EXECUTION_REQUESTED: 523, EventType.EXECUTION_COMPLETED: 498, - EventType.POD_CREATED: 522 - }, - "events_by_service": { - "api-gateway": 523, - "execution-service": 1020 + EventType.POD_CREATED: 522, }, + "events_by_service": {"api-gateway": 523, "execution-service": 1020}, "events_by_hour": [ {"hour": "2024-01-20 10:00", "count": 85}, - {"hour": "2024-01-20 11:00", "count": 92} - ] + {"hour": "2024-01-20 11:00", "count": 92}, + ], } } ) @@ -212,6 +213,7 @@ class EventStatistics(BaseModel): class EventProjection(BaseModel): """Configuration for event projections.""" + name: str description: str | None = None source_events: List[EventType] # Event types to include @@ -227,18 +229,16 @@ class EventProjection(BaseModel): "description": "Summary of executions by user and status", "source_events": [EventType.EXECUTION_REQUESTED, EventType.EXECUTION_COMPLETED], "aggregation_pipeline": [ - {"$match": {"event_type": {"$in": [EventType.EXECUTION_REQUESTED, - EventType.EXECUTION_COMPLETED]}}}, - {"$group": { - "_id": { - "user_id": "$metadata.user_id", - "status": "$payload.status" - }, - "count": {"$sum": 1} - }} + {"$match": {"event_type": {"$in": [EventType.EXECUTION_REQUESTED, EventType.EXECUTION_COMPLETED]}}}, + { + "$group": { + "_id": {"user_id": "$metadata.user_id", "status": "$payload.status"}, + "count": {"$sum": 1}, + } + }, ], "output_collection": "execution_summary", - "refresh_interval_seconds": 300 + "refresh_interval_seconds": 300, } } ) @@ -246,6 +246,7 @@ class EventProjection(BaseModel): class ResourceUsage(BaseModel): """Resource usage statistics.""" + cpu_seconds: float memory_mb_seconds: float disk_io_mb: float @@ -254,6 +255,7 @@ class ResourceUsage(BaseModel): class PublishEventResponse(BaseModel): """Response model for publishing events""" + event_id: str status: str timestamp: datetime @@ -261,6 +263,7 @@ class PublishEventResponse(BaseModel): class DeleteEventResponse(BaseModel): """Response model for deleting events""" + message: str event_id: str deleted_at: datetime @@ -268,6 +271,7 @@ class DeleteEventResponse(BaseModel): class ReplayAggregateResponse(BaseModel): """Response model for replaying aggregate events""" + dry_run: bool aggregate_id: str event_count: int | None = None diff --git a/backend/app/schemas_pydantic/execution.py b/backend/app/schemas_pydantic/execution.py index fb513201..ad91cf9a 100644 --- a/backend/app/schemas_pydantic/execution.py +++ b/backend/app/schemas_pydantic/execution.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import datetime, timezone from typing import Any from uuid import uuid4 @@ -11,6 +13,7 @@ class ExecutionBase(BaseModel): """Base model for execution data.""" + script: str = Field(..., max_length=50000, description="Script content (max 50,000 characters)") status: ExecutionStatus = ExecutionStatus.QUEUED stdout: str | None = None @@ -21,59 +24,52 @@ class ExecutionBase(BaseModel): class ExecutionCreate(ExecutionBase): """Model for creating a new execution.""" + pass class ExecutionInDB(ExecutionBase): """Model for execution as stored in database.""" + execution_id: str = Field(default_factory=lambda: str(uuid4())) created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) - resource_usage: dict | None = None + resource_usage: ResourceUsage | None = None user_id: str | None = None exit_code: int | None = None error_type: ErrorType | None = None - model_config = ConfigDict( - populate_by_name=True - ) + model_config = ConfigDict(populate_by_name=True) class ExecutionUpdate(BaseModel): """Model for updating an execution.""" + status: ExecutionStatus | None = None stdout: str | None = None stderr: str | None = None - resource_usage: dict | None = None + resource_usage: ResourceUsage | None = None exit_code: int | None = None error_type: ErrorType | None = None class ResourceUsage(BaseModel): """Model for execution resource usage.""" - execution_time_wall_seconds: float | None = Field( - default=None, description="Wall clock execution time in seconds" - ) + + execution_time_wall_seconds: float | None = Field(default=None, description="Wall clock execution time in seconds") cpu_time_jiffies: int | None = Field( default=None, description="CPU time in jiffies (multiply by 10 for milliseconds)" ) - clk_tck_hertz: int | None = Field( - default=None, description="Clock ticks per second (usually 100)" - ) - peak_memory_kb: int | None = Field( - default=None, description="Peak memory usage in KB" - ) + clk_tck_hertz: int | None = Field(default=None, description="Clock ticks per second (usually 100)") + peak_memory_kb: int | None = Field(default=None, description="Peak memory usage in KB") class ExecutionRequest(BaseModel): """Model for execution request.""" + script: str = Field(..., max_length=50000, description="Script content (max 50,000 characters)") - lang: str = Field( - default="python", description="Language name" - ) - lang_version: str = Field( - default="3.11", description="Language version to use for execution" - ) + lang: str = Field(default="python", description="Language name") + lang_version: str = Field(default="3.11", description="Language version to use for execution") @model_validator(mode="after") def validate_runtime_supported(self) -> "ExecutionRequest": # noqa: D401 @@ -83,24 +79,22 @@ def validate_runtime_supported(self) -> "ExecutionRequest": # noqa: D401 raise ValueError(f"Language '{self.lang}' not supported. Supported: {list(runtimes.keys())}") versions = runtimes.get(self.lang, []) if self.lang_version not in versions: - raise ValueError( - f"Version '{self.lang_version}' not supported for {self.lang}. Supported: {versions}" - ) + raise ValueError(f"Version '{self.lang_version}' not supported for {self.lang}. Supported: {versions}") return self class ExecutionResponse(BaseModel): """Model for execution response.""" + execution_id: str status: ExecutionStatus - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class ExecutionResult(BaseModel): """Model for execution result.""" + execution_id: str status: ExecutionStatus stdout: str | None = None @@ -111,13 +105,12 @@ class ExecutionResult(BaseModel): exit_code: int | None = None error_type: ErrorType | None = None - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class ResourceLimits(BaseModel): """Model for resource limits configuration.""" + cpu_limit: str memory_limit: str cpu_request: str @@ -128,22 +121,26 @@ class ResourceLimits(BaseModel): class ExampleScripts(BaseModel): """Model for example scripts.""" + scripts: dict[str, str] # lang: str with script class CancelExecutionRequest(BaseModel): """Model for cancelling an execution.""" + reason: str | None = Field(None, description="Reason for cancellation") class RetryExecutionRequest(BaseModel): """Model for retrying an execution.""" + reason: str | None = Field(None, description="Reason for retry") preserve_output: bool = Field(False, description="Keep output from previous attempt") class ExecutionEventResponse(BaseModel): """Model for execution event response.""" + event_id: str event_type: str timestamp: datetime @@ -152,6 +149,7 @@ class ExecutionEventResponse(BaseModel): class ExecutionListResponse(BaseModel): """Model for paginated execution list.""" + executions: list[ExecutionResult] total: int limit: int @@ -161,6 +159,7 @@ class ExecutionListResponse(BaseModel): class CancelResponse(BaseModel): """Model for execution cancellation response.""" + execution_id: str status: str message: str @@ -169,5 +168,6 @@ class CancelResponse(BaseModel): class DeleteResponse(BaseModel): """Model for execution deletion response.""" + message: str execution_id: str diff --git a/backend/app/schemas_pydantic/health_dashboard.py b/backend/app/schemas_pydantic/health_dashboard.py index 24b3a9bd..634bde38 100644 --- a/backend/app/schemas_pydantic/health_dashboard.py +++ b/backend/app/schemas_pydantic/health_dashboard.py @@ -7,6 +7,7 @@ class HealthAlert(BaseModel): """Health alert information.""" + id: str = Field(..., description="Unique alert identifier") severity: AlertSeverity = Field(..., description="Alert severity level") service: str = Field(..., description="Service name that triggered the alert") @@ -19,6 +20,7 @@ class HealthAlert(BaseModel): class HealthMetricsSummary(BaseModel): """Summary of health metrics for dashboard display""" + total_checks: int healthy_checks: int failed_checks: int @@ -29,6 +31,7 @@ class HealthMetricsSummary(BaseModel): class ServiceMetrics(BaseModel): """Detailed metrics for a specific service""" + service_name: str check_count_24h: int failure_count_24h: int @@ -42,6 +45,7 @@ class ServiceMetrics(BaseModel): class HealthTrend(BaseModel): """Health trend data point""" + timestamp: datetime = Field(..., description="Trend data timestamp") status: str healthy_count: int @@ -51,6 +55,7 @@ class HealthTrend(BaseModel): class ServiceHealth(BaseModel): """Service health information""" + name: str status: str uptime_percentage: float @@ -61,6 +66,7 @@ class ServiceHealth(BaseModel): class HealthDashboardResponse(BaseModel): """Complete health dashboard response""" + overall_status: str last_updated: datetime = Field(..., description="Dashboard last update timestamp") services: list[ServiceHealth] @@ -71,11 +77,13 @@ class HealthDashboardResponse(BaseModel): class SimpleHealthStatus(BaseModel): """Simple health status response for public endpoint.""" + status: str = Field(..., description="Health status: 'healthy' or 'unhealthy'") class HealthStatistics(BaseModel): """Health check statistics.""" + total_checks: int healthy: int degraded: int @@ -85,6 +93,7 @@ class HealthStatistics(BaseModel): class CategoryServices(BaseModel): """Services within a health category.""" + status: str message: str duration_ms: float @@ -93,6 +102,7 @@ class CategoryServices(BaseModel): class DetailedHealthStatus(BaseModel): """Detailed health status with all categories and statistics.""" + timestamp: str = Field(..., description="ISO timestamp of health check") overall_status: str = Field(..., description="Overall system health status") categories: dict[str, dict[str, CategoryServices]] = Field( @@ -103,6 +113,7 @@ class DetailedHealthStatus(BaseModel): class HealthCheckConfig(BaseModel): """Health check configuration details.""" + type: str | None = None critical: bool | None = None interval_seconds: float | None = None @@ -112,12 +123,14 @@ class HealthCheckConfig(BaseModel): class HealthCheckState(BaseModel): """Current state of health check.""" + consecutive_failures: int consecutive_successes: int class ServiceHealthDetails(BaseModel): """Detailed health information for a specific service.""" + name: str status: str message: str @@ -131,6 +144,7 @@ class ServiceHealthDetails(BaseModel): class CategoryHealthStatistics(BaseModel): """Statistics for a health category.""" + total: int healthy: int degraded: int @@ -139,6 +153,7 @@ class CategoryHealthStatistics(BaseModel): class CategoryHealthResponse(BaseModel): """Health information for a specific category.""" + category: str status: str services: dict[str, CategoryServices] = Field(default_factory=dict) @@ -147,6 +162,7 @@ class CategoryHealthResponse(BaseModel): class DependencyNode(BaseModel): """Service dependency graph node.""" + id: str label: str status: str @@ -156,6 +172,7 @@ class DependencyNode(BaseModel): class DependencyEdge(BaseModel): """Service dependency graph edge.""" + from_service: str = Field(..., alias="from") to_service: str = Field(..., alias="to") critical: bool @@ -165,12 +182,14 @@ class DependencyEdge(BaseModel): class DependencyGraph(BaseModel): """Service dependency graph structure.""" + nodes: list[DependencyNode] edges: list[DependencyEdge] class ServiceImpactAnalysis(BaseModel): """Impact analysis for an unhealthy service.""" + status: str affected_services: list[str] is_critical: bool @@ -178,6 +197,7 @@ class ServiceImpactAnalysis(BaseModel): class ServiceDependenciesResponse(BaseModel): """Service dependencies and impact analysis.""" + dependency_graph: DependencyGraph impact_analysis: dict[str, ServiceImpactAnalysis] total_services: int @@ -187,6 +207,7 @@ class ServiceDependenciesResponse(BaseModel): class HealthCheckTriggerResponse(BaseModel): """Response from manually triggered health check.""" + service: str status: str message: str @@ -199,6 +220,7 @@ class HealthCheckTriggerResponse(BaseModel): class ServiceHistoryDataPoint(BaseModel): """Single data point in service history.""" + timestamp: datetime status: str duration_ms: float @@ -207,6 +229,7 @@ class ServiceHistoryDataPoint(BaseModel): class ServiceHistorySummary(BaseModel): """Summary statistics for service history.""" + uptime_percentage: float total_checks: int healthy_checks: int @@ -215,6 +238,7 @@ class ServiceHistorySummary(BaseModel): class ServiceHistoryResponse(BaseModel): """Historical health data for a service.""" + service_name: str time_range_hours: int data_points: list[ServiceHistoryDataPoint] @@ -223,6 +247,7 @@ class ServiceHistoryResponse(BaseModel): class SystemMetrics(BaseModel): """System-level metrics for real-time status.""" + mongodb_connections: int mongodb_ops_per_sec: int kafka_total_lag: int @@ -232,6 +257,7 @@ class SystemMetrics(BaseModel): class ServiceRealtimeStatus(BaseModel): """Real-time status for a single service.""" + status: str message: str duration_ms: float @@ -241,6 +267,7 @@ class ServiceRealtimeStatus(BaseModel): class LastIncident(BaseModel): """Information about the last system incident.""" + time: datetime | None = None service: str | None = None duration_minutes: int | None = None @@ -248,6 +275,7 @@ class LastIncident(BaseModel): class RealtimeStatusResponse(BaseModel): """Real-time health status with live metrics.""" + timestamp: datetime overall_status: str services: dict[str, ServiceRealtimeStatus] diff --git a/backend/app/schemas_pydantic/notification.py b/backend/app/schemas_pydantic/notification.py index d208ca71..1cea301f 100644 --- a/backend/app/schemas_pydantic/notification.py +++ b/backend/app/schemas_pydantic/notification.py @@ -15,6 +15,7 @@ class Notification(BaseModel): """Individual notification instance""" + notification_id: str = Field(default_factory=lambda: str(uuid4())) user_id: str channel: NotificationChannel @@ -56,13 +57,12 @@ def validate_scheduled_for(cls, v: datetime | None) -> datetime | None: raise ValueError("scheduled_for must be in the future") return v - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class NotificationBatch(BaseModel): """Batch of notifications for bulk processing""" + batch_id: str = Field(default_factory=lambda: str(uuid4())) notifications: list[Notification] created_at: datetime = Field(default_factory=lambda: datetime.now(UTC)) @@ -78,9 +78,7 @@ def validate_notifications(cls, v: list[Notification]) -> list[Notification]: raise ValueError("Batch cannot exceed 1000 notifications") return v - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) # Rules removed in unified model @@ -88,6 +86,7 @@ def validate_notifications(cls, v: list[Notification]) -> list[Notification]: class NotificationSubscription(BaseModel): """User subscription preferences for notifications""" + user_id: str channel: NotificationChannel severities: list[NotificationSeverity] = Field(default_factory=list) @@ -111,13 +110,12 @@ class NotificationSubscription(BaseModel): created_at: datetime = Field(default_factory=lambda: datetime.now(UTC)) updated_at: datetime = Field(default_factory=lambda: datetime.now(UTC)) - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class NotificationStats(BaseModel): """Statistics for notification delivery""" + user_id: str | None = None channel: NotificationChannel | None = None tags: list[str] | None = None @@ -143,13 +141,12 @@ class NotificationStats(BaseModel): avg_delivery_time_seconds: float = 0.0 avg_read_time_seconds: float = 0.0 - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class NotificationResponse(BaseModel): """Response schema for notification endpoints""" + notification_id: str channel: NotificationChannel status: NotificationStatus @@ -161,24 +158,22 @@ class NotificationResponse(BaseModel): severity: NotificationSeverity tags: list[str] - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class NotificationListResponse(BaseModel): """Response schema for notification list endpoints""" + notifications: list[NotificationResponse] total: int unread_count: int - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class SubscriptionUpdate(BaseModel): """Request schema for updating notification subscriptions""" + enabled: bool severities: list[NotificationSeverity] = Field(default_factory=list) include_tags: list[str] = Field(default_factory=list) @@ -191,9 +186,7 @@ class SubscriptionUpdate(BaseModel): timezone: str = "UTC" batch_interval_minutes: int = 60 - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) # TestNotificationRequest removed in unified model; use Notification schema directly for test endpoints @@ -201,26 +194,23 @@ class SubscriptionUpdate(BaseModel): class SubscriptionsResponse(BaseModel): """Response schema for user subscriptions""" + subscriptions: list[NotificationSubscription] - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class UnreadCountResponse(BaseModel): """Response schema for unread notification count""" + unread_count: int - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class DeleteNotificationResponse(BaseModel): """Response schema for notification deletion""" + message: str - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) diff --git a/backend/app/schemas_pydantic/replay.py b/backend/app/schemas_pydantic/replay.py index 6bdfe22e..73bd7556 100644 --- a/backend/app/schemas_pydantic/replay.py +++ b/backend/app/schemas_pydantic/replay.py @@ -9,6 +9,7 @@ class ReplayRequest(BaseModel): """Request schema for creating replay sessions""" + replay_type: ReplayType target: ReplayTarget = ReplayTarget.KAFKA @@ -31,6 +32,7 @@ class ReplayRequest(BaseModel): class ReplayResponse(BaseModel): """Response schema for replay operations""" + session_id: str status: ReplayStatus message: str @@ -38,6 +40,7 @@ class ReplayResponse(BaseModel): class SessionSummary(BaseModel): """Summary information for replay sessions""" + session_id: str replay_type: ReplayType target: ReplayTarget @@ -55,5 +58,6 @@ class SessionSummary(BaseModel): class CleanupResponse(BaseModel): """Response schema for cleanup operations""" + removed_sessions: int message: str diff --git a/backend/app/schemas_pydantic/saga.py b/backend/app/schemas_pydantic/saga.py index 70b58a5f..ebf4abca 100644 --- a/backend/app/schemas_pydantic/saga.py +++ b/backend/app/schemas_pydantic/saga.py @@ -6,6 +6,7 @@ class SagaStatusResponse(BaseModel): """Response schema for saga status""" + saga_id: str saga_name: str execution_id: str @@ -34,18 +35,20 @@ def from_domain(cls, saga: "Saga") -> "SagaStatusResponse": created_at=saga.created_at.isoformat(), updated_at=saga.updated_at.isoformat(), completed_at=saga.completed_at.isoformat() if saga.completed_at else None, - retry_count=saga.retry_count + retry_count=saga.retry_count, ) class SagaListResponse(BaseModel): """Response schema for saga list""" + sagas: list[SagaStatusResponse] total: int class SagaCancellationResponse(BaseModel): """Response schema for saga cancellation""" + success: bool message: str saga_id: str diff --git a/backend/app/schemas_pydantic/saved_script.py b/backend/app/schemas_pydantic/saved_script.py index 6340d9a7..e315c656 100644 --- a/backend/app/schemas_pydantic/saved_script.py +++ b/backend/app/schemas_pydantic/saved_script.py @@ -11,9 +11,7 @@ class SavedScriptBase(BaseModel): lang_version: str = "3.11" description: str | None = None - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class SavedScriptCreate(SavedScriptBase): @@ -26,9 +24,7 @@ class SavedScriptInDB(SavedScriptBase): created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class SavedScriptUpdate(BaseModel): @@ -39,9 +35,7 @@ class SavedScriptUpdate(BaseModel): description: str | None = None updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class SavedScriptCreateRequest(SavedScriptBase): @@ -58,14 +52,10 @@ class SavedScriptResponse(BaseModel): created_at: datetime updated_at: datetime - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class SavedScriptListResponse(BaseModel): scripts: list[SavedScriptResponse] - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) diff --git a/backend/app/schemas_pydantic/sse.py b/backend/app/schemas_pydantic/sse.py index f2cc044c..ed2b22fd 100644 --- a/backend/app/schemas_pydantic/sse.py +++ b/backend/app/schemas_pydantic/sse.py @@ -6,24 +6,38 @@ class SSEEvent(BaseModel): """Base model for SSE events.""" + event: str = Field(description="Event type") data: str = Field(description="JSON-encoded event data") +class ShutdownStatusResponse(BaseModel): + """Response model for shutdown status.""" + + phase: str = Field(description="Current shutdown phase") + initiated: bool = Field(description="Whether shutdown has been initiated") + complete: bool = Field(description="Whether shutdown is complete") + active_connections: int = Field(description="Number of active connections") + draining_connections: int = Field(description="Number of connections being drained") + duration: float | None = Field(None, description="Duration of shutdown in seconds") + + class SSEHealthResponse(BaseModel): """Response model for SSE health check.""" + status: str = Field(description="Health status: healthy or draining") kafka_enabled: bool = Field(True, description="Whether Kafka features are enabled") active_connections: int = Field(description="Total number of active SSE connections") active_executions: int = Field(description="Number of executions being monitored") active_consumers: int = Field(description="Number of active Kafka consumers") max_connections_per_user: int = Field(description="Maximum connections allowed per user") - shutdown: Dict[str, Any] = Field(description="Shutdown status information") + shutdown: ShutdownStatusResponse = Field(description="Shutdown status information") timestamp: datetime = Field(description="Health check timestamp") class ExecutionStreamEvent(BaseModel): """Model for execution stream events.""" + event_id: str | None = Field(None, description="Unique event identifier") timestamp: datetime | None = Field(None, description="Event timestamp") type: str | None = Field(None, description="Event type") @@ -36,6 +50,7 @@ class ExecutionStreamEvent(BaseModel): class NotificationStreamEvent(BaseModel): """Model for notification stream events.""" + message: str = Field(description="Notification message") user_id: str = Field(description="User ID") timestamp: datetime = Field(description="Event timestamp") @@ -43,6 +58,7 @@ class NotificationStreamEvent(BaseModel): class HeartbeatEvent(BaseModel): """Model for heartbeat events.""" + timestamp: datetime = Field(description="Heartbeat timestamp") execution_id: str | None = Field(None, description="Associated execution ID") user_id: str | None = Field(None, description="Associated user ID") diff --git a/backend/app/schemas_pydantic/user.py b/backend/app/schemas_pydantic/user.py index 1c966b06..2a450977 100644 --- a/backend/app/schemas_pydantic/user.py +++ b/backend/app/schemas_pydantic/user.py @@ -9,54 +9,50 @@ class UserBase(BaseModel): """Base user model with common fields""" + username: str email: EmailStr role: UserRole = UserRole.USER is_active: bool = True - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class UserCreate(UserBase): """Model for creating a new user""" + password: str = Field(..., min_length=8) - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class UserInDB(UserBase): """User model as stored in database (with hashed password)""" + user_id: str = Field(default_factory=lambda: str(uuid4())) hashed_password: str is_superuser: bool = False created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) - model_config = ConfigDict( - from_attributes=True, - arbitrary_types_allowed=True - ) + model_config = ConfigDict(from_attributes=True, arbitrary_types_allowed=True) class UserUpdate(BaseModel): """Model for updating a user""" + username: Optional[str] = None email: Optional[EmailStr] = None role: Optional[UserRole] = None is_active: Optional[bool] = None password: Optional[str] = Field(None, min_length=8) - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class UserResponse(UserBase): """User model for API responses (without password)""" + user_id: str is_superuser: bool = False created_at: datetime @@ -73,6 +69,7 @@ class UserResponse(UserBase): class User(BaseModel): """User model for internal service use (without sensitive data)""" + user_id: str username: str email: EmailStr @@ -98,42 +95,40 @@ def from_response(cls, user_response: UserResponse) -> "User": is_active=user_response.is_active, is_superuser=user_response.is_superuser, created_at=user_response.created_at, - updated_at=user_response.updated_at + updated_at=user_response.updated_at, ) class UserListResponse(BaseModel): """Response model for listing users""" + users: List[UserResponse] total: int offset: int limit: int - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class PasswordResetRequest(BaseModel): """Request model for password reset""" + new_password: str = Field(..., min_length=8, description="New password for the user") - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class MessageResponse(BaseModel): """Generic message response""" + message: str - model_config = ConfigDict( - from_attributes=True - ) + model_config = ConfigDict(from_attributes=True) class LoginResponse(BaseModel): """Response model for successful login""" + message: str username: str role: str @@ -144,9 +139,68 @@ class LoginResponse(BaseModel): class TokenValidationResponse(BaseModel): """Response model for token validation""" + valid: bool username: str role: str csrf_token: str model_config = ConfigDict(from_attributes=True) + + +class DeleteUserResponse(BaseModel): + """Response model for user deletion.""" + + message: str + deleted_counts: dict[str, int] + + model_config = ConfigDict(from_attributes=True) + + +class RateLimitRuleResponse(BaseModel): + """Response model for rate limit rule.""" + + endpoint_pattern: str + group: str + requests: int + window_seconds: int + algorithm: str + burst_multiplier: float = 1.5 + priority: int = 0 + enabled: bool = True + + model_config = ConfigDict(from_attributes=True) + + +class UserRateLimitConfigResponse(BaseModel): + """Response model for user rate limit config.""" + + user_id: str + bypass_rate_limit: bool + global_multiplier: float + rules: list[RateLimitRuleResponse] + created_at: Optional[datetime] = None + updated_at: Optional[datetime] = None + notes: Optional[str] = None + + model_config = ConfigDict(from_attributes=True) + + +class UserRateLimitsResponse(BaseModel): + """Response model for user rate limits with usage stats.""" + + user_id: str + rate_limit_config: Optional[UserRateLimitConfigResponse] = None + current_usage: dict[str, dict[str, object]] + + model_config = ConfigDict(from_attributes=True) + + +class RateLimitUpdateResponse(BaseModel): + """Response model for rate limit update.""" + + user_id: str + updated: bool + config: UserRateLimitConfigResponse + + model_config = ConfigDict(from_attributes=True) diff --git a/backend/app/schemas_pydantic/user_settings.py b/backend/app/schemas_pydantic/user_settings.py index b2224432..2066ca45 100644 --- a/backend/app/schemas_pydantic/user_settings.py +++ b/backend/app/schemas_pydantic/user_settings.py @@ -10,6 +10,7 @@ class NotificationSettings(BaseModel): """User notification preferences""" + execution_completed: bool = True execution_failed: bool = True system_updates: bool = True @@ -19,6 +20,7 @@ class NotificationSettings(BaseModel): class EditorSettings(BaseModel): """Code editor preferences""" + theme: str = "one-dark" font_size: int = 14 tab_size: int = 4 @@ -31,14 +33,14 @@ class EditorSettings(BaseModel): bracket_matching: bool = True highlight_active_line: bool = True default_language: str = "python" - + @field_validator("font_size") @classmethod def validate_font_size(cls, v: int) -> int: if v < 8 or v > 32: raise ValueError("Font size must be between 8 and 32") return v - + @field_validator("tab_size") @classmethod def validate_tab_size(cls, v: int) -> int: @@ -49,6 +51,7 @@ def validate_tab_size(cls, v: int) -> int: class UserSettings(BaseModel): """Complete user settings model""" + user_id: str theme: Theme = Theme.AUTO timezone: str = "UTC" @@ -64,6 +67,7 @@ class UserSettings(BaseModel): class UserSettingsUpdate(BaseModel): """Partial update model for user settings""" + theme: Theme | None = None timezone: str | None = None date_format: str | None = None @@ -75,6 +79,7 @@ class UserSettingsUpdate(BaseModel): class SettingChange(BaseModel): """Represents a single setting change for event sourcing""" + field_path: str # e.g., "theme", "editor.font_size", "notifications.channels" old_value: Any new_value: Any @@ -84,11 +89,13 @@ class SettingChange(BaseModel): class ThemeUpdateRequest(BaseModel): """Request model for theme update""" + theme: Theme class SettingsHistoryEntry(BaseModel): """Single entry in settings history""" + timestamp: datetime event_type: str field: str @@ -100,18 +107,21 @@ class SettingsHistoryEntry(BaseModel): class SettingsHistoryResponse(BaseModel): """Response model for settings history""" + history: List[SettingsHistoryEntry] total: int class RestoreSettingsRequest(BaseModel): """Request model for restoring settings""" + timestamp: datetime reason: str | None = None class SettingsEvent(BaseModel): """Minimal event model for user settings service consumption.""" + event_type: str | EventType timestamp: datetime payload: Dict[str, Any] diff --git a/backend/app/services/admin/admin_events_service.py b/backend/app/services/admin/admin_events_service.py index 7f9cfe82..2419f675 100644 --- a/backend/app/services/admin/admin_events_service.py +++ b/backend/app/services/admin/admin_events_service.py @@ -27,14 +27,14 @@ class AdminReplayResult: def __init__( - self, - *, - dry_run: bool, - total_events: int, - replay_correlation_id: str, - status: str, - session_id: str | None = None, - events_preview: List[Dict[str, Any]] | None = None, + self, + *, + dry_run: bool, + total_events: int, + replay_correlation_id: str, + status: str, + session_id: str | None = None, + events_preview: List[Dict[str, Any]] | None = None, ) -> None: self.dry_run = dry_run self.total_events = total_events @@ -46,7 +46,7 @@ def __init__( @dataclass class ExportResult: - filename: str + file_name: str # not 'filename' - conflicts with LogRecord reserved attribute content: str media_type: str @@ -57,13 +57,13 @@ def __init__(self, repository: AdminEventsRepository, replay_service: ReplayServ self._replay_service = replay_service async def browse_events( - self, - *, - filter: EventFilter, - skip: int, - limit: int, - sort_by: str, - sort_order: int, + self, + *, + filter: EventFilter, + skip: int, + limit: int, + sort_by: str, + sort_order: int, ) -> EventBrowseResult: return await self._repo.browse_events( filter=filter, skip=skip, limit=limit, sort_by=sort_by, sort_order=sort_order @@ -76,22 +76,25 @@ async def get_event_stats(self, *, hours: int) -> EventStatistics: return await self._repo.get_event_stats(hours=hours) async def prepare_or_schedule_replay( - self, - *, - replay_query: ReplayQuery, - dry_run: bool, - replay_correlation_id: str, - target_service: str | None, + self, + *, + replay_query: ReplayQuery, + dry_run: bool, + replay_correlation_id: str, + target_service: str | None, ) -> AdminReplayResult: query = self._repo.build_replay_query(replay_query) if not query: raise ValueError("Must specify at least one filter for replay") # Prepare and optionally preview - logger.info("Preparing replay session", extra={ - "dry_run": dry_run, - "replay_correlation_id": replay_correlation_id, - }) + logger.info( + "Preparing replay session", + extra={ + "dry_run": dry_run, + "replay_correlation_id": replay_correlation_id, + }, + ) session_data = await self._repo.prepare_replay_session( query=query, dry_run=dry_run, @@ -117,10 +120,13 @@ async def prepare_or_schedule_replay( status="Preview", events_preview=previews, ) - logger.info("Replay dry-run prepared", extra={ - "total_events": result.total_events, - "replay_correlation_id": result.replay_correlation_id, - }) + logger.info( + "Replay dry-run prepared", + extra={ + "total_events": result.total_events, + "replay_correlation_id": result.replay_correlation_id, + }, + ) return result # Build config for actual replay and create session via replay service @@ -157,11 +163,14 @@ async def prepare_or_schedule_replay( session_id=session_id, status="Replay scheduled", ) - logger.info("Replay scheduled", extra={ - "session_id": result.session_id, - "total_events": result.total_events, - "replay_correlation_id": result.replay_correlation_id, - }) + logger.info( + "Replay scheduled", + extra={ + "session_id": result.session_id, + "total_events": result.total_events, + "replay_correlation_id": result.replay_correlation_id, + }, + ) return result async def start_replay_session(self, session_id: str) -> None: @@ -178,26 +187,37 @@ async def export_events_csv(self, filter: EventFilter) -> List[EventExportRow]: async def export_events_csv_content(self, *, filter: EventFilter, limit: int) -> ExportResult: rows = await self._repo.export_events_csv(filter) output = StringIO() - writer = csv.DictWriter(output, fieldnames=[ - "Event ID", "Event Type", "Timestamp", "Correlation ID", - "Aggregate ID", "User ID", "Service", "Status", "Error", - ]) + writer = csv.DictWriter( + output, + fieldnames=[ + "Event ID", + "Event Type", + "Timestamp", + "Correlation ID", + "Aggregate ID", + "User ID", + "Service", + "Status", + "Error", + ], + ) writer.writeheader() row_mapper = EventExportRowMapper() for row in rows[:limit]: writer.writerow(row_mapper.to_dict(row)) output.seek(0) filename = f"events_export_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.csv" - logger.info("Exported events CSV", extra={ - "row_count": len(rows), - "filename": filename, - }) - return ExportResult(filename=filename, content=output.getvalue(), media_type="text/csv") + logger.info( + "Exported events CSV", + extra={ + "row_count": len(rows), + "file_name": filename, + }, + ) + return ExportResult(file_name=filename, content=output.getvalue(), media_type="text/csv") async def export_events_json_content(self, *, filter: EventFilter, limit: int) -> ExportResult: - result = await self._repo.browse_events( - filter=filter, skip=0, limit=limit, sort_by="timestamp", sort_order=-1 - ) + result = await self._repo.browse_events(filter=filter, skip=0, limit=limit, sort_by="timestamp", sort_order=-1) event_mapper = EventMapper() events_data: list[dict[str, Any]] = [] for event in result.events: @@ -226,11 +246,14 @@ async def export_events_json_content(self, *, filter: EventFilter, limit: int) - } json_content = json.dumps(export_data, indent=2, default=str) filename = f"events_export_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json" - logger.info("Exported events JSON", extra={ - "event_count": len(events_data), - "filename": filename, - }) - return ExportResult(filename=filename, content=json_content, media_type="application/json") + logger.info( + "Exported events JSON", + extra={ + "event_count": len(events_data), + "file_name": filename, + }, + ) + return ExportResult(file_name=filename, content=json_content, media_type="application/json") async def delete_event(self, *, event_id: str, deleted_by: str) -> bool: # Load event for archival; archive then delete @@ -241,10 +264,13 @@ async def delete_event(self, *, event_id: str, deleted_by: str) -> bool: await self._repo.archive_event(detail.event, deleted_by) deleted = await self._repo.delete_event(event_id) if deleted: - logger.info("Event deleted", extra={ - "event_id": event_id, - "event_type": detail.event.event_type, - "correlation_id": detail.event.correlation_id, - "deleted_by": deleted_by, - }) + logger.info( + "Event deleted", + extra={ + "event_id": event_id, + "event_type": detail.event.event_type, + "correlation_id": detail.event.correlation_id, + "deleted_by": deleted_by, + }, + ) return deleted diff --git a/backend/app/services/admin/admin_settings_service.py b/backend/app/services/admin/admin_settings_service.py index f71b9d0b..88754c80 100644 --- a/backend/app/services/admin/admin_settings_service.py +++ b/backend/app/services/admin/admin_settings_service.py @@ -16,18 +16,16 @@ async def get_system_settings(self, admin_username: str) -> SystemSettings: return settings async def update_system_settings( - self, - settings: SystemSettings, - updated_by: str, - user_id: str, + self, + settings: SystemSettings, + updated_by: str, + user_id: str, ) -> SystemSettings: logger.info( "Admin updating system settings", extra={"admin_username": updated_by}, ) - updated = await self._repo.update_system_settings( - settings=settings, updated_by=updated_by, user_id=user_id - ) + updated = await self._repo.update_system_settings(settings=settings, updated_by=updated_by, user_id=user_id) logger.info("System settings updated successfully") return updated diff --git a/backend/app/services/admin/admin_user_service.py b/backend/app/services/admin/admin_user_service.py index 7cc55cc2..8914f270 100644 --- a/backend/app/services/admin/admin_user_service.py +++ b/backend/app/services/admin/admin_user_service.py @@ -8,7 +8,7 @@ from app.domain.enums.events import EventType from app.domain.enums.execution import ExecutionStatus from app.domain.enums.user import UserRole -from app.domain.rate_limit import UserRateLimit +from app.domain.rate_limit import RateLimitUpdateResult, UserRateLimit, UserRateLimitsResult from app.domain.user import PasswordReset, User, UserListResult, UserUpdate from app.infrastructure.mappers import UserRateLimitMapper from app.schemas_pydantic.user import UserCreate @@ -19,11 +19,11 @@ class AdminUserService: def __init__( - self, - user_repository: AdminUserRepository, - event_service: EventService, - execution_service: ExecutionService, - rate_limit_service: RateLimitService, + self, + user_repository: AdminUserRepository, + event_service: EventService, + execution_service: ExecutionService, + rate_limit_service: RateLimitService, ) -> None: self._users = user_repository self._events = event_service @@ -31,8 +31,7 @@ def __init__( self._rate_limits = rate_limit_service async def get_user_overview(self, user_id: str, hours: int = 24) -> AdminUserOverviewDomain: - logger.info("Admin getting user overview", - extra={"target_user_id": user_id, "hours": hours}) + logger.info("Admin getting user overview", extra={"target_user_id": user_id, "hours": hours}) user = await self._users.get_user_by_id(user_id) if not user: raise ValueError("User not found") @@ -46,10 +45,7 @@ async def get_user_overview(self, user_id: str, hours: int = 24) -> AdminUserOve end_time=now, include_all_users=False, ) - exec_stats = await self._executions.get_execution_stats( - user_id=user_id, - time_range=(start, now) - ) + exec_stats = await self._executions.get_execution_stats(user_id=user_id, time_range=(start, now)) by_status = exec_stats.get("by_status", {}) or {} def _count(status: ExecutionStatus) -> int: @@ -102,13 +98,9 @@ def _count(status: ExecutionStatus) -> int: recent_events=recent_events, ) - async def list_users(self, - *, - admin_username: str, - limit: int, - offset: int, - search: str | None, - role: UserRole | None) -> UserListResult: + async def list_users( + self, *, admin_username: str, limit: int, offset: int, search: str | None, role: UserRole | None + ) -> UserListResult: logger.info( "Admin listing users", extra={ @@ -150,8 +142,9 @@ async def create_user(self, *, admin_username: str, user_data: UserCreate) -> Us "updated_at": now, } await self._users.users_collection.insert_one(user_doc) - logger.info("User created successfully", - extra={"new_username": user_data.username, "admin_username": admin_username}) + logger.info( + "User created successfully", extra={"new_username": user_data.username, "admin_username": admin_username} + ) # Return fresh domain user created = await self._users.get_user_by_id(user_id) if not created: @@ -159,8 +152,7 @@ async def create_user(self, *, admin_username: str, user_data: UserCreate) -> Us return created async def get_user(self, *, admin_username: str, user_id: str) -> User | None: - logger.info("Admin getting user details", - extra={"admin_username": admin_username, "target_user_id": user_id}) + logger.info("Admin getting user details", extra={"admin_username": admin_username, "target_user_id": user_id}) return await self._users.get_user_by_id(user_id) async def update_user(self, *, admin_username: str, user_id: str, update: UserUpdate) -> User | None: @@ -183,31 +175,30 @@ async def delete_user(self, *, admin_username: str, user_id: str, cascade: bool) return deleted_counts async def reset_user_password(self, *, admin_username: str, user_id: str, new_password: str) -> bool: - logger.info("Admin resetting user password", - extra={"admin_username": admin_username, "target_user_id": user_id}) + logger.info( + "Admin resetting user password", extra={"admin_username": admin_username, "target_user_id": user_id} + ) pr = PasswordReset(user_id=user_id, new_password=new_password) ok = await self._users.reset_user_password(pr) if ok: logger.info("User password reset successfully", extra={"target_user_id": user_id}) return ok - async def get_user_rate_limits(self, *, admin_username: str, user_id: str) -> dict: - logger.info("Admin getting user rate limits", - extra={"admin_username": admin_username, "target_user_id": user_id}) + async def get_user_rate_limits(self, *, admin_username: str, user_id: str) -> UserRateLimitsResult: + logger.info( + "Admin getting user rate limits", extra={"admin_username": admin_username, "target_user_id": user_id} + ) user_limit = await self._rate_limits.get_user_rate_limit(user_id) usage_stats = await self._rate_limits.get_usage_stats(user_id) - rate_limit_mapper = UserRateLimitMapper() - return { - "user_id": user_id, - "rate_limit_config": rate_limit_mapper.to_dict(user_limit) if user_limit else None, - "current_usage": usage_stats, - } + return UserRateLimitsResult( + user_id=user_id, + rate_limit_config=user_limit, + current_usage=usage_stats, + ) - async def update_user_rate_limits(self, - *, - admin_username: str, - user_id: str, - config: UserRateLimit) -> dict[str, object]: + async def update_user_rate_limits( + self, *, admin_username: str, user_id: str, config: UserRateLimit + ) -> RateLimitUpdateResult: mapper = UserRateLimitMapper() logger.info( "Admin updating user rate limits", @@ -215,10 +206,11 @@ async def update_user_rate_limits(self, ) config.user_id = user_id await self._rate_limits.update_user_rate_limit(user_id, config) - return {"message": "Rate limits updated successfully", "config": mapper.to_dict(config)} + return RateLimitUpdateResult(user_id=user_id, updated=True, config=config) async def reset_user_rate_limits(self, *, admin_username: str, user_id: str) -> bool: - logger.info("Admin resetting user rate limits", - extra={"admin_username": admin_username, "target_user_id": user_id}) + logger.info( + "Admin resetting user rate limits", extra={"admin_username": admin_username, "target_user_id": user_id} + ) await self._rate_limits.reset_user_limits(user_id) return True diff --git a/backend/app/services/auth_service.py b/backend/app/services/auth_service.py index 3c1a4cc9..68caeb1c 100644 --- a/backend/app/services/auth_service.py +++ b/backend/app/services/auth_service.py @@ -35,9 +35,7 @@ async def get_current_user(self, request: Request) -> UserResponse: async def get_admin(self, request: Request) -> UserResponse: user = await self.get_current_user(request) if user.role != UserRole.ADMIN: - logger.warning( - f"Admin access denied for user: {user.username} (role: {user.role})" - ) + logger.warning(f"Admin access denied for user: {user.username} (role: {user.role})") raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail="Admin access required", diff --git a/backend/app/services/coordinator/coordinator.py b/backend/app/services/coordinator/coordinator.py index df255568..a6d562a8 100644 --- a/backend/app/services/coordinator/coordinator.py +++ b/backend/app/services/coordinator/coordinator.py @@ -8,6 +8,7 @@ import redis.asyncio as redis from motor.motor_asyncio import AsyncIOMotorClient +from app.core.database_context import DBClient from app.core.lifecycle import LifecycleEnabled from app.core.logging import logger from app.core.metrics.context import get_coordinator_metrics @@ -49,7 +50,7 @@ class ExecutionCoordinator(LifecycleEnabled): """ Coordinates execution scheduling across the system. - + This service: 1. Consumes ExecutionRequested events 2. Manages execution queue with priority @@ -59,15 +60,15 @@ class ExecutionCoordinator(LifecycleEnabled): """ def __init__( - self, - producer: UnifiedProducer, - schema_registry_manager: SchemaRegistryManager, - event_store: EventStore, - execution_repository: ExecutionRepository, - idempotency_manager: IdempotencyManager, - consumer_group: str = "execution-coordinator", - max_concurrent_scheduling: int = 10, - scheduling_interval_seconds: float = 0.5, + self, + producer: UnifiedProducer, + schema_registry_manager: SchemaRegistryManager, + event_store: EventStore, + execution_repository: ExecutionRepository, + idempotency_manager: IdempotencyManager, + consumer_group: str = "execution-coordinator", + max_concurrent_scheduling: int = 10, + scheduling_interval_seconds: float = 0.5, ): self.metrics = get_coordinator_metrics() settings = get_settings() @@ -77,17 +78,9 @@ def __init__( self.consumer_group = consumer_group # Components - self.queue_manager = QueueManager( - max_queue_size=10000, - max_executions_per_user=100, - stale_timeout_seconds=3600 - ) + self.queue_manager = QueueManager(max_queue_size=10000, max_executions_per_user=100, stale_timeout_seconds=3600) - self.resource_manager = ResourceManager( - total_cpu_cores=32.0, - total_memory_mb=65536, - total_gpu_count=0 - ) + self.resource_manager = ResourceManager(total_cpu_cores=32.0, total_memory_mb=65536, total_gpu_count=0) # Kafka components self.consumer: UnifiedConsumer | None = None @@ -106,7 +99,7 @@ def __init__( # State tracking self._running = False - self._scheduling_task: asyncio.Task | None = None + self._scheduling_task: asyncio.Task[None] | None = None self._active_executions: set[str] = set() self._execution_resources: ExecutionMap = {} self._schema_registry_manager = schema_registry_manager @@ -134,13 +127,10 @@ async def start(self) -> None: max_poll_interval_ms=300000, # 5 minutes - max time between polls max_poll_records=100, # Process max 100 messages at a time for flow control fetch_max_wait_ms=500, # Wait max 500ms for data (reduces latency) - fetch_min_bytes=1 # Return immediately if any data available + fetch_min_bytes=1, # Return immediately if any data available ) - self.consumer = UnifiedConsumer( - consumer_config, - event_dispatcher=self.dispatcher - ) + self.consumer = UnifiedConsumer(consumer_config, event_dispatcher=self.dispatcher) # Register handlers with EventDispatcher BEFORE wrapping with idempotency @self.dispatcher.register(EventType.EXECUTION_REQUESTED) @@ -165,7 +155,7 @@ async def handle_cancelled(event: BaseEvent) -> None: dispatcher=self.dispatcher, default_key_strategy="event_based", # Use event ID for deduplication default_ttl_seconds=7200, # 2 hours TTL for coordinator events - enable_for_all_handlers=True # Enable idempotency for ALL handlers + enable_for_all_handlers=True, # Enable idempotency for ALL handlers ) logger.info("COORDINATOR: Event handlers registered with idempotency protection") @@ -201,20 +191,18 @@ async def stop(self) -> None: await self.queue_manager.stop() # Close idempotency manager - if hasattr(self, 'idempotency_manager') and self.idempotency_manager: + if hasattr(self, "idempotency_manager") and self.idempotency_manager: await self.idempotency_manager.close() - logger.info( - f"ExecutionCoordinator service stopped. " - f"Active executions: {len(self._active_executions)}" - ) + logger.info(f"ExecutionCoordinator service stopped. Active executions: {len(self._active_executions)}") async def _route_execution_event(self, event: BaseEvent) -> None: """Route execution events to appropriate handlers based on event type""" logger.info( f"COORDINATOR: Routing execution event - type: {event.event_type}, " f"id: {event.event_id}, " - f"actual class: {type(event).__name__}") + f"actual class: {type(event).__name__}" + ) if event.event_type == EventType.EXECUTION_REQUESTED: await self._handle_execution_requested(event) # type: ignore @@ -232,10 +220,7 @@ async def _route_execution_result(self, event: BaseEvent) -> None: else: logger.debug(f"Ignoring execution result event type: {event.event_type}") - async def _handle_execution_requested( - self, - event: ExecutionRequestedEvent - ) -> None: + async def _handle_execution_requested(self, event: ExecutionRequestedEvent) -> None: """Handle execution requested event - add to queue for processing""" logger.info(f"HANDLER CALLED: _handle_execution_requested for event {event.event_id}") start_time = time.time() @@ -263,15 +248,10 @@ async def _handle_execution_requested( self.metrics.record_coordinator_scheduling_duration(duration) self.metrics.record_coordinator_execution_scheduled("queued") - logger.info( - f"Execution {event.execution_id} added to queue at position {position}" - ) + logger.info(f"Execution {event.execution_id} added to queue at position {position}") except Exception as e: - logger.error( - f"Failed to handle execution request {event.execution_id}: {e}", - exc_info=True - ) + logger.error(f"Failed to handle execution request {event.execution_id}: {e}", exc_info=True) self.metrics.record_coordinator_execution_scheduled("error") async def _handle_execution_cancelled(self, event: ExecutionCancelledEvent) -> None: @@ -343,9 +323,7 @@ async def _schedule_execution(self, event: ExecutionRequestedEvent) -> None: try: # Check if already active (shouldn't happen, but be safe) if event.execution_id in self._active_executions: - logger.warning( - f"Execution {event.execution_id} already active, skipping" - ) + logger.warning(f"Execution {event.execution_id} already active, skipping") return # Request resource allocation @@ -354,18 +332,13 @@ async def _schedule_execution(self, event: ExecutionRequestedEvent) -> None: event.language, requested_cpu=None, # Use defaults for now requested_memory_mb=None, - requested_gpu=0 + requested_gpu=0, ) if not allocation: # No resources available, requeue - await self.queue_manager.requeue_execution( - event, - increment_retry=False - ) - logger.info( - f"No resources available for {event.execution_id}, requeued" - ) + await self.queue_manager.requeue_execution(event, increment_retry=False) + logger.info(f"No resources available for {event.execution_id}, requeued") return # Track allocation @@ -379,13 +352,15 @@ async def _schedule_execution(self, event: ExecutionRequestedEvent) -> None: await self._publish_execution_started(event) logger.info(f"Successfully published ExecutionStartedEvent for {event.execution_id}") except Exception as publish_error: - logger.error(f"Failed to publish ExecutionStartedEvent for {event.execution_id}: {publish_error}", - exc_info=True) + logger.error( + f"Failed to publish ExecutionStartedEvent for {event.execution_id}: {publish_error}", + exc_info=True, + ) raise # Track metrics queue_time = start_time - event.timestamp.timestamp() - priority = getattr(event, 'priority', QueuePriority.NORMAL.value) + priority = getattr(event, "priority", QueuePriority.NORMAL.value) self.metrics.record_coordinator_queue_time(queue_time, QueuePriority(priority).name) scheduling_duration = time.time() - start_time @@ -400,10 +375,7 @@ async def _schedule_execution(self, event: ExecutionRequestedEvent) -> None: ) except Exception as e: - logger.error( - f"Failed to schedule execution {event.execution_id}: {e}", - exc_info=True - ) + logger.error(f"Failed to schedule execution {event.execution_id}: {e}", exc_info=True) # Release any allocated resources if event.execution_id in self._execution_resources: @@ -431,10 +403,7 @@ async def _build_command_metadata(self, request: ExecutionRequestedEvent) -> Eve correlation_id=request.metadata.correlation_id, ) - async def _publish_execution_started( - self, - request: ExecutionRequestedEvent - ) -> None: + async def _publish_execution_started(self, request: ExecutionRequestedEvent) -> None: """Send CreatePodCommandEvent to k8s-worker via SAGA_COMMANDS topic""" metadata = await self._build_command_metadata(request) @@ -456,17 +425,9 @@ async def _publish_execution_started( metadata=metadata, ) - await self.producer.produce( - event_to_produce=create_pod_cmd, - key=request.execution_id - ) + await self.producer.produce(event_to_produce=create_pod_cmd, key=request.execution_id) - async def _publish_execution_accepted( - self, - request: ExecutionRequestedEvent, - position: int, - priority: int - ) -> None: + async def _publish_execution_accepted(self, request: ExecutionRequestedEvent, position: int, priority: int) -> None: """Publish execution accepted event to notify that request was valid and queued""" logger.info(f"Publishing ExecutionAcceptedEvent for execution {request.execution_id}") @@ -475,17 +436,13 @@ async def _publish_execution_accepted( queue_position=position, estimated_wait_seconds=None, # Could calculate based on queue analysis priority=priority, - metadata=request.metadata + metadata=request.metadata, ) await self.producer.produce(event_to_produce=event) logger.info(f"ExecutionAcceptedEvent published for {request.execution_id}") - async def _publish_queue_full( - self, - request: ExecutionRequestedEvent, - error: str - ) -> None: + async def _publish_queue_full(self, request: ExecutionRequestedEvent, error: str) -> None: """Publish queue full event""" # Get queue stats for context queue_stats = await self.queue_manager.get_queue_stats() @@ -500,14 +457,9 @@ async def _publish_queue_full( error_message=error, ) - await self.producer.produce(event_to_produce=event, - key=request.execution_id) + await self.producer.produce(event_to_produce=event, key=request.execution_id) - async def _publish_scheduling_failed( - self, - request: ExecutionRequestedEvent, - error: str - ) -> None: + async def _publish_scheduling_failed(self, request: ExecutionRequestedEvent, error: str) -> None: """Publish scheduling failed event""" # Get resource stats for context resource_stats = await self.resource_manager.get_resource_stats() @@ -517,15 +469,14 @@ async def _publish_scheduling_failed( error_type=ExecutionErrorType.SYSTEM_ERROR, exit_code=-1, stderr=f"Failed to schedule execution: {error}. " - f"Available resources: CPU={resource_stats.available.cpu_cores}, " - f"Memory={resource_stats.available.memory_mb}MB", + f"Available resources: CPU={resource_stats.available.cpu_cores}, " + f"Memory={resource_stats.available.memory_mb}MB", resource_usage=ResourceUsageDomain.from_dict({}), metadata=request.metadata, - error_message=error + error_message=error, ) - await self.producer.produce(event_to_produce=event, - key=request.execution_id) + await self.producer.produce(event_to_produce=event, key=request.execution_id) async def get_status(self) -> dict[str, Any]: """Get coordinator status""" @@ -533,7 +484,7 @@ async def get_status(self) -> dict[str, Any]: "running": self._running, "active_executions": len(self._active_executions), "queue_stats": await self.queue_manager.get_queue_stats(), - "resource_stats": await self.resource_manager.get_resource_stats() + "resource_stats": await self.resource_manager.get_resource_stats(), } @@ -549,22 +500,14 @@ async def run_coordinator() -> None: config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS) producer = UnifiedProducer(config, schema_registry_manager) - db_client: AsyncIOMotorClient = AsyncIOMotorClient( - settings.MONGODB_URL, - tz_aware=True, - serverSelectionTimeoutMS=5000 - ) + db_client: DBClient = AsyncIOMotorClient(settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000) db_name = settings.DATABASE_NAME database = db_client[db_name] await SchemaManager(database).apply_all() logger.info("Creating event store for coordinator...") - event_store = create_event_store( - db=database, - schema_registry=schema_registry_manager, - ttl_days=90 - ) + event_store = create_event_store(db=database, schema_registry=schema_registry_manager, ttl_days=90) exec_repo = ExecutionRepository(database) r = redis.Redis( diff --git a/backend/app/services/coordinator/queue_manager.py b/backend/app/services/coordinator/queue_manager.py index 8b4d2277..d1a9ccd1 100644 --- a/backend/app/services/coordinator/queue_manager.py +++ b/backend/app/services/coordinator/queue_manager.py @@ -41,10 +41,10 @@ def age_seconds(self) -> float: class QueueManager: def __init__( - self, - max_queue_size: int = 10000, - max_executions_per_user: int = 100, - stale_timeout_seconds: int = 3600, + self, + max_queue_size: int = 10000, + max_executions_per_user: int = 100, + stale_timeout_seconds: int = 3600, ) -> None: self.metrics = get_coordinator_metrics() self.max_queue_size = max_queue_size @@ -55,7 +55,7 @@ def __init__( self._queue_lock = asyncio.Lock() self._user_execution_count: Dict[str, int] = defaultdict(int) self._execution_users: Dict[str, str] = {} - self._cleanup_task: asyncio.Task | None = None + self._cleanup_task: asyncio.Task[None] | None = None self._running = False async def start(self) -> None: @@ -82,9 +82,7 @@ async def stop(self) -> None: logger.info(f"Queue manager stopped. Final queue size: {len(self._queue)}") async def add_execution( - self, - event: ExecutionRequestedEvent, - priority: QueuePriority | None = None + self, event: ExecutionRequestedEvent, priority: QueuePriority | None = None ) -> Tuple[bool, int | None, str | None]: async with self._queue_lock: if len(self._queue) >= self.max_queue_size: @@ -98,11 +96,7 @@ async def add_execution( if priority is None: priority = QueuePriority(event.priority) - queued = QueuedExecution( - priority=priority.value, - timestamp=time.time(), - event=event - ) + queued = QueuedExecution(priority=priority.value, timestamp=time.time(), event=event) heapq.heappush(self._queue, queued) self._track_execution(event.execution_id, user_id) @@ -172,24 +166,18 @@ async def get_queue_stats(self) -> Dict[str, Any]: priority_counts[priority_name] += 1 user_counts[queued.user_id] += 1 - top_users = dict(sorted( - user_counts.items(), - key=lambda x: x[1], - reverse=True - )[:10]) + top_users = dict(sorted(user_counts.items(), key=lambda x: x[1], reverse=True)[:10]) return { "total_size": len(self._queue), "priority_distribution": dict(priority_counts), "top_users": top_users, "max_queue_size": self.max_queue_size, - "utilization_percent": (len(self._queue) / self.max_queue_size) * 100 + "utilization_percent": (len(self._queue) / self.max_queue_size) * 100, } async def requeue_execution( - self, - event: ExecutionRequestedEvent, - increment_retry: bool = True + self, event: ExecutionRequestedEvent, increment_retry: bool = True ) -> Tuple[bool, int | None, str | None]: def _next_lower(p: QueuePriority) -> QueuePriority: order = [ @@ -240,10 +228,8 @@ def _record_removal(self, reason: str) -> None: def _record_wait_time(self, queued: QueuedExecution) -> None: self.metrics.record_queue_wait_time_by_priority( - queued.age_seconds, - QueuePriority(queued.priority).name, - "default" - ) + queued.age_seconds, QueuePriority(queued.priority).name, "default" + ) def _update_add_metrics(self, priority: QueuePriority) -> None: # Deprecated in favor of single execution queue depth metric diff --git a/backend/app/services/coordinator/resource_manager.py b/backend/app/services/coordinator/resource_manager.py index 59e069b7..8bfe9478 100644 --- a/backend/app/services/coordinator/resource_manager.py +++ b/backend/app/services/coordinator/resource_manager.py @@ -9,6 +9,7 @@ @dataclass class ResourceAllocation: """Resource allocation for an execution""" + cpu_cores: float memory_mb: int gpu_count: int = 0 @@ -27,6 +28,7 @@ def memory_bytes(self) -> int: @dataclass class ResourcePool: """Available resource pool""" + total_cpu_cores: float total_memory_mb: int total_gpu_count: int @@ -48,6 +50,7 @@ class ResourcePool: @dataclass class ResourceGroup: """Resource group with usage information""" + cpu_cores: float memory_mb: int gpu_count: int @@ -56,6 +59,7 @@ class ResourceGroup: @dataclass class ResourceStats: """Resource statistics""" + total: ResourceGroup available: ResourceGroup allocated: ResourceGroup @@ -67,6 +71,7 @@ class ResourceStats: @dataclass class ResourceAllocationInfo: """Information about a resource allocation""" + execution_id: str cpu_cores: float memory_mb: int @@ -79,11 +84,11 @@ class ResourceManager: """Manages resource allocation for executions""" def __init__( - self, - total_cpu_cores: float = 32.0, - total_memory_mb: int = 65536, # 64GB - total_gpu_count: int = 0, - overcommit_factor: float = 1.2, # Allow 20% overcommit + self, + total_cpu_cores: float = 32.0, + total_memory_mb: int = 65536, # 64GB + total_gpu_count: int = 0, + overcommit_factor: float = 1.2, # Allow 20% overcommit ): self.metrics = get_coordinator_metrics() self.pool = ResourcePool( @@ -92,7 +97,7 @@ def __init__( total_gpu_count=total_gpu_count, available_cpu_cores=total_cpu_cores * overcommit_factor, available_memory_mb=int(total_memory_mb * overcommit_factor), - available_gpu_count=total_gpu_count + available_gpu_count=total_gpu_count, ) # Adjust minimum reserve thresholds proportionally for small pools. @@ -126,16 +131,16 @@ def __init__( self._update_metrics() async def request_allocation( - self, - execution_id: str, - language: str, - requested_cpu: float | None = None, - requested_memory_mb: int | None = None, - requested_gpu: int = 0 + self, + execution_id: str, + language: str, + requested_cpu: float | None = None, + requested_memory_mb: int | None = None, + requested_gpu: int = 0, ) -> ResourceAllocation | None: """ Request resource allocation for execution - + Returns: ResourceAllocation if successful, None if resources unavailable """ @@ -148,19 +153,13 @@ async def request_allocation( # Determine requested resources if requested_cpu is None or requested_memory_mb is None: # Use defaults based on language - default = self.default_allocations.get( - language, - ResourceAllocation(cpu_cores=0.5, memory_mb=512) - ) + default = self.default_allocations.get(language, ResourceAllocation(cpu_cores=0.5, memory_mb=512)) requested_cpu = requested_cpu or default.cpu_cores requested_memory_mb = requested_memory_mb or default.memory_mb # Apply limits requested_cpu = min(requested_cpu, self.pool.max_cpu_per_execution) - requested_memory_mb = min( - requested_memory_mb, - self.pool.max_memory_per_execution_mb - ) + requested_memory_mb = min(requested_memory_mb, self.pool.max_memory_per_execution_mb) requested_gpu = min(requested_gpu, self.pool.max_gpu_per_execution) # Check availability (considering minimum reserves) @@ -168,9 +167,11 @@ async def request_allocation( memory_after = self.pool.available_memory_mb - requested_memory_mb gpu_after = self.pool.available_gpu_count - requested_gpu - if (cpu_after < self.pool.min_available_cpu_cores or - memory_after < self.pool.min_available_memory_mb or - gpu_after < 0): + if ( + cpu_after < self.pool.min_available_cpu_cores + or memory_after < self.pool.min_available_memory_mb + or gpu_after < 0 + ): logger.warning( f"Insufficient resources for execution {execution_id}. " f"Requested: {requested_cpu} CPU, {requested_memory_mb}MB RAM, " @@ -181,9 +182,7 @@ async def request_allocation( # Create allocation allocation = ResourceAllocation( - cpu_cores=requested_cpu, - memory_mb=requested_memory_mb, - gpu_count=requested_gpu + cpu_cores=requested_cpu, memory_mb=requested_memory_mb, gpu_count=requested_gpu ) # Update pool @@ -238,21 +237,18 @@ async def get_allocation(self, execution_id: str) -> ResourceAllocation | None: async with self._allocation_lock: return self._allocations.get(execution_id) - async def can_allocate( - self, - cpu_cores: float, - memory_mb: int, - gpu_count: int = 0 - ) -> bool: + async def can_allocate(self, cpu_cores: float, memory_mb: int, gpu_count: int = 0) -> bool: """Check if resources can be allocated""" async with self._allocation_lock: cpu_after = self.pool.available_cpu_cores - cpu_cores memory_after = self.pool.available_memory_mb - memory_mb gpu_after = self.pool.available_gpu_count - gpu_count - return (cpu_after >= self.pool.min_available_cpu_cores and - memory_after >= self.pool.min_available_memory_mb and - gpu_after >= 0) + return ( + cpu_after >= self.pool.min_available_cpu_cores + and memory_after >= self.pool.min_available_memory_mb + and gpu_after >= 0 + ) async def get_resource_stats(self) -> ResourceStats: """Get resource statistics""" @@ -262,34 +258,30 @@ async def get_resource_stats(self) -> ResourceStats: allocated_gpu = self.pool.total_gpu_count - self.pool.available_gpu_count gpu_percent = (allocated_gpu / self.pool.total_gpu_count * 100) if self.pool.total_gpu_count > 0 else 0 - + return ResourceStats( total=ResourceGroup( cpu_cores=self.pool.total_cpu_cores, memory_mb=self.pool.total_memory_mb, - gpu_count=self.pool.total_gpu_count + gpu_count=self.pool.total_gpu_count, ), available=ResourceGroup( cpu_cores=self.pool.available_cpu_cores, memory_mb=self.pool.available_memory_mb, - gpu_count=self.pool.available_gpu_count - ), - allocated=ResourceGroup( - cpu_cores=allocated_cpu, - memory_mb=allocated_memory, - gpu_count=allocated_gpu + gpu_count=self.pool.available_gpu_count, ), + allocated=ResourceGroup(cpu_cores=allocated_cpu, memory_mb=allocated_memory, gpu_count=allocated_gpu), utilization={ "cpu_percent": (allocated_cpu / self.pool.total_cpu_cores * 100), "memory_percent": (allocated_memory / self.pool.total_memory_mb * 100), - "gpu_percent": gpu_percent + "gpu_percent": gpu_percent, }, allocation_count=len(self._allocations), limits={ "max_cpu_per_execution": self.pool.max_cpu_per_execution, "max_memory_per_execution_mb": self.pool.max_memory_per_execution_mb, - "max_gpu_per_execution": self.pool.max_gpu_per_execution - } + "max_gpu_per_execution": self.pool.max_gpu_per_execution, + }, ) async def get_allocations_by_resource_usage(self) -> List[ResourceAllocationInfo]: @@ -297,35 +289,34 @@ async def get_allocations_by_resource_usage(self) -> List[ResourceAllocationInfo async with self._allocation_lock: allocations = [] for exec_id, allocation in self._allocations.items(): - allocations.append(ResourceAllocationInfo( - execution_id=str(exec_id), - cpu_cores=allocation.cpu_cores, - memory_mb=allocation.memory_mb, - gpu_count=allocation.gpu_count, - cpu_percentage=(allocation.cpu_cores / self.pool.total_cpu_cores * 100), - memory_percentage=(allocation.memory_mb / self.pool.total_memory_mb * 100) - )) + allocations.append( + ResourceAllocationInfo( + execution_id=str(exec_id), + cpu_cores=allocation.cpu_cores, + memory_mb=allocation.memory_mb, + gpu_count=allocation.gpu_count, + cpu_percentage=(allocation.cpu_cores / self.pool.total_cpu_cores * 100), + memory_percentage=(allocation.memory_mb / self.pool.total_memory_mb * 100), + ) + ) # Sort by total resource usage - allocations.sort( - key=lambda x: x.cpu_percentage + x.memory_percentage, - reverse=True - ) + allocations.sort(key=lambda x: x.cpu_percentage + x.memory_percentage, reverse=True) return allocations def _update_metrics(self) -> None: """Update metrics""" - cpu_usage = (self.pool.total_cpu_cores - self.pool.available_cpu_cores) + cpu_usage = self.pool.total_cpu_cores - self.pool.available_cpu_cores cpu_percent = cpu_usage / self.pool.total_cpu_cores * 100 self.metrics.update_resource_usage("cpu", cpu_percent) - + memory_usage = self.pool.total_memory_mb - self.pool.available_memory_mb memory_percent = memory_usage / self.pool.total_memory_mb * 100 self.metrics.update_resource_usage("memory", memory_percent) - + gpu_usage = self.pool.total_gpu_count - self.pool.available_gpu_count gpu_percent = gpu_usage / max(1, self.pool.total_gpu_count) * 100 self.metrics.update_resource_usage("gpu", gpu_percent) - + self.metrics.update_coordinator_active_executions(len(self._allocations)) diff --git a/backend/app/services/event_bus.py b/backend/app/services/event_bus.py index 0c223a66..a0449c5f 100644 --- a/backend/app/services/event_bus.py +++ b/backend/app/services/event_bus.py @@ -16,18 +16,29 @@ from app.settings import get_settings +@dataclass +class EventBusEvent: + """Represents an event on the event bus.""" + + id: str + event_type: str + timestamp: str + payload: dict[str, Any] + + @dataclass class Subscription: """Represents a single event subscription.""" + id: str = field(default_factory=lambda: str(uuid4())) pattern: str = "" - handler: Callable = field(default=lambda: None) + handler: Callable[[EventBusEvent], Any] = field(default=lambda _: None) class EventBus(LifecycleEnabled): """ Hybrid event bus with Kafka backing and local in-memory distribution. - + Supports pattern-based subscriptions using wildcards: - execution.* - matches all execution events - execution.123.* - matches all events for execution 123 @@ -42,10 +53,10 @@ def __init__(self) -> None: self._subscriptions: dict[str, Subscription] = {} # id -> Subscription self._pattern_index: dict[str, set[str]] = {} # pattern -> set of subscription ids self._running = False - self._consumer_task: Optional[asyncio.Task] = None + self._consumer_task: Optional[asyncio.Task[None]] = None self._lock = asyncio.Lock() self._topic = f"{self.settings.KAFKA_TOPIC_PREFIX}{KafkaTopic.EVENT_BUS_STREAM}" - self._executor: Optional[Callable] = None # Will store the executor function + self._executor: Optional[Callable[..., Any]] = None # Will store the executor function async def start(self) -> None: """Start the event bus with Kafka backing.""" @@ -62,21 +73,25 @@ async def start(self) -> None: async def _initialize_kafka(self) -> None: """Initialize Kafka producer and consumer.""" # Producer setup - self.producer = Producer({ - 'bootstrap.servers': self.settings.KAFKA_BOOTSTRAP_SERVERS, - 'client.id': f'event-bus-producer-{uuid4()}', - 'linger.ms': 10, - 'batch.size': 16384 - }) + self.producer = Producer( + { + "bootstrap.servers": self.settings.KAFKA_BOOTSTRAP_SERVERS, + "client.id": f"event-bus-producer-{uuid4()}", + "linger.ms": 10, + "batch.size": 16384, + } + ) # Consumer setup - self.consumer = Consumer({ - 'bootstrap.servers': self.settings.KAFKA_BOOTSTRAP_SERVERS, - 'group.id': f"event-bus-{uuid4()}", - 'auto.offset.reset': 'latest', - 'enable.auto.commit': True, - 'client.id': f'event-bus-consumer-{uuid4()}' - }) + self.consumer = Consumer( + { + "bootstrap.servers": self.settings.KAFKA_BOOTSTRAP_SERVERS, + "group.id": f"event-bus-{uuid4()}", + "auto.offset.reset": "latest", + "enable.auto.commit": True, + "client.id": f"event-bus-consumer-{uuid4()}", + } + ) self.consumer.subscribe([self._topic]) # Store the executor function for sync operations @@ -118,7 +133,7 @@ async def _cleanup(self) -> None: async def publish(self, event_type: str, data: dict[str, Any]) -> None: """ Publish an event to Kafka and local subscribers. - + Args: event_type: Event type (e.g., "execution.123.started") data: Event data payload @@ -129,8 +144,8 @@ async def publish(self, event_type: str, data: dict[str, Any]) -> None: if self.producer: try: # Serialize and send message asynchronously - value = json.dumps(event).encode('utf-8') - key = event_type.encode('utf-8') if event_type else None + value = json.dumps(vars(event)).encode("utf-8") + key = event_type.encode("utf-8") if event_type else None # Use executor to avoid blocking if self._executor: @@ -147,23 +162,23 @@ async def publish(self, event_type: str, data: dict[str, Any]) -> None: # Publish to local subscribers for immediate handling await self._distribute_event(event_type, event) - def _create_event(self, event_type: str, data: dict[str, Any]) -> dict[str, Any]: + def _create_event(self, event_type: str, data: dict[str, Any]) -> EventBusEvent: """Create a standardized event object.""" - return { - "id": str(uuid4()), - "event_type": event_type, - "timestamp": datetime.now(timezone.utc).isoformat(), - "payload": data - } - - async def subscribe(self, pattern: str, handler: Callable) -> str: + return EventBusEvent( + id=str(uuid4()), + event_type=event_type, + timestamp=datetime.now(timezone.utc).isoformat(), + payload=data, + ) + + async def subscribe(self, pattern: str, handler: Callable[[EventBusEvent], Any]) -> str: """ Subscribe to events matching a pattern. - + Args: pattern: Event pattern with wildcards (e.g., "execution.*") handler: Async function to handle matching events - + Returns: Subscription ID for later unsubscribe """ @@ -184,7 +199,7 @@ async def subscribe(self, pattern: str, handler: Callable) -> str: logger.debug(f"Created subscription {subscription.id} for pattern: {pattern}") return subscription.id - async def unsubscribe(self, pattern: str, handler: Callable) -> None: + async def unsubscribe(self, pattern: str, handler: Callable[[EventBusEvent], Any]) -> None: """Unsubscribe a specific handler from a pattern.""" async with self._lock: # Find subscription with matching pattern and handler @@ -218,7 +233,7 @@ async def _remove_subscription(self, subscription_id: str) -> None: logger.debug(f"Removed subscription {subscription_id} for pattern: {pattern}") - async def _distribute_event(self, event_type: str, event: dict[str, Any]) -> None: + async def _distribute_event(self, event_type: str, event: EventBusEvent) -> None: """Distribute event to all matching local subscribers.""" # Find matching subscriptions matching_handlers = await self._find_matching_handlers(event_type) @@ -228,8 +243,7 @@ async def _distribute_event(self, event_type: str, event: dict[str, Any]) -> Non # Execute all handlers concurrently results = await asyncio.gather( - *(self._invoke_handler(handler, event) for handler in matching_handlers), - return_exceptions=True + *(self._invoke_handler(handler, event) for handler in matching_handlers), return_exceptions=True ) # Log any errors @@ -237,20 +251,18 @@ async def _distribute_event(self, event_type: str, event: dict[str, Any]) -> Non if isinstance(result, Exception): logger.error(f"Handler failed for event {event_type}: {result}") - async def _find_matching_handlers(self, event_type: str) -> list[Callable]: + async def _find_matching_handlers(self, event_type: str) -> list[Callable[[EventBusEvent], Any]]: """Find all handlers matching the event type.""" async with self._lock: - handlers: list[Callable] = [] + handlers: list[Callable[[EventBusEvent], Any]] = [] for pattern, sub_ids in self._pattern_index.items(): if fnmatch.fnmatch(event_type, pattern): handlers.extend( - self._subscriptions[sub_id].handler - for sub_id in sub_ids - if sub_id in self._subscriptions + self._subscriptions[sub_id].handler for sub_id in sub_ids if sub_id in self._subscriptions ) return handlers - async def _invoke_handler(self, handler: Callable, event: dict[str, Any]) -> None: + async def _invoke_handler(self, handler: Callable[[EventBusEvent], Any], event: EventBusEvent) -> None: """Invoke a single handler, handling both sync and async.""" if asyncio.iscoroutinefunction(handler): await handler(event) @@ -284,9 +296,14 @@ async def _kafka_listener(self) -> None: try: # Deserialize message - event = json.loads(msg.value().decode('utf-8')) - event_type = event.get("event_type", "") - await self._distribute_event(event_type, event) + event_dict = json.loads(msg.value().decode("utf-8")) + event = EventBusEvent( + id=event_dict.get("id", ""), + event_type=event_dict.get("event_type", ""), + timestamp=event_dict.get("timestamp", ""), + payload=event_dict.get("payload", {}), + ) + await self._distribute_event(event.event_type, event) except Exception as e: logger.error(f"Error processing Kafka message: {e}") @@ -310,7 +327,7 @@ async def get_statistics(self) -> dict[str, Any]: "total_patterns": len(self._pattern_index), "total_subscriptions": len(self._subscriptions), "kafka_enabled": self.producer is not None, - "running": self._running + "running": self._running, } diff --git a/backend/app/services/event_replay/replay_service.py b/backend/app/services/event_replay/replay_service.py index ab043b2e..8d88f356 100644 --- a/backend/app/services/event_replay/replay_service.py +++ b/backend/app/services/event_replay/replay_service.py @@ -20,19 +20,18 @@ class EventReplayService: - def __init__( - self, - repository: ReplayRepository, - producer: UnifiedProducer, - event_store: EventStore, + self, + repository: ReplayRepository, + producer: UnifiedProducer, + event_store: EventStore, ) -> None: self._sessions: Dict[str, ReplaySessionState] = {} - self._active_tasks: Dict[str, asyncio.Task] = {} + self._active_tasks: Dict[str, asyncio.Task[None]] = {} self._repository = repository self._producer = producer self._event_store = event_store - self._callbacks: Dict[ReplayTarget, Callable] = {} + self._callbacks: Dict[ReplayTarget, Callable[..., Any]] = {} self._file_locks: Dict[str, asyncio.Lock] = {} self._metrics = ReplayMetrics() logger.info("Event replay service initialized") @@ -41,10 +40,7 @@ async def create_replay_session(self, config: ReplayConfig) -> str: state = ReplaySessionState(session_id=str(uuid4()), config=config) self._sessions[state.session_id] = state - logger.info( - f"Created replay session {state.session_id} " - f"type={config.replay_type} target={config.target}" - ) + logger.info(f"Created replay session {state.session_id} type={config.replay_type} target={config.target}") return state.session_id @@ -70,13 +66,13 @@ async def _run_replay(self, session: ReplaySessionState) -> None: try: with trace_span( - name="event_replay.session", - kind=SpanKind.INTERNAL, - attributes={ - "replay.session_id": str(session.session_id), - "replay.type": session.config.replay_type, - "replay.target": session.config.target, - } + name="event_replay.session", + kind=SpanKind.INTERNAL, + attributes={ + "replay.session_id": str(session.session_id), + "replay.type": session.config.replay_type, + "replay.target": session.config.target, + }, ): await self._prepare_session(session) @@ -97,15 +93,9 @@ async def _run_replay(self, session: ReplaySessionState) -> None: async def _prepare_session(self, session: ReplaySessionState) -> None: total_count = await self._repository.count_events(session.config.filter) - session.total_events = min( - total_count, - session.config.max_events or total_count - ) + session.total_events = min(total_count, session.config.max_events or total_count) - logger.info( - f"Replay session {session.session_id} will process " - f"{session.total_events} events" - ) + logger.info(f"Replay session {session.session_id} will process {session.total_events} events") async def _handle_progress_callback(self, session: ReplaySessionState) -> None: cb = session.config.get_progress_callback() @@ -135,37 +125,23 @@ async def _complete_session(self, session: ReplaySessionState, start_time: float ) async def _handle_session_error(self, session: ReplaySessionState, error: Exception) -> None: - logger.error( - f"Replay session {session.session_id} failed: {error}", - exc_info=True - ) + logger.error(f"Replay session {session.session_id} failed: {error}", exc_info=True) session.status = ReplayStatus.FAILED session.completed_at = datetime.now(timezone.utc) - session.errors.append({ - "timestamp": datetime.now(timezone.utc).isoformat(), - "error": str(error), - "type": type(error).__name__ - }) + session.errors.append( + {"timestamp": datetime.now(timezone.utc).isoformat(), "error": str(error), "type": type(error).__name__} + ) self._metrics.record_replay_error(type(error).__name__) await self._update_session_in_db(session) - async def _apply_replay_delay( - self, - session: ReplaySessionState, - event: BaseEvent - ) -> None: + async def _apply_replay_delay(self, session: ReplaySessionState, event: BaseEvent) -> None: if session.last_event_at and session.config.speed_multiplier < 100: time_diff = (event.timestamp - session.last_event_at).total_seconds() delay = time_diff / session.config.speed_multiplier if delay > 0: await asyncio.sleep(delay) - def _update_replay_metrics( - self, - session: ReplaySessionState, - event: BaseEvent, - success: bool - ) -> None: + def _update_replay_metrics(self, session: ReplaySessionState, event: BaseEvent, success: bool) -> None: if success: session.replayed_events += 1 status = "success" @@ -175,25 +151,14 @@ def _update_replay_metrics( self._metrics.record_event_replayed(session.config.replay_type, event.event_type, status) - async def _handle_replay_error( - self, - session: ReplaySessionState, - event: BaseEvent, - error: Exception - ) -> None: + async def _handle_replay_error(self, session: ReplaySessionState, event: BaseEvent, error: Exception) -> None: logger.error(f"Failed to replay event {event.event_id}: {error}") session.failed_events += 1 - session.errors.append({ - "timestamp": datetime.now(timezone.utc).isoformat(), - "event_id": str(event.event_id), - "error": str(error) - }) - - async def _replay_to_kafka( - self, - session: ReplaySessionState, - event: BaseEvent - ) -> bool: + session.errors.append( + {"timestamp": datetime.now(timezone.utc).isoformat(), "event_id": str(event.event_id), "error": str(error)} + ) + + async def _replay_to_kafka(self, session: ReplaySessionState, event: BaseEvent) -> bool: config = session.config if not config.preserve_timestamps: event.timestamp = datetime.now(timezone.utc) @@ -202,39 +167,27 @@ async def _replay_to_kafka( await self._producer.produce(event_to_produce=event) return True - async def _replay_to_callback( - self, - event: BaseEvent, - session: ReplaySessionState - ) -> bool: + async def _replay_to_callback(self, event: BaseEvent, session: ReplaySessionState) -> bool: callback = self._callbacks.get(ReplayTarget.CALLBACK) if callback: await callback(event, session) return True return False - async def _replay_to_file( - self, - event: BaseEvent, - file_path: str | None - ) -> bool: + async def _replay_to_file(self, event: BaseEvent, file_path: str | None) -> bool: if not file_path: logger.error("No target file path specified") return False await self._write_event_to_file(event, file_path) return True - async def _fetch_event_batches( - self, - session: ReplaySessionState - ) -> AsyncIterator[List[BaseEvent]]: + async def _fetch_event_batches(self, session: ReplaySessionState) -> AsyncIterator[List[BaseEvent]]: logger.info(f"Fetching events for session {session.session_id}") events_processed = 0 max_events = session.config.max_events async for batch_docs in self._repository.fetch_events( - filter=session.config.filter, - batch_size=session.config.batch_size + filter=session.config.filter, batch_size=session.config.batch_size ): batch: List[BaseEvent] = [] for doc in batch_docs: @@ -252,19 +205,15 @@ async def _fetch_event_batches( if max_events and events_processed >= max_events: break - async def _process_batch( - self, - session: ReplaySessionState, - batch: List[BaseEvent] - ) -> None: + async def _process_batch(self, session: ReplaySessionState, batch: List[BaseEvent]) -> None: with trace_span( - name="event_replay.process_batch", - kind=SpanKind.INTERNAL, - attributes={ - "replay.session_id": str(session.session_id), - "replay.batch.count": len(batch), - "replay.target": session.config.target, - }, + name="event_replay.process_batch", + kind=SpanKind.INTERNAL, + attributes={ + "replay.session_id": str(session.session_id), + "replay.batch.count": len(batch), + "replay.target": session.config.target, + }, ): for event in batch: if session.status != ReplayStatus.RUNNING: @@ -284,11 +233,7 @@ async def _process_batch( session.last_event_at = event.timestamp await self._update_session_in_db(session) - async def _replay_event( - self, - session: ReplaySessionState, - event: BaseEvent - ) -> bool: + async def _replay_event(self, session: ReplaySessionState, event: BaseEvent) -> bool: config = session.config attempts = config.retry_attempts if config.retry_failed else 1 @@ -308,35 +253,22 @@ async def _replay_event( except Exception as e: logger.error(f"Failed to replay event (attempt {attempt + 1}/{attempts}): {e}") if attempt < attempts - 1: - await asyncio.sleep(min(2 ** attempt, 10)) + await asyncio.sleep(min(2**attempt, 10)) continue return False - async def _write_event_to_file( - self, - event: BaseEvent, - file_path: str - ) -> None: + async def _write_event_to_file(self, event: BaseEvent, file_path: str) -> None: if file_path not in self._file_locks: self._file_locks[file_path] = asyncio.Lock() async with self._file_locks[file_path]: loop = asyncio.get_event_loop() - await loop.run_in_executor( - None, - self._write_to_file_sync, - event, - file_path - ) + await loop.run_in_executor(None, self._write_to_file_sync, event, file_path) - def _write_to_file_sync( - self, - event: BaseEvent, - file_path: str - ) -> None: - with open(file_path, 'a') as f: - f.write(json.dumps(event.model_dump(), default=str) + '\n') + def _write_to_file_sync(self, event: BaseEvent, file_path: str) -> None: + with open(file_path, "a") as f: + f.write(json.dumps(event.model_dump(), default=str) + "\n") async def pause_replay(self, session_id: str) -> None: session = self._sessions.get(session_id) @@ -372,11 +304,7 @@ async def cancel_replay(self, session_id: str) -> None: def get_session(self, session_id: str) -> ReplaySessionState | None: return self._sessions.get(session_id) - def list_sessions( - self, - status: ReplayStatus | None = None, - limit: int = 100 - ) -> List[ReplaySessionState]: + def list_sessions(self, status: ReplayStatus | None = None, limit: int = 100) -> List[ReplaySessionState]: sessions = list(self._sessions.values()) if status: @@ -385,32 +313,18 @@ def list_sessions( sessions.sort(key=lambda s: s.created_at, reverse=True) return sessions[:limit] - def register_callback( - self, - target: ReplayTarget, - callback: Callable[[BaseEvent, ReplaySessionState], Any] - ) -> None: + def register_callback(self, target: ReplayTarget, callback: Callable[[BaseEvent, ReplaySessionState], Any]) -> None: self._callbacks[target] = callback - async def cleanup_old_sessions( - self, - older_than_hours: int = 24 - ) -> int: + async def cleanup_old_sessions(self, older_than_hours: int = 24) -> int: cutoff_time = datetime.now(timezone.utc) - timedelta(hours=older_than_hours) removed = 0 - completed_statuses = { - ReplayStatus.COMPLETED, - ReplayStatus.FAILED, - ReplayStatus.CANCELLED - } + completed_statuses = {ReplayStatus.COMPLETED, ReplayStatus.FAILED, ReplayStatus.CANCELLED} for session_id in list(self._sessions.keys()): session = self._sessions[session_id] - if ( - session.status in completed_statuses - and session.created_at < cutoff_time - ): + if session.status in completed_statuses and session.created_at < cutoff_time: del self._sessions[session_id] removed += 1 @@ -429,9 +343,6 @@ async def _update_session_in_db(self, session: ReplaySessionState) -> None: ) # Note: last_event_at is not in ReplaySessionUpdate # If needed, add it to the domain model - await self._repository.update_replay_session( - session_id=session.session_id, - updates=session_update - ) + await self._repository.update_replay_session(session_id=session.session_id, updates=session_update) except Exception as e: logger.error(f"Failed to update session in database: {e}") diff --git a/backend/app/services/event_service.py b/backend/app/services/event_service.py index bcb64516..506f1cee 100644 --- a/backend/app/services/event_service.py +++ b/backend/app/services/event_service.py @@ -4,6 +4,7 @@ from pymongo import ASCENDING, DESCENDING from app.db.repositories.event_repository import EventRepository +from app.domain.enums.common import SortOrder from app.domain.enums.user import UserRole from app.domain.events import ( Event, @@ -27,11 +28,11 @@ def _build_user_filter(self, user_id: str, user_role: UserRole) -> dict[str, obj return {"metadata.user_id": user_id} async def get_execution_events( - self, - execution_id: str, - user_id: str, - user_role: UserRole, - include_system_events: bool = False, + self, + execution_id: str, + user_id: str, + user_role: UserRole, + include_system_events: bool = False, ) -> List[Event] | None: events = await self.repository.get_events_by_aggregate(aggregate_id=execution_id, limit=1000) if not events: @@ -52,14 +53,14 @@ async def get_execution_events( return events async def get_user_events_paginated( - self, - user_id: str, - event_types: List[str] | None = None, - start_time: datetime | None = None, - end_time: datetime | None = None, - limit: int = 100, - skip: int = 0, - sort_order: str = "desc", + self, + user_id: str, + event_types: List[str] | None = None, + start_time: datetime | None = None, + end_time: datetime | None = None, + limit: int = 100, + skip: int = 0, + sort_order: str = "desc", ) -> EventListResult: return await self.repository.get_user_events_paginated( user_id=user_id, @@ -72,14 +73,14 @@ async def get_user_events_paginated( ) async def query_events_advanced( - self, - user_id: str, - user_role: UserRole, - filters: EventFilter, - sort_by: str = "timestamp", - sort_order: Any = "desc", - limit: int = 100, - skip: int = 0, + self, + user_id: str, + user_role: UserRole, + filters: EventFilter, + sort_by: str = "timestamp", + sort_order: SortOrder = SortOrder.DESC, + limit: int = 100, + skip: int = 0, ) -> EventListResult | None: # Access control if filters.user_id and filters.user_id != user_id and user_role != UserRole.ADMIN: @@ -98,11 +99,11 @@ async def query_events_advanced( "stored_at": "stored_at", } sort_field = field_map.get(sort_by, "timestamp") - direction = DESCENDING if str(sort_order).lower() == "desc" else ASCENDING + direction = DESCENDING if sort_order == SortOrder.DESC else ASCENDING # Pagination and sorting from request return await self.repository.query_events_generic( - query=query, # type: ignore[assignment] + query=query, sort_field=sort_field, sort_direction=direction, skip=skip, @@ -110,12 +111,12 @@ async def query_events_advanced( ) async def get_events_by_correlation( - self, - correlation_id: str, - user_id: str, - user_role: UserRole, - include_all_users: bool = False, - limit: int = 100, + self, + correlation_id: str, + user_id: str, + user_role: UserRole, + include_all_users: bool = False, + limit: int = 100, ) -> List[Event]: events = await self.repository.get_events_by_correlation(correlation_id=correlation_id, limit=limit) if not include_all_users or user_role != UserRole.ADMIN: @@ -123,12 +124,12 @@ async def get_events_by_correlation( return events async def get_event_statistics( - self, - user_id: str, - user_role: UserRole, - start_time: datetime | None = None, - end_time: datetime | None = None, - include_all_users: bool = False, + self, + user_id: str, + user_role: UserRole, + start_time: datetime | None = None, + end_time: datetime | None = None, + include_all_users: bool = False, ) -> EventStatistics: match = {} if include_all_users else self._build_user_filter(user_id, user_role) return await self.repository.get_event_statistics_filtered( @@ -138,10 +139,10 @@ async def get_event_statistics( ) async def get_event( - self, - event_id: str, - user_id: str, - user_role: UserRole, + self, + event_id: str, + user_id: str, + user_role: UserRole, ) -> Event | None: event = await self.repository.get_event(event_id) if not event: @@ -152,11 +153,11 @@ async def get_event( return event async def aggregate_events( - self, - user_id: str, - user_role: UserRole, - pipeline: List[Dict[str, Any]], - limit: int = 100, + self, + user_id: str, + user_role: UserRole, + pipeline: List[Dict[str, Any]], + limit: int = 100, ) -> EventAggregationResult: user_filter = self._build_user_filter(user_id, user_role) new_pipeline = list(pipeline) @@ -168,18 +169,18 @@ async def aggregate_events( return await self.repository.aggregate_events(new_pipeline, limit=limit) async def list_event_types( - self, - user_id: str, - user_role: UserRole, + self, + user_id: str, + user_role: UserRole, ) -> List[str]: match = self._build_user_filter(user_id, user_role) return await self.repository.list_event_types(match=match) async def delete_event_with_archival( - self, - event_id: str, - deleted_by: str, - deletion_reason: str = "Admin deletion via API", + self, + event_id: str, + deleted_by: str, + deletion_reason: str = "Admin deletion via API", ) -> Event | None: return await self.repository.delete_event_with_archival( event_id=event_id, @@ -191,10 +192,10 @@ async def get_aggregate_replay_info(self, aggregate_id: str) -> EventReplayInfo return await self.repository.get_aggregate_replay_info(aggregate_id) async def get_events_by_aggregate( - self, - aggregate_id: str, - event_types: List[str] | None = None, - limit: int = 100, + self, + aggregate_id: str, + event_types: List[str] | None = None, + limit: int = 100, ) -> list[Event]: return await self.repository.get_events_by_aggregate( aggregate_id=aggregate_id, diff --git a/backend/app/services/execution_service.py b/backend/app/services/execution_service.py index ae9e7ff2..cd0204dc 100644 --- a/backend/app/services/execution_service.py +++ b/backend/app/services/execution_service.py @@ -10,7 +10,7 @@ from app.db.repositories.execution_repository import ExecutionRepository from app.domain.enums.events import EventType from app.domain.enums.execution import ExecutionStatus -from app.domain.execution import DomainExecution, ExecutionResultDomain, ResourceUsageDomain +from app.domain.execution import DomainExecution, ExecutionResultDomain, ResourceLimitsDomain, ResourceUsageDomain from app.events.core import UnifiedProducer from app.events.event_store import EventStore from app.infrastructure.kafka.events.base import BaseEvent @@ -33,22 +33,22 @@ class ExecutionService: """ Unified execution service that orchestrates code execution through events. - + This service creates execution records and publishes events to Kafka, where specialized workers handle the actual execution in isolated environments. Results are updated asynchronously through event processing. """ def __init__( - self, - execution_repo: ExecutionRepository, - producer: UnifiedProducer, - event_store: EventStore, - settings: Settings, + self, + execution_repo: ExecutionRepository, + producer: UnifiedProducer, + event_store: EventStore, + settings: Settings, ) -> None: """ Initialize execution service. - + Args: execution_repo: Repository for execution data persistence. producer: Kafka producer for publishing events. @@ -70,33 +70,33 @@ def _track_active_execution(self) -> Generator[None, None, None]: # noqa: D401 finally: self.metrics.decrement_active_executions() - async def get_k8s_resource_limits(self) -> dict[str, Any]: - return { - "cpu_limit": self.settings.K8S_POD_CPU_LIMIT, - "memory_limit": self.settings.K8S_POD_MEMORY_LIMIT, - "cpu_request": self.settings.K8S_POD_CPU_REQUEST, - "memory_request": self.settings.K8S_POD_MEMORY_REQUEST, - "execution_timeout": self.settings.K8S_POD_EXECUTION_TIMEOUT, - "supported_runtimes": self.settings.SUPPORTED_RUNTIMES, - } + async def get_k8s_resource_limits(self) -> ResourceLimitsDomain: + return ResourceLimitsDomain( + cpu_limit=self.settings.K8S_POD_CPU_LIMIT, + memory_limit=self.settings.K8S_POD_MEMORY_LIMIT, + cpu_request=self.settings.K8S_POD_CPU_REQUEST, + memory_request=self.settings.K8S_POD_MEMORY_REQUEST, + execution_timeout=self.settings.K8S_POD_EXECUTION_TIMEOUT, + supported_runtimes=self.settings.SUPPORTED_RUNTIMES, + ) async def get_example_scripts(self) -> dict[str, str]: return self.settings.EXAMPLE_SCRIPTS def _create_event_metadata( - self, - user_id: str | None = None, - client_ip: str | None = None, - user_agent: str | None = None, + self, + user_id: str | None = None, + client_ip: str | None = None, + user_agent: str | None = None, ) -> EventMetadata: """ Create standardized event metadata. - + Args: user_id: User identifier. client_ip: Client IP address. user_agent: User agent string. - + Returns: EventMetadata instance. """ @@ -112,20 +112,20 @@ def _create_event_metadata( ) async def execute_script( - self, - script: str, - user_id: str, - *, - client_ip: str | None, - user_agent: str | None, - lang: str = "python", - lang_version: str = "3.11", - priority: int = 5, - timeout_override: int | None = None, + self, + script: str, + user_id: str, + *, + client_ip: str | None, + user_agent: str | None, + lang: str = "python", + lang_version: str = "3.11", + priority: int = 5, + timeout_override: int | None = None, ) -> DomainExecution: """ Execute a script by creating an execution record and publishing an event. - + Args: script: The code to execute. lang: Programming language. @@ -133,10 +133,10 @@ async def execute_script( user_id: ID of the user requesting execution. priority: Execution priority (1-10, lower is higher priority). timeout_override: Override default timeout in seconds. - + Returns: DomainExecution record with queued status. - + Raises: IntegrationException: If validation fails or event publishing fails. """ @@ -152,7 +152,7 @@ async def execute_script( "script_length": len(script), "priority": priority, "timeout_override": timeout_override, - } + }, ) runtime_cfg = RUNTIME_REGISTRY[lang][lang_version] @@ -177,7 +177,7 @@ async def execute_script( "lang_version": lang_version, "user_id": user_id, "script_length": len(script), - } + }, ) # Metadata and event @@ -222,15 +222,11 @@ async def execute_script( "execution_id": str(created_execution.execution_id), "status": created_execution.status, "duration_seconds": duration, - } + }, ) return created_execution - async def _update_execution_error( - self, - execution_id: str, - error_message: str - ) -> None: + async def _update_execution_error(self, execution_id: str, error_message: str) -> None: result = ExecutionResultDomain( execution_id=execution_id, status=ExecutionStatus.ERROR, @@ -245,30 +241,24 @@ async def _update_execution_error( async def get_execution_result(self, execution_id: str) -> DomainExecution: """ Get execution result from database. - + In the event-driven architecture, results are updated asynchronously by worker services processing events. This method simply retrieves the current state from the database. - + Args: execution_id: UUID of the execution. - + Returns: Current execution state. - + Raises: IntegrationException: If execution not found. """ execution = await self.execution_repo.get_execution(execution_id) if not execution: - logger.warning( - "Execution not found", - extra={"execution_id": execution_id} - ) - raise IntegrationException( - status_code=404, - detail=f"Execution {execution_id} not found" - ) + logger.warning("Execution not found", extra={"execution_id": execution_id}) + raise IntegrationException(status_code=404, detail=f"Execution {execution_id} not found") logger.info( "Execution result retrieved successfully", @@ -280,33 +270,30 @@ async def get_execution_result(self, execution_id: str) -> DomainExecution: "has_output": bool(execution.stdout), "has_errors": bool(execution.stderr), "resource_usage": execution.resource_usage, - } + }, ) return execution async def get_execution_events( - self, - execution_id: str, - event_types: EventFilter = None, - limit: int = 100, + self, + execution_id: str, + event_types: EventFilter = None, + limit: int = 100, ) -> list[BaseEvent]: """ Get all events for an execution from the event store. - + Args: execution_id: UUID of the execution. event_types: Filter by specific event types. limit: Maximum number of events to return. - + Returns: List of events for the execution. """ # Use the correct method name - get_execution_events instead of get_events_by_execution - events = await self.event_store.get_execution_events( - execution_id=execution_id, - event_types=event_types - ) + events = await self.event_store.get_execution_events(execution_id=execution_id, event_types=event_types) # Apply limit if we got more events than requested if len(events) > limit: @@ -318,24 +305,24 @@ async def get_execution_events( "execution_id": execution_id, "event_count": len(events), "event_types": event_types, - } + }, ) return events async def get_user_executions( - self, - user_id: UserId, - status: ExecutionStatus | None = None, - lang: str | None = None, - start_time: datetime | None = None, - end_time: datetime | None = None, - limit: int = 50, - skip: int = 0, + self, + user_id: UserId, + status: ExecutionStatus | None = None, + lang: str | None = None, + start_time: datetime | None = None, + end_time: datetime | None = None, + limit: int = 50, + skip: int = 0, ) -> list[DomainExecution]: """ Get executions for a specific user with optional filters. - + Args: user_id: User identifier. status: Filter by execution status. @@ -344,17 +331,14 @@ async def get_user_executions( end_time: Filter by end time. limit: Maximum number of results. skip: Number of results to skip. - + Returns: List of executions matching filters. """ query = self._build_user_query(user_id, status, lang, start_time, end_time) executions = await self.execution_repo.get_executions( - query=query, - limit=limit, - skip=skip, - sort=[("created_at", -1)] + query=query, limit=limit, skip=skip, sort=[("created_at", -1)] ) logger.debug( @@ -364,29 +348,29 @@ async def get_user_executions( "filters": {k: v for k, v in query.items() if k != "user_id"}, "limit": limit, "skip": skip, - } + }, ) return executions async def count_user_executions( - self, - user_id: UserId, - status: ExecutionStatus | None = None, - lang: str | None = None, - start_time: datetime | None = None, - end_time: datetime | None = None, + self, + user_id: UserId, + status: ExecutionStatus | None = None, + lang: str | None = None, + start_time: datetime | None = None, + end_time: datetime | None = None, ) -> int: """ Count executions for a specific user with optional filters. - + Args: user_id: User identifier. status: Filter by execution status. lang: Filter by language. start_time: Filter by start time. end_time: Filter by end time. - + Returns: Count of executions matching filters. """ @@ -394,23 +378,23 @@ async def count_user_executions( return await self.execution_repo.count_executions(query) def _build_user_query( - self, - user_id: UserId, - status: ExecutionStatus | None = None, - lang: str | None = None, - start_time: datetime | None = None, - end_time: datetime | None = None, + self, + user_id: UserId, + status: ExecutionStatus | None = None, + lang: str | None = None, + start_time: datetime | None = None, + end_time: datetime | None = None, ) -> ExecutionQuery: """ Build MongoDB query for user executions. - + Args: user_id: User identifier. status: Filter by execution status. lang: Filter by language. start_time: Filter by start time. end_time: Filter by end time. - + Returns: MongoDB query dictionary. """ @@ -435,10 +419,10 @@ def _build_user_query( async def delete_execution(self, execution_id: str) -> bool: """ Delete an execution and publish deletion event. - + Args: execution_id: UUID of execution to delete. - + Returns: True if deletion successful. """ @@ -449,10 +433,7 @@ async def delete_execution(self, execution_id: str) -> bool: logger.warning(f"Execution {execution_id} not found for deletion") raise ServiceError("Execution not found", status_code=404) - logger.info( - "Deleted execution", - extra={"execution_id": execution_id} - ) + logger.info("Deleted execution", extra={"execution_id": execution_id}) await self._publish_deletion_event(execution_id) @@ -461,44 +442,36 @@ async def delete_execution(self, execution_id: str) -> bool: async def _publish_deletion_event(self, execution_id: str) -> None: """ Publish execution deletion/cancellation event. - + Args: execution_id: UUID of deleted execution. """ metadata = self._create_event_metadata() event = ExecutionCancelledEvent( - execution_id=execution_id, - reason="user_requested", - cancelled_by=metadata.user_id, - metadata=metadata + execution_id=execution_id, reason="user_requested", cancelled_by=metadata.user_id, metadata=metadata ) - await self.producer.produce( - event_to_produce=event, - key=execution_id - ) + await self.producer.produce(event_to_produce=event, key=execution_id) logger.info( "Published cancellation event", extra={ "execution_id": execution_id, "event_id": str(event.event_id), - } + }, ) async def get_execution_stats( - self, - user_id: UserId | None = None, - time_range: TimeRange = (None, None) + self, user_id: UserId | None = None, time_range: TimeRange = (None, None) ) -> ExecutionStats: """ Get execution statistics. - + Args: user_id: Optional user filter. time_range: Tuple of (start_time, end_time). - + Returns: Dictionary containing execution statistics. """ @@ -507,23 +480,19 @@ async def get_execution_stats( # Get executions for stats executions = await self.execution_repo.get_executions( query=query, - limit=1000 # Reasonable limit for stats + limit=1000, # Reasonable limit for stats ) return self._calculate_stats(executions) - def _build_stats_query( - self, - user_id: UserId | None, - time_range: TimeRange - ) -> ExecutionQuery: + def _build_stats_query(self, user_id: UserId | None, time_range: TimeRange) -> ExecutionQuery: """ Build query for statistics. - + Args: user_id: Optional user filter. time_range: Tuple of (start_time, end_time). - + Returns: MongoDB query dictionary. """ @@ -546,10 +515,10 @@ def _build_stats_query( def _calculate_stats(self, executions: list[DomainExecution]) -> ExecutionStats: """ Calculate statistics from executions. - + Args: executions: List of executions to analyze. - + Returns: Statistics dictionary. """ diff --git a/backend/app/services/grafana_alert_processor.py b/backend/app/services/grafana_alert_processor.py index 5689157e..64e0181f 100644 --- a/backend/app/services/grafana_alert_processor.py +++ b/backend/app/services/grafana_alert_processor.py @@ -45,11 +45,7 @@ def map_severity(cls, severity_str: str, alert_status: str | None) -> Notificati @classmethod def extract_title(cls, alert: GrafanaAlertItem) -> str: """Extract title from alert labels or annotations.""" - return ( - (alert.labels or {}).get("alertname") - or (alert.annotations or {}).get("title") - or cls.DEFAULT_TITLE - ) + return (alert.labels or {}).get("alertname") or (alert.annotations or {}).get("title") or cls.DEFAULT_TITLE @classmethod def build_message(cls, alert: GrafanaAlertItem) -> str: @@ -64,12 +60,7 @@ def build_message(cls, alert: GrafanaAlertItem) -> str: return summary or description or cls.DEFAULT_MESSAGE @classmethod - def build_metadata( - cls, - alert: GrafanaAlertItem, - webhook: GrafanaWebhook, - severity: str - ) -> dict[str, Any]: + def build_metadata(cls, alert: GrafanaAlertItem, webhook: GrafanaWebhook, severity: str) -> dict[str, Any]: """Build metadata dictionary for the notification.""" return { "grafana_status": alert.status or webhook.status, @@ -79,10 +70,10 @@ def build_metadata( } async def process_single_alert( - self, - alert: GrafanaAlertItem, - webhook: GrafanaWebhook, - correlation_id: str, + self, + alert: GrafanaAlertItem, + webhook: GrafanaWebhook, + correlation_id: str, ) -> tuple[bool, str | None]: """Process a single Grafana alert. @@ -112,18 +103,10 @@ async def process_single_alert( except Exception as e: error_msg = f"Failed to process Grafana alert: {e}" - logger.error( - error_msg, - extra={"correlation_id": correlation_id}, - exc_info=True - ) + logger.error(error_msg, extra={"correlation_id": correlation_id}, exc_info=True) return False, error_msg - async def process_webhook( - self, - webhook_payload: GrafanaWebhook, - correlation_id: str - ) -> tuple[int, list[str]]: + async def process_webhook(self, webhook_payload: GrafanaWebhook, correlation_id: str) -> tuple[int, list[str]]: """Process all alerts in a Grafana webhook. Args: @@ -147,9 +130,7 @@ async def process_webhook( ) for alert in alerts: - success, error_msg = await self.process_single_alert( - alert, webhook_payload, correlation_id - ) + success, error_msg = await self.process_single_alert(alert, webhook_payload, correlation_id) if success: processed_count += 1 elif error_msg: diff --git a/backend/app/services/idempotency/__init__.py b/backend/app/services/idempotency/__init__.py index 7ce275ed..82af12f0 100644 --- a/backend/app/services/idempotency/__init__.py +++ b/backend/app/services/idempotency/__init__.py @@ -17,5 +17,5 @@ "create_idempotency_manager", "IdempotentEventHandler", "idempotent_handler", - "IdempotentConsumerWrapper" + "IdempotentConsumerWrapper", ] diff --git a/backend/app/services/idempotency/idempotency_manager.py b/backend/app/services/idempotency/idempotency_manager.py index ec26f6bf..f06467c1 100644 --- a/backend/app/services/idempotency/idempotency_manager.py +++ b/backend/app/services/idempotency/idempotency_manager.py @@ -88,11 +88,7 @@ async def close(self) -> None: logger.info("Closed idempotency manager") def _generate_key( - self, - event: BaseEvent, - key_strategy: str, - custom_key: str | None = None, - fields: set[str] | None = None + self, event: BaseEvent, key_strategy: str, custom_key: str | None = None, fields: set[str] | None = None ) -> str: if key_strategy == "event_based": key = IdempotencyKeyStrategy.event_based(event) @@ -104,15 +100,13 @@ def _generate_key( raise ValueError(f"Invalid key strategy: {key_strategy}") return f"{self.config.key_prefix}:{key}" - - async def check_and_reserve( - self, - event: BaseEvent, - key_strategy: str = "event_based", - custom_key: str | None = None, - ttl_seconds: int | None = None, - fields: set[str] | None = None, + self, + event: BaseEvent, + key_strategy: str = "event_based", + custom_key: str | None = None, + ttl_seconds: int | None = None, + fields: set[str] | None = None, ) -> IdempotencyResult: full_key = self._generate_key(event, key_strategy, custom_key, fields) ttl = ttl_seconds or self.config.default_ttl_seconds @@ -126,10 +120,10 @@ async def check_and_reserve( return await self._create_new_key(full_key, event, ttl) async def _handle_existing_key( - self, - existing: IdempotencyRecord, - full_key: str, - event_type: str, + self, + existing: IdempotencyRecord, + full_key: str, + event_type: str, ) -> IdempotencyResult: status = existing.status if status == IdempotencyStatus.PROCESSING: @@ -149,10 +143,10 @@ async def _handle_existing_key( ) async def _handle_processing_key( - self, - existing: IdempotencyRecord, - full_key: str, - event_type: str, + self, + existing: IdempotencyRecord, + full_key: str, + event_type: str, ) -> IdempotencyResult: created_at = existing.created_at now = datetime.now(timezone.utc) @@ -162,12 +156,18 @@ async def _handle_processing_key( existing.created_at = now existing.status = IdempotencyStatus.PROCESSING await self._repo.update_record(existing) - return IdempotencyResult(is_duplicate=False, status=IdempotencyStatus.PROCESSING, created_at=now, - key=full_key) + return IdempotencyResult( + is_duplicate=False, status=IdempotencyStatus.PROCESSING, created_at=now, key=full_key + ) self.metrics.record_idempotency_duplicate_blocked(event_type) - return IdempotencyResult(is_duplicate=True, status=IdempotencyStatus.PROCESSING, created_at=created_at, - has_cached_result=existing.result_json is not None, key=full_key) + return IdempotencyResult( + is_duplicate=True, + status=IdempotencyStatus.PROCESSING, + created_at=created_at, + has_cached_result=existing.result_json is not None, + key=full_key, + ) async def _create_new_key(self, full_key: str, event: BaseEvent, ttl: int) -> IdempotencyResult: created_at = datetime.now(timezone.utc) @@ -181,24 +181,26 @@ async def _create_new_key(self, full_key: str, event: BaseEvent, ttl: int) -> Id ttl_seconds=ttl, ) await self._repo.insert_processing(record) - return IdempotencyResult(is_duplicate=False, status=IdempotencyStatus.PROCESSING, created_at=created_at, - key=full_key) + return IdempotencyResult( + is_duplicate=False, status=IdempotencyStatus.PROCESSING, created_at=created_at, key=full_key + ) except DuplicateKeyError: # Race: someone inserted the same key concurrently — treat as existing existing = await self._repo.find_by_key(full_key) if existing: return await self._handle_existing_key(existing, full_key, event.event_type) # If for some reason it's still not found, allow processing - return IdempotencyResult(is_duplicate=False, status=IdempotencyStatus.PROCESSING, created_at=created_at, - key=full_key) + return IdempotencyResult( + is_duplicate=False, status=IdempotencyStatus.PROCESSING, created_at=created_at, key=full_key + ) async def _update_key_status( - self, - full_key: str, - existing: IdempotencyRecord, - status: IdempotencyStatus, - cached_json: str | None = None, - error: str | None = None, + self, + full_key: str, + existing: IdempotencyRecord, + status: IdempotencyStatus, + cached_json: str | None = None, + error: str | None = None, ) -> bool: created_at = existing.created_at completed_at = datetime.now(timezone.utc) @@ -216,11 +218,11 @@ async def _update_key_status( return (await self._repo.update_record(existing)) > 0 async def mark_completed( - self, - event: BaseEvent, - key_strategy: str = "event_based", - custom_key: str | None = None, - fields: set[str] | None = None + self, + event: BaseEvent, + key_strategy: str = "event_based", + custom_key: str | None = None, + fields: set[str] | None = None, ) -> bool: full_key = self._generate_key(event, key_strategy, custom_key, fields) try: @@ -235,28 +237,29 @@ async def mark_completed( return await self._update_key_status(full_key, existing, IdempotencyStatus.COMPLETED, cached_json=None) async def mark_failed( - self, - event: BaseEvent, - error: str, - key_strategy: str = "event_based", - custom_key: str | None = None, - fields: set[str] | None = None + self, + event: BaseEvent, + error: str, + key_strategy: str = "event_based", + custom_key: str | None = None, + fields: set[str] | None = None, ) -> bool: full_key = self._generate_key(event, key_strategy, custom_key, fields) existing = await self._repo.find_by_key(full_key) if not existing: logger.warning(f"Idempotency key {full_key} not found when marking failed") return False - return await self._update_key_status(full_key, existing, IdempotencyStatus.FAILED, cached_json=None, - error=error) + return await self._update_key_status( + full_key, existing, IdempotencyStatus.FAILED, cached_json=None, error=error + ) async def mark_completed_with_json( - self, - event: BaseEvent, - cached_json: str, - key_strategy: str = "event_based", - custom_key: str | None = None, - fields: set[str] | None = None + self, + event: BaseEvent, + cached_json: str, + key_strategy: str = "event_based", + custom_key: str | None = None, + fields: set[str] | None = None, ) -> bool: full_key = self._generate_key(event, key_strategy, custom_key, fields) existing = await self._repo.find_by_key(full_key) @@ -265,19 +268,20 @@ async def mark_completed_with_json( return False return await self._update_key_status(full_key, existing, IdempotencyStatus.COMPLETED, cached_json=cached_json) - async def get_cached_json(self, event: BaseEvent, key_strategy: str, custom_key: str | None, - fields: set[str] | None = None) -> str: + async def get_cached_json( + self, event: BaseEvent, key_strategy: str, custom_key: str | None, fields: set[str] | None = None + ) -> str: full_key = self._generate_key(event, key_strategy, custom_key, fields) existing = await self._repo.find_by_key(full_key) assert existing and existing.result_json is not None, "Invariant: cached result must exist when requested" return existing.result_json async def remove( - self, - event: BaseEvent, - key_strategy: str = "event_based", - custom_key: str | None = None, - fields: set[str] | None = None + self, + event: BaseEvent, + key_strategy: str = "event_based", + custom_key: str | None = None, + fields: set[str] | None = None, ) -> bool: full_key = self._generate_key(event, key_strategy, custom_key, fields) try: @@ -311,8 +315,8 @@ async def _update_stats_loop(self) -> None: def create_idempotency_manager( - *, - repository: IdempotencyRepoProtocol, - config: IdempotencyConfig | None = None, + *, + repository: IdempotencyRepoProtocol, + config: IdempotencyConfig | None = None, ) -> IdempotencyManager: return IdempotencyManager(config or IdempotencyConfig(), repository) diff --git a/backend/app/services/idempotency/middleware.py b/backend/app/services/idempotency/middleware.py index 465c8f9a..b31afe7b 100644 --- a/backend/app/services/idempotency/middleware.py +++ b/backend/app/services/idempotency/middleware.py @@ -15,15 +15,15 @@ class IdempotentEventHandler: """Wrapper for event handlers with idempotency support""" def __init__( - self, - handler: Callable[[BaseEvent], Awaitable[None]], - idempotency_manager: IdempotencyManager, - key_strategy: str = "event_based", - custom_key_func: Callable[[BaseEvent], str] | None = None, - fields: Set[str] | None = None, - ttl_seconds: int | None = None, - cache_result: bool = True, - on_duplicate: Callable | None = None + self, + handler: Callable[[BaseEvent], Awaitable[None]], + idempotency_manager: IdempotencyManager, + key_strategy: str = "event_based", + custom_key_func: Callable[[BaseEvent], str] | None = None, + fields: Set[str] | None = None, + ttl_seconds: int | None = None, + cache_result: bool = True, + on_duplicate: Callable[[BaseEvent, Any], Any] | None = None, ): self.handler = handler self.idempotency_manager = idempotency_manager @@ -36,8 +36,10 @@ def __init__( async def __call__(self, event: BaseEvent) -> None: """Process event with idempotency check""" - logger.info(f"IdempotentEventHandler called for event {event.event_type}, " - f"id={event.event_id}, handler={self.handler.__name__}") + logger.info( + f"IdempotentEventHandler called for event {event.event_type}, " + f"id={event.event_id}, handler={self.handler.__name__}" + ) # Generate custom key if function provided custom_key = None if self.key_strategy == "custom" and self.custom_key_func: @@ -49,14 +51,13 @@ async def __call__(self, event: BaseEvent) -> None: key_strategy=self.key_strategy, custom_key=custom_key, ttl_seconds=self.ttl_seconds, - fields=self.fields + fields=self.fields, ) if idempotency_result.is_duplicate: # Handle duplicate logger.info( - f"Duplicate event detected: {event.event_type} ({event.event_id}), " - f"status: {idempotency_result.status}" + f"Duplicate event detected: {event.event_type} ({event.event_id}), status: {idempotency_result.status}" ) # Call duplicate handler if provided @@ -76,32 +77,25 @@ async def __call__(self, event: BaseEvent) -> None: # Mark as completed await self.idempotency_manager.mark_completed( - event=event, - key_strategy=self.key_strategy, - custom_key=custom_key, - fields=self.fields + event=event, key_strategy=self.key_strategy, custom_key=custom_key, fields=self.fields ) except Exception as e: # Mark as failed await self.idempotency_manager.mark_failed( - event=event, - error=str(e), - key_strategy=self.key_strategy, - custom_key=custom_key, - fields=self.fields + event=event, error=str(e), key_strategy=self.key_strategy, custom_key=custom_key, fields=self.fields ) raise def idempotent_handler( - idempotency_manager: IdempotencyManager, - key_strategy: str = "event_based", - custom_key_func: Callable[[BaseEvent], str] | None = None, - fields: Set[str] | None = None, - ttl_seconds: int | None = None, - cache_result: bool = True, - on_duplicate: Callable | None = None + idempotency_manager: IdempotencyManager, + key_strategy: str = "event_based", + custom_key_func: Callable[[BaseEvent], str] | None = None, + fields: Set[str] | None = None, + ttl_seconds: int | None = None, + cache_result: bool = True, + on_duplicate: Callable[[BaseEvent, Any], Any] | None = None, ) -> Callable[[Callable[[BaseEvent], Awaitable[None]]], Callable[[BaseEvent], Awaitable[None]]]: """Decorator for making event handlers idempotent""" @@ -114,7 +108,7 @@ def decorator(func: Callable[[BaseEvent], Awaitable[None]]) -> Callable[[BaseEve fields=fields, ttl_seconds=ttl_seconds, cache_result=cache_result, - on_duplicate=on_duplicate + on_duplicate=on_duplicate, ) return handler # IdempotentEventHandler is already callable with the right signature @@ -125,13 +119,13 @@ class IdempotentConsumerWrapper: """Wrapper for Kafka consumer with automatic idempotency""" def __init__( - self, - consumer: UnifiedConsumer, - idempotency_manager: IdempotencyManager, - dispatcher: EventDispatcher, - default_key_strategy: str = "event_based", - default_ttl_seconds: int = 3600, - enable_for_all_handlers: bool = True + self, + consumer: UnifiedConsumer, + idempotency_manager: IdempotencyManager, + dispatcher: EventDispatcher, + default_key_strategy: str = "event_based", + default_ttl_seconds: int = 3600, + enable_for_all_handlers: bool = True, ): self.consumer = consumer self.idempotency_manager = idempotency_manager @@ -139,12 +133,14 @@ def __init__( self.default_key_strategy = default_key_strategy self.default_ttl_seconds = default_ttl_seconds self.enable_for_all_handlers = enable_for_all_handlers - self._original_handlers: Dict[EventType, list] = {} + self._original_handlers: Dict[EventType, list[Callable[[BaseEvent], Awaitable[None]]]] = {} def make_handlers_idempotent(self) -> None: """Wrap all registered handlers with idempotency""" - logger.info(f"make_handlers_idempotent called: enable_for_all={self.enable_for_all_handlers}, " - f"dispatcher={self.dispatcher is not None}") + logger.info( + f"make_handlers_idempotent called: enable_for_all={self.enable_for_all_handlers}, " + f"dispatcher={self.dispatcher is not None}" + ) if not self.enable_for_all_handlers or not self.dispatcher: logger.warning("Skipping handler wrapping - conditions not met") return @@ -162,27 +158,28 @@ def make_handlers_idempotent(self) -> None: handler=handler, idempotency_manager=self.idempotency_manager, key_strategy=self.default_key_strategy, - ttl_seconds=self.default_ttl_seconds + ttl_seconds=self.default_ttl_seconds, ) wrapped_handlers.append(wrapped) # Replace handlers using public API - logger.info(f"Replacing {len(handlers)} handlers for {event_type} " - f"with {len(wrapped_handlers)} wrapped handlers") + logger.info( + f"Replacing {len(handlers)} handlers for {event_type} with {len(wrapped_handlers)} wrapped handlers" + ) self.dispatcher.replace_handlers(event_type, wrapped_handlers) - + logger.info("Handler wrapping complete") def subscribe_idempotent_handler( - self, - event_type: str, - handler: Callable, - key_strategy: str | None = None, - custom_key_func: Callable[[BaseEvent], str] | None = None, - fields: Set[str] | None = None, - ttl_seconds: int | None = None, - cache_result: bool = True, - on_duplicate: Callable | None = None + self, + event_type: str, + handler: Callable[[BaseEvent], Awaitable[None]], + key_strategy: str | None = None, + custom_key_func: Callable[[BaseEvent], str] | None = None, + fields: Set[str] | None = None, + ttl_seconds: int | None = None, + cache_result: bool = True, + on_duplicate: Callable[[BaseEvent, Any], Any] | None = None, ) -> None: """Subscribe an idempotent handler for specific event type""" # Create the idempotent handler wrapper @@ -194,48 +191,50 @@ def subscribe_idempotent_handler( fields=fields, ttl_seconds=ttl_seconds or self.default_ttl_seconds, cache_result=cache_result, - on_duplicate=on_duplicate + on_duplicate=on_duplicate, ) # Create an async handler that processes the message async def async_handler(message: Any) -> Any: logger.info(f"IDEMPOTENT HANDLER CALLED for {event_type}") - + # Extract event from confluent-kafka Message - if not hasattr(message, 'value'): + if not hasattr(message, "value"): logger.error(f"Received non-Message object for {event_type}: {type(message)}") return None - + # Debug log to check message details - logger.info(f"Handler for {event_type} - Message type: {type(message)}, " - f"has key: {hasattr(message, 'key')}, " - f"has topic: {hasattr(message, 'topic')}") - + logger.info( + f"Handler for {event_type} - Message type: {type(message)}, " + f"has key: {hasattr(message, 'key')}, " + f"has topic: {hasattr(message, 'topic')}" + ) + raw_value = message.value() - + # Debug the raw value logger.info(f"Raw value extracted: {raw_value[:100] if raw_value else 'None or empty'}") - + # Handle tombstone messages (null value for log compaction) if raw_value is None: logger.warning(f"Received empty message for {event_type} - tombstone or consumed value") return None - + # Handle empty messages if not raw_value: logger.warning(f"Received empty message for {event_type} - empty bytes") return None - + try: # Deserialize using schema registry if available event = self.consumer._schema_registry.deserialize_event(raw_value, message.topic()) if not event: logger.error(f"Failed to deserialize event for {event_type}") return None - + # Call the idempotent wrapper directly in async context await idempotent_wrapper(event) - + logger.debug(f"Successfully processed {event_type} event: {event.event_id}") return None except Exception as e: @@ -247,6 +246,7 @@ async def async_handler(message: Any) -> Any: # Create wrapper for EventDispatcher async def dispatch_handler(event: BaseEvent) -> None: await idempotent_wrapper(event) + self.dispatcher.register(EventType(event_type))(dispatch_handler) else: # Fallback to direct consumer registration if no dispatcher diff --git a/backend/app/services/idempotency/redis_repository.py b/backend/app/services/idempotency/redis_repository.py index d33c9704..c8d18624 100644 --- a/backend/app/services/idempotency/redis_repository.py +++ b/backend/app/services/idempotency/redis_repository.py @@ -59,7 +59,7 @@ def _doc_to_record(self, doc: Dict[str, Any]) -> IdempotencyRecord: event_id=str(doc.get("event_id", "")), created_at=created_at, # type: ignore[arg-type] ttl_seconds=int(doc.get("ttl_seconds", 0) or 0), - completed_at=completed_at, # type: ignore[arg-type] + completed_at=completed_at, processing_duration_ms=doc.get("processing_duration_ms"), error=doc.get("error"), result_json=doc.get("result"), @@ -138,4 +138,4 @@ async def aggregate_status_counts(self, key_prefix: str) -> dict[str, int]: return counts async def health_check(self) -> None: - await self._r.execute_command("PING") + await self._r.execute_command("PING") # type: ignore[no-untyped-call] diff --git a/backend/app/services/k8s_worker/config.py b/backend/app/services/k8s_worker/config.py index 295f3355..dad0f6f3 100644 --- a/backend/app/services/k8s_worker/config.py +++ b/backend/app/services/k8s_worker/config.py @@ -6,7 +6,6 @@ @dataclass class K8sWorkerConfig: - # Kafka settings kafka_bootstrap_servers: str | None = None consumer_group: str = "kubernetes-worker" diff --git a/backend/app/services/k8s_worker/pod_builder.py b/backend/app/services/k8s_worker/pod_builder.py index 144ef441..c4db7a48 100644 --- a/backend/app/services/k8s_worker/pod_builder.py +++ b/backend/app/services/k8s_worker/pod_builder.py @@ -22,25 +22,17 @@ def build_pod_manifest(self, command: CreatePodCommandEvent) -> k8s_client.V1Pod user_id=command.metadata.user_id, language=command.language, correlation_id=command.metadata.correlation_id, - saga_id=command.saga_id + saga_id=command.saga_id, ) - return k8s_client.V1Pod( - api_version="v1", - kind="Pod", - metadata=metadata, - spec=pod_spec - ) + return k8s_client.V1Pod(api_version="v1", kind="Pod", metadata=metadata, spec=pod_spec) def build_config_map( - self, - command: CreatePodCommandEvent, - script_content: str, - entrypoint_content: str + self, command: CreatePodCommandEvent, script_content: str, entrypoint_content: str ) -> k8s_client.V1ConfigMap: """Build ConfigMap for script and entrypoint""" execution_id = command.execution_id - + return k8s_client.V1ConfigMap( api_version="v1", kind="ConfigMap", @@ -51,21 +43,18 @@ def build_config_map( "app": "integr8s", "component": "execution-script", "execution-id": execution_id, - "saga-id": command.saga_id - } + "saga-id": command.saga_id, + }, ), - data={ - command.runtime_filename: script_content, - "entrypoint.sh": entrypoint_content - } + data={command.runtime_filename: script_content, "entrypoint.sh": entrypoint_content}, ) def _build_container(self, command: CreatePodCommandEvent) -> k8s_client.V1Container: execution_id = command.execution_id # Timeout is enforced by activeDeadlineSeconds on the pod spec - container_command = ['/bin/sh', '/entry/entrypoint.sh'] + command.runtime_command - + container_command = ["/bin/sh", "/entry/entrypoint.sh"] + command.runtime_command + # Get resources - prefer command values, fallback to config cpu_request = command.cpu_request or self.config.default_cpu_request memory_request = command.memory_request or self.config.default_memory_request @@ -80,16 +69,16 @@ def _build_container(self, command: CreatePodCommandEvent) -> k8s_client.V1Conta k8s_client.V1VolumeMount(name="script-volume", mount_path="/scripts", read_only=True), k8s_client.V1VolumeMount(name="entrypoint-volume", mount_path="/entry", read_only=True), k8s_client.V1VolumeMount(name="output-volume", mount_path="/output"), - k8s_client.V1VolumeMount(name="tmp-volume", mount_path="/tmp") # nosec B108: K8s EmptyDir mounted inside container; not host /tmp + k8s_client.V1VolumeMount(name="tmp-volume", mount_path="/tmp"), # nosec B108: K8s EmptyDir mounted inside container; not host /tmp ], resources=k8s_client.V1ResourceRequirements( requests={"cpu": cpu_request, "memory": memory_request}, - limits={"cpu": cpu_limit, "memory": memory_limit} + limits={"cpu": cpu_limit, "memory": memory_limit}, ), env=[ k8s_client.V1EnvVar(name="EXECUTION_ID", value=execution_id), - k8s_client.V1EnvVar(name="OUTPUT_PATH", value="/output") - ] + k8s_client.V1EnvVar(name="OUTPUT_PATH", value="/output"), + ], ) # SECURITY: Always enforce strict security context @@ -99,7 +88,7 @@ def _build_container(self, command: CreatePodCommandEvent) -> k8s_client.V1Conta run_as_group=1000, read_only_root_filesystem=True, # Always read-only filesystem allow_privilege_escalation=False, - capabilities=k8s_client.V1Capabilities(drop=["ALL"]) + capabilities=k8s_client.V1Capabilities(drop=["ALL"]), ) container.security_context.seccomp_profile = k8s_client.V1SeccompProfile(type="RuntimeDefault") container.stdin = False @@ -108,9 +97,7 @@ def _build_container(self, command: CreatePodCommandEvent) -> k8s_client.V1Conta return container def _build_pod_spec( - self, - container: k8s_client.V1Container, - command: CreatePodCommandEvent + self, container: k8s_client.V1Container, command: CreatePodCommandEvent ) -> k8s_client.V1PodSpec: """Build pod specification""" execution_id = command.execution_id @@ -125,42 +112,27 @@ def _build_pod_spec( name="script-volume", config_map=k8s_client.V1ConfigMapVolumeSource( name=f"script-{execution_id}", - items=[ - k8s_client.V1KeyToPath( - key=command.runtime_filename, - path=command.runtime_filename - ) - ] - ) + items=[k8s_client.V1KeyToPath(key=command.runtime_filename, path=command.runtime_filename)], + ), ), k8s_client.V1Volume( name="entrypoint-volume", config_map=k8s_client.V1ConfigMapVolumeSource( name=f"script-{execution_id}", - items=[ - k8s_client.V1KeyToPath( - key="entrypoint.sh", - path="entrypoint.sh" - ) - ] - ) + items=[k8s_client.V1KeyToPath(key="entrypoint.sh", path="entrypoint.sh")], + ), ), k8s_client.V1Volume( - name="output-volume", - empty_dir=k8s_client.V1EmptyDirVolumeSource(size_limit="10Mi") + name="output-volume", empty_dir=k8s_client.V1EmptyDirVolumeSource(size_limit="10Mi") ), - k8s_client.V1Volume( - name="tmp-volume", - empty_dir=k8s_client.V1EmptyDirVolumeSource(size_limit="10Mi") - ) + k8s_client.V1Volume(name="tmp-volume", empty_dir=k8s_client.V1EmptyDirVolumeSource(size_limit="10Mi")), ], - # Critical security boundaries (not network-related) - enable_service_links=False, # Defense in depth - no service discovery - automount_service_account_token=False, # CRITICAL: No K8s API access - host_network=False, # CRITICAL: No host network namespace - host_pid=False, # CRITICAL: No host PID namespace - host_ipc=False # CRITICAL: No host IPC namespace + enable_service_links=False, # Defense in depth - no service discovery + automount_service_account_token=False, # CRITICAL: No K8s API access + host_network=False, # CRITICAL: No host network namespace + host_pid=False, # CRITICAL: No host PID namespace + host_ipc=False, # CRITICAL: No host IPC namespace ) spec.security_context = k8s_client.V1PodSecurityContext( @@ -169,9 +141,7 @@ def _build_pod_spec( run_as_group=1000, fs_group=1000, fs_group_change_policy="OnRootMismatch", - seccomp_profile=k8s_client.V1SeccompProfile( - type="RuntimeDefault" - ) + seccomp_profile=k8s_client.V1SeccompProfile(type="RuntimeDefault"), ) return spec @@ -183,23 +153,18 @@ def _build_pod_metadata( user_id: str | None, language: str, correlation_id: str | None = None, - saga_id: str | None = None + saga_id: str | None = None, ) -> k8s_client.V1ObjectMeta: """Build pod metadata with correlation and saga tracking""" - labels = { - "app": "integr8s", - "component": "executor", - "execution-id": execution_id, - "language": language - } + labels = {"app": "integr8s", "component": "executor", "execution-id": execution_id, "language": language} if user_id: labels["user-id"] = user_id[:63] # K8s label value limit - + # Add correlation_id if provided (truncate to K8s label limit) if correlation_id: labels["correlation-id"] = correlation_id[:63] - + # Add saga_id if provided (truncate to K8s label limit) if saga_id: labels["saga-id"] = saga_id[:63] @@ -207,18 +172,13 @@ def _build_pod_metadata( annotations = { "integr8s.io/execution-id": execution_id, "integr8s.io/created-by": "kubernetes-worker", - "integr8s.io/language": language + "integr8s.io/language": language, } - + if correlation_id: annotations["integr8s.io/correlation-id"] = correlation_id - + if saga_id: annotations["integr8s.io/saga-id"] = saga_id - return k8s_client.V1ObjectMeta( - name=name, - namespace=self.namespace, - labels=labels, - annotations=annotations - ) + return k8s_client.V1ObjectMeta(name=name, namespace=self.namespace, labels=labels, annotations=annotations) diff --git a/backend/app/services/k8s_worker/worker.py b/backend/app/services/k8s_worker/worker.py index cb08e2dd..66bee135 100644 --- a/backend/app/services/k8s_worker/worker.py +++ b/backend/app/services/k8s_worker/worker.py @@ -9,8 +9,9 @@ from kubernetes import client as k8s_client from kubernetes import config as k8s_config from kubernetes.client.rest import ApiException -from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase +from motor.motor_asyncio import AsyncIOMotorClient +from app.core.database_context import Database, DBClient from app.core.lifecycle import LifecycleEnabled from app.core.logging import logger from app.core.metrics import ExecutionMetrics, KubernetesMetrics @@ -46,7 +47,7 @@ class KubernetesWorker(LifecycleEnabled): """ Worker service that creates Kubernetes pods from execution events. - + This service: 1. Consumes ExecutionStarted events from Kafka 2. Creates ConfigMaps with script content @@ -55,20 +56,22 @@ class KubernetesWorker(LifecycleEnabled): 5. Publishes PodCreated events """ - def __init__(self, - config: K8sWorkerConfig, - database: AsyncIOMotorDatabase, - producer: UnifiedProducer, - schema_registry_manager: SchemaRegistryManager, - event_store: EventStore, - idempotency_manager: IdempotencyManager): + def __init__( + self, + config: K8sWorkerConfig, + database: Database, + producer: UnifiedProducer, + schema_registry_manager: SchemaRegistryManager, + event_store: EventStore, + idempotency_manager: IdempotencyManager, + ): self.metrics = KubernetesMetrics() self.execution_metrics = ExecutionMetrics() self.config = config or K8sWorkerConfig() settings = get_settings() self.kafka_servers = self.config.kafka_bootstrap_servers or settings.KAFKA_BOOTSTRAP_SERVERS - self._db: AsyncIOMotorDatabase = database + self._db: Database = database self._event_store = event_store # Kubernetes clients @@ -100,8 +103,9 @@ async def start(self) -> None: logger.info("DEBUG: About to initialize Kubernetes client") if self.config.namespace == "default": - raise RuntimeError("KubernetesWorker namespace 'default' is forbidden. " - "Set K8S_NAMESPACE to a dedicated namespace.") + raise RuntimeError( + "KubernetesWorker namespace 'default' is forbidden. Set K8S_NAMESPACE to a dedicated namespace." + ) # Initialize Kubernetes client self._initialize_kubernetes_client() @@ -115,7 +119,7 @@ async def start(self) -> None: consumer_config = ConsumerConfig( bootstrap_servers=self.kafka_servers, group_id=f"{self.config.consumer_group}.{get_settings().KAFKA_GROUP_SUFFIX}", - enable_auto_commit=False + enable_auto_commit=False, ) # Create dispatcher and register handlers for saga commands @@ -124,10 +128,7 @@ async def start(self) -> None: self.dispatcher.register_handler(EventType.DELETE_POD_COMMAND, self._handle_delete_pod_command_wrapper) # Create consumer with dispatcher - self.consumer = UnifiedConsumer( - consumer_config, - event_dispatcher=self.dispatcher - ) + self.consumer = UnifiedConsumer(consumer_config, event_dispatcher=self.dispatcher) # Wrap consumer with idempotency - use content hash for pod commands self.idempotent_consumer = IdempotentConsumerWrapper( @@ -136,7 +137,7 @@ async def start(self) -> None: dispatcher=self.dispatcher, default_key_strategy="content_hash", # Hash execution_id + script for deduplication default_ttl_seconds=3600, # 1 hour TTL for pod creation events - enable_for_all_handlers=True # Enable idempotency for all handlers + enable_for_all_handlers=True, # Enable idempotency for all handlers ) # Start the consumer with idempotency - listen to saga commands topic @@ -235,10 +236,7 @@ async def _handle_delete_pod_command_wrapper(self, event: BaseEvent) -> None: logger.info(f"Processing delete_pod_command for execution {event.execution_id} from saga {event.saga_id}") await self._handle_delete_pod_command(event) - async def _handle_create_pod_command( - self, - command: CreatePodCommandEvent - ) -> None: + async def _handle_create_pod_command(self, command: CreatePodCommandEvent) -> None: """Handle create pod command from saga orchestrator""" execution_id = command.execution_id @@ -250,10 +248,7 @@ async def _handle_create_pod_command( # Create pod asynchronously asyncio.create_task(self._create_pod_for_execution(command)) - async def _handle_delete_pod_command( - self, - command: DeletePodCommandEvent - ) -> None: + async def _handle_delete_pod_command(self, command: DeletePodCommandEvent) -> None: """Handle delete pod command from saga orchestrator (compensation)""" execution_id = command.execution_id logger.info(f"Deleting pod for execution {execution_id} due to: {command.reason}") @@ -266,7 +261,7 @@ async def _handle_delete_pod_command( self.v1.delete_namespaced_pod, name=pod_name, namespace=self.config.namespace, - grace_period_seconds=30 + grace_period_seconds=30, ) logger.info(f"Successfully deleted pod {pod_name}") @@ -274,9 +269,7 @@ async def _handle_delete_pod_command( configmap_name = f"script-{execution_id}" if self.v1: await asyncio.to_thread( - self.v1.delete_namespaced_config_map, - name=configmap_name, - namespace=self.config.namespace + self.v1.delete_namespaced_config_map, name=configmap_name, namespace=self.config.namespace ) logger.info(f"Successfully deleted ConfigMap {configmap_name}") @@ -306,9 +299,7 @@ async def _create_pod_for_execution(self, command: CreatePodCommandEvent) -> Non # Create ConfigMap config_map = self.pod_builder.build_config_map( - command=command, - script_content=script_content, - entrypoint_content=entrypoint_content + command=command, script_content=script_content, entrypoint_content=entrypoint_content ) await self._create_config_map(config_map) @@ -330,10 +321,7 @@ async def _create_pod_for_execution(self, command: CreatePodCommandEvent) -> Non ) except Exception as e: - logger.error( - f"Failed to create pod for execution {execution_id}: {e}", - exc_info=True - ) + logger.error(f"Failed to create pod for execution {execution_id}: {e}", exc_info=True) # Update metrics self.metrics.record_k8s_pod_created("failed", "unknown") @@ -374,9 +362,7 @@ async def _create_config_map(self, config_map: k8s_client.V1ConfigMap) -> None: raise RuntimeError("Kubernetes client not initialized") try: await asyncio.to_thread( - self.v1.create_namespaced_config_map, - namespace=self.config.namespace, - body=config_map + self.v1.create_namespaced_config_map, namespace=self.config.namespace, body=config_map ) self.metrics.record_k8s_config_map_created("success") logger.debug(f"Created ConfigMap {config_map.metadata.name}") @@ -393,11 +379,7 @@ async def _create_pod(self, pod: k8s_client.V1Pod) -> None: if not self.v1: raise RuntimeError("Kubernetes client not initialized") try: - await asyncio.to_thread( - self.v1.create_namespaced_pod, - namespace=self.config.namespace, - body=pod - ) + await asyncio.to_thread(self.v1.create_namespaced_pod, namespace=self.config.namespace, body=pod) logger.debug(f"Created Pod {pod.metadata.name}") except ApiException as e: if e.status == 409: # Already exists @@ -405,11 +387,7 @@ async def _create_pod(self, pod: k8s_client.V1Pod) -> None: else: raise - async def _publish_execution_started( - self, - command: CreatePodCommandEvent, - pod: k8s_client.V1Pod - ) -> None: + async def _publish_execution_started(self, command: CreatePodCommandEvent, pod: k8s_client.V1Pod) -> None: """Publish execution started event""" event = ExecutionStartedEvent( execution_id=command.execution_id, @@ -417,24 +395,20 @@ async def _publish_execution_started( pod_name=pod.metadata.name, node_name=pod.spec.node_name, container_id=None, # Will be set when container actually starts - metadata=command.metadata + metadata=command.metadata, ) if not self.producer: logger.error("Producer not initialized") return await self.producer.produce(event_to_produce=event) - async def _publish_pod_created( - self, - command: CreatePodCommandEvent, - pod: k8s_client.V1Pod - ) -> None: + async def _publish_pod_created(self, command: CreatePodCommandEvent, pod: k8s_client.V1Pod) -> None: """Publish pod created event""" event = PodCreatedEvent( execution_id=command.execution_id, pod_name=pod.metadata.name, namespace=pod.metadata.namespace, - metadata=command.metadata + metadata=command.metadata, ) if not self.producer: @@ -442,11 +416,7 @@ async def _publish_pod_created( return await self.producer.produce(event_to_produce=event) - async def _publish_pod_creation_failed( - self, - command: CreatePodCommandEvent, - error: str - ) -> None: + async def _publish_pod_creation_failed(self, command: CreatePodCommandEvent, error: str) -> None: """Publish pod creation failed event""" event = ExecutionFailedEvent( execution_id=command.execution_id, @@ -471,8 +441,8 @@ async def get_status(self) -> dict[str, Any]: "config": { "namespace": self.config.namespace, "max_concurrent_pods": self.config.max_concurrent_pods, - "enable_network_policies": True - } + "enable_network_policies": True, + }, } async def ensure_image_pre_puller_daemonset(self) -> None: @@ -487,22 +457,20 @@ async def ensure_image_pre_puller_daemonset(self) -> None: try: init_containers = [] - all_images = { - config.image - for lang in RUNTIME_REGISTRY.values() - for config in lang.values() - } + all_images = {config.image for lang in RUNTIME_REGISTRY.values() for config in lang.values()} for i, image_ref in enumerate(sorted(list(all_images))): - sanitized_image_ref = image_ref.split('/')[-1].replace(':', '-').replace('.', '-').replace('_', '-') + sanitized_image_ref = image_ref.split("/")[-1].replace(":", "-").replace(".", "-").replace("_", "-") logger.info(f"DAEMONSET: before: {image_ref} -> {sanitized_image_ref}") container_name = f"pull-{i}-{sanitized_image_ref}" - init_containers.append({ - "name": container_name, - "image": image_ref, - "command": ["/bin/sh", "-c", f'echo "Image {image_ref} pulled."'], - "imagePullPolicy": "Always", - }) + init_containers.append( + { + "name": container_name, + "image": image_ref, + "command": ["/bin/sh", "-c", f'echo "Image {image_ref} pulled."'], + "imagePullPolicy": "Always", + } + ) manifest: dict[str, Any] = { "apiVersion": "apps/v1", @@ -514,24 +482,21 @@ async def ensure_image_pre_puller_daemonset(self) -> None: "metadata": {"labels": {"name": daemonset_name}}, "spec": { "initContainers": init_containers, - "containers": [{ - "name": "pause", - "image": "registry.k8s.io/pause:3.9" - }], - "tolerations": [{"operator": "Exists"}] - } + "containers": [{"name": "pause", "image": "registry.k8s.io/pause:3.9"}], + "tolerations": [{"operator": "Exists"}], + }, }, - "updateStrategy": {"type": "RollingUpdate"} - } + "updateStrategy": {"type": "RollingUpdate"}, + }, } try: - await asyncio.to_thread(self.apps_v1.read_namespaced_daemon_set, name=daemonset_name, - namespace=namespace) + await asyncio.to_thread( + self.apps_v1.read_namespaced_daemon_set, name=daemonset_name, namespace=namespace + ) logger.info(f"DaemonSet '{daemonset_name}' exists. Replacing to ensure it is up-to-date.") await asyncio.to_thread( - self.apps_v1.replace_namespaced_daemon_set, - name=daemonset_name, namespace=namespace, body=manifest + self.apps_v1.replace_namespaced_daemon_set, name=daemonset_name, namespace=namespace, body=manifest ) logger.info(f"DaemonSet '{daemonset_name}' replaced successfully.") except ApiException as e: @@ -556,11 +521,7 @@ async def run_kubernetes_worker() -> None: logger.info("Initializing database connection...") settings = get_settings() - db_client: AsyncIOMotorClient = AsyncIOMotorClient( - settings.MONGODB_URL, - tz_aware=True, - serverSelectionTimeoutMS=5000 - ) + db_client: DBClient = AsyncIOMotorClient(settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000) db_name = settings.DATABASE_NAME database = db_client[db_name] await db_client.admin.command("ping") @@ -601,7 +562,7 @@ async def run_kubernetes_worker() -> None: producer=producer, schema_registry_manager=schema_registry_manager, event_store=event_store, - idempotency_manager=idem_manager + idempotency_manager=idem_manager, ) def signal_handler(sig: int, frame: Any) -> None: diff --git a/backend/app/services/kafka_event_service.py b/backend/app/services/kafka_event_service.py index 148f55d6..875a480b 100644 --- a/backend/app/services/kafka_event_service.py +++ b/backend/app/services/kafka_event_service.py @@ -21,35 +21,30 @@ class KafkaEventService: - - def __init__( - self, - event_repository: EventRepository, - kafka_producer: UnifiedProducer - ): + def __init__(self, event_repository: EventRepository, kafka_producer: UnifiedProducer): self.event_repository = event_repository self.kafka_producer = kafka_producer self.metrics = get_event_metrics() self.settings = get_settings() async def publish_event( - self, - event_type: str, - payload: Dict[str, Any], - aggregate_id: str | None, - correlation_id: str | None = None, - metadata: EventMetadata | None = None, + self, + event_type: str, + payload: Dict[str, Any], + aggregate_id: str | None, + correlation_id: str | None = None, + metadata: EventMetadata | None = None, ) -> str: """ Publish an event to Kafka and store an audit copy via the repository - + Args: event_type: Type of event (e.g., "execution.requested") payload: Event-specific data aggregate_id: ID of the aggregate root correlation_id: ID for correlating related events metadata: Event metadata (service/user/trace/IP). If None, service fills minimal defaults. - + Returns: Event ID of published event """ @@ -81,7 +76,7 @@ async def publish_event( timestamp=timestamp, aggregate_id=aggregate_id, metadata=event_metadata, - payload=payload + payload=payload, ) _ = await self.event_repository.store_event(event) @@ -99,7 +94,7 @@ async def publish_event( "timestamp": timestamp, "aggregate_id": aggregate_id, "metadata": event_metadata, - **payload # Include event-specific payload fields + **payload, # Include event-specific payload fields } # Create the typed event instance @@ -109,7 +104,7 @@ async def publish_event( headers: Dict[str, str] = { "event_type": event_type, "correlation_id": event.correlation_id or "", - "service": event_metadata.service_name + "service": event_metadata.service_name, } # Add trace context @@ -122,11 +117,7 @@ async def publish_event( headers = inject_trace_context(headers) # Publish to Kafka - await self.kafka_producer.produce( - event_to_produce=kafka_event, - key=aggregate_id, - headers=headers - ) + await self.kafka_producer.produce(event_to_produce=kafka_event, key=aggregate_id, headers=headers) self.metrics.record_event_published(event_type) @@ -146,12 +137,12 @@ async def publish_event( return kafka_event.event_id async def publish_execution_event( - self, - event_type: str, - execution_id: str, - status: str, - metadata: EventMetadata | None = None, - error_message: str | None = None, + self, + event_type: str, + execution_id: str, + status: str, + metadata: EventMetadata | None = None, + error_message: str | None = None, ) -> str: """Publish execution-related event using provided metadata (no framework coupling).""" logger.info( @@ -160,13 +151,10 @@ async def publish_execution_event( "event_type": event_type, "execution_id": execution_id, "status": status, - } + }, ) - payload = { - "execution_id": execution_id, - "status": status - } + payload = {"execution_id": execution_id, "status": status} if error_message: payload["error_message"] = error_message @@ -184,26 +172,22 @@ async def publish_execution_event( "event_type": event_type, "execution_id": execution_id, "event_id": event_id, - } + }, ) return event_id async def publish_pod_event( - self, - event_type: str, - pod_name: str, - execution_id: str, - namespace: str = "integr8scode", - status: str | None = None, - metadata: EventMetadata | None = None, + self, + event_type: str, + pod_name: str, + execution_id: str, + namespace: str = "integr8scode", + status: str | None = None, + metadata: EventMetadata | None = None, ) -> str: """Publish pod-related event""" - payload = { - "pod_name": pod_name, - "execution_id": execution_id, - "namespace": namespace - } + payload = {"pod_name": pod_name, "execution_id": execution_id, "namespace": namespace} if status: payload["status"] = status diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py index 7fa3ec84..68d68674 100644 --- a/backend/app/services/notification_service.py +++ b/backend/app/services/notification_service.py @@ -52,6 +52,7 @@ class ServiceState(StringEnum): """Service lifecycle states.""" + IDLE = auto() INITIALIZING = auto() RUNNING = auto() @@ -62,15 +63,16 @@ class ServiceState(StringEnum): @dataclass class ThrottleCache: """Manages notification throttling with time windows.""" + _entries: dict[str, list[datetime]] = field(default_factory=dict) _lock: asyncio.Lock = field(default_factory=asyncio.Lock) async def check_throttle( - self, - user_id: str, - severity: NotificationSeverity, - window_hours: int, - max_per_hour: int, + self, + user_id: str, + severity: NotificationSeverity, + window_hours: int, + max_per_hour: int, ) -> bool: """Check if notification should be throttled.""" key = f"{user_id}:{severity}" @@ -82,10 +84,7 @@ async def check_throttle( self._entries[key] = [] # Clean old entries - self._entries[key] = [ - ts for ts in self._entries[key] - if ts > window_start - ] + self._entries[key] = [ts for ts in self._entries[key] if ts > window_start] # Check limit if len(self._entries[key]) >= max_per_hour: @@ -109,13 +108,13 @@ class SystemConfig: class NotificationService: def __init__( - self, - notification_repository: NotificationRepository, - event_service: KafkaEventService, - event_bus_manager: EventBusManager, - schema_registry_manager: SchemaRegistryManager, - sse_bus: SSERedisBus, - settings: Settings, + self, + notification_repository: NotificationRepository, + event_service: KafkaEventService, + event_bus_manager: EventBusManager, + schema_registry_manager: SchemaRegistryManager, + sse_bus: SSERedisBus, + settings: Settings, ) -> None: self.repository = notification_repository self.event_service = event_service @@ -141,15 +140,15 @@ def __init__( extra={ "repository": type(notification_repository).__name__, "event_service": type(event_service).__name__, - "schema_registry": type(schema_registry_manager).__name__ - } + "schema_registry": type(schema_registry_manager).__name__, + }, ) # Channel handlers mapping self._channel_handlers: dict[NotificationChannel, ChannelHandler] = { NotificationChannel.IN_APP: self._send_in_app, NotificationChannel.WEBHOOK: self._send_webhook, - NotificationChannel.SLACK: self._send_slack + NotificationChannel.SLACK: self._send_slack, } @property @@ -214,7 +213,7 @@ async def _subscribe_to_events(self) -> None: group_id=f"{GroupId.NOTIFICATION_SERVICE}.{get_settings().KAFKA_GROUP_SUFFIX}", max_poll_records=10, enable_auto_commit=True, - auto_offset_reset="latest" # Only process new events + auto_offset_reset="latest", # Only process new events ) execution_results_topic = get_topic_for_event(EventType.EXECUTION_COMPLETED) @@ -230,10 +229,7 @@ async def _subscribe_to_events(self) -> None: self._dispatcher.register_handler(EventType.EXECUTION_TIMEOUT, self._handle_execution_event) # Create consumer with dispatcher - self._consumer = UnifiedConsumer( - consumer_config, - event_dispatcher=self._dispatcher - ) + self._consumer = UnifiedConsumer(consumer_config, event_dispatcher=self._dispatcher) # Start consumer await self._consumer.start([execution_results_topic]) @@ -246,16 +242,16 @@ async def _subscribe_to_events(self) -> None: logger.info("Notification service subscribed to execution events") async def create_notification( - self, - user_id: str, - subject: str, - body: str, - tags: list[str], - severity: NotificationSeverity = NotificationSeverity.MEDIUM, - channel: NotificationChannel = NotificationChannel.IN_APP, - scheduled_for: datetime | None = None, - action_url: str | None = None, - metadata: NotificationContext | None = None, + self, + user_id: str, + subject: str, + body: str, + tags: list[str], + severity: NotificationSeverity = NotificationSeverity.MEDIUM, + channel: NotificationChannel = NotificationChannel.IN_APP, + scheduled_for: datetime | None = None, + action_url: str | None = None, + metadata: NotificationContext | None = None, ) -> DomainNotification: if not tags: raise ServiceError("tags must be a non-empty list", status_code=422) @@ -267,19 +263,21 @@ async def create_notification( "severity": str(severity), "tags": list(tags), "scheduled": scheduled_for is not None, - } + }, ) # Check throttling if await self._throttle_cache.check_throttle( - user_id, - severity, - window_hours=self.settings.NOTIF_THROTTLE_WINDOW_HOURS, - max_per_hour=self.settings.NOTIF_THROTTLE_MAX_PER_HOUR, + user_id, + severity, + window_hours=self.settings.NOTIF_THROTTLE_WINDOW_HOURS, + max_per_hour=self.settings.NOTIF_THROTTLE_MAX_PER_HOUR, ): - error_msg = (f"Notification rate limit exceeded for user {user_id}. " - f"Max {self.settings.NOTIF_THROTTLE_MAX_PER_HOUR} " - f"per {self.settings.NOTIF_THROTTLE_WINDOW_HOURS} hour(s)") + error_msg = ( + f"Notification rate limit exceeded for user {user_id}. " + f"Max {self.settings.NOTIF_THROTTLE_MAX_PER_HOUR} " + f"per {self.settings.NOTIF_THROTTLE_WINDOW_HOURS} hour(s)" + ) logger.warning(error_msg) # Throttling is a client-driven rate issue raise ServiceError(error_msg, status_code=429) @@ -295,7 +293,7 @@ async def create_notification( tags=tags, scheduled_for=scheduled_for, status=NotificationStatus.PENDING, - metadata=metadata or {} + metadata=metadata or {}, ) # Save to database @@ -310,7 +308,7 @@ async def create_notification( "user_id": user_id, "severity": str(severity), "tags": notification.tags, - } + }, ) asyncio.create_task(self._deliver_notification(notification)) @@ -318,21 +316,22 @@ async def create_notification( return notification async def create_system_notification( - self, - title: str, - message: str, - severity: NotificationSeverity = NotificationSeverity.MEDIUM, - tags: list[str] | None = None, - metadata: dict[str, object] | None = None, - target_users: list[str] | None = None, - target_roles: list[UserRole] | None = None, + self, + title: str, + message: str, + severity: NotificationSeverity = NotificationSeverity.MEDIUM, + tags: list[str] | None = None, + metadata: dict[str, object] | None = None, + target_users: list[str] | None = None, + target_roles: list[UserRole] | None = None, ) -> SystemNotificationStats: """Create system notifications with streamlined control flow. Returns stats with totals and created/failed/throttled counts. """ - cfg = SystemConfig(severity=severity, - throttle_exempt=(severity in (NotificationSeverity.HIGH, NotificationSeverity.URGENT))) + cfg = SystemConfig( + severity=severity, throttle_exempt=(severity in (NotificationSeverity.HIGH, NotificationSeverity.URGENT)) + ) base_context: NotificationContext = {"message": message, **(metadata or {})} users = await self._resolve_targets(target_users, target_roles) @@ -345,8 +344,8 @@ async def worker(uid: str) -> str: async with sem: return await self._create_system_for_user(uid, cfg, title, base_context, tags or ["system"]) - results = [await worker(u) for u in users] if len(users) <= 20 else await asyncio.gather( - *(worker(u) for u in users) + results = ( + [await worker(u) for u in users] if len(users) <= 20 else await asyncio.gather(*(worker(u) for u in users)) ) created = sum(1 for r in results if r == "created") @@ -368,9 +367,9 @@ async def worker(uid: str) -> str: return {"total_users": len(users), "created": created, "failed": failed, "throttled": throttled} async def _resolve_targets( - self, - target_users: list[str] | None, - target_roles: list[UserRole] | None, + self, + target_users: list[str] | None, + target_roles: list[UserRole] | None, ) -> list[str]: if target_users: return target_users @@ -379,12 +378,12 @@ async def _resolve_targets( return await self.repository.get_active_users(days=30) async def _create_system_for_user( - self, - user_id: str, - cfg: SystemConfig, - title: str, - base_context: NotificationContext, - tags: list[str], + self, + user_id: str, + cfg: SystemConfig, + title: str, + base_context: NotificationContext, + tags: list[str], ) -> str: try: if not cfg.throttle_exempt: @@ -412,24 +411,21 @@ async def _create_system_for_user( return "failed" async def _send_in_app( - self, - notification: DomainNotification, - subscription: DomainNotificationSubscription + self, notification: DomainNotification, subscription: DomainNotificationSubscription ) -> None: """Send in-app notification via SSE bus (fan-out to connected clients).""" await self._publish_notification_sse(notification) async def _send_webhook( - self, - notification: DomainNotification, - subscription: DomainNotificationSubscription + self, notification: DomainNotification, subscription: DomainNotificationSubscription ) -> None: """Send webhook notification.""" webhook_url = notification.webhook_url or subscription.webhook_url if not webhook_url: raise ValueError( f"No webhook URL configured for user {notification.user_id} on channel {notification.channel}. " - f"Configure in notification settings.") + f"Configure in notification settings." + ) payload = { "notification_id": str(notification.notification_id), @@ -451,8 +447,8 @@ async def _send_webhook( extra={ "notification_id": str(notification.notification_id), "payload_size": len(str(payload)), - "webhook_url": webhook_url - } + "webhook_url": webhook_url, + }, ) add_span_attributes( @@ -463,61 +459,51 @@ async def _send_webhook( } ) async with httpx.AsyncClient() as client: - response = await client.post( - webhook_url, - json=payload, - headers=headers, - timeout=30.0 - ) + response = await client.post(webhook_url, json=payload, headers=headers, timeout=30.0) response.raise_for_status() logger.debug( "Webhook delivered successfully", extra={ "notification_id": str(notification.notification_id), "status_code": response.status_code, - "response_time_ms": int(response.elapsed.total_seconds() * 1000) - } + "response_time_ms": int(response.elapsed.total_seconds() * 1000), + }, ) - async def _send_slack( - self, - notification: DomainNotification, - subscription: DomainNotificationSubscription - ) -> None: + async def _send_slack(self, notification: DomainNotification, subscription: DomainNotificationSubscription) -> None: """Send Slack notification.""" if not subscription.slack_webhook: raise ValueError( f"No Slack webhook URL configured for user {notification.user_id}. " - f"Please configure Slack integration in notification settings.") + f"Please configure Slack integration in notification settings." + ) # Format message for Slack slack_message: SlackMessage = { "text": notification.subject, - "attachments": [{ - "color": self._get_slack_color(notification.severity), - "text": notification.body, - "footer": "Integr8sCode Notifications", - "ts": int(notification.created_at.timestamp()) - }] + "attachments": [ + { + "color": self._get_slack_color(notification.severity), + "text": notification.body, + "footer": "Integr8sCode Notifications", + "ts": int(notification.created_at.timestamp()), + } + ], } # Add action button if URL provided if notification.action_url: attachments = slack_message.get("attachments", []) if attachments and isinstance(attachments, list): - attachments[0]["actions"] = [{ - "type": "button", - "text": "View Details", - "url": notification.action_url - }] + attachments[0]["actions"] = [{"type": "button", "text": "View Details", "url": notification.action_url}] logger.debug( "Sending Slack notification", extra={ "notification_id": str(notification.notification_id), "has_action": notification.action_url is not None, - "priority_color": self._get_slack_color(notification.severity) - } + "priority_color": self._get_slack_color(notification.severity), + }, ) add_span_attributes( @@ -527,18 +513,11 @@ async def _send_slack( } ) async with httpx.AsyncClient() as client: - response = await client.post( - subscription.slack_webhook, - json=slack_message, - timeout=30.0 - ) + response = await client.post(subscription.slack_webhook, json=slack_message, timeout=30.0) response.raise_for_status() logger.debug( "Slack notification delivered successfully", - extra={ - "notification_id": str(notification.notification_id), - "status_code": response.status_code - } + extra={"notification_id": str(notification.notification_id), "status_code": response.status_code}, ) def _get_slack_color(self, priority: NotificationSeverity) -> str: @@ -631,8 +610,9 @@ async def _handle_execution_completed_typed(self, event: ExecutionCompletedEvent return title = f"Execution Completed: {event.execution_id}" - body = (f"Your execution completed successfully. " - f"Duration: {event.resource_usage.execution_time_wall_seconds:.2f}s.") + body = ( + f"Your execution completed successfully. Duration: {event.resource_usage.execution_time_wall_seconds:.2f}s." + ) await self.create_notification( user_id=user_id, subject=title, @@ -640,7 +620,8 @@ async def _handle_execution_completed_typed(self, event: ExecutionCompletedEvent severity=NotificationSeverity.MEDIUM, tags=["execution", "completed", ENTITY_EXECUTION_TAG, f"exec:{event.execution_id}"], metadata=event.model_dump( - exclude={"metadata", "event_type", "event_version", "timestamp", "aggregate_id", "topic"}), + exclude={"metadata", "event_type", "event_version", "timestamp", "aggregate_id", "topic"} + ), ) async def _handle_execution_event(self, event: BaseEvent) -> None: @@ -692,11 +673,7 @@ async def mark_as_read(self, user_id: str, notification_id: str) -> bool: if success: await event_bus.publish( "notifications.read", - { - "notification_id": str(notification_id), - "user_id": user_id, - "read_at": datetime.now(UTC).isoformat() - } + {"notification_id": str(notification_id), "user_id": user_id, "read_at": datetime.now(UTC).isoformat()}, ) else: raise ServiceError("Notification not found", status_code=404) @@ -708,14 +685,14 @@ async def get_unread_count(self, user_id: str) -> int: return await self.repository.get_unread_count(user_id) async def list_notifications( - self, - user_id: str, - status: NotificationStatus | None = None, - limit: int = 20, - offset: int = 0, - include_tags: list[str] | None = None, - exclude_tags: list[str] | None = None, - tag_prefix: str | None = None, + self, + user_id: str, + status: NotificationStatus | None = None, + limit: int = 20, + offset: int = 0, + include_tags: list[str] | None = None, + exclude_tags: list[str] | None = None, + tag_prefix: str | None = None, ) -> DomainNotificationListResult: """List notifications with pagination.""" # Get notifications @@ -731,26 +708,21 @@ async def list_notifications( # Get counts total, unread_count = await asyncio.gather( - self.repository.count_notifications(user_id, {"status": status}), - self.get_unread_count(user_id) + self.repository.count_notifications(user_id, {"status": status}), self.get_unread_count(user_id) ) - return DomainNotificationListResult( - notifications=notifications, - total=total, - unread_count=unread_count - ) + return DomainNotificationListResult(notifications=notifications, total=total, unread_count=unread_count) async def update_subscription( - self, - user_id: str, - channel: NotificationChannel, - enabled: bool, - webhook_url: str | None = None, - slack_webhook: str | None = None, - severities: list[NotificationSeverity] | None = None, - include_tags: list[str] | None = None, - exclude_tags: list[str] | None = None, + self, + user_id: str, + channel: NotificationChannel, + enabled: bool, + webhook_url: str | None = None, + slack_webhook: str | None = None, + severities: list[NotificationSeverity] | None = None, + include_tags: list[str] | None = None, + exclude_tags: list[str] | None = None, ) -> DomainNotificationSubscription: """Update notification subscription preferences.""" # Validate channel-specific requirements @@ -800,12 +772,7 @@ async def mark_all_as_read(self, user_id: str) -> int: event_bus = await self.event_bus_manager.get_event_bus() if count > 0: await event_bus.publish( - "notifications.all_read", - { - "user_id": user_id, - "count": count, - "read_at": datetime.now(UTC).isoformat() - } + "notifications.all_read", {"user_id": user_id, "count": count, "read_at": datetime.now(UTC).isoformat()} ) return count @@ -814,11 +781,7 @@ async def get_subscriptions(self, user_id: str) -> dict[str, DomainNotificationS """Get all notification subscriptions for a user.""" return await self.repository.get_all_subscriptions(user_id) - async def delete_notification( - self, - user_id: str, - notification_id: str - ) -> bool: + async def delete_notification(self, user_id: str, notification_id: str) -> bool: """Delete a notification.""" deleted = await self.repository.delete_notification(str(notification_id), user_id) if not deleted: @@ -840,9 +803,7 @@ async def _publish_notification_sse(self, notification: DomainNotification) -> N await self.sse_bus.publish_notification(notification.user_id, payload) async def _should_skip_notification( - self, - notification: DomainNotification, - subscription: DomainNotificationSubscription | None + self, notification: DomainNotification, subscription: DomainNotificationSubscription | None ) -> str | None: """Check if notification should be skipped based on subscription filters. @@ -858,10 +819,7 @@ async def _should_skip_notification( ) if subscription.include_tags and not any(tag in subscription.include_tags for tag in (notification.tags or [])): - return ( - f"Notification tags {notification.tags} " - f"not in include list for {notification.channel}" - ) + return f"Notification tags {notification.tags} not in include list for {notification.channel}" if subscription.exclude_tags and any(tag in subscription.exclude_tags for tag in (notification.tags or [])): return f"Notification tags {notification.tags} excluded by preferences for {notification.channel}" @@ -883,14 +841,11 @@ async def _deliver_notification(self, notification: DomainNotification) -> None: "channel": str(notification.channel), "severity": str(notification.severity), "tags": list(notification.tags or []), - } + }, ) # Check user subscription for the channel - subscription = await self.repository.get_subscription( - notification.user_id, - notification.channel - ) + subscription = await self.repository.get_subscription(notification.user_id, notification.channel) # Check if notification should be skipped skip_reason = await self._should_skip_notification(notification, subscription) @@ -928,13 +883,14 @@ async def _deliver_notification(self, notification: DomainNotification) -> None: extra={ "notification_id": str(notification.notification_id), "channel": str(notification.channel), - "delivery_time_ms": int(delivery_time * 1000) - } + "delivery_time_ms": int(delivery_time * 1000), + }, ) # Metrics (use tag string or severity) - self.metrics.record_notification_sent(str(notification.severity), channel=str(notification.channel), - severity=str(notification.severity)) + self.metrics.record_notification_sent( + str(notification.severity), channel=str(notification.channel), severity=str(notification.severity) + ) self.metrics.record_notification_delivery_time(delivery_time, str(notification.severity)) except Exception as e: @@ -944,13 +900,13 @@ async def _deliver_notification(self, notification: DomainNotification) -> None: "error_type": type(e).__name__, "error_message": str(e), "retry_count": notification.retry_count, - "max_retries": notification.max_retries + "max_retries": notification.max_retries, } logger.error( f"Failed to deliver notification {notification.notification_id}: {str(e)}", extra=error_details, - exc_info=True + exc_info=True, ) notification.status = NotificationStatus.FAILED @@ -966,7 +922,7 @@ async def _deliver_notification(self, notification: DomainNotification) -> None: logger.info( f"Scheduled retry {notification.retry_count}/{notification.max_retries} " f"for {notification.notification_id}", - extra={"retry_at": retry_time.isoformat()} + extra={"retry_at": retry_time.isoformat()}, ) await self.repository.update_notification(notification) diff --git a/backend/app/services/pod_monitor/event_mapper.py b/backend/app/services/pod_monitor/event_mapper.py index 5706f837..56eb0966 100644 --- a/backend/app/services/pod_monitor/event_mapper.py +++ b/backend/app/services/pod_monitor/event_mapper.py @@ -30,6 +30,7 @@ @dataclass(frozen=True) class PodContext: """Immutable context for pod event processing""" + pod: k8s_client.V1Pod execution_id: str metadata: EventMetadata @@ -40,6 +41,7 @@ class PodContext: @dataclass(frozen=True) class PodLogs: """Parsed pod logs and execution results""" + stdout: str = "" stderr: str = "" exit_code: int | None = None @@ -96,17 +98,11 @@ def map_pod_event(self, pod: k8s_client.V1Pod, event_type: str) -> EventList: # Skip duplicate events if pod.metadata and self._is_duplicate(pod.metadata.name, phase): - logger.debug( - f"POD-EVENT: duplicate ignored name={pod.metadata.name} phase={phase}" - ) + logger.debug(f"POD-EVENT: duplicate ignored name={pod.metadata.name} phase={phase}") return [] ctx = PodContext( - pod=pod, - execution_id=execution_id, - metadata=self._create_metadata(pod), - phase=phase, - event_type=event_type + pod=pod, execution_id=execution_id, metadata=self._create_metadata(pod), phase=phase, event_type=event_type ) logger.info( f"POD-EVENT: ctx execution_id={ctx.execution_id} phase={ctx.phase} " @@ -133,27 +129,21 @@ def map_pod_event(self, pod: k8s_client.V1Pod, event_type: str) -> EventList: and pod.metadata and prior_phase == "Pending" ): - logger.debug( - f"POD-EVENT: skipping running map due to empty statuses after Pending exec={execution_id}" - ) + logger.debug(f"POD-EVENT: skipping running map due to empty statuses after Pending exec={execution_id}") return events # Phase-based mappers for mapper in self._phase_mappers.get(phase, []): if event := mapper(ctx): mapper_name = getattr(mapper, "__name__", repr(mapper)) - logger.info( - f"POD-EVENT: phase-map {mapper_name} -> {event.event_type} exec={ctx.execution_id}" - ) + logger.info(f"POD-EVENT: phase-map {mapper_name} -> {event.event_type} exec={ctx.execution_id}") events.append(event) # Event type mappers for mapper in self._event_type_mappers.get(event_type, []): if event := mapper(ctx): mapper_name = getattr(mapper, "__name__", repr(mapper)) - logger.info( - f"POD-EVENT: type-map {mapper_name} -> {event.event_type} exec={ctx.execution_id}" - ) + logger.info(f"POD-EVENT: type-map {mapper_name} -> {event.event_type} exec={ctx.execution_id}") events.append(event) return events @@ -165,25 +155,19 @@ def _extract_execution_id(self, pod: k8s_client.V1Pod) -> str | None: # Try labels first if pod.metadata.labels and (exec_id := pod.metadata.labels.get("execution-id")): - logger.debug( - f"POD-EVENT: extracted exec-id from label name={pod.metadata.name} exec_id={exec_id}" - ) + logger.debug(f"POD-EVENT: extracted exec-id from label name={pod.metadata.name} exec_id={exec_id}") return str(exec_id) - + # Try annotations if pod.metadata.annotations and (exec_id := pod.metadata.annotations.get("integr8s.io/execution-id")): - logger.debug( - f"POD-EVENT: extracted exec-id from annotation name={pod.metadata.name} exec_id={exec_id}" - ) + logger.debug(f"POD-EVENT: extracted exec-id from annotation name={pod.metadata.name} exec_id={exec_id}") return str(exec_id) - + # Try pod name pattern if pod.metadata.name and pod.metadata.name.startswith("exec-"): - logger.debug( - f"POD-EVENT: extracted exec-id from name pattern name={pod.metadata.name}" - ) + logger.debug(f"POD-EVENT: extracted exec-id from name pattern name={pod.metadata.name}") return str(pod.metadata.name[5:]) - + return None def _create_metadata(self, pod: k8s_client.V1Pod) -> EventMetadata: @@ -193,21 +177,15 @@ def _create_metadata(self, pod: k8s_client.V1Pod) -> EventMetadata: # Try to get correlation_id from annotations first (full value), # then labels (potentially truncated) - correlation_id = ( - annotations.get("integr8s.io/correlation-id") or - labels.get("correlation-id") or - "" - ) + correlation_id = annotations.get("integr8s.io/correlation-id") or labels.get("correlation-id") or "" md = EventMetadata( user_id=labels.get("user-id"), service_name=GroupId.POD_MONITOR, service_version="1.0.0", - correlation_id=correlation_id - ) - logger.info( - f"POD-EVENT: metadata user_id={md.user_id} corr={md.correlation_id} name={pod.metadata.name}" + correlation_id=correlation_id, ) + logger.info(f"POD-EVENT: metadata user_id={md.user_id} corr={md.correlation_id} name={pod.metadata.name}") return md def _is_duplicate(self, pod_name: str, phase: PodPhase) -> bool: @@ -225,9 +203,7 @@ def _map_scheduled(self, ctx: PodContext) -> PodScheduledEvent | None: # Find PodScheduled condition scheduled_condition = next( - (c for c in ctx.pod.status.conditions - if c.type == "PodScheduled" and c.status == "True"), - None + (c for c in ctx.pod.status.conditions if c.type == "PodScheduled" and c.status == "True"), None ) if not scheduled_condition: @@ -237,7 +213,7 @@ def _map_scheduled(self, ctx: PodContext) -> PodScheduledEvent | None: execution_id=ctx.execution_id, pod_name=ctx.pod.metadata.name, node_name=ctx.pod.spec.node_name or "pending", - metadata=ctx.metadata + metadata=ctx.metadata, ) logger.debug(f"POD-EVENT: mapped scheduled -> {evt.event_type} exec={ctx.execution_id}") return evt @@ -253,7 +229,7 @@ def _map_running(self, ctx: PodContext) -> PodRunningEvent | None: "name": status.name, "ready": str(status.ready), "restart_count": str(status.restart_count), - "state": self._format_container_state(status.state) + "state": self._format_container_state(status.state), } for status in (ctx.pod.status.container_statuses or []) ] @@ -262,7 +238,7 @@ def _map_running(self, ctx: PodContext) -> PodRunningEvent | None: execution_id=ctx.execution_id, pod_name=ctx.pod.metadata.name, container_statuses=json.dumps(container_statuses), # Serialize as JSON string - metadata=ctx.metadata + metadata=ctx.metadata, ) logger.debug(f"POD-EVENT: mapped running -> {evt.event_type} exec={ctx.execution_id}") return evt @@ -283,11 +259,9 @@ def _map_completed(self, ctx: PodContext) -> ExecutionCompletedEvent | None: stdout=logs.stdout, stderr=logs.stderr, resource_usage=logs.resource_usage or ResourceUsageDomain.from_dict({}), - metadata=ctx.metadata - ) - logger.info( - f"POD-EVENT: mapped completed exec={ctx.execution_id} exit_code={exit_code}" + metadata=ctx.metadata, ) + logger.info(f"POD-EVENT: mapped completed exec={ctx.execution_id} exit_code={exit_code}") return evt def _map_failed_or_completed(self, ctx: PodContext) -> BaseEvent | None: @@ -308,8 +282,11 @@ def _map_failed(self, ctx: PodContext) -> ExecutionFailedEvent | None: # If no stderr from logs but we have an error message, use it as stderr stderr = logs.stderr if logs.stderr else error_info.message # Ensure exit_code is populated (fallback to logs or generic non-zero) - exit_code = error_info.exit_code if error_info.exit_code is not None \ + exit_code = ( + error_info.exit_code + if error_info.exit_code is not None else (logs.exit_code if logs.exit_code is not None else 1) + ) evt = ExecutionFailedEvent( execution_id=ctx.execution_id, @@ -320,7 +297,7 @@ def _map_failed(self, ctx: PodContext) -> ExecutionFailedEvent | None: stderr=stderr, error_message=stderr, resource_usage=logs.resource_usage or ResourceUsageDomain.from_dict({}), - metadata=ctx.metadata + metadata=ctx.metadata, ) logger.info( f"POD-EVENT: mapped failed exec={ctx.execution_id} error_type={error_info.error_type} " @@ -341,7 +318,7 @@ def _map_terminated(self, ctx: PodContext) -> PodTerminatedEvent | None: exit_code=terminated.exit_code, reason=terminated.reason or "Terminated", message=getattr(terminated, "message", None), - metadata=ctx.metadata + metadata=ctx.metadata, ) logger.info( f"POD-EVENT: mapped terminated exec={ctx.execution_id} reason={terminated.reason} " @@ -361,11 +338,9 @@ def _check_timeout(self, ctx: PodContext) -> ExecutionTimeoutEvent | None: stdout=logs.stdout, stderr=logs.stderr, resource_usage=logs.resource_usage or ResourceUsageDomain.from_dict({}), - metadata=ctx.metadata - ) - logger.info( - f"POD-EVENT: mapped timeout exec={ctx.execution_id} adl={ctx.pod.spec.active_deadline_seconds}" + metadata=ctx.metadata, ) + logger.info(f"POD-EVENT: mapped timeout exec={ctx.execution_id} adl={ctx.pod.spec.active_deadline_seconds}") return evt def _get_main_container(self, pod: k8s_client.V1Pod) -> k8s_client.V1ContainerStatus | None: @@ -380,9 +355,7 @@ def _all_containers_succeeded(self, pod: k8s_client.V1Pod) -> bool: return False return all( - status.state and - status.state.terminated and - status.state.terminated.exit_code == 0 + status.state and status.state.terminated and status.state.terminated.exit_code == 0 for status in pod.status.container_statuses ) @@ -403,6 +376,7 @@ def _format_container_state(self, state: k8s_client.V1ContainerState | None) -> @dataclass class FailureInfo: """Pod failure analysis result""" + message: str error_type: ExecutionErrorType exit_code: int | None = None @@ -412,7 +386,7 @@ def _analyze_failure(self, pod: k8s_client.V1Pod) -> FailureInfo: # Default failure info default = self.FailureInfo( message=(pod.status.message if pod.status else None) or "Pod failed", - error_type=ExecutionErrorType.SYSTEM_ERROR + error_type=ExecutionErrorType.SYSTEM_ERROR, ) if not pod.status: @@ -421,12 +395,11 @@ def _analyze_failure(self, pod: k8s_client.V1Pod) -> FailureInfo: # Check for resource limits if pod.status.reason == "Evicted": return self.FailureInfo( - message="Pod evicted due to resource constraints", - error_type=ExecutionErrorType.RESOURCE_LIMIT + message="Pod evicted due to resource constraints", error_type=ExecutionErrorType.RESOURCE_LIMIT ) # Check container statuses - for status in (pod.status.container_statuses or []): + for status in pod.status.container_statuses or []: # Terminated container if status.state and status.state.terminated: terminated = status.state.terminated @@ -437,7 +410,7 @@ def _analyze_failure(self, pod: k8s_client.V1Pod) -> FailureInfo: return self.FailureInfo( message=term_msg or status_msg or f"Container exited with code {terminated.exit_code}", error_type=ExecutionErrorType.SCRIPT_ERROR, - exit_code=terminated.exit_code + exit_code=terminated.exit_code, ) # Waiting container @@ -451,15 +424,13 @@ def _analyze_failure(self, pod: k8s_client.V1Pod) -> FailureInfo: if error_type := error_type_map.get(waiting.reason): return self.FailureInfo( - message=waiting.message or f"Container waiting: {waiting.reason}", - error_type=error_type + message=waiting.message or f"Container waiting: {waiting.reason}", error_type=error_type ) # Check for OOM if "OOMKilled" in (pod.status.message or ""): return self.FailureInfo( - message="Container killed due to out of memory", - error_type=ExecutionErrorType.RESOURCE_LIMIT + message="Container killed due to out of memory", error_type=ExecutionErrorType.RESOURCE_LIMIT ) return default @@ -472,8 +443,7 @@ def _extract_logs(self, pod: k8s_client.V1Pod) -> PodLogs: # Check if any container terminated has_terminated = any( - status.state and status.state.terminated - for status in (pod.status.container_statuses if pod.status else []) + status.state and status.state.terminated for status in (pod.status.container_statuses if pod.status else []) ) if not has_terminated: @@ -482,14 +452,12 @@ def _extract_logs(self, pod: k8s_client.V1Pod) -> PodLogs: try: logs = self._k8s_api.read_namespaced_pod_log( - name=pod.metadata.name, - namespace=pod.metadata.namespace or "integr8scode", - tail_lines=10000 + name=pod.metadata.name, namespace=pod.metadata.namespace or "integr8scode", tail_lines=10000 ) - + if not logs: return PodLogs() - + # Try to parse executor JSON return self._parse_executor_output(logs) @@ -506,7 +474,7 @@ def _parse_executor_output(self, logs: str) -> PodLogs: return result # Try line by line - for line in logs_stripped.split('\n'): + for line in logs_stripped.split("\n"): if result := self._try_parse_json(line.strip()): return result @@ -516,7 +484,7 @@ def _parse_executor_output(self, logs: str) -> PodLogs: def _try_parse_json(self, text: str) -> PodLogs | None: """Try to parse text as executor JSON output""" - if not (text.startswith('{') and text.endswith('}')): + if not (text.startswith("{") and text.endswith("}")): return None data = ast.literal_eval(text) @@ -524,7 +492,7 @@ def _try_parse_json(self, text: str) -> PodLogs | None: stdout=data.get("stdout", ""), stderr=data.get("stderr", ""), exit_code=data.get("exit_code", 0), - resource_usage=ResourceUsageDomain.from_dict(data.get("resource_usage", {})) + resource_usage=ResourceUsageDomain.from_dict(data.get("resource_usage", {})), ) def _log_extraction_error(self, pod_name: str, error: str) -> None: @@ -538,7 +506,6 @@ def _log_extraction_error(self, pod_name: str, error: str) -> None: else: logger.warning(f"Failed to extract logs from pod {pod_name}: {error}") - def clear_cache(self) -> None: """Clear event cache""" self._event_cache.clear() diff --git a/backend/app/services/pod_monitor/monitor.py b/backend/app/services/pod_monitor/monitor.py index 889cfe70..97fbbb4f 100644 --- a/backend/app/services/pod_monitor/monitor.py +++ b/backend/app/services/pod_monitor/monitor.py @@ -41,6 +41,7 @@ class WatchEventType(StringEnum): """Kubernetes watch event types.""" + ADDED = "ADDED" MODIFIED = "MODIFIED" DELETED = "DELETED" @@ -48,6 +49,7 @@ class WatchEventType(StringEnum): class MonitorState(StringEnum): """Pod monitor states.""" + IDLE = auto() RUNNING = auto() STOPPING = auto() @@ -56,6 +58,7 @@ class MonitorState(StringEnum): class ErrorType(StringEnum): """Error types for metrics.""" + RESOURCE_VERSION_EXPIRED = auto() API_ERROR = auto() UNEXPECTED = auto() @@ -65,6 +68,7 @@ class ErrorType(StringEnum): @dataclass(frozen=True, slots=True) class WatchContext: """Immutable context for watch operations.""" + namespace: str label_selector: str field_selector: str | None @@ -75,6 +79,7 @@ class WatchContext: @dataclass(frozen=True, slots=True) class PodEvent: """Immutable pod event data.""" + event_type: WatchEventType pod: k8s_client.V1Pod resource_version: ResourceVersion | None @@ -83,6 +88,7 @@ class PodEvent: @dataclass(frozen=True, slots=True) class ReconciliationResult: """Result of state reconciliation.""" + missing_pods: set[PodName] extra_pods: set[PodName] duration_seconds: float @@ -93,12 +99,7 @@ class ReconciliationResult: class EventPublisher(Protocol): """Protocol for event publishing.""" - async def send_event( - self, - event: BaseEvent, - topic: str, - key: str | None = None - ) -> bool: + async def send_event(self, event: BaseEvent, topic: str, key: str | None = None) -> bool: """Send an event to a topic.""" ... @@ -113,12 +114,7 @@ class UnifiedProducerAdapter: def __init__(self, producer: UnifiedProducer) -> None: self._producer = producer - async def send_event( - self, - event: BaseEvent, - topic: str, - key: str | None = None - ) -> bool: + async def send_event(self, event: BaseEvent, topic: str, key: str | None = None) -> bool: """Send event and return success status.""" try: await self._producer.produce(event_to_produce=event, key=key) @@ -136,22 +132,20 @@ async def is_healthy(self) -> bool: class PodMonitor(LifecycleEnabled): """ Monitors Kubernetes pods and publishes lifecycle events. - + This service watches pods with specific labels using the K8s watch API, maps Kubernetes events to application events, and publishes them to Kafka. """ - def __init__(self, config: PodMonitorConfig, producer: UnifiedProducer, - k8s_clients: K8sClients | None = None) -> None: + def __init__( + self, config: PodMonitorConfig, producer: UnifiedProducer, k8s_clients: K8sClients | None = None + ) -> None: """Initialize the pod monitor.""" self.config = config or PodMonitorConfig() settings = get_settings() # Kafka configuration - self.kafka_servers = ( - self.config.kafka_bootstrap_servers or - settings.KAFKA_BOOTSTRAP_SERVERS - ) + self.kafka_servers = self.config.kafka_bootstrap_servers or settings.KAFKA_BOOTSTRAP_SERVERS # Kubernetes clients (initialized on start) self._v1: k8s_client.CoreV1Api | None = None @@ -269,8 +263,7 @@ async def _watch_pods(self) -> None: case 410: # Gone - resource version too old logger.warning("Resource version expired, resetting watch") self._last_resource_version = None - self._metrics.record_pod_monitor_watch_error( - str(ErrorType.RESOURCE_VERSION_EXPIRED.value)) + self._metrics.record_pod_monitor_watch_error(str(ErrorType.RESOURCE_VERSION_EXPIRED.value)) case _: logger.error(f"API error in watch: {e}") self._metrics.record_pod_monitor_watch_error(str(ErrorType.API_ERROR.value)) @@ -291,13 +284,10 @@ async def _watch_pod_events(self) -> None: label_selector=self.config.label_selector, field_selector=self.config.field_selector, timeout_seconds=self.config.watch_timeout_seconds, - resource_version=self._last_resource_version + resource_version=self._last_resource_version, ) - logger.info( - f"Starting pod watch with selector: {context.label_selector}, " - f"namespace: {context.namespace}" - ) + logger.info(f"Starting pod watch with selector: {context.label_selector}, namespace: {context.namespace}") # Create watch stream kwargs = { @@ -316,10 +306,7 @@ async def _watch_pod_events(self) -> None: if not self._watch or not self._v1: raise RuntimeError("Watch or API not initialized") - stream = self._watch.stream( - self._v1.list_namespaced_pod, - **kwargs - ) + stream = self._watch.stream(self._v1.list_namespaced_pod, **kwargs) try: for event in stream: @@ -345,13 +332,11 @@ async def _process_raw_event(self, raw_event: KubeEvent) -> None: try: # Parse event event = PodEvent( - event_type=WatchEventType(raw_event['type'].upper()), - pod=raw_event['object'], + event_type=WatchEventType(raw_event["type"].upper()), + pod=raw_event["object"], resource_version=( - raw_event['object'].metadata.resource_version - if raw_event['object'].metadata - else None - ) + raw_event["object"].metadata.resource_version if raw_event["object"].metadata else None + ), ) await self._process_pod_event(event) @@ -386,10 +371,7 @@ async def _process_pod_event(self, event: PodEvent) -> None: self._metrics.update_pod_monitor_pods_watched(len(self._tracked_pods)) # Map to application events - app_events = self._event_mapper.map_pod_event( - event.pod, - event.event_type - ) + app_events = self._event_mapper.map_pod_event(event.pod, event.event_type) # Publish events for app_event in app_events: @@ -411,11 +393,7 @@ async def _process_pod_event(self, event: PodEvent) -> None: logger.error(f"Error processing pod event: {e}", exc_info=True) self._metrics.record_pod_monitor_watch_error(str(ErrorType.PROCESSING_ERROR.value)) - async def _publish_event( - self, - event: BaseEvent, - pod: k8s_client.V1Pod - ) -> None: + async def _publish_event(self, event: BaseEvent, pod: k8s_client.V1Pod) -> None: """Publish event to Kafka.""" try: # Get proper topic from event type mapping @@ -427,12 +405,9 @@ async def _publish_event( event.metadata.correlation_id = pod.metadata.labels.get("execution-id") # Get execution ID from event if it has one - execution_id = getattr(event, 'execution_id', None) or event.aggregate_id + execution_id = getattr(event, "execution_id", None) or event.aggregate_id - logger.info( - f"Publishing event {event.event_type} to topic {topic} " - f"for execution_id: {execution_id}" - ) + logger.info(f"Publishing event {event.event_type} to topic {topic} for execution_id: {execution_id}") # Check producer health if not await self._producer.is_healthy(): @@ -461,17 +436,13 @@ async def _handle_watch_error(self) -> None: if self._reconnect_attempts > self.config.max_reconnect_attempts: logger.error( - f"Max reconnect attempts ({self.config.max_reconnect_attempts}) " - f"exceeded, stopping pod monitor" + f"Max reconnect attempts ({self.config.max_reconnect_attempts}) exceeded, stopping pod monitor" ) self._state = MonitorState.STOPPING return # Calculate exponential backoff - backoff = min( - self.config.watch_reconnect_delay * (2 ** (self._reconnect_attempts - 1)), - MAX_BACKOFF_SECONDS - ) + backoff = min(self.config.watch_reconnect_delay * (2 ** (self._reconnect_attempts - 1)), MAX_BACKOFF_SECONDS) logger.info( f"Reconnecting watch in {backoff}s " @@ -511,13 +482,11 @@ async def _reconcile_state(self) -> ReconciliationResult: extra_pods=set(), duration_seconds=time.time() - start_time, success=False, - error="K8s API not initialized" + error="K8s API not initialized", ) pods = await asyncio.to_thread( - self._v1.list_namespaced_pod, - namespace=self.config.namespace, - label_selector=self.config.label_selector + self._v1.list_namespaced_pod, namespace=self.config.namespace, label_selector=self.config.label_selector ) # Get current pod names @@ -532,9 +501,7 @@ async def _reconcile_state(self) -> ReconciliationResult: if pod.metadata.name in missing_pods: logger.info(f"Reconciling missing pod: {pod.metadata.name}") event = PodEvent( - event_type=WatchEventType.ADDED, - pod=pod, - resource_version=pod.metadata.resource_version + event_type=WatchEventType.ADDED, pod=pod, resource_version=pod.metadata.resource_version ) await self._process_pod_event(event) @@ -550,10 +517,7 @@ async def _reconcile_state(self) -> ReconciliationResult: duration = time.time() - start_time return ReconciliationResult( - missing_pods=missing_pods, - extra_pods=extra_pods, - duration_seconds=duration, - success=True + missing_pods=missing_pods, extra_pods=extra_pods, duration_seconds=duration, success=True ) except Exception as e: @@ -565,7 +529,7 @@ async def _reconcile_state(self) -> ReconciliationResult: extra_pods=set(), duration_seconds=time.time() - start_time, success=False, - error=str(e) + error=str(e), ) def _log_reconciliation_result(self, result: ReconciliationResult) -> None: @@ -577,10 +541,7 @@ def _log_reconciliation_result(self, result: ReconciliationResult) -> None: f"{len(result.extra_pods)} extra pods" ) else: - logger.error( - f"Reconciliation failed after {result.duration_seconds:.2f}s: " - f"{result.error}" - ) + logger.error(f"Reconciliation failed after {result.duration_seconds:.2f}s: {result.error}") async def get_status(self) -> StatusDict: """Get monitor status.""" @@ -592,16 +553,16 @@ async def get_status(self) -> StatusDict: "config": { "namespace": self.config.namespace, "label_selector": self.config.label_selector, - "enable_reconciliation": self.config.enable_state_reconciliation - } + "enable_reconciliation": self.config.enable_state_reconciliation, + }, } @asynccontextmanager async def create_pod_monitor( - config: PodMonitorConfig, - producer: UnifiedProducer, - k8s_clients: K8sClients | None = None, + config: PodMonitorConfig, + producer: UnifiedProducer, + k8s_clients: K8sClients | None = None, ) -> AsyncIterator[PodMonitor]: """Create and manage a pod monitor instance.""" monitor = PodMonitor(config=config, producer=producer, k8s_clients=k8s_clients) diff --git a/backend/app/services/rate_limit_service.py b/backend/app/services/rate_limit_service.py index 47f01629..e4b8d9e5 100644 --- a/backend/app/services/rate_limit_service.py +++ b/backend/app/services/rate_limit_service.py @@ -31,8 +31,8 @@ def __init__(self, redis_client: redis.Redis, settings: Settings, metrics: "Rate self.metrics = metrics # Patterns to match IDs and replace with * - self._uuid_pattern = re.compile(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}') - self._id_pattern = re.compile(r'/[0-9a-zA-Z]{20,}(?=/|$)') + self._uuid_pattern = re.compile(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}") + self._id_pattern = re.compile(r"/[0-9a-zA-Z]{20,}(?=/|$)") def _index_key(self, user_id: str) -> str: """Key of the Redis set that indexes all per-user rate limit state keys.""" @@ -43,8 +43,8 @@ async def _register_user_key(self, user_id: str, key: str) -> None: _ = await cast(Awaitable[int], self.redis.sadd(self._index_key(user_id), key)) def _normalize_endpoint(self, endpoint: str) -> str: - normalized = self._uuid_pattern.sub('*', endpoint) - normalized = self._id_pattern.sub('/*', normalized) + normalized = self._uuid_pattern.sub("*", endpoint) + normalized = self._id_pattern.sub("/*", normalized) return normalized @contextmanager @@ -75,11 +75,9 @@ def _labels(self, ctx: "RateLimitService._Context") -> dict[str, str]: "algorithm": ctx.algorithm.value, } if ctx.rule is not None: - labels.update({ - "group": ctx.rule.group.value, - "priority": str(ctx.rule.priority), - "multiplier": str(ctx.multiplier) - }) + labels.update( + {"group": ctx.rule.group.value, "priority": str(ctx.rule.priority), "multiplier": str(ctx.multiplier)} + ) return labels def _unlimited(self, algo: RateLimitAlgorithm = RateLimitAlgorithm.SLIDING_WINDOW) -> RateLimitStatus: @@ -106,10 +104,7 @@ def _prepare_config(self, config: RateLimitConfig) -> None: user_limit.rules.sort(key=lambda r: r.priority, reverse=True) async def check_rate_limit( - self, - user_id: str, - endpoint: str, - config: Optional[RateLimitConfig] = None + self, user_id: str, endpoint: str, config: Optional[RateLimitConfig] = None ) -> RateLimitStatus: start_time = time.time() # Tracing attributes added at end of check @@ -123,9 +118,14 @@ async def check_rate_limit( try: if not self.settings.RATE_LIMIT_ENABLED: # Track request when rate limiting is disabled - self.metrics.requests_total.add(1, {"authenticated": str(ctx.authenticated).lower(), - "endpoint": ctx.normalized_endpoint, - "algorithm": "disabled"}) + self.metrics.requests_total.add( + 1, + { + "authenticated": str(ctx.authenticated).lower(), + "endpoint": ctx.normalized_endpoint, + "algorithm": "disabled", + }, + ) return self._unlimited() if config is None: @@ -142,17 +142,27 @@ async def check_rate_limit( user_config = config.user_overrides.get(str(user_id)) if user_config and user_config.bypass_rate_limit: self.metrics.bypass.add(1, {"endpoint": ctx.normalized_endpoint}) - self.metrics.requests_total.add(1, {"authenticated": str(ctx.authenticated).lower(), - "endpoint": ctx.normalized_endpoint, - "algorithm": "bypassed"}) + self.metrics.requests_total.add( + 1, + { + "authenticated": str(ctx.authenticated).lower(), + "endpoint": ctx.normalized_endpoint, + "algorithm": "bypassed", + }, + ) return self._unlimited() # Find matching rule rule = self._find_matching_rule(endpoint, user_config, config) if not rule: - self.metrics.requests_total.add(1, {"authenticated": str(ctx.authenticated).lower(), - "endpoint": ctx.normalized_endpoint, - "algorithm": "no_limit"}) + self.metrics.requests_total.add( + 1, + { + "authenticated": str(ctx.authenticated).lower(), + "endpoint": ctx.normalized_endpoint, + "algorithm": "no_limit", + }, + ) return self._unlimited() # Apply user multiplier if exists @@ -162,13 +172,19 @@ async def check_rate_limit( ctx.algorithm = rule.algorithm # Track total requests with algorithm - self.metrics.requests_total.add(1, {"authenticated": str(ctx.authenticated).lower(), - "endpoint": ctx.normalized_endpoint, - "algorithm": rule.algorithm.value}) + self.metrics.requests_total.add( + 1, + { + "authenticated": str(ctx.authenticated).lower(), + "endpoint": ctx.normalized_endpoint, + "algorithm": rule.algorithm.value, + }, + ) # Record window size - self.metrics.window_size.record(rule.window_seconds, {"endpoint": ctx.normalized_endpoint, - "algorithm": rule.algorithm.value}) + self.metrics.window_size.record( + rule.window_seconds, {"endpoint": ctx.normalized_endpoint, "algorithm": rule.algorithm.value} + ) # Check rate limit based on algorithm (avoid duplicate branches) timer_attrs = { @@ -207,17 +223,13 @@ async def check_rate_limit( ) return status finally: - self.metrics.check_duration.record((time.time() - start_time) * 1000, - {"endpoint": ctx.normalized_endpoint, - "authenticated": str(ctx.authenticated).lower()}) + self.metrics.check_duration.record( + (time.time() - start_time) * 1000, + {"endpoint": ctx.normalized_endpoint, "authenticated": str(ctx.authenticated).lower()}, + ) async def _check_sliding_window( - self, - user_id: str, - endpoint: str, - limit: int, - window_seconds: int, - rule: RateLimitRule + self, user_id: str, endpoint: str, limit: int, window_seconds: int, rule: RateLimitRule ) -> RateLimitStatus: key = f"{self.prefix}sw:{user_id}:{endpoint}" await self._register_user_key(user_id, key) @@ -226,8 +238,7 @@ async def _check_sliding_window( normalized_endpoint = self._normalize_endpoint(endpoint) - with self._timer(self.metrics.redis_duration, {"operation": "sliding_window", - "endpoint": normalized_endpoint}): + with self._timer(self.metrics.redis_duration, {"operation": "sliding_window", "endpoint": normalized_endpoint}): pipe = self.redis.pipeline() pipe.zremrangebyscore(key, 0, window_start) pipe.zadd(key, {str(now): now}) @@ -255,7 +266,7 @@ async def _check_sliding_window( reset_at=datetime.fromtimestamp(now + retry_after, timezone.utc), retry_after=retry_after, matched_rule=rule.endpoint_pattern, - algorithm=RateLimitAlgorithm.SLIDING_WINDOW + algorithm=RateLimitAlgorithm.SLIDING_WINDOW, ) return RateLimitStatus( @@ -265,17 +276,11 @@ async def _check_sliding_window( reset_at=datetime.fromtimestamp(now + window_seconds, timezone.utc), retry_after=None, matched_rule=rule.endpoint_pattern, - algorithm=RateLimitAlgorithm.SLIDING_WINDOW + algorithm=RateLimitAlgorithm.SLIDING_WINDOW, ) async def _check_token_bucket( - self, - user_id: str, - endpoint: str, - limit: int, - window_seconds: int, - burst_multiplier: float, - rule: RateLimitRule + self, user_id: str, endpoint: str, limit: int, window_seconds: int, burst_multiplier: float, rule: RateLimitRule ) -> RateLimitStatus: key = f"{self.prefix}tb:{user_id}:{endpoint}" max_tokens = int(limit * burst_multiplier) @@ -288,8 +293,9 @@ async def _check_token_bucket( await self._register_user_key(user_id, key) # Get current bucket state - with self._timer(self.metrics.redis_duration, {"operation": "token_bucket_get", - "endpoint": normalized_endpoint}): + with self._timer( + self.metrics.redis_duration, {"operation": "token_bucket_get", "endpoint": normalized_endpoint} + ): bucket_data = await self.redis.get(key) if bucket_data: @@ -306,23 +312,22 @@ async def _check_token_bucket( last_refill = now # Record token bucket metrics - self.metrics.token_bucket_tokens.record(tokens, { - "endpoint": normalized_endpoint, - }) - self.metrics.token_bucket_refill_rate.record(refill_rate, { - "endpoint": normalized_endpoint - }) + self.metrics.token_bucket_tokens.record( + tokens, + { + "endpoint": normalized_endpoint, + }, + ) + self.metrics.token_bucket_refill_rate.record(refill_rate, {"endpoint": normalized_endpoint}) # Try to consume a token if tokens >= 1: tokens -= 1 - bucket = { - "tokens": tokens, - "last_refill": now - } + bucket = {"tokens": tokens, "last_refill": now} - with self._timer(self.metrics.redis_duration, {"operation": "token_bucket_set", - "endpoint": normalized_endpoint}): + with self._timer( + self.metrics.redis_duration, {"operation": "token_bucket_set", "endpoint": normalized_endpoint} + ): await self.redis.setex(key, window_seconds * 2, json.dumps(bucket)) return RateLimitStatus( @@ -332,7 +337,7 @@ async def _check_token_bucket( reset_at=datetime.fromtimestamp(now + window_seconds, timezone.utc), retry_after=None, matched_rule=rule.endpoint_pattern, - algorithm=RateLimitAlgorithm.TOKEN_BUCKET + algorithm=RateLimitAlgorithm.TOKEN_BUCKET, ) else: # Calculate when next token will be available @@ -345,14 +350,11 @@ async def _check_token_bucket( reset_at=datetime.fromtimestamp(now + retry_after, timezone.utc), retry_after=retry_after, matched_rule=rule.endpoint_pattern, - algorithm=RateLimitAlgorithm.TOKEN_BUCKET + algorithm=RateLimitAlgorithm.TOKEN_BUCKET, ) def _find_matching_rule( - self, - endpoint: str, - user_config: Optional[UserRateLimit], - global_config: RateLimitConfig + self, endpoint: str, user_config: Optional[UserRateLimit], global_config: RateLimitConfig ) -> Optional[RateLimitRule]: rules = [] @@ -388,7 +390,7 @@ async def _get_config(self) -> RateLimitConfig: await self.redis.setex( config_key, 300, # Cache for 5 minutes - mapper.model_dump_json(config) + mapper.model_dump_json(config), ) # Prepare for fast matching @@ -422,11 +424,7 @@ async def update_config(self, config: RateLimitConfig) -> None: self.metrics.custom_users.record(custom_users_count) self.metrics.bypass_users.record(bypass_users_count) - async def update_user_rate_limit( - self, - user_id: str, - user_limit: UserRateLimit - ) -> None: + async def update_user_rate_limit(self, user_id: str, user_limit: UserRateLimit) -> None: config = await self._get_config() config.user_overrides[str(user_id)] = user_limit await self.update_config(config) @@ -436,9 +434,9 @@ async def get_user_rate_limit(self, user_id: str) -> Optional[UserRateLimit]: return config.user_overrides.get(str(user_id)) async def get_user_rate_limit_summary( - self, - user_id: str, - config: Optional[RateLimitConfig] = None, + self, + user_id: str, + config: Optional[RateLimitConfig] = None, ) -> UserRateLimitSummary: """Return a summary for the user's rate limit configuration with sensible defaults. @@ -486,7 +484,7 @@ async def reset_user_limits(self, user_id: str) -> None: await self.redis.delete(*keys) await self.redis.delete(index_key) - async def get_usage_stats(self, user_id: str) -> dict: + async def get_usage_stats(self, user_id: str) -> dict[str, dict[str, object]]: stats: dict[str, dict[str, object]] = {} index_key = self._index_key(user_id) keys = await cast(Awaitable[set[Any]], self.redis.smembers(index_key)) diff --git a/backend/app/services/replay_service.py b/backend/app/services/replay_service.py index 34333ace..916f3d76 100644 --- a/backend/app/services/replay_service.py +++ b/backend/app/services/replay_service.py @@ -19,11 +19,7 @@ class ReplayService: """Service for managing replay sessions and providing business logic""" - def __init__( - self, - repository: ReplayRepository, - event_replay_service: EventReplayService - ) -> None: + def __init__(self, repository: ReplayRepository, event_replay_service: EventReplayService) -> None: self.repository = repository self.event_replay_service = event_replay_service @@ -51,8 +47,9 @@ async def start_session(self, session_id: str) -> ReplayOperationResult: await self.repository.update_session_status(session_id, ReplayStatus.RUNNING) - return ReplayOperationResult(session_id=session_id, status=ReplayStatus.RUNNING, - message="Replay session started") + return ReplayOperationResult( + session_id=session_id, status=ReplayStatus.RUNNING, message="Replay session started" + ) except ValueError as e: raise ServiceError(str(e), status_code=404) from e @@ -67,8 +64,9 @@ async def pause_session(self, session_id: str) -> ReplayOperationResult: await self.repository.update_session_status(session_id, ReplayStatus.PAUSED) - return ReplayOperationResult(session_id=session_id, status=ReplayStatus.PAUSED, - message="Replay session paused") + return ReplayOperationResult( + session_id=session_id, status=ReplayStatus.PAUSED, message="Replay session paused" + ) except ValueError as e: raise ServiceError(str(e), status_code=404) from e @@ -83,8 +81,9 @@ async def resume_session(self, session_id: str) -> ReplayOperationResult: await self.repository.update_session_status(session_id, ReplayStatus.RUNNING) - return ReplayOperationResult(session_id=session_id, status=ReplayStatus.RUNNING, - message="Replay session resumed") + return ReplayOperationResult( + session_id=session_id, status=ReplayStatus.RUNNING, message="Replay session resumed" + ) except ValueError as e: raise ServiceError(str(e), status_code=404) from e @@ -99,8 +98,9 @@ async def cancel_session(self, session_id: str) -> ReplayOperationResult: await self.repository.update_session_status(session_id, ReplayStatus.CANCELLED) - return ReplayOperationResult(session_id=session_id, status=ReplayStatus.CANCELLED, - message="Replay session cancelled") + return ReplayOperationResult( + session_id=session_id, status=ReplayStatus.CANCELLED, message="Replay session cancelled" + ) except ValueError as e: raise ServiceError(str(e), status_code=404) from e @@ -108,11 +108,7 @@ async def cancel_session(self, session_id: str) -> ReplayOperationResult: logger.error(f"Failed to cancel replay session: {e}") raise ServiceError(str(e), status_code=500) from e - def list_sessions( - self, - status: ReplayStatus | None = None, - limit: int = 100 - ) -> List[ReplaySessionState]: + def list_sessions(self, status: ReplayStatus | None = None, limit: int = 100) -> List[ReplaySessionState]: """List replay sessions with optional filtering (domain objects).""" return self.event_replay_service.list_sessions(status=status, limit=limit) @@ -144,5 +140,3 @@ async def cleanup_old_sessions(self, older_than_hours: int = 24) -> CleanupRespo except Exception as e: logger.error(f"Failed to cleanup old sessions: {e}") raise ServiceError(str(e), status_code=500) from e - - diff --git a/backend/app/services/result_processor/processor.py b/backend/app/services/result_processor/processor.py index 8138004e..1755fa3d 100644 --- a/backend/app/services/result_processor/processor.py +++ b/backend/app/services/result_processor/processor.py @@ -35,6 +35,7 @@ class ProcessingState(StringEnum): """Processing state enumeration.""" + IDLE = auto() PROCESSING = auto() STOPPED = auto() @@ -50,7 +51,7 @@ class ResultProcessorConfig(BaseModel): default_factory=lambda: [ KafkaTopic.EXECUTION_COMPLETED, KafkaTopic.EXECUTION_FAILED, - KafkaTopic.EXECUTION_TIMEOUT + KafkaTopic.EXECUTION_TIMEOUT, ] ) result_topic: KafkaTopic = Field(default=KafkaTopic.EXECUTION_RESULTS) @@ -62,10 +63,7 @@ class ResultProcessor(LifecycleEnabled): """Service for processing execution completion events and storing results.""" def __init__( - self, - execution_repo: ExecutionRepository, - producer: UnifiedProducer, - idempotency_manager: IdempotencyManager + self, execution_repo: ExecutionRepository, producer: UnifiedProducer, idempotency_manager: IdempotencyManager ) -> None: """Initialize the result processor.""" self.config = ResultProcessorConfig() @@ -128,17 +126,14 @@ async def _create_consumer(self) -> IdempotentConsumerWrapper: group_id=f"{self.config.consumer_group}.{settings.KAFKA_GROUP_SUFFIX}", max_poll_records=1, enable_auto_commit=True, - auto_offset_reset="earliest" + auto_offset_reset="earliest", ) # Create consumer with schema registry and dispatcher if not self._dispatcher: raise RuntimeError("Event dispatcher not initialized") - base_consumer = UnifiedConsumer( - consumer_config, - event_dispatcher=self._dispatcher - ) + base_consumer = UnifiedConsumer(consumer_config, event_dispatcher=self._dispatcher) wrapper = IdempotentConsumerWrapper( consumer=base_consumer, idempotency_manager=self._idempotency_manager, @@ -169,8 +164,7 @@ async def _handle_completed(self, event: ExecutionCompletedEvent) -> None: exec_obj = await self._execution_repo.get_execution(event.execution_id) if exec_obj is None: - raise ServiceError(message=f"Execution {event.execution_id} not found", - status_code=404) + raise ServiceError(message=f"Execution {event.execution_id} not found", status_code=404) lang_and_version = f"{exec_obj.lang}-{exec_obj.lang_version}" @@ -188,8 +182,7 @@ async def _handle_completed(self, event: ExecutionCompletedEvent) -> None: memory_limit_mib = int(settings_limit.rstrip("Mi")) # TODO: Less brittle acquisition of limit memory_percent = (memory_mib / memory_limit_mib) * 100 self._metrics.memory_utilization_percent.record( - memory_percent, - attributes={"lang_and_version": lang_and_version} + memory_percent, attributes={"lang_and_version": lang_and_version} ) result = ExecutionResultDomain( @@ -215,8 +208,7 @@ async def _handle_failed(self, event: ExecutionFailedEvent) -> None: # Fetch execution to get language and version for metrics exec_obj = await self._execution_repo.get_execution(event.execution_id) if exec_obj is None: - raise ServiceError(message=f"Execution {event.execution_id} not found", - status_code=404) + raise ServiceError(message=f"Execution {event.execution_id} not found", status_code=404) self._metrics.record_error(event.error_type) lang_and_version = f"{exec_obj.lang}-{exec_obj.lang_version}" @@ -244,8 +236,7 @@ async def _handle_timeout(self, event: ExecutionTimeoutEvent) -> None: exec_obj = await self._execution_repo.get_execution(event.execution_id) if exec_obj is None: - raise ServiceError(message=f"Execution {event.execution_id} not found", - status_code=404) + raise ServiceError(message=f"Execution {event.execution_id} not found", status_code=404) self._metrics.record_error(ExecutionErrorType.TIMEOUT) lang_and_version = f"{exec_obj.lang}-{exec_obj.lang_version}" @@ -286,16 +277,9 @@ async def _publish_result_stored(self, result: ExecutionResultDomain) -> None: ), ) - await self._producer.produce( - event_to_produce=event, - key=result.execution_id - ) + await self._producer.produce(event_to_produce=event, key=result.execution_id) - async def _publish_result_failed( - self, - execution_id: str, - error_message: str - ) -> None: + async def _publish_result_failed(self, execution_id: str, error_message: str) -> None: """Publish result processing failed event.""" event = ResultFailedEvent( @@ -307,10 +291,7 @@ async def _publish_result_failed( ), ) - await self._producer.produce( - event_to_produce=event, - key=execution_id - ) + await self._producer.produce(event_to_produce=event, key=execution_id) async def get_status(self) -> dict[str, Any]: """Get processor status.""" diff --git a/backend/app/services/result_processor/resource_cleaner.py b/backend/app/services/result_processor/resource_cleaner.py index b01fa253..2fb5c4c4 100644 --- a/backend/app/services/result_processor/resource_cleaner.py +++ b/backend/app/services/result_processor/resource_cleaner.py @@ -45,12 +45,12 @@ async def initialize(self) -> None: raise ServiceError(f"Kubernetes initialization failed: {e}") from e async def cleanup_pod_resources( - self, - pod_name: str, - namespace: str = "integr8scode", - execution_id: str | None = None, - timeout: int = 60, - delete_pvcs: bool = False, + self, + pod_name: str, + namespace: str = "integr8scode", + execution_id: str | None = None, + timeout: int = 60, + delete_pvcs: bool = False, ) -> None: """Clean up all resources associated with a pod""" await self.initialize() @@ -62,21 +62,14 @@ async def cleanup_pod_resources( *( [ self._delete_configmaps(execution_id, namespace), - *( - [self._delete_pvcs(execution_id, namespace)] - if delete_pvcs - else [] - ), + *([self._delete_pvcs(execution_id, namespace)] if delete_pvcs else []), ] if execution_id else [] ), ] - await asyncio.wait_for( - asyncio.gather(*tasks, return_exceptions=True), - timeout=timeout - ) + await asyncio.wait_for(asyncio.gather(*tasks, return_exceptions=True), timeout=timeout) logger.info(f"Successfully cleaned up resources for pod: {pod_name}") @@ -91,24 +84,13 @@ async def _delete_pod(self, pod_name: str, namespace: str) -> None: """Delete a pod""" if not self.v1: raise ServiceError("Kubernetes client not initialized") - + try: loop = asyncio.get_event_loop() - await loop.run_in_executor( - None, - self.v1.read_namespaced_pod, - pod_name, - namespace - ) + await loop.run_in_executor(None, self.v1.read_namespaced_pod, pod_name, namespace) await loop.run_in_executor( - None, - partial( - self.v1.delete_namespaced_pod, - pod_name, - namespace, - grace_period_seconds=30 - ) + None, partial(self.v1.delete_namespaced_pod, pod_name, namespace, grace_period_seconds=30) ) logger.info(f"Deleted pod: {pod_name}") @@ -124,64 +106,50 @@ async def _delete_configmaps(self, execution_id: str, namespace: str) -> None: """Delete ConfigMaps for an execution""" if not self.v1: raise ServiceError("Kubernetes client not initialized") - + await self._delete_labeled_resources( execution_id, namespace, self.v1.list_namespaced_config_map, self.v1.delete_namespaced_config_map, - "ConfigMap" + "ConfigMap", ) - async def _delete_pvcs(self, execution_id: str, namespace: str) -> None: """Delete PersistentVolumeClaims for an execution""" if not self.v1: raise ServiceError("Kubernetes client not initialized") - + await self._delete_labeled_resources( execution_id, namespace, self.v1.list_namespaced_persistent_volume_claim, self.v1.delete_namespaced_persistent_volume_claim, - "PVC" + "PVC", ) async def _delete_labeled_resources( - self, - execution_id: str, - namespace: str, - list_func: Any, - delete_func: Any, - resource_type: str + self, execution_id: str, namespace: str, list_func: Any, delete_func: Any, resource_type: str ) -> None: """Generic function to delete labeled resources""" try: loop = asyncio.get_event_loop() label_selector = f"execution-id={execution_id}" - resources = await loop.run_in_executor( - None, - partial(list_func, namespace, label_selector=label_selector) - ) + resources = await loop.run_in_executor(None, partial(list_func, namespace, label_selector=label_selector)) for resource in resources.items: - await loop.run_in_executor( - None, - delete_func, - resource.metadata.name, - namespace - ) + await loop.run_in_executor(None, delete_func, resource.metadata.name, namespace) logger.info(f"Deleted {resource_type}: {resource.metadata.name}") except ApiException as e: logger.error(f"Failed to delete {resource_type}s: {e}") async def cleanup_orphaned_resources( - self, - namespace: str = "integr8scode", - max_age_hours: int = 24, - dry_run: bool = False, + self, + namespace: str = "integr8scode", + max_age_hours: int = 24, + dry_run: bool = False, ) -> ResourceDict: """Clean up orphaned resources older than specified age""" await self.initialize() @@ -204,32 +172,23 @@ async def cleanup_orphaned_resources( raise ServiceError(f"Orphaned resource cleanup failed: {e}") from e async def _cleanup_orphaned_pods( - self, - namespace: str, - cutoff_time: datetime, - cleaned: ResourceDict, - dry_run: bool + self, namespace: str, cutoff_time: datetime, cleaned: ResourceDict, dry_run: bool ) -> None: """Clean up orphaned pods""" if not self.v1: raise ServiceError("Kubernetes client not initialized") - + loop = asyncio.get_event_loop() pods = await loop.run_in_executor( - None, - partial( - self.v1.list_namespaced_pod, - namespace, - label_selector="app=integr8s" - ) + None, partial(self.v1.list_namespaced_pod, namespace, label_selector="app=integr8s") ) terminal_phases = {"Succeeded", "Failed", "Unknown"} for pod in pods.items: if ( - pod.metadata.creation_timestamp.replace(tzinfo=timezone.utc) < cutoff_time - and pod.status.phase in terminal_phases + pod.metadata.creation_timestamp.replace(tzinfo=timezone.utc) < cutoff_time + and pod.status.phase in terminal_phases ): cleaned["pods"].append(pod.metadata.name) @@ -240,24 +199,15 @@ async def _cleanup_orphaned_pods( logger.error(f"Failed to delete orphaned pod {pod.metadata.name}: {e}") async def _cleanup_orphaned_configmaps( - self, - namespace: str, - cutoff_time: datetime, - cleaned: ResourceDict, - dry_run: bool + self, namespace: str, cutoff_time: datetime, cleaned: ResourceDict, dry_run: bool ) -> None: """Clean up orphaned ConfigMaps""" if not self.v1: raise ServiceError("Kubernetes client not initialized") - + loop = asyncio.get_event_loop() configmaps = await loop.run_in_executor( - None, - partial( - self.v1.list_namespaced_config_map, - namespace, - label_selector="app=integr8s" - ) + None, partial(self.v1.list_namespaced_config_map, namespace, label_selector="app=integr8s") ) for cm in configmaps.items: @@ -267,10 +217,7 @@ async def _cleanup_orphaned_configmaps( if not dry_run: try: await loop.run_in_executor( - None, - self.v1.delete_namespaced_config_map, - cm.metadata.name, - namespace + None, self.v1.delete_namespaced_config_map, cm.metadata.name, namespace ) except Exception as e: logger.error(f"Failed to delete orphaned ConfigMap {cm.metadata.name}: {e}") @@ -289,10 +236,9 @@ async def get_resource_usage(self, namespace: str = "default") -> CountDict: try: if not self.v1: raise ServiceError("Kubernetes client not initialized") - + pods = await loop.run_in_executor( - None, - partial(self.v1.list_namespaced_pod, namespace, label_selector=label_selector) + None, partial(self.v1.list_namespaced_pod, namespace, label_selector=label_selector) ) pod_count = len(pods.items) except Exception as e: @@ -303,10 +249,9 @@ async def get_resource_usage(self, namespace: str = "default") -> CountDict: try: if not self.v1: raise ServiceError("Kubernetes client not initialized") - + configmaps = await loop.run_in_executor( - None, - partial(self.v1.list_namespaced_config_map, namespace, label_selector=label_selector) + None, partial(self.v1.list_namespaced_config_map, namespace, label_selector=label_selector) ) configmap_count = len(configmaps.items) except Exception as e: @@ -317,10 +262,12 @@ async def get_resource_usage(self, namespace: str = "default") -> CountDict: try: if not self.networking_v1: raise ServiceError("Kubernetes networking client not initialized") - + policies = await loop.run_in_executor( None, - partial(self.networking_v1.list_namespaced_network_policy, namespace, label_selector=label_selector) + partial( + self.networking_v1.list_namespaced_network_policy, namespace, label_selector=label_selector + ), ) policy_count = len(policies.items) except Exception as e: diff --git a/backend/app/services/saga/base_saga.py b/backend/app/services/saga/base_saga.py index 40a4b96a..6e64a17a 100644 --- a/backend/app/services/saga/base_saga.py +++ b/backend/app/services/saga/base_saga.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from typing import Any from app.domain.enums.events import EventType from app.services.saga.saga_step import SagaStep @@ -6,7 +7,7 @@ class BaseSaga(ABC): """Base class for saga implementations. - + All saga implementations should inherit from this class and implement the required abstract methods to define their workflow. """ @@ -15,7 +16,7 @@ class BaseSaga(ABC): @abstractmethod def get_name(cls) -> str: """Get the unique name of this saga. - + Returns: String identifier for this saga type """ @@ -25,16 +26,16 @@ def get_name(cls) -> str: @abstractmethod def get_trigger_events(cls) -> list[EventType]: """Get event types that trigger this saga. - + Returns: List of event types that should start this saga """ pass @abstractmethod - def get_steps(self) -> list[SagaStep]: + def get_steps(self) -> list[SagaStep[Any]]: """Get saga steps in execution order. - + Returns: Ordered list of steps to execute for this saga """ diff --git a/backend/app/services/saga/execution_saga.py b/backend/app/services/saga/execution_saga.py index e38fd1ab..479d7c99 100644 --- a/backend/app/services/saga/execution_saga.py +++ b/backend/app/services/saga/execution_saga.py @@ -1,5 +1,5 @@ import logging -from typing import Optional +from typing import Any, Optional from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository from app.domain.enums.events import EventType @@ -179,17 +179,14 @@ async def execute(self, context: SagaContext, event: ExecutionRequestedEvent) -> metadata=EventMetadata( service_name="saga-orchestrator", service_version="1.0.0", - user_id=event.metadata.user_id if event.metadata else "system" - ) + user_id=event.metadata.user_id if event.metadata else "system", + ), ) # Publish command to saga_commands topic if not self.producer: raise RuntimeError("Producer dependency not injected") - await self.producer.produce( - event_to_produce=create_pod_cmd, - key=execution_id - ) + await self.producer.produce(event_to_produce=create_pod_cmd, key=execution_id) context.set("pod_creation_triggered", True) logger.info(f"CreatePodCommandEvent published for execution {execution_id}") @@ -237,6 +234,7 @@ def get_compensation(self) -> CompensationStep | None: # Compensation Steps + class ReleaseResourcesCompensation(CompensationStep): """Release allocated resources""" @@ -317,17 +315,12 @@ async def compensate(self, context: SagaContext) -> bool: execution_id=execution_id, reason="Saga compensation due to failure", metadata=EventMetadata( - service_name="saga-orchestrator", - service_version="1.0.0", - user_id=context.get("user_id", "system") - ) + service_name="saga-orchestrator", service_version="1.0.0", user_id=context.get("user_id", "system") + ), ) - await self.producer.produce( - event_to_produce=delete_pod_cmd, - key=execution_id - ) - + await self.producer.produce(event_to_produce=delete_pod_cmd, key=execution_id) + logger.info(f"DeletePodCommandEvent published for {execution_id}") return True @@ -349,7 +342,7 @@ def get_trigger_events(cls) -> list[EventType]: """Get events that trigger this saga""" return [EventType.EXECUTION_REQUESTED] - def get_steps(self) -> list[SagaStep]: + def get_steps(self) -> list[SagaStep[Any]]: """Get saga steps in order""" alloc_repo = getattr(self, "_alloc_repo", None) producer = getattr(self, "_producer", None) diff --git a/backend/app/services/saga/saga_orchestrator.py b/backend/app/services/saga/saga_orchestrator.py index 7fca1d72..c72a0624 100644 --- a/backend/app/services/saga/saga_orchestrator.py +++ b/backend/app/services/saga/saga_orchestrator.py @@ -33,13 +33,13 @@ class SagaOrchestrator(LifecycleEnabled): """Orchestrates saga execution and compensation""" def __init__( - self, - config: SagaConfig, - saga_repository: SagaRepository, - producer: UnifiedProducer, - event_store: EventStore, - idempotency_manager: IdempotencyManager, - resource_allocation_repository: ResourceAllocationRepository, + self, + config: SagaConfig, + saga_repository: SagaRepository, + producer: UnifiedProducer, + event_store: EventStore, + idempotency_manager: IdempotencyManager, + resource_allocation_repository: ResourceAllocationRepository, ): self.config = config self._sagas: dict[str, type[BaseSaga]] = {} @@ -51,7 +51,7 @@ def __init__( self._repo: SagaRepository = saga_repository self._alloc_repo: ResourceAllocationRepository = resource_allocation_repository self._running = False - self._tasks: list[asyncio.Task] = [] + self._tasks: list[asyncio.Task[None]] = [] def register_saga(self, saga_class: type[BaseSaga]) -> None: self._sagas[saga_class.get_name()] = saga_class @@ -171,8 +171,10 @@ async def _handle_event(self, event: BaseEvent) -> None: def _should_trigger_saga(self, saga_class: type[BaseSaga], event: BaseEvent) -> bool: trigger_event_types = saga_class.get_trigger_events() should_trigger = event.event_type in trigger_event_types - logger.debug(f"Saga {saga_class.get_name()} triggers on {trigger_event_types}, " - f"event is {event.event_type}, should trigger: {should_trigger}") + logger.debug( + f"Saga {saga_class.get_name()} triggers on {trigger_event_types}, " + f"event is {event.event_type}, should trigger: {should_trigger}" + ) return should_trigger async def _start_saga(self, saga_name: str, trigger_event: BaseEvent) -> str | None: @@ -224,11 +226,11 @@ async def _start_saga(self, saga_name: str, trigger_event: BaseEvent) -> str | N return instance.saga_id async def _execute_saga( - self, - saga: BaseSaga, - instance: Saga, - context: SagaContext, - trigger_event: BaseEvent, + self, + saga: BaseSaga, + instance: Saga, + context: SagaContext, + trigger_event: BaseEvent, ) -> None: """Execute saga steps""" tracer = get_tracer() @@ -393,10 +395,10 @@ async def get_execution_sagas(self, execution_id: str) -> list[Saga]: async def cancel_saga(self, saga_id: str) -> bool: """Cancel a running saga and trigger compensation. - + Args: saga_id: The ID of the saga to cancel - + Returns: True if cancelled successfully, False otherwise """ @@ -480,7 +482,7 @@ async def cancel_saga(self, saga_id: str) -> bool: async def _publish_saga_cancelled_event(self, saga_instance: Saga) -> None: """Publish saga cancelled event. - + Args: saga_instance: The cancelled saga instance """ @@ -514,20 +516,20 @@ async def _publish_saga_cancelled_event(self, saga_instance: Saga) -> None: def create_saga_orchestrator( - saga_repository: SagaRepository, - producer: UnifiedProducer, - event_store: EventStore, - idempotency_manager: IdempotencyManager, - resource_allocation_repository: ResourceAllocationRepository, - config: SagaConfig, + saga_repository: SagaRepository, + producer: UnifiedProducer, + event_store: EventStore, + idempotency_manager: IdempotencyManager, + resource_allocation_repository: ResourceAllocationRepository, + config: SagaConfig, ) -> SagaOrchestrator: """Factory function to create a saga orchestrator. - + Args: producer: Kafka producer instance event_store: Event store instance for event sourcing config: Optional saga configuration (uses defaults if not provided) - + Returns: A new saga orchestrator instance """ diff --git a/backend/app/services/saga/saga_service.py b/backend/app/services/saga/saga_service.py index b09291ff..fd2f1594 100644 --- a/backend/app/services/saga/saga_service.py +++ b/backend/app/services/saga/saga_service.py @@ -14,12 +14,7 @@ class SagaService: """Service for saga business logic and orchestration.""" - def __init__( - self, - saga_repo: SagaRepository, - execution_repo: ExecutionRepository, - orchestrator: SagaOrchestrator - ): + def __init__(self, saga_repo: SagaRepository, execution_repo: ExecutionRepository, orchestrator: SagaOrchestrator): self.saga_repo = saga_repo self.execution_repo = execution_repo self.orchestrator = orchestrator @@ -29,15 +24,11 @@ def __init__( extra={ "saga_repo": type(saga_repo).__name__, "execution_repo": type(execution_repo).__name__, - "orchestrator": type(orchestrator).__name__ - } + "orchestrator": type(orchestrator).__name__, + }, ) - async def check_execution_access( - self, - execution_id: str, - user: User - ) -> bool: + async def check_execution_access(self, execution_id: str, user: User) -> bool: """Check if user has access to an execution.""" # Admins have access to all executions if user.role == UserRole.ADMIN: @@ -50,20 +41,13 @@ async def check_execution_access( logger.debug( f"Access denied for user {user.user_id} to execution {execution_id}", - extra={"user_role": user.role, "execution_exists": execution is not None} + extra={"user_role": user.role, "execution_exists": execution is not None}, ) return False - async def get_saga_with_access_check( - self, - saga_id: str, - user: User - ) -> Saga: + async def get_saga_with_access_check(self, saga_id: str, user: User) -> Saga: """Get saga with access control.""" - logger.debug( - f"Getting saga {saga_id} for user {user.user_id}", - extra={"user_role": user.role} - ) + logger.debug(f"Getting saga {saga_id} for user {user.user_id}", extra={"user_role": user.role}) saga = await self.saga_repo.get_saga(saga_id) if not saga: @@ -73,40 +57,25 @@ async def get_saga_with_access_check( # Check access permissions if not await self.check_execution_access(saga.execution_id, user): logger.warning( - f"Access denied for user {user.user_id} to saga {saga_id}", - extra={"execution_id": saga.execution_id} - ) - raise SagaAccessDeniedError( - f"Access denied - you don't have access to execution {saga.execution_id}" + f"Access denied for user {user.user_id} to saga {saga_id}", extra={"execution_id": saga.execution_id} ) + raise SagaAccessDeniedError(f"Access denied - you don't have access to execution {saga.execution_id}") return saga - async def get_execution_sagas( - self, - execution_id: str, - user: User, - state: SagaState | None = None - ) -> list[Saga]: + async def get_execution_sagas(self, execution_id: str, user: User, state: SagaState | None = None) -> list[Saga]: """Get sagas for an execution with access control.""" # Check access to execution if not await self.check_execution_access(execution_id, user): logger.warning( - f"Access denied for user {user.user_id} to execution {execution_id}", - extra={"user_role": user.role} - ) - raise SagaAccessDeniedError( - f"Access denied - no access to execution {execution_id}" + f"Access denied for user {user.user_id} to execution {execution_id}", extra={"user_role": user.role} ) + raise SagaAccessDeniedError(f"Access denied - no access to execution {execution_id}") return await self.saga_repo.get_sagas_by_execution(execution_id, state) async def list_user_sagas( - self, - user: User, - state: SagaState | None = None, - limit: int = 100, - skip: int = 0 + self, user: User, state: SagaState | None = None, limit: int = 100, skip: int = 0 ) -> SagaListResult: """List sagas accessible by user.""" saga_filter = SagaFilter(state=state) @@ -117,56 +86,40 @@ async def list_user_sagas( saga_filter.execution_ids = user_execution_ids logger.debug( f"Filtering sagas for user {user.user_id}", - extra={"execution_count": len(user_execution_ids) if user_execution_ids else 0} + extra={"execution_count": len(user_execution_ids) if user_execution_ids else 0}, ) # Get sagas from repository result = await self.saga_repo.list_sagas(saga_filter, limit, skip) logger.debug( f"Listed {len(result.sagas)} sagas for user {user.user_id}", - extra={"total": result.total, "state_filter": str(state) if state else None} + extra={"total": result.total, "state_filter": str(state) if state else None}, ) return result - async def cancel_saga( - self, - saga_id: str, - user: User - ) -> bool: + async def cancel_saga(self, saga_id: str, user: User) -> bool: """Cancel a saga with permission check.""" - logger.info( - f"User {user.user_id} requesting cancellation of saga {saga_id}", - extra={"user_role": user.role} - ) + logger.info(f"User {user.user_id} requesting cancellation of saga {saga_id}", extra={"user_role": user.role}) # Get saga with access check saga = await self.get_saga_with_access_check(saga_id, user) # Check if saga can be cancelled if saga.state not in [SagaState.RUNNING, SagaState.CREATED]: raise SagaInvalidStateError( - f"Cannot cancel saga in {saga.state} state. " - f"Only RUNNING or CREATED sagas can be cancelled." + f"Cannot cancel saga in {saga.state} state. Only RUNNING or CREATED sagas can be cancelled." ) # Use orchestrator to cancel success = await self.orchestrator.cancel_saga(saga_id) if success: logger.info( - f"User {user.user_id} cancelled saga {saga_id}", - extra={"user_role": user.role, "saga_id": saga_id} + f"User {user.user_id} cancelled saga {saga_id}", extra={"user_role": user.role, "saga_id": saga_id} ) else: - logger.error( - f"Failed to cancel saga {saga_id} for user {user.user_id}", - extra={"saga_id": saga_id} - ) + logger.error(f"Failed to cancel saga {saga_id} for user {user.user_id}", extra={"saga_id": saga_id}) return success - async def get_saga_statistics( - self, - user: User, - include_all: bool = False - ) -> dict[str, object]: + async def get_saga_statistics(self, user: User, include_all: bool = False) -> dict[str, object]: """Get saga statistics.""" saga_filter = None @@ -177,23 +130,18 @@ async def get_saga_statistics( return await self.saga_repo.get_saga_statistics(saga_filter) - async def get_saga_status_from_orchestrator( - self, - saga_id: str, - user: User - ) -> Saga | None: + async def get_saga_status_from_orchestrator(self, saga_id: str, user: User) -> Saga | None: """Get saga status from orchestrator with fallback to database.""" logger.debug(f"Getting live saga status for {saga_id}") # Try orchestrator first for live status saga = await self.orchestrator.get_saga_status(saga_id) if saga: - # Check access if not await self.check_execution_access(saga.execution_id, user): logger.warning( f"Access denied for user {user.user_id} to live saga {saga_id}", - extra={"execution_id": saga.execution_id} + extra={"execution_id": saga.execution_id}, ) raise SagaAccessDeniedError(f"Access denied - no access to execution {saga.execution_id}") diff --git a/backend/app/services/saga/saga_step.py b/backend/app/services/saga/saga_step.py index bf07fe0f..e03d0e22 100644 --- a/backend/app/services/saga/saga_step.py +++ b/backend/app/services/saga/saga_step.py @@ -8,7 +8,7 @@ logger = logging.getLogger(__name__) -T = TypeVar('T', bound=BaseEvent) +T = TypeVar("T", bound=BaseEvent) class SagaContext: @@ -35,7 +35,7 @@ def add_event(self, event: BaseEvent) -> None: """Add event to context""" self.events.append(event) - def add_compensation(self, compensation: 'CompensationStep') -> None: + def add_compensation(self, compensation: "CompensationStep") -> None: """Add compensation step""" self.compensations.append(compensation) @@ -80,7 +80,7 @@ def __init__(self, name: str): async def execute(self, context: SagaContext, event: T) -> bool: """ Execute the saga step - + Returns: True if step succeeded and saga should continue False if step failed and compensation should start @@ -88,7 +88,7 @@ async def execute(self, context: SagaContext, event: T) -> bool: pass @abstractmethod - def get_compensation(self) -> Optional['CompensationStep']: + def get_compensation(self) -> Optional["CompensationStep"]: """Get compensation step for this action""" pass @@ -110,7 +110,7 @@ def __init__(self, name: str): async def compensate(self, context: SagaContext) -> bool: """ Execute compensation logic - + Returns: True if compensation succeeded False if compensation failed diff --git a/backend/app/services/saved_script_service.py b/backend/app/services/saved_script_service.py index 9dbde36a..d36e6e96 100644 --- a/backend/app/services/saved_script_service.py +++ b/backend/app/services/saved_script_service.py @@ -1,4 +1,3 @@ - from app.core.exceptions import ServiceError from app.core.logging import logger from app.db.repositories import SavedScriptRepository @@ -14,9 +13,7 @@ def __init__(self, saved_script_repo: SavedScriptRepository): self.saved_script_repo = saved_script_repo async def create_saved_script( - self, - saved_script_create: DomainSavedScriptCreate, - user_id: str + self, saved_script_create: DomainSavedScriptCreate, user_id: str ) -> DomainSavedScript: logger.info( "Creating new saved script", @@ -39,11 +36,7 @@ async def create_saved_script( ) return created_script - async def get_saved_script( - self, - script_id: str, - user_id: str - ) -> DomainSavedScript: + async def get_saved_script(self, script_id: str, user_id: str) -> DomainSavedScript: logger.info( "Retrieving saved script", extra={ @@ -68,10 +61,7 @@ async def get_saved_script( return script async def update_saved_script( - self, - script_id: str, - user_id: str, - update_data: DomainSavedScriptUpdate + self, script_id: str, user_id: str, update_data: DomainSavedScriptUpdate ) -> DomainSavedScript: logger.info( "Updating saved script", @@ -94,11 +84,7 @@ async def update_saved_script( ) return updated_script - async def delete_saved_script( - self, - script_id: str, - user_id: str - ) -> None: + async def delete_saved_script(self, script_id: str, user_id: str) -> None: logger.info( "Deleting saved script", extra={ @@ -114,10 +100,7 @@ async def delete_saved_script( extra={"script_id": script_id, "user_id": user_id}, ) - async def list_saved_scripts( - self, - user_id: str - ) -> list[DomainSavedScript]: + async def list_saved_scripts(self, user_id: str) -> list[DomainSavedScript]: logger.info( "Listing saved scripts", extra={ diff --git a/backend/app/services/sse/redis_bus.py b/backend/app/services/sse/redis_bus.py index 074c33a6..bb3dae5a 100644 --- a/backend/app/services/sse/redis_bus.py +++ b/backend/app/services/sse/redis_bus.py @@ -31,16 +31,15 @@ async def close(self) -> None: try: await self._pubsub.unsubscribe(self._channel) finally: - await self._pubsub.aclose() + await self._pubsub.aclose() # type: ignore[no-untyped-call] class SSERedisBus: """Redis-backed pub/sub bus for SSE event fan-out across workers.""" - def __init__(self, - redis_client: redis.Redis, - exec_prefix: str = "sse:exec:", - notif_prefix: str = "sse:notif:") -> None: + def __init__( + self, redis_client: redis.Redis, exec_prefix: str = "sse:exec:", notif_prefix: str = "sse:notif:" + ) -> None: self._redis = redis_client self._exec_prefix = exec_prefix self._notif_prefix = notif_prefix diff --git a/backend/app/services/sse/sse_service.py b/backend/app/services/sse/sse_service.py index 4ab388a3..63f6bc57 100644 --- a/backend/app/services/sse/sse_service.py +++ b/backend/app/services/sse/sse_service.py @@ -17,7 +17,6 @@ class SSEService: - # Only result_stored should terminate the stream; other terminal-ish # execution events precede the final persisted result and must not close # the connection prematurely. @@ -44,30 +43,28 @@ def __init__( self.metrics = get_connection_metrics() self.heartbeat_interval = getattr(settings, "SSE_HEARTBEAT_INTERVAL", 30) - async def create_execution_stream( - self, - execution_id: str, - user_id: str - ) -> AsyncGenerator[Dict[str, Any], None]: + async def create_execution_stream(self, execution_id: str, user_id: str) -> AsyncGenerator[Dict[str, Any], None]: connection_id = f"sse_{execution_id}_{datetime.now(timezone.utc).timestamp()}" - + shutdown_event = await self.shutdown_manager.register_connection(execution_id, connection_id) if shutdown_event is None: - yield self._format_event("error", { - "error": "Server is shutting down", - "timestamp": datetime.now(timezone.utc).isoformat() - }) + yield self._format_event( + "error", {"error": "Server is shutting down", "timestamp": datetime.now(timezone.utc).isoformat()} + ) return subscription = None try: # Start opening subscription concurrently, then yield handshake sub_task = asyncio.create_task(self.sse_bus.open_subscription(execution_id)) - yield self._format_event("connected", { - "execution_id": execution_id, - "timestamp": datetime.now(timezone.utc).isoformat(), - "connection_id": connection_id - }) + yield self._format_event( + "connected", + { + "execution_id": execution_id, + "timestamp": datetime.now(timezone.utc).isoformat(), + "connection_id": connection_id, + }, + ) # Complete Redis subscription after handshake logger.info(f"Opening Redis subscription for execution {execution_id}") @@ -91,7 +88,7 @@ async def create_execution_stream( include_heartbeat=False, ): yield event_data - + finally: if subscription is not None: await subscription.close() @@ -108,20 +105,22 @@ async def _stream_events_redis( last_heartbeat = datetime.now(timezone.utc) while True: if shutdown_event.is_set(): - yield self._format_event("shutdown", { - "message": "Server is shutting down", - "grace_period": 30, - "timestamp": datetime.now(timezone.utc).isoformat() - }) + yield self._format_event( + "shutdown", + { + "message": "Server is shutting down", + "grace_period": 30, + "timestamp": datetime.now(timezone.utc).isoformat(), + }, + ) break now = datetime.now(timezone.utc) if include_heartbeat and (now - last_heartbeat).total_seconds() >= self.heartbeat_interval: - yield self._format_event("heartbeat", { - "execution_id": execution_id, - "timestamp": now.isoformat(), - "message": "SSE connection active" - }) + yield self._format_event( + "heartbeat", + {"execution_id": execution_id, "timestamp": now.isoformat(), "message": "SSE connection active"}, + ) last_heartbeat = now msg = await subscription.get(timeout=0.5) @@ -184,20 +183,20 @@ async def _stream_events_redis( # Ignore malformed messages continue - async def create_notification_stream( - self, - user_id: str - ) -> AsyncGenerator[Dict[str, Any], None]: + async def create_notification_stream(self, user_id: str) -> AsyncGenerator[Dict[str, Any], None]: subscription = None try: # Start opening subscription concurrently, then yield handshake sub_task = asyncio.create_task(self.sse_bus.open_notification_subscription(user_id)) - yield self._format_event("connected", { - "message": "Connected to notification stream", - "user_id": user_id, - "timestamp": datetime.now(timezone.utc).isoformat() - }) + yield self._format_event( + "connected", + { + "message": "Connected to notification stream", + "user_id": user_id, + "timestamp": datetime.now(timezone.utc).isoformat(), + }, + ) # Complete Redis subscription after handshake subscription = await sub_task @@ -207,11 +206,10 @@ async def create_notification_stream( # Heartbeat now = datetime.now(timezone.utc) if (now - last_heartbeat).total_seconds() >= self.heartbeat_interval: - yield self._format_event("heartbeat", { - "timestamp": now.isoformat(), - "user_id": user_id, - "message": "Notification stream active" - }) + yield self._format_event( + "heartbeat", + {"timestamp": now.isoformat(), "user_id": user_id, "message": "Notification stream active"}, + ) last_heartbeat = now # Forward notification messages as SSE data @@ -236,7 +234,7 @@ async def get_health_status(self) -> SSEHealthDomain: active_consumers=router_stats["num_consumers"], max_connections_per_user=5, shutdown=self.shutdown_manager.get_shutdown_status(), - timestamp=datetime.now(timezone.utc) + timestamp=datetime.now(timezone.utc), ) async def _event_to_sse_format(self, event: BaseEvent, execution_id: str) -> Dict[str, Any]: diff --git a/backend/app/services/sse/sse_shutdown_manager.py b/backend/app/services/sse/sse_shutdown_manager.py index e865495b..87011d2b 100644 --- a/backend/app/services/sse/sse_shutdown_manager.py +++ b/backend/app/services/sse/sse_shutdown_manager.py @@ -1,15 +1,17 @@ import asyncio import time from enum import Enum -from typing import Any, Dict, Set +from typing import Dict, Set from app.core.logging import logger from app.core.metrics.context import get_connection_metrics +from app.domain.sse import ShutdownStatus from app.services.sse.kafka_redis_bridge import SSEKafkaRedisBridge class ShutdownPhase(Enum): """Phases of SSE shutdown process""" + READY = "ready" NOTIFYING = "notifying" # Notify connections of impending shutdown DRAINING = "draining" # Wait for connections to close gracefully @@ -20,22 +22,19 @@ class ShutdownPhase(Enum): class SSEShutdownManager: """ Manages graceful shutdown of SSE connections. - + Works alongside the SSEKafkaRedisBridge to: - Track active SSE connections - Notify clients about shutdown - Coordinate graceful disconnection - Ensure clean resource cleanup - + The router handles Kafka consumer shutdown while this manager handles SSE client connection lifecycle. """ def __init__( - self, - drain_timeout: float = 30.0, - notification_timeout: float = 5.0, - force_close_timeout: float = 10.0 + self, drain_timeout: float = 30.0, notification_timeout: float = 5.0, force_close_timeout: float = 10.0 ): self.drain_timeout = drain_timeout self.notification_timeout = notification_timeout @@ -70,11 +69,7 @@ def set_router(self, router: "SSEKafkaRedisBridge") -> None: """Set the router reference for shutdown coordination.""" self._router = router - async def register_connection( - self, - execution_id: str, - connection_id: str - ) -> asyncio.Event | None: + async def register_connection(self, execution_id: str, connection_id: str) -> asyncio.Event | None: """ Register a new SSE connection. @@ -218,10 +213,7 @@ async def _drain_connections(self) -> None: while remaining > 0 and (time.time() - start_time) < self.drain_timeout: # Wait for connections to close try: - await asyncio.wait_for( - self._drain_complete_event.wait(), - timeout=check_interval - ) + await asyncio.wait_for(self._drain_complete_event.wait(), timeout=check_interval) break # All connections drained except asyncio.TimeoutError: pass @@ -270,20 +262,20 @@ def is_shutting_down(self) -> bool: """Check if shutdown is in progress""" return self._shutdown_initiated - def get_shutdown_status(self) -> Dict[str, Any]: + def get_shutdown_status(self) -> ShutdownStatus: """Get current shutdown status""" - status: Dict[str, Any] = { - "phase": self._phase.value, - "initiated": self._shutdown_initiated, - "complete": self._shutdown_complete, - "active_connections": sum(len(conns) for conns in self._active_connections.values()), - "draining_connections": len(self._draining_connections), - } - + duration = None if self._shutdown_start_time: - status["duration"] = time.time() - self._shutdown_start_time - - return status + duration = time.time() - self._shutdown_start_time + + return ShutdownStatus( + phase=self._phase.value, + initiated=self._shutdown_initiated, + complete=self._shutdown_complete, + active_connections=sum(len(conns) for conns in self._active_connections.values()), + draining_connections=len(self._draining_connections), + duration=duration, + ) async def wait_for_shutdown(self, timeout: float | None = None) -> bool: """ @@ -296,10 +288,7 @@ async def wait_for_shutdown(self, timeout: float | None = None) -> bool: return True try: - await asyncio.wait_for( - self._wait_for_complete(), - timeout=timeout - ) + await asyncio.wait_for(self._wait_for_complete(), timeout=timeout) return True except asyncio.TimeoutError: return False @@ -311,22 +300,18 @@ async def _wait_for_complete(self) -> None: def create_sse_shutdown_manager( - drain_timeout: float = 30.0, - notification_timeout: float = 5.0, - force_close_timeout: float = 10.0 + drain_timeout: float = 30.0, notification_timeout: float = 5.0, force_close_timeout: float = 10.0 ) -> SSEShutdownManager: """Factory function to create an SSE shutdown manager. - + Args: drain_timeout: Time to wait for connections to close gracefully notification_timeout: Time to wait for shutdown notifications to be sent force_close_timeout: Time before force closing connections - + Returns: A new SSE shutdown manager instance """ return SSEShutdownManager( - drain_timeout=drain_timeout, - notification_timeout=notification_timeout, - force_close_timeout=force_close_timeout + drain_timeout=drain_timeout, notification_timeout=notification_timeout, force_close_timeout=force_close_timeout ) diff --git a/backend/app/services/user_settings_service.py b/backend/app/services/user_settings_service.py index b1d43b2b..93ac16e2 100644 --- a/backend/app/services/user_settings_service.py +++ b/backend/app/services/user_settings_service.py @@ -18,16 +18,12 @@ DomainUserSettings, DomainUserSettingsUpdate, ) -from app.services.event_bus import EventBusManager +from app.services.event_bus import EventBusEvent, EventBusManager from app.services.kafka_event_service import KafkaEventService class UserSettingsService: - def __init__( - self, - repository: UserSettingsRepository, - event_service: KafkaEventService - ) -> None: + def __init__(self, repository: UserSettingsRepository, event_service: KafkaEventService) -> None: self.repository = repository self.event_service = event_service # TTL+LRU cache for settings @@ -42,20 +38,14 @@ def __init__( logger.info( "UserSettingsService initialized", - extra={ - "cache_ttl_seconds": self._cache_ttl.total_seconds(), - "max_cache_size": self._max_cache_size - } + extra={"cache_ttl_seconds": self._cache_ttl.total_seconds(), "max_cache_size": self._max_cache_size}, ) async def get_user_settings(self, user_id: str) -> DomainUserSettings: """Get settings with cache; rebuild and cache on miss.""" if user_id in self._cache: cached = self._cache[user_id] - logger.debug( - f"Settings cache hit for user {user_id}", - extra={"cache_size": len(self._cache)} - ) + logger.debug(f"Settings cache hit for user {user_id}", extra={"cache_size": len(self._cache)}) return cached return await self.get_user_settings_fresh(user_id) @@ -65,9 +55,8 @@ async def initialize(self, event_bus_manager: EventBusManager) -> None: self._event_bus_manager = event_bus_manager bus = await event_bus_manager.get_event_bus() - async def _handle(evt: dict) -> None: - payload = evt.get("payload", {}) - uid = payload.get("user_id") + async def _handle(evt: EventBusEvent) -> None: + uid = evt.payload.get("user_id") if uid: # Use asyncio.to_thread for the sync operation to make it properly async await asyncio.to_thread(self.invalidate_cache, str(uid)) @@ -92,10 +81,7 @@ async def get_user_settings_fresh(self, user_id: str) -> DomainUserSettings: return settings async def update_user_settings( - self, - user_id: str, - updates: DomainUserSettingsUpdate, - reason: str | None = None + self, user_id: str, updates: DomainUserSettingsUpdate, reason: str | None = None ) -> DomainUserSettings: """Upsert provided fields into current settings, publish minimal event, and cache.""" s = await self.get_user_settings(user_id) @@ -224,41 +210,26 @@ async def update_user_settings( async def update_theme(self, user_id: str, theme: Theme) -> DomainUserSettings: """Update user's theme preference""" return await self.update_user_settings( - user_id, - DomainUserSettingsUpdate(theme=theme), - reason="User changed theme" + user_id, DomainUserSettingsUpdate(theme=theme), reason="User changed theme" ) async def update_notification_settings( - self, - user_id: str, - notifications: DomainNotificationSettings + self, user_id: str, notifications: DomainNotificationSettings ) -> DomainUserSettings: """Update notification preferences""" return await self.update_user_settings( user_id, DomainUserSettingsUpdate(notifications=notifications), - reason="User updated notification preferences" + reason="User updated notification preferences", ) - async def update_editor_settings( - self, - user_id: str, - editor: DomainEditorSettings - ) -> DomainUserSettings: + async def update_editor_settings(self, user_id: str, editor: DomainEditorSettings) -> DomainUserSettings: """Update editor preferences""" return await self.update_user_settings( - user_id, - DomainUserSettingsUpdate(editor=editor), - reason="User updated editor settings" + user_id, DomainUserSettingsUpdate(editor=editor), reason="User updated editor settings" ) - async def update_custom_setting( - self, - user_id: str, - key: str, - value: Any - ) -> DomainUserSettings: + async def update_custom_setting(self, user_id: str, key: str, value: Any) -> DomainUserSettings: """Update a custom setting""" current_settings = await self.get_user_settings(user_id) current_settings.custom_settings[key] = value @@ -266,14 +237,10 @@ async def update_custom_setting( return await self.update_user_settings( user_id, DomainUserSettingsUpdate(custom_settings=current_settings.custom_settings), - reason=f"Custom setting '{key}' updated" + reason=f"Custom setting '{key}' updated", ) - async def get_settings_history( - self, - user_id: str, - limit: int = 50 - ) -> List[DomainSettingsHistoryEntry]: + async def get_settings_history(self, user_id: str, limit: int = 50) -> List[DomainSettingsHistoryEntry]: """Get history from changed paths recorded in events.""" events = await self._get_settings_events(user_id, limit=limit) history: list[DomainSettingsHistoryEntry] = [] @@ -309,17 +276,10 @@ async def get_settings_history( ) return history - async def restore_settings_to_point( - self, - user_id: str, - timestamp: datetime - ) -> DomainUserSettings: + async def restore_settings_to_point(self, user_id: str, timestamp: datetime) -> DomainUserSettings: """Restore settings to a specific point in time""" # Get all events up to the timestamp - events = await self._get_settings_events( - user_id, - until=timestamp - ) + events = await self._get_settings_events(user_id, until=timestamp) # Rebuild settings from events settings = DomainUserSettings(user_id=user_id) @@ -345,26 +305,18 @@ async def restore_settings_to_point( return settings async def _get_settings_events( - self, - user_id: str, - since: datetime | None = None, - until: datetime | None = None, - limit: int | None = None + self, user_id: str, since: datetime | None = None, until: datetime | None = None, limit: int | None = None ) -> List[DomainSettingsEvent]: """Get settings-related events for a user""" event_types = [ EventType.USER_SETTINGS_UPDATED, EventType.USER_THEME_CHANGED, EventType.USER_NOTIFICATION_SETTINGS_UPDATED, - EventType.USER_EDITOR_SETTINGS_UPDATED + EventType.USER_EDITOR_SETTINGS_UPDATED, ] raw = await self.repository.get_settings_events( - user_id=user_id, - event_types=event_types, - since=since, - until=until, - limit=limit + user_id=user_id, event_types=event_types, since=since, until=until, limit=limit ) # map to domain out: list[DomainSettingsEvent] = [] @@ -384,7 +336,7 @@ def _apply_event(self, settings: DomainUserSettings, event: DomainSettingsEvent) if event.event_type == EventType.USER_THEME_CHANGED: new_theme = event.payload.get("new_theme") if new_theme: - settings.theme = Theme(new_theme) # type: ignore[arg-type] + settings.theme = Theme(new_theme) return settings upd = event.payload.get("updated") @@ -393,7 +345,7 @@ def _apply_event(self, settings: DomainUserSettings, event: DomainSettingsEvent) # Top-level if "theme" in upd: - settings.theme = Theme(upd["theme"]) # type: ignore[arg-type] + settings.theme = Theme(upd["theme"]) if "timezone" in upd: settings.timezone = upd["timezone"] if "date_format" in upd: @@ -431,18 +383,12 @@ def invalidate_cache(self, user_id: str) -> None: """Invalidate cached settings for a user""" removed = self._cache.pop(user_id, None) is not None if removed: - logger.debug( - f"Invalidated cache for user {user_id}", - extra={"cache_size": len(self._cache)} - ) + logger.debug(f"Invalidated cache for user {user_id}", extra={"cache_size": len(self._cache)}) def _add_to_cache(self, user_id: str, settings: DomainUserSettings) -> None: """Add settings to TTL+LRU cache.""" self._cache[user_id] = settings - logger.debug( - f"Cached settings for user {user_id}", - extra={"cache_size": len(self._cache)} - ) + logger.debug(f"Cached settings for user {user_id}", extra={"cache_size": len(self._cache)}) def get_cache_stats(self) -> dict[str, Any]: """Get cache statistics for monitoring.""" @@ -452,13 +398,13 @@ def get_cache_stats(self) -> dict[str, Any]: "expired_entries": 0, "cache_ttl_seconds": self._cache_ttl.total_seconds(), } - + async def reset_user_settings(self, user_id: str) -> None: """Reset user settings by deleting all data and cache.""" # Clear from cache self.invalidate_cache(user_id) - + # Delete from database await self.repository.delete_user_settings(user_id) - + logger.info(f"Reset settings for user {user_id}") diff --git a/backend/app/settings.py b/backend/app/settings.py index f0de8967..34b0383b 100644 --- a/backend/app/settings.py +++ b/backend/app/settings.py @@ -14,7 +14,7 @@ class Settings(BaseSettings): SECRET_KEY: str = Field( ..., # Actual key be loaded from .env file min_length=32, - description="Secret key for JWT token signing. Must be at least 32 characters." + description="Secret key for JWT token signing. Must be at least 32 characters.", ) ALGORITHM: str = "HS256" ACCESS_TOKEN_EXPIRE_MINUTES: int = 1440 # 24 hours @@ -37,20 +37,15 @@ class Settings(BaseSettings): K8S_POD_EXECUTION_TIMEOUT: int = 300 # in seconds K8S_POD_PRIORITY_CLASS_NAME: str | None = None - SUPPORTED_RUNTIMES: dict[str, list[str]] = Field( - default_factory=lambda: RUNTIME_MATRIX - ) - - EXAMPLE_SCRIPTS: dict[str, str] = Field( - default_factory=lambda: EXEC_EXAMPLE_SCRIPTS - ) + SUPPORTED_RUNTIMES: dict[str, list[str]] = Field(default_factory=lambda: RUNTIME_MATRIX) + EXAMPLE_SCRIPTS: dict[str, str] = Field(default_factory=lambda: EXEC_EXAMPLE_SCRIPTS) TESTING: bool = False # Event-Driven Design Configuration KAFKA_BOOTSTRAP_SERVERS: str = "kafka:29092" - KAFKA_GROUP_SUFFIX: str = "suff" # Suffix to append to consumer group IDs for test/parallel isolation + KAFKA_GROUP_SUFFIX: str = "suff" # Suffix to append to consumer group IDs for test/parallel isolation SCHEMA_REGISTRY_URL: str = "http://schema-registry:8081" SCHEMA_REGISTRY_AUTH: str | None = None # Format: "username:password" ENABLE_EVENT_STREAMING: bool = False @@ -61,7 +56,7 @@ class Settings(BaseSettings): KAFKA_ENABLE_AUTO_COMMIT: bool = True KAFKA_SESSION_TIMEOUT_MS: int = 30000 KAFKA_MAX_POLL_RECORDS: int = 500 - + # SSE Configuration SSE_CONSUMER_POOL_SIZE: int = 10 # Number of consumers in the partitioned pool SSE_HEARTBEAT_INTERVAL: int = 30 # Heartbeat interval in seconds for SSE - keep connection alive @@ -87,7 +82,7 @@ class Settings(BaseSettings): default=0.1, # 10% sampling by default ge=0.0, le=1.0, - description="Sampling rate for distributed tracing (0.0 to 1.0)" + description="Sampling rate for distributed tracing (0.0 to 1.0)", ) TRACING_SERVICE_NAME: str = "integr8scode-backend" TRACING_SERVICE_VERSION: str = "1.0.0" @@ -107,7 +102,7 @@ class Settings(BaseSettings): # WebSocket configuration WEBSOCKET_PING_INTERVAL: int = 30 WEBSOCKET_PING_TIMEOUT: int = 10 - + # Redis Configuration REDIS_HOST: str = "redis" REDIS_PORT: int = 6379 @@ -116,7 +111,7 @@ class Settings(BaseSettings): REDIS_SSL: bool = False REDIS_MAX_CONNECTIONS: int = 50 REDIS_DECODE_RESPONSES: bool = True - + # Rate Limiting Configuration RATE_LIMIT_ENABLED: bool = True RATE_LIMIT_DEFAULT_REQUESTS: int = 100 @@ -154,16 +149,13 @@ class Settings(BaseSettings): SECURE_COOKIES: bool = True # Can be overridden in .env for development # Logging configuration - LOG_LEVEL: str = Field( - default="DEBUG", - description="Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)" - ) + LOG_LEVEL: str = Field(default="DEBUG", description="Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)") model_config = SettingsConfigDict( env_file=".env", env_file_encoding="utf-8", case_sensitive=True, - extra="forbid" # Raise error on extra fields + extra="forbid", # Raise error on extra fields ) diff --git a/backend/scripts/create_topics.py b/backend/scripts/create_topics.py index 61f5c5f1..85094ae6 100755 --- a/backend/scripts/create_topics.py +++ b/backend/scripts/create_topics.py @@ -19,10 +19,12 @@ async def create_topics() -> None: settings = get_settings() # Create admin client - admin_client = AdminClient({ - 'bootstrap.servers': settings.KAFKA_BOOTSTRAP_SERVERS, - 'client.id': 'topic-creator', - }) + admin_client = AdminClient( + { + "bootstrap.servers": settings.KAFKA_BOOTSTRAP_SERVERS, + "client.id": "topic-creator", + } + ) try: logger.info(f"Connected to Kafka brokers: {settings.KAFKA_BOOTSTRAP_SERVERS}") @@ -46,20 +48,23 @@ async def create_topics() -> None: topic_name = f"{topic_prefix}{topic.value}" if topic_name not in existing_topics: # Get config from topic_configs - config = topic_configs.get(topic, { - "num_partitions": 3, - "replication_factor": 1, - "config": { - "retention.ms": "604800000", # 7 days - "compression.type": "gzip", - } - }) + config = topic_configs.get( + topic, + { + "num_partitions": 3, + "replication_factor": 1, + "config": { + "retention.ms": "604800000", # 7 days + "compression.type": "gzip", + }, + }, + ) new_topic = NewTopic( topic=topic_name, num_partitions=config.get("num_partitions", 3), replication_factor=config.get("replication_factor", 1), - config=config.get("config", {}) + config=config.get("config", {}), ) topics_to_create.append(new_topic) logger.info(f"Will create topic: {topic_name}") diff --git a/backend/scripts/seed_users.py b/backend/scripts/seed_users.py index d1d1c7ed..a7653c69 100755 --- a/backend/scripts/seed_users.py +++ b/backend/scripts/seed_users.py @@ -15,6 +15,7 @@ import asyncio import os from datetime import datetime, timezone +from typing import Any from bson import ObjectId from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase @@ -24,7 +25,7 @@ async def upsert_user( - db: AsyncIOMotorDatabase, + db: AsyncIOMotorDatabase[dict[str, Any]], username: str, email: str, password: str, @@ -38,28 +39,32 @@ async def upsert_user( print(f"User '{username}' exists - updating password, role={role}, is_superuser={is_superuser}") await db.users.update_one( {"username": username}, - {"$set": { + { + "$set": { + "hashed_password": pwd_context.hash(password), + "role": role, + "is_superuser": is_superuser, + "is_active": True, + "updated_at": datetime.now(timezone.utc), + } + }, + ) + else: + print(f"Creating user '{username}' (role={role}, is_superuser={is_superuser})") + await db.users.insert_one( + { + "_id": ObjectId(), + "user_id": str(ObjectId()), + "username": username, + "email": email, "hashed_password": pwd_context.hash(password), "role": role, - "is_superuser": is_superuser, "is_active": True, - "updated_at": datetime.now(timezone.utc) - }} + "is_superuser": is_superuser, + "created_at": datetime.now(timezone.utc), + "updated_at": datetime.now(timezone.utc), + } ) - else: - print(f"Creating user '{username}' (role={role}, is_superuser={is_superuser})") - await db.users.insert_one({ - "_id": ObjectId(), - "user_id": str(ObjectId()), - "username": username, - "email": email, - "hashed_password": pwd_context.hash(password), - "role": role, - "is_active": True, - "is_superuser": is_superuser, - "created_at": datetime.now(timezone.utc), - "updated_at": datetime.now(timezone.utc) - }) async def seed_users() -> None: @@ -68,7 +73,7 @@ async def seed_users() -> None: print(f"Connecting to MongoDB (database: {db_name})...") - client: AsyncIOMotorClient = AsyncIOMotorClient(mongodb_url) + client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(mongodb_url) db = client[db_name] # Default user @@ -78,7 +83,7 @@ async def seed_users() -> None: email="user@integr8scode.com", password=os.getenv("DEFAULT_USER_PASSWORD", "user123"), role="user", - is_superuser=False + is_superuser=False, ) # Admin user @@ -88,7 +93,7 @@ async def seed_users() -> None: email="admin@integr8scode.com", password=os.getenv("ADMIN_USER_PASSWORD", "admin123"), role="admin", - is_superuser=True + is_superuser=True, ) print("\n" + "=" * 50) diff --git a/backend/tests/integration/events/test_consumer_min_e2e.py b/backend/tests/integration/events/test_consumer_min_e2e.py index 24ef1bc6..a228cc75 100644 --- a/backend/tests/integration/events/test_consumer_min_e2e.py +++ b/backend/tests/integration/events/test_consumer_min_e2e.py @@ -1,12 +1,9 @@ -import asyncio from uuid import uuid4 import pytest - from app.domain.enums.kafka import KafkaTopic from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer - pytestmark = [pytest.mark.integration, pytest.mark.kafka] @@ -18,7 +15,7 @@ async def test_consumer_start_status_seek_and_stop(): await c.start([KafkaTopic.EXECUTION_EVENTS]) try: st = c.get_status() - assert st["state"] == "running" and st["is_running"] is True + assert st.state == "running" and st.is_running is True # Exercise seek functions; don't force specific partition offsets await c.seek_to_beginning() await c.seek_to_end() diff --git a/backend/tests/integration/services/events/test_event_bus.py b/backend/tests/integration/services/events/test_event_bus.py index 08d81022..398300c0 100644 --- a/backend/tests/integration/services/events/test_event_bus.py +++ b/backend/tests/integration/services/events/test_event_bus.py @@ -1,25 +1,25 @@ -import asyncio import pytest -from app.services.event_bus import EventBusManager +from app.services.event_bus import EventBusEvent, EventBusManager +from tests.helpers.eventually import eventually pytestmark = pytest.mark.integration -from tests.helpers.eventually import eventually + @pytest.mark.asyncio async def test_event_bus_publish_subscribe(scope) -> None: # type: ignore[valid-type] manager: EventBusManager = await scope.get(EventBusManager) bus = await manager.get_event_bus() - received: list[dict] = [] + received: list[EventBusEvent] = [] - async def handler(event: dict) -> None: + async def handler(event: EventBusEvent) -> None: received.append(event) await bus.subscribe("test.*", handler) await bus.publish("test.created", {"x": 1}) async def _received(): - assert any(e.get("event_type") == "test.created" for e in received) + assert any(e.event_type == "test.created" for e in received) await eventually(_received, timeout=2.0, interval=0.05) diff --git a/backend/tests/integration/services/events/test_event_service_integration.py b/backend/tests/integration/services/events/test_event_service_integration.py index 7690d8c7..9d66de5d 100644 --- a/backend/tests/integration/services/events/test_event_service_integration.py +++ b/backend/tests/integration/services/events/test_event_service_integration.py @@ -4,6 +4,7 @@ from app.db.repositories import EventRepository from app.domain.events.event_models import EventFields, Event, EventFilter +from app.domain.enums.common import SortOrder from app.domain.enums.user import UserRole from app.infrastructure.kafka.events.metadata import EventMetadata from app.domain.enums.events import EventType @@ -35,7 +36,7 @@ async def test_event_service_access_and_queries(scope) -> None: # type: ignore[ assert any(ev.aggregate_id == "agg1" for ev in events_admin) # query_events_advanced: basic run (empty filters) should return a result structure - res = await svc.query_events_advanced("u1", UserRole.USER, filters=EventFilter(), sort_by="correlation_id", sort_order="asc") + res = await svc.query_events_advanced("u1", UserRole.USER, filters=EventFilter(), sort_by="correlation_id", sort_order=SortOrder.ASC) assert res is not None # get_events_by_correlation filters non-admin to their own user_id diff --git a/backend/tests/integration/services/execution/test_execution_service.py b/backend/tests/integration/services/execution/test_execution_service.py index f924d747..184a3494 100644 --- a/backend/tests/integration/services/execution/test_execution_service.py +++ b/backend/tests/integration/services/execution/test_execution_service.py @@ -1,5 +1,6 @@ import pytest +from app.domain.execution import ResourceLimitsDomain from app.services.execution_service import ExecutionService pytestmark = pytest.mark.integration @@ -9,7 +10,8 @@ async def test_execute_script_and_limits(scope) -> None: # type: ignore[valid-type] svc: ExecutionService = await scope.get(ExecutionService) limits = await svc.get_k8s_resource_limits() - assert set(limits.keys()) >= {"cpu_limit", "memory_limit", "supported_runtimes"} + assert isinstance(limits, ResourceLimitsDomain) + assert limits.cpu_limit and limits.memory_limit and limits.supported_runtimes ex = await svc.get_example_scripts() assert isinstance(ex, dict) diff --git a/backend/tests/integration/test_admin_routes.py b/backend/tests/integration/test_admin_routes.py index f2579fb4..54babedd 100644 --- a/backend/tests/integration/test_admin_routes.py +++ b/backend/tests/integration/test_admin_routes.py @@ -2,15 +2,14 @@ from uuid import uuid4 import pytest -from httpx import AsyncClient - from app.schemas_pydantic.admin_settings import ( - SystemSettings, ExecutionLimitsSchema, + MonitoringSettingsSchema, SecuritySettingsSchema, - MonitoringSettingsSchema + SystemSettings, ) from app.schemas_pydantic.admin_user_overview import AdminUserOverview +from httpx import AsyncClient @pytest.mark.integration @@ -334,8 +333,7 @@ async def test_admin_events_export_csv_and_json(self, client: AsyncClient, share # CSV export r_csv = await client.get("/api/v1/admin/events/export/csv?limit=10") - if r_csv.status_code != 200: - pytest.skip("CSV export not available in this environment") + assert r_csv.status_code == 200, f"CSV export failed: {r_csv.status_code} - {r_csv.text[:200]}" ct_csv = r_csv.headers.get("content-type", "") assert "text/csv" in ct_csv body_csv = r_csv.text @@ -344,8 +342,7 @@ async def test_admin_events_export_csv_and_json(self, client: AsyncClient, share # JSON export r_json = await client.get("/api/v1/admin/events/export/json?limit=10") - if r_json.status_code != 200: - pytest.skip("JSON export not available in this environment") + assert r_json.status_code == 200, f"JSON export failed: {r_json.status_code} - {r_json.text[:200]}" ct_json = r_json.headers.get("content-type", "") assert "application/json" in ct_json data = r_json.json() @@ -401,7 +398,7 @@ async def test_admin_user_rate_limits_and_password_reset(self, client: AsyncClie rl_put = await client.put(f"/api/v1/admin/users/{target_user_id}/rate-limits", json=update_payload) assert rl_put.status_code == 200 put_body = rl_put.json() - assert put_body.get("message") + assert put_body.get("updated") is True assert put_body.get("config", {}).get("user_id") == target_user_id # Reset rate limits diff --git a/backend/tests/unit/services/sse/test_sse_service.py b/backend/tests/unit/services/sse/test_sse_service.py index 240fa151..b768ddfc 100644 --- a/backend/tests/unit/services/sse/test_sse_service.py +++ b/backend/tests/unit/services/sse/test_sse_service.py @@ -8,7 +8,7 @@ from app.domain.enums.events import EventType from app.domain.execution import DomainExecution, ResourceUsageDomain -from app.domain.sse import SSEHealthDomain +from app.domain.sse import ShutdownStatus, SSEHealthDomain from app.services.sse.sse_service import SSEService @@ -76,8 +76,14 @@ async def unregister_connection(self, execution_id: str, connection_id: str): def is_shutting_down(self) -> bool: return self._initiated - def get_shutdown_status(self) -> dict[str, Any]: - return {"initiated": self._initiated, "phase": "ready"} + def get_shutdown_status(self) -> ShutdownStatus: + return ShutdownStatus( + phase="ready", + initiated=self._initiated, + complete=False, + active_connections=0, + draining_connections=0, + ) def initiate(self) -> None: self._initiated = True diff --git a/backend/uv.lock b/backend/uv.lock index 404fb583..5d057738 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -1678,65 +1678,63 @@ wheels = [ [[package]] name = "numpy" -version = "2.3.5" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/76/65/21b3bc86aac7b8f2862db1e808f1ea22b028e30a225a34a5ede9bf8678f2/numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0", size = 20584950, upload-time = "2025-11-16T22:52:42.067Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/44/37/e669fe6cbb2b96c62f6bbedc6a81c0f3b7362f6a59230b23caa673a85721/numpy-2.3.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:74ae7b798248fe62021dbf3c914245ad45d1a6b0cb4a29ecb4b31d0bfbc4cc3e", size = 16733873, upload-time = "2025-11-16T22:49:49.84Z" }, - { url = "https://files.pythonhosted.org/packages/c5/65/df0db6c097892c9380851ab9e44b52d4f7ba576b833996e0080181c0c439/numpy-2.3.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ee3888d9ff7c14604052b2ca5535a30216aa0a58e948cdd3eeb8d3415f638769", size = 12259838, upload-time = "2025-11-16T22:49:52.863Z" }, - { url = "https://files.pythonhosted.org/packages/5b/e1/1ee06e70eb2136797abe847d386e7c0e830b67ad1d43f364dd04fa50d338/numpy-2.3.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:612a95a17655e213502f60cfb9bf9408efdc9eb1d5f50535cc6eb365d11b42b5", size = 5088378, upload-time = "2025-11-16T22:49:55.055Z" }, - { url = "https://files.pythonhosted.org/packages/6d/9c/1ca85fb86708724275103b81ec4cf1ac1d08f465368acfc8da7ab545bdae/numpy-2.3.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3101e5177d114a593d79dd79658650fe28b5a0d8abeb8ce6f437c0e6df5be1a4", size = 6628559, upload-time = "2025-11-16T22:49:57.371Z" }, - { url = "https://files.pythonhosted.org/packages/74/78/fcd41e5a0ce4f3f7b003da85825acddae6d7ecb60cf25194741b036ca7d6/numpy-2.3.5-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b973c57ff8e184109db042c842423ff4f60446239bd585a5131cc47f06f789d", size = 14250702, upload-time = "2025-11-16T22:49:59.632Z" }, - { url = "https://files.pythonhosted.org/packages/b6/23/2a1b231b8ff672b4c450dac27164a8b2ca7d9b7144f9c02d2396518352eb/numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d8163f43acde9a73c2a33605353a4f1bc4798745a8b1d73183b28e5b435ae28", size = 16606086, upload-time = "2025-11-16T22:50:02.127Z" }, - { url = "https://files.pythonhosted.org/packages/a0/c5/5ad26fbfbe2012e190cc7d5003e4d874b88bb18861d0829edc140a713021/numpy-2.3.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:51c1e14eb1e154ebd80e860722f9e6ed6ec89714ad2db2d3aa33c31d7c12179b", size = 16025985, upload-time = "2025-11-16T22:50:04.536Z" }, - { url = "https://files.pythonhosted.org/packages/d2/fa/dd48e225c46c819288148d9d060b047fd2a6fb1eb37eae25112ee4cb4453/numpy-2.3.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b46b4ec24f7293f23adcd2d146960559aaf8020213de8ad1909dba6c013bf89c", size = 18542976, upload-time = "2025-11-16T22:50:07.557Z" }, - { url = "https://files.pythonhosted.org/packages/05/79/ccbd23a75862d95af03d28b5c6901a1b7da4803181513d52f3b86ed9446e/numpy-2.3.5-cp312-cp312-win32.whl", hash = "sha256:3997b5b3c9a771e157f9aae01dd579ee35ad7109be18db0e85dbdbe1de06e952", size = 6285274, upload-time = "2025-11-16T22:50:10.746Z" }, - { url = "https://files.pythonhosted.org/packages/2d/57/8aeaf160312f7f489dea47ab61e430b5cb051f59a98ae68b7133ce8fa06a/numpy-2.3.5-cp312-cp312-win_amd64.whl", hash = "sha256:86945f2ee6d10cdfd67bcb4069c1662dd711f7e2a4343db5cecec06b87cf31aa", size = 12782922, upload-time = "2025-11-16T22:50:12.811Z" }, - { url = "https://files.pythonhosted.org/packages/78/a6/aae5cc2ca78c45e64b9ef22f089141d661516856cf7c8a54ba434576900d/numpy-2.3.5-cp312-cp312-win_arm64.whl", hash = "sha256:f28620fe26bee16243be2b7b874da327312240a7cdc38b769a697578d2100013", size = 10194667, upload-time = "2025-11-16T22:50:16.16Z" }, - { url = "https://files.pythonhosted.org/packages/db/69/9cde09f36da4b5a505341180a3f2e6fadc352fd4d2b7096ce9778db83f1a/numpy-2.3.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d0f23b44f57077c1ede8c5f26b30f706498b4862d3ff0a7298b8411dd2f043ff", size = 16728251, upload-time = "2025-11-16T22:50:19.013Z" }, - { url = "https://files.pythonhosted.org/packages/79/fb/f505c95ceddd7027347b067689db71ca80bd5ecc926f913f1a23e65cf09b/numpy-2.3.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:aa5bc7c5d59d831d9773d1170acac7893ce3a5e130540605770ade83280e7188", size = 12254652, upload-time = "2025-11-16T22:50:21.487Z" }, - { url = "https://files.pythonhosted.org/packages/78/da/8c7738060ca9c31b30e9301ee0cf6c5ffdbf889d9593285a1cead337f9a5/numpy-2.3.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ccc933afd4d20aad3c00bcef049cb40049f7f196e0397f1109dba6fed63267b0", size = 5083172, upload-time = "2025-11-16T22:50:24.562Z" }, - { url = "https://files.pythonhosted.org/packages/a4/b4/ee5bb2537fb9430fd2ef30a616c3672b991a4129bb1c7dcc42aa0abbe5d7/numpy-2.3.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:afaffc4393205524af9dfa400fa250143a6c3bc646c08c9f5e25a9f4b4d6a903", size = 6622990, upload-time = "2025-11-16T22:50:26.47Z" }, - { url = "https://files.pythonhosted.org/packages/95/03/dc0723a013c7d7c19de5ef29e932c3081df1c14ba582b8b86b5de9db7f0f/numpy-2.3.5-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c75442b2209b8470d6d5d8b1c25714270686f14c749028d2199c54e29f20b4d", size = 14248902, upload-time = "2025-11-16T22:50:28.861Z" }, - { url = "https://files.pythonhosted.org/packages/f5/10/ca162f45a102738958dcec8023062dad0cbc17d1ab99d68c4e4a6c45fb2b/numpy-2.3.5-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11e06aa0af8c0f05104d56450d6093ee639e15f24ecf62d417329d06e522e017", size = 16597430, upload-time = "2025-11-16T22:50:31.56Z" }, - { url = "https://files.pythonhosted.org/packages/2a/51/c1e29be863588db58175175f057286900b4b3327a1351e706d5e0f8dd679/numpy-2.3.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed89927b86296067b4f81f108a2271d8926467a8868e554eaf370fc27fa3ccaf", size = 16024551, upload-time = "2025-11-16T22:50:34.242Z" }, - { url = "https://files.pythonhosted.org/packages/83/68/8236589d4dbb87253d28259d04d9b814ec0ecce7cb1c7fed29729f4c3a78/numpy-2.3.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51c55fe3451421f3a6ef9a9c1439e82101c57a2c9eab9feb196a62b1a10b58ce", size = 18533275, upload-time = "2025-11-16T22:50:37.651Z" }, - { url = "https://files.pythonhosted.org/packages/40/56/2932d75b6f13465239e3b7b7e511be27f1b8161ca2510854f0b6e521c395/numpy-2.3.5-cp313-cp313-win32.whl", hash = "sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e", size = 6277637, upload-time = "2025-11-16T22:50:40.11Z" }, - { url = "https://files.pythonhosted.org/packages/0c/88/e2eaa6cffb115b85ed7c7c87775cb8bcf0816816bc98ca8dbfa2ee33fe6e/numpy-2.3.5-cp313-cp313-win_amd64.whl", hash = "sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b", size = 12779090, upload-time = "2025-11-16T22:50:42.503Z" }, - { url = "https://files.pythonhosted.org/packages/8f/88/3f41e13a44ebd4034ee17baa384acac29ba6a4fcc2aca95f6f08ca0447d1/numpy-2.3.5-cp313-cp313-win_arm64.whl", hash = "sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae", size = 10194710, upload-time = "2025-11-16T22:50:44.971Z" }, - { url = "https://files.pythonhosted.org/packages/13/cb/71744144e13389d577f867f745b7df2d8489463654a918eea2eeb166dfc9/numpy-2.3.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:414802f3b97f3c1eef41e530aaba3b3c1620649871d8cb38c6eaff034c2e16bd", size = 16827292, upload-time = "2025-11-16T22:50:47.715Z" }, - { url = "https://files.pythonhosted.org/packages/71/80/ba9dc6f2a4398e7f42b708a7fdc841bb638d353be255655498edbf9a15a8/numpy-2.3.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5ee6609ac3604fa7780e30a03e5e241a7956f8e2fcfe547d51e3afa5247ac47f", size = 12378897, upload-time = "2025-11-16T22:50:51.327Z" }, - { url = "https://files.pythonhosted.org/packages/2e/6d/db2151b9f64264bcceccd51741aa39b50150de9b602d98ecfe7e0c4bff39/numpy-2.3.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:86d835afea1eaa143012a2d7a3f45a3adce2d7adc8b4961f0b362214d800846a", size = 5207391, upload-time = "2025-11-16T22:50:54.542Z" }, - { url = "https://files.pythonhosted.org/packages/80/ae/429bacace5ccad48a14c4ae5332f6aa8ab9f69524193511d60ccdfdc65fa/numpy-2.3.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:30bc11310e8153ca664b14c5f1b73e94bd0503681fcf136a163de856f3a50139", size = 6721275, upload-time = "2025-11-16T22:50:56.794Z" }, - { url = "https://files.pythonhosted.org/packages/74/5b/1919abf32d8722646a38cd527bc3771eb229a32724ee6ba340ead9b92249/numpy-2.3.5-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1062fde1dcf469571705945b0f221b73928f34a20c904ffb45db101907c3454e", size = 14306855, upload-time = "2025-11-16T22:50:59.208Z" }, - { url = "https://files.pythonhosted.org/packages/a5/87/6831980559434973bebc30cd9c1f21e541a0f2b0c280d43d3afd909b66d0/numpy-2.3.5-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce581db493ea1a96c0556360ede6607496e8bf9b3a8efa66e06477267bc831e9", size = 16657359, upload-time = "2025-11-16T22:51:01.991Z" }, - { url = "https://files.pythonhosted.org/packages/dd/91/c797f544491ee99fd00495f12ebb7802c440c1915811d72ac5b4479a3356/numpy-2.3.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:cc8920d2ec5fa99875b670bb86ddeb21e295cb07aa331810d9e486e0b969d946", size = 16093374, upload-time = "2025-11-16T22:51:05.291Z" }, - { url = "https://files.pythonhosted.org/packages/74/a6/54da03253afcbe7a72785ec4da9c69fb7a17710141ff9ac5fcb2e32dbe64/numpy-2.3.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:9ee2197ef8c4f0dfe405d835f3b6a14f5fee7782b5de51ba06fb65fc9b36e9f1", size = 18594587, upload-time = "2025-11-16T22:51:08.585Z" }, - { url = "https://files.pythonhosted.org/packages/80/e9/aff53abbdd41b0ecca94285f325aff42357c6b5abc482a3fcb4994290b18/numpy-2.3.5-cp313-cp313t-win32.whl", hash = "sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3", size = 6405940, upload-time = "2025-11-16T22:51:11.541Z" }, - { url = "https://files.pythonhosted.org/packages/d5/81/50613fec9d4de5480de18d4f8ef59ad7e344d497edbef3cfd80f24f98461/numpy-2.3.5-cp313-cp313t-win_amd64.whl", hash = "sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234", size = 12920341, upload-time = "2025-11-16T22:51:14.312Z" }, - { url = "https://files.pythonhosted.org/packages/bb/ab/08fd63b9a74303947f34f0bd7c5903b9c5532c2d287bead5bdf4c556c486/numpy-2.3.5-cp313-cp313t-win_arm64.whl", hash = "sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7", size = 10262507, upload-time = "2025-11-16T22:51:16.846Z" }, - { url = "https://files.pythonhosted.org/packages/ba/97/1a914559c19e32d6b2e233cf9a6a114e67c856d35b1d6babca571a3e880f/numpy-2.3.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:bf06bc2af43fa8d32d30fae16ad965663e966b1a3202ed407b84c989c3221e82", size = 16735706, upload-time = "2025-11-16T22:51:19.558Z" }, - { url = "https://files.pythonhosted.org/packages/57/d4/51233b1c1b13ecd796311216ae417796b88b0616cfd8a33ae4536330748a/numpy-2.3.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:052e8c42e0c49d2575621c158934920524f6c5da05a1d3b9bab5d8e259e045f0", size = 12264507, upload-time = "2025-11-16T22:51:22.492Z" }, - { url = "https://files.pythonhosted.org/packages/45/98/2fe46c5c2675b8306d0b4a3ec3494273e93e1226a490f766e84298576956/numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:1ed1ec893cff7040a02c8aa1c8611b94d395590d553f6b53629a4461dc7f7b63", size = 5093049, upload-time = "2025-11-16T22:51:25.171Z" }, - { url = "https://files.pythonhosted.org/packages/ce/0e/0698378989bb0ac5f1660c81c78ab1fe5476c1a521ca9ee9d0710ce54099/numpy-2.3.5-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:2dcd0808a421a482a080f89859a18beb0b3d1e905b81e617a188bd80422d62e9", size = 6626603, upload-time = "2025-11-16T22:51:27Z" }, - { url = "https://files.pythonhosted.org/packages/5e/a6/9ca0eecc489640615642a6cbc0ca9e10df70df38c4d43f5a928ff18d8827/numpy-2.3.5-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:727fd05b57df37dc0bcf1a27767a3d9a78cbbc92822445f32cc3436ba797337b", size = 14262696, upload-time = "2025-11-16T22:51:29.402Z" }, - { url = "https://files.pythonhosted.org/packages/c8/f6/07ec185b90ec9d7217a00eeeed7383b73d7e709dae2a9a021b051542a708/numpy-2.3.5-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fffe29a1ef00883599d1dc2c51aa2e5d80afe49523c261a74933df395c15c520", size = 16597350, upload-time = "2025-11-16T22:51:32.167Z" }, - { url = "https://files.pythonhosted.org/packages/75/37/164071d1dde6a1a84c9b8e5b414fa127981bad47adf3a6b7e23917e52190/numpy-2.3.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8f7f0e05112916223d3f438f293abf0727e1181b5983f413dfa2fefc4098245c", size = 16040190, upload-time = "2025-11-16T22:51:35.403Z" }, - { url = "https://files.pythonhosted.org/packages/08/3c/f18b82a406b04859eb026d204e4e1773eb41c5be58410f41ffa511d114ae/numpy-2.3.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2e2eb32ddb9ccb817d620ac1d8dae7c3f641c1e5f55f531a33e8ab97960a75b8", size = 18536749, upload-time = "2025-11-16T22:51:39.698Z" }, - { url = "https://files.pythonhosted.org/packages/40/79/f82f572bf44cf0023a2fe8588768e23e1592585020d638999f15158609e1/numpy-2.3.5-cp314-cp314-win32.whl", hash = "sha256:66f85ce62c70b843bab1fb14a05d5737741e74e28c7b8b5a064de10142fad248", size = 6335432, upload-time = "2025-11-16T22:51:42.476Z" }, - { url = "https://files.pythonhosted.org/packages/a3/2e/235b4d96619931192c91660805e5e49242389742a7a82c27665021db690c/numpy-2.3.5-cp314-cp314-win_amd64.whl", hash = "sha256:e6a0bc88393d65807d751a614207b7129a310ca4fe76a74e5c7da5fa5671417e", size = 12919388, upload-time = "2025-11-16T22:51:45.275Z" }, - { url = "https://files.pythonhosted.org/packages/07/2b/29fd75ce45d22a39c61aad74f3d718e7ab67ccf839ca8b60866054eb15f8/numpy-2.3.5-cp314-cp314-win_arm64.whl", hash = "sha256:aeffcab3d4b43712bb7a60b65f6044d444e75e563ff6180af8f98dd4b905dfd2", size = 10476651, upload-time = "2025-11-16T22:51:47.749Z" }, - { url = "https://files.pythonhosted.org/packages/17/e1/f6a721234ebd4d87084cfa68d081bcba2f5cfe1974f7de4e0e8b9b2a2ba1/numpy-2.3.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:17531366a2e3a9e30762c000f2c43a9aaa05728712e25c11ce1dbe700c53ad41", size = 16834503, upload-time = "2025-11-16T22:51:50.443Z" }, - { url = "https://files.pythonhosted.org/packages/5c/1c/baf7ffdc3af9c356e1c135e57ab7cf8d247931b9554f55c467efe2c69eff/numpy-2.3.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d21644de1b609825ede2f48be98dfde4656aefc713654eeee280e37cadc4e0ad", size = 12381612, upload-time = "2025-11-16T22:51:53.609Z" }, - { url = "https://files.pythonhosted.org/packages/74/91/f7f0295151407ddc9ba34e699013c32c3c91944f9b35fcf9281163dc1468/numpy-2.3.5-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:c804e3a5aba5460c73955c955bdbd5c08c354954e9270a2c1565f62e866bdc39", size = 5210042, upload-time = "2025-11-16T22:51:56.213Z" }, - { url = "https://files.pythonhosted.org/packages/2e/3b/78aebf345104ec50dd50a4d06ddeb46a9ff5261c33bcc58b1c4f12f85ec2/numpy-2.3.5-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:cc0a57f895b96ec78969c34f682c602bf8da1a0270b09bc65673df2e7638ec20", size = 6724502, upload-time = "2025-11-16T22:51:58.584Z" }, - { url = "https://files.pythonhosted.org/packages/02/c6/7c34b528740512e57ef1b7c8337ab0b4f0bddf34c723b8996c675bc2bc91/numpy-2.3.5-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:900218e456384ea676e24ea6a0417f030a3b07306d29d7ad843957b40a9d8d52", size = 14308962, upload-time = "2025-11-16T22:52:01.698Z" }, - { url = "https://files.pythonhosted.org/packages/80/35/09d433c5262bc32d725bafc619e095b6a6651caf94027a03da624146f655/numpy-2.3.5-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:09a1bea522b25109bf8e6f3027bd810f7c1085c64a0c7ce050c1676ad0ba010b", size = 16655054, upload-time = "2025-11-16T22:52:04.267Z" }, - { url = "https://files.pythonhosted.org/packages/7a/ab/6a7b259703c09a88804fa2430b43d6457b692378f6b74b356155283566ac/numpy-2.3.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:04822c00b5fd0323c8166d66c701dc31b7fbd252c100acd708c48f763968d6a3", size = 16091613, upload-time = "2025-11-16T22:52:08.651Z" }, - { url = "https://files.pythonhosted.org/packages/c2/88/330da2071e8771e60d1038166ff9d73f29da37b01ec3eb43cb1427464e10/numpy-2.3.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d6889ec4ec662a1a37eb4b4fb26b6100841804dac55bd9df579e326cdc146227", size = 18591147, upload-time = "2025-11-16T22:52:11.453Z" }, - { url = "https://files.pythonhosted.org/packages/51/41/851c4b4082402d9ea860c3626db5d5df47164a712cb23b54be028b184c1c/numpy-2.3.5-cp314-cp314t-win32.whl", hash = "sha256:93eebbcf1aafdf7e2ddd44c2923e2672e1010bddc014138b229e49725b4d6be5", size = 6479806, upload-time = "2025-11-16T22:52:14.641Z" }, - { url = "https://files.pythonhosted.org/packages/90/30/d48bde1dfd93332fa557cff1972fbc039e055a52021fbef4c2c4b1eefd17/numpy-2.3.5-cp314-cp314t-win_amd64.whl", hash = "sha256:c8a9958e88b65c3b27e22ca2a076311636850b612d6bbfb76e8d156aacde2aaf", size = 13105760, upload-time = "2025-11-16T22:52:17.975Z" }, - { url = "https://files.pythonhosted.org/packages/2d/fd/4b5eb0b3e888d86aee4d198c23acec7d214baaf17ea93c1adec94c9518b9/numpy-2.3.5-cp314-cp314t-win_arm64.whl", hash = "sha256:6203fdf9f3dc5bdaed7319ad8698e685c7a3be10819f41d32a0723e611733b42", size = 10545459, upload-time = "2025-11-16T22:52:20.55Z" }, +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a4/7a/6a3d14e205d292b738db449d0de649b373a59edb0d0b4493821d0a3e8718/numpy-2.4.0.tar.gz", hash = "sha256:6e504f7b16118198f138ef31ba24d985b124c2c469fe8467007cf30fd992f934", size = 20685720, upload-time = "2025-12-20T16:18:19.023Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8b/ff/f6400ffec95de41c74b8e73df32e3fff1830633193a7b1e409be7fb1bb8c/numpy-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2a8b6bb8369abefb8bd1801b054ad50e02b3275c8614dc6e5b0373c305291037", size = 16653117, upload-time = "2025-12-20T16:16:06.709Z" }, + { url = "https://files.pythonhosted.org/packages/fd/28/6c23e97450035072e8d830a3c411bf1abd1f42c611ff9d29e3d8f55c6252/numpy-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e284ca13d5a8367e43734148622caf0b261b275673823593e3e3634a6490f83", size = 12369711, upload-time = "2025-12-20T16:16:08.758Z" }, + { url = "https://files.pythonhosted.org/packages/bc/af/acbef97b630ab1bb45e6a7d01d1452e4251aa88ce680ac36e56c272120ec/numpy-2.4.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:49ff32b09f5aa0cd30a20c2b39db3e669c845589f2b7fc910365210887e39344", size = 5198355, upload-time = "2025-12-20T16:16:10.902Z" }, + { url = "https://files.pythonhosted.org/packages/c1/c8/4e0d436b66b826f2e53330adaa6311f5cac9871a5b5c31ad773b27f25a74/numpy-2.4.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:36cbfb13c152b1c7c184ddac43765db8ad672567e7bafff2cc755a09917ed2e6", size = 6545298, upload-time = "2025-12-20T16:16:12.607Z" }, + { url = "https://files.pythonhosted.org/packages/ef/27/e1f5d144ab54eac34875e79037011d511ac57b21b220063310cb96c80fbc/numpy-2.4.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:35ddc8f4914466e6fc954c76527aa91aa763682a4f6d73249ef20b418fe6effb", size = 14398387, upload-time = "2025-12-20T16:16:14.257Z" }, + { url = "https://files.pythonhosted.org/packages/67/64/4cb909dd5ab09a9a5d086eff9586e69e827b88a5585517386879474f4cf7/numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc578891de1db95b2a35001b695451767b580bb45753717498213c5ff3c41d63", size = 16363091, upload-time = "2025-12-20T16:16:17.32Z" }, + { url = "https://files.pythonhosted.org/packages/9d/9c/8efe24577523ec6809261859737cf117b0eb6fdb655abdfdc81b2e468ce4/numpy-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:98e81648e0b36e325ab67e46b5400a7a6d4a22b8a7c8e8bbfe20e7db7906bf95", size = 16176394, upload-time = "2025-12-20T16:16:19.524Z" }, + { url = "https://files.pythonhosted.org/packages/61/f0/1687441ece7b47a62e45a1f82015352c240765c707928edd8aef875d5951/numpy-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d57b5046c120561ba8fa8e4030fbb8b822f3063910fa901ffadf16e2b7128ad6", size = 18287378, upload-time = "2025-12-20T16:16:22.866Z" }, + { url = "https://files.pythonhosted.org/packages/d3/6f/f868765d44e6fc466467ed810ba9d8d6db1add7d4a748abfa2a4c99a3194/numpy-2.4.0-cp312-cp312-win32.whl", hash = "sha256:92190db305a6f48734d3982f2c60fa30d6b5ee9bff10f2887b930d7b40119f4c", size = 5955432, upload-time = "2025-12-20T16:16:25.06Z" }, + { url = "https://files.pythonhosted.org/packages/d4/b5/94c1e79fcbab38d1ca15e13777477b2914dd2d559b410f96949d6637b085/numpy-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:680060061adb2d74ce352628cb798cfdec399068aa7f07ba9fb818b2b3305f98", size = 12306201, upload-time = "2025-12-20T16:16:26.979Z" }, + { url = "https://files.pythonhosted.org/packages/70/09/c39dadf0b13bb0768cd29d6a3aaff1fb7c6905ac40e9aaeca26b1c086e06/numpy-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:39699233bc72dd482da1415dcb06076e32f60eddc796a796c5fb6c5efce94667", size = 10308234, upload-time = "2025-12-20T16:16:29.417Z" }, + { url = "https://files.pythonhosted.org/packages/a7/0d/853fd96372eda07c824d24adf02e8bc92bb3731b43a9b2a39161c3667cc4/numpy-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a152d86a3ae00ba5f47b3acf3b827509fd0b6cb7d3259665e63dafbad22a75ea", size = 16649088, upload-time = "2025-12-20T16:16:31.421Z" }, + { url = "https://files.pythonhosted.org/packages/e3/37/cc636f1f2a9f585434e20a3e6e63422f70bfe4f7f6698e941db52ea1ac9a/numpy-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:39b19251dec4de8ff8496cd0806cbe27bf0684f765abb1f4809554de93785f2d", size = 12364065, upload-time = "2025-12-20T16:16:33.491Z" }, + { url = "https://files.pythonhosted.org/packages/ed/69/0b78f37ca3690969beee54103ce5f6021709134e8020767e93ba691a72f1/numpy-2.4.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:009bd0ea12d3c784b6639a8457537016ce5172109e585338e11334f6a7bb88ee", size = 5192640, upload-time = "2025-12-20T16:16:35.636Z" }, + { url = "https://files.pythonhosted.org/packages/1d/2a/08569f8252abf590294dbb09a430543ec8f8cc710383abfb3e75cc73aeda/numpy-2.4.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:5fe44e277225fd3dff6882d86d3d447205d43532c3627313d17e754fb3905a0e", size = 6541556, upload-time = "2025-12-20T16:16:37.276Z" }, + { url = "https://files.pythonhosted.org/packages/93/e9/a949885a4e177493d61519377952186b6cbfdf1d6002764c664ba28349b5/numpy-2.4.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f935c4493eda9069851058fa0d9e39dbf6286be690066509305e52912714dbb2", size = 14396562, upload-time = "2025-12-20T16:16:38.953Z" }, + { url = "https://files.pythonhosted.org/packages/99/98/9d4ad53b0e9ef901c2ef1d550d2136f5ac42d3fd2988390a6def32e23e48/numpy-2.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8cfa5f29a695cb7438965e6c3e8d06e0416060cf0d709c1b1c1653a939bf5c2a", size = 16351719, upload-time = "2025-12-20T16:16:41.503Z" }, + { url = "https://files.pythonhosted.org/packages/28/de/5f3711a38341d6e8dd619f6353251a0cdd07f3d6d101a8fd46f4ef87f895/numpy-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ba0cb30acd3ef11c94dc27fbfba68940652492bc107075e7ffe23057f9425681", size = 16176053, upload-time = "2025-12-20T16:16:44.552Z" }, + { url = "https://files.pythonhosted.org/packages/2a/5b/2a3753dc43916501b4183532e7ace862e13211042bceafa253afb5c71272/numpy-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:60e8c196cd82cbbd4f130b5290007e13e6de3eca79f0d4d38014769d96a7c475", size = 18277859, upload-time = "2025-12-20T16:16:47.174Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c5/a18bcdd07a941db3076ef489d036ab16d2bfc2eae0cf27e5a26e29189434/numpy-2.4.0-cp313-cp313-win32.whl", hash = "sha256:5f48cb3e88fbc294dc90e215d86fbaf1c852c63dbdb6c3a3e63f45c4b57f7344", size = 5953849, upload-time = "2025-12-20T16:16:49.554Z" }, + { url = "https://files.pythonhosted.org/packages/4f/f1/719010ff8061da6e8a26e1980cf090412d4f5f8060b31f0c45d77dd67a01/numpy-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:a899699294f28f7be8992853c0c60741f16ff199205e2e6cdca155762cbaa59d", size = 12302840, upload-time = "2025-12-20T16:16:51.227Z" }, + { url = "https://files.pythonhosted.org/packages/f5/5a/b3d259083ed8b4d335270c76966cb6cf14a5d1b69e1a608994ac57a659e6/numpy-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:9198f447e1dc5647d07c9a6bbe2063cc0132728cc7175b39dbc796da5b54920d", size = 10308509, upload-time = "2025-12-20T16:16:53.313Z" }, + { url = "https://files.pythonhosted.org/packages/31/01/95edcffd1bb6c0633df4e808130545c4f07383ab629ac7e316fb44fff677/numpy-2.4.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74623f2ab5cc3f7c886add4f735d1031a1d2be4a4ae63c0546cfd74e7a31ddf6", size = 12491815, upload-time = "2025-12-20T16:16:55.496Z" }, + { url = "https://files.pythonhosted.org/packages/59/ea/5644b8baa92cc1c7163b4b4458c8679852733fa74ca49c942cfa82ded4e0/numpy-2.4.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:0804a8e4ab070d1d35496e65ffd3cf8114c136a2b81f61dfab0de4b218aacfd5", size = 5320321, upload-time = "2025-12-20T16:16:57.468Z" }, + { url = "https://files.pythonhosted.org/packages/26/4e/e10938106d70bc21319bd6a86ae726da37edc802ce35a3a71ecdf1fdfe7f/numpy-2.4.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:02a2038eb27f9443a8b266a66911e926566b5a6ffd1a689b588f7f35b81e7dc3", size = 6641635, upload-time = "2025-12-20T16:16:59.379Z" }, + { url = "https://files.pythonhosted.org/packages/b3/8d/a8828e3eaf5c0b4ab116924df82f24ce3416fa38d0674d8f708ddc6c8aac/numpy-2.4.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1889b3a3f47a7b5bee16bc25a2145bd7cb91897f815ce3499db64c7458b6d91d", size = 14456053, upload-time = "2025-12-20T16:17:01.768Z" }, + { url = "https://files.pythonhosted.org/packages/68/a1/17d97609d87d4520aa5ae2dcfb32305654550ac6a35effb946d303e594ce/numpy-2.4.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85eef4cb5625c47ee6425c58a3502555e10f45ee973da878ac8248ad58c136f3", size = 16401702, upload-time = "2025-12-20T16:17:04.235Z" }, + { url = "https://files.pythonhosted.org/packages/18/32/0f13c1b2d22bea1118356b8b963195446f3af124ed7a5adfa8fdecb1b6ca/numpy-2.4.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6dc8b7e2f4eb184b37655195f421836cfae6f58197b67e3ffc501f1333d993fa", size = 16242493, upload-time = "2025-12-20T16:17:06.856Z" }, + { url = "https://files.pythonhosted.org/packages/ae/23/48f21e3d309fbc137c068a1475358cbd3a901b3987dcfc97a029ab3068e2/numpy-2.4.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:44aba2f0cafd287871a495fb3163408b0bd25bbce135c6f621534a07f4f7875c", size = 18324222, upload-time = "2025-12-20T16:17:09.392Z" }, + { url = "https://files.pythonhosted.org/packages/ac/52/41f3d71296a3dcaa4f456aaa3c6fc8e745b43d0552b6bde56571bb4b4a0f/numpy-2.4.0-cp313-cp313t-win32.whl", hash = "sha256:20c115517513831860c573996e395707aa9fb691eb179200125c250e895fcd93", size = 6076216, upload-time = "2025-12-20T16:17:11.437Z" }, + { url = "https://files.pythonhosted.org/packages/35/ff/46fbfe60ab0710d2a2b16995f708750307d30eccbb4c38371ea9e986866e/numpy-2.4.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b48e35f4ab6f6a7597c46e301126ceba4c44cd3280e3750f85db48b082624fa4", size = 12444263, upload-time = "2025-12-20T16:17:13.182Z" }, + { url = "https://files.pythonhosted.org/packages/a3/e3/9189ab319c01d2ed556c932ccf55064c5d75bb5850d1df7a482ce0badead/numpy-2.4.0-cp313-cp313t-win_arm64.whl", hash = "sha256:4d1cfce39e511069b11e67cd0bd78ceff31443b7c9e5c04db73c7a19f572967c", size = 10378265, upload-time = "2025-12-20T16:17:15.211Z" }, + { url = "https://files.pythonhosted.org/packages/ab/ed/52eac27de39d5e5a6c9aadabe672bc06f55e24a3d9010cd1183948055d76/numpy-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c95eb6db2884917d86cde0b4d4cf31adf485c8ec36bf8696dd66fa70de96f36b", size = 16647476, upload-time = "2025-12-20T16:17:17.671Z" }, + { url = "https://files.pythonhosted.org/packages/77/c0/990ce1b7fcd4e09aeaa574e2a0a839589e4b08b2ca68070f1acb1fea6736/numpy-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:65167da969cd1ec3a1df31cb221ca3a19a8aaa25370ecb17d428415e93c1935e", size = 12374563, upload-time = "2025-12-20T16:17:20.216Z" }, + { url = "https://files.pythonhosted.org/packages/37/7c/8c5e389c6ae8f5fd2277a988600d79e9625db3fff011a2d87ac80b881a4c/numpy-2.4.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3de19cfecd1465d0dcf8a5b5ea8b3155b42ed0b639dba4b71e323d74f2a3be5e", size = 5203107, upload-time = "2025-12-20T16:17:22.47Z" }, + { url = "https://files.pythonhosted.org/packages/e6/94/ca5b3bd6a8a70a5eec9a0b8dd7f980c1eff4b8a54970a9a7fef248ef564f/numpy-2.4.0-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:6c05483c3136ac4c91b4e81903cb53a8707d316f488124d0398499a4f8e8ef51", size = 6538067, upload-time = "2025-12-20T16:17:24.001Z" }, + { url = "https://files.pythonhosted.org/packages/79/43/993eb7bb5be6761dde2b3a3a594d689cec83398e3f58f4758010f3b85727/numpy-2.4.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36667db4d6c1cea79c8930ab72fadfb4060feb4bfe724141cd4bd064d2e5f8ce", size = 14411926, upload-time = "2025-12-20T16:17:25.822Z" }, + { url = "https://files.pythonhosted.org/packages/03/75/d4c43b61de473912496317a854dac54f1efec3eeb158438da6884b70bb90/numpy-2.4.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9a818668b674047fd88c4cddada7ab8f1c298812783e8328e956b78dc4807f9f", size = 16354295, upload-time = "2025-12-20T16:17:28.308Z" }, + { url = "https://files.pythonhosted.org/packages/b8/0a/b54615b47ee8736a6461a4bb6749128dd3435c5a759d5663f11f0e9af4ac/numpy-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1ee32359fb7543b7b7bd0b2f46294db27e29e7bbdf70541e81b190836cd83ded", size = 16190242, upload-time = "2025-12-20T16:17:30.993Z" }, + { url = "https://files.pythonhosted.org/packages/98/ce/ea207769aacad6246525ec6c6bbd66a2bf56c72443dc10e2f90feed29290/numpy-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e493962256a38f58283de033d8af176c5c91c084ea30f15834f7545451c42059", size = 18280875, upload-time = "2025-12-20T16:17:33.327Z" }, + { url = "https://files.pythonhosted.org/packages/17/ef/ec409437aa962ea372ed601c519a2b141701683ff028f894b7466f0ab42b/numpy-2.4.0-cp314-cp314-win32.whl", hash = "sha256:6bbaebf0d11567fa8926215ae731e1d58e6ec28a8a25235b8a47405d301332db", size = 6002530, upload-time = "2025-12-20T16:17:35.729Z" }, + { url = "https://files.pythonhosted.org/packages/5f/4a/5cb94c787a3ed1ac65e1271b968686521169a7b3ec0b6544bb3ca32960b0/numpy-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:3d857f55e7fdf7c38ab96c4558c95b97d1c685be6b05c249f5fdafcbd6f9899e", size = 12435890, upload-time = "2025-12-20T16:17:37.599Z" }, + { url = "https://files.pythonhosted.org/packages/48/a0/04b89db963af9de1104975e2544f30de89adbf75b9e75f7dd2599be12c79/numpy-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:bb50ce5fb202a26fd5404620e7ef820ad1ab3558b444cb0b55beb7ef66cd2d63", size = 10591892, upload-time = "2025-12-20T16:17:39.649Z" }, + { url = "https://files.pythonhosted.org/packages/53/e5/d74b5ccf6712c06c7a545025a6a71bfa03bdc7e0568b405b0d655232fd92/numpy-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:355354388cba60f2132df297e2d53053d4063f79077b67b481d21276d61fc4df", size = 12494312, upload-time = "2025-12-20T16:17:41.714Z" }, + { url = "https://files.pythonhosted.org/packages/c2/08/3ca9cc2ddf54dfee7ae9a6479c071092a228c68aef08252aa08dac2af002/numpy-2.4.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:1d8f9fde5f6dc1b6fc34df8162f3b3079365468703fee7f31d4e0cc8c63baed9", size = 5322862, upload-time = "2025-12-20T16:17:44.145Z" }, + { url = "https://files.pythonhosted.org/packages/87/74/0bb63a68394c0c1e52670cfff2e309afa41edbe11b3327d9af29e4383f34/numpy-2.4.0-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:e0434aa22c821f44eeb4c650b81c7fbdd8c0122c6c4b5a576a76d5a35625ecd9", size = 6644986, upload-time = "2025-12-20T16:17:46.203Z" }, + { url = "https://files.pythonhosted.org/packages/06/8f/9264d9bdbcf8236af2823623fe2f3981d740fc3461e2787e231d97c38c28/numpy-2.4.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:40483b2f2d3ba7aad426443767ff5632ec3156ef09742b96913787d13c336471", size = 14457958, upload-time = "2025-12-20T16:17:48.017Z" }, + { url = "https://files.pythonhosted.org/packages/8c/d9/f9a69ae564bbc7236a35aa883319364ef5fd41f72aa320cc1cbe66148fe2/numpy-2.4.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d9e6a7664ddd9746e20b7325351fe1a8408d0a2bf9c63b5e898290ddc8f09544", size = 16398394, upload-time = "2025-12-20T16:17:50.409Z" }, + { url = "https://files.pythonhosted.org/packages/34/c7/39241501408dde7f885d241a98caba5421061a2c6d2b2197ac5e3aa842d8/numpy-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ecb0019d44f4cdb50b676c5d0cb4b1eae8e15d1ed3d3e6639f986fc92b2ec52c", size = 16241044, upload-time = "2025-12-20T16:17:52.661Z" }, + { url = "https://files.pythonhosted.org/packages/7c/95/cae7effd90e065a95e59fe710eeee05d7328ed169776dfdd9f789e032125/numpy-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d0ffd9e2e4441c96a9c91ec1783285d80bf835b677853fc2770a89d50c1e48ac", size = 18321772, upload-time = "2025-12-20T16:17:54.947Z" }, + { url = "https://files.pythonhosted.org/packages/96/df/3c6c279accd2bfb968a76298e5b276310bd55d243df4fa8ac5816d79347d/numpy-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:77f0d13fa87036d7553bf81f0e1fe3ce68d14c9976c9851744e4d3e91127e95f", size = 6148320, upload-time = "2025-12-20T16:17:57.249Z" }, + { url = "https://files.pythonhosted.org/packages/92/8d/f23033cce252e7a75cae853d17f582e86534c46404dea1c8ee094a9d6d84/numpy-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b1f5b45829ac1848893f0ddf5cb326110604d6df96cdc255b0bf9edd154104d4", size = 12623460, upload-time = "2025-12-20T16:17:58.963Z" }, + { url = "https://files.pythonhosted.org/packages/a4/4f/1f8475907d1a7c4ef9020edf7f39ea2422ec896849245f00688e4b268a71/numpy-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:23a3e9d1a6f360267e8fbb38ba5db355a6a7e9be71d7fce7ab3125e88bb646c8", size = 10661799, upload-time = "2025-12-20T16:18:01.078Z" }, ] [[package]] diff --git a/backend/workers/dlq_processor.py b/backend/workers/dlq_processor.py index 9bf400fd..a3725c9e 100644 --- a/backend/workers/dlq_processor.py +++ b/backend/workers/dlq_processor.py @@ -2,12 +2,13 @@ import signal from typing import Optional +from app.core.database_context import Database, DBClient from app.core.logging import logger from app.dlq import DLQMessage, RetryPolicy, RetryStrategy from app.dlq.manager import DLQManager, create_dlq_manager from app.domain.enums.kafka import KafkaTopic from app.settings import get_settings -from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase +from motor.motor_asyncio import AsyncIOMotorClient def _configure_retry_policies(manager: DLQManager) -> None: @@ -39,8 +40,9 @@ def _configure_retry_policies(manager: DLQManager) -> None: ) manager.set_retry_policy( "websocket-events", - RetryPolicy(topic="websocket-events", strategy=RetryStrategy.FIXED_INTERVAL, - max_retries=10, base_delay_seconds=10), + RetryPolicy( + topic="websocket-events", strategy=RetryStrategy.FIXED_INTERVAL, max_retries=10, base_delay_seconds=10 + ), ) manager.default_retry_policy = RetryPolicy( topic="default", @@ -54,9 +56,11 @@ def _configure_retry_policies(manager: DLQManager) -> None: def _configure_filters(manager: DLQManager, testing: bool) -> None: if not testing: + def filter_test_events(message: DLQMessage) -> bool: event_id = message.event.event_id or "" return not event_id.startswith("test-") + manager.add_filter(filter_test_events) def filter_old_messages(message: DLQMessage) -> bool: @@ -96,13 +100,13 @@ async def alert_on_discard(message: DLQMessage, reason: str) -> None: async def main() -> None: settings = get_settings() - db_client: AsyncIOMotorClient = AsyncIOMotorClient( + db_client: DBClient = AsyncIOMotorClient( settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000, ) db_name = settings.DATABASE_NAME - database: AsyncIOMotorDatabase = db_client[db_name] + database: Database = db_client[db_name] await db_client.admin.command("ping") logger.info(f"Connected to database: {db_name}") @@ -126,6 +130,7 @@ def signal_handler(signum: int, frame: object | None) -> None: signal.signal(signal.SIGTERM, signal_handler) from contextlib import AsyncExitStack + async with AsyncExitStack() as stack: await stack.enter_async_context(manager) stack.callback(db_client.close) diff --git a/backend/workers/run_coordinator.py b/backend/workers/run_coordinator.py index 777d6c36..b97bf6db 100644 --- a/backend/workers/run_coordinator.py +++ b/backend/workers/run_coordinator.py @@ -16,10 +16,7 @@ def main() -> None: setup_logger() # Configure root logger for worker - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' - ) + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) logger.info("Starting ExecutionCoordinator worker...") @@ -31,7 +28,7 @@ def main() -> None: service_name=GroupId.EXECUTION_COORDINATOR, service_version=settings.TRACING_SERVICE_VERSION, enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE + sampling_rate=settings.TRACING_SAMPLING_RATE, ) logger.info("Tracing initialized for ExecutionCoordinator") diff --git a/backend/workers/run_event_replay.py b/backend/workers/run_event_replay.py index 8b2de419..3d34ddf5 100644 --- a/backend/workers/run_event_replay.py +++ b/backend/workers/run_event_replay.py @@ -2,6 +2,7 @@ import logging from contextlib import AsyncExitStack +from app.core.database_context import DBClient from app.core.logging import setup_logger from app.core.tracing import init_tracing from app.db.repositories.replay_repository import ReplayRepository @@ -35,11 +36,7 @@ async def run_replay_service() -> None: settings = get_settings() # Create database connection - db_client: AsyncIOMotorClient = AsyncIOMotorClient( - settings.MONGODB_URL, - tz_aware=True, - serverSelectionTimeoutMS=5000 - ) + db_client: DBClient = AsyncIOMotorClient(settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000) db_name = settings.DATABASE_NAME database = db_client[db_name] @@ -52,9 +49,7 @@ async def run_replay_service() -> None: # Initialize services schema_registry = SchemaRegistryManager() - producer_config = ProducerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS - ) + producer_config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS) producer = UnifiedProducer(producer_config, schema_registry) # Create event store @@ -68,11 +63,7 @@ async def run_replay_service() -> None: replay_repository = ReplayRepository(database) # Create replay service - replay_service = EventReplayService( - repository=replay_repository, - producer=producer, - event_store=event_store - ) + replay_service = EventReplayService(repository=replay_repository, producer=producer, event_store=event_store) logger.info("Event replay service initialized") async with AsyncExitStack() as stack: @@ -80,12 +71,14 @@ async def run_replay_service() -> None: stack.callback(db_client.close) task = asyncio.create_task(cleanup_task(replay_service)) + async def _cancel_task() -> None: task.cancel() try: await task except asyncio.CancelledError: pass + stack.push_async_callback(_cancel_task) await asyncio.Event().wait() @@ -97,10 +90,7 @@ def main() -> None: setup_logger() # Configure root logger for worker - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' - ) + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) logger.info("Starting Event Replay Service...") @@ -112,7 +102,7 @@ def main() -> None: service_name="event-replay", service_version=settings.TRACING_SERVICE_VERSION, enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE + sampling_rate=settings.TRACING_SAMPLING_RATE, ) logger.info("Tracing initialized for Event Replay Service") diff --git a/backend/workers/run_k8s_worker.py b/backend/workers/run_k8s_worker.py index e21cd1df..47c297a5 100644 --- a/backend/workers/run_k8s_worker.py +++ b/backend/workers/run_k8s_worker.py @@ -16,10 +16,7 @@ def main() -> None: setup_logger() # Configure root logger for worker - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' - ) + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) logger.info("Starting KubernetesWorker...") @@ -31,7 +28,7 @@ def main() -> None: service_name=GroupId.K8S_WORKER, service_version=settings.TRACING_SERVICE_VERSION, enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE + sampling_rate=settings.TRACING_SAMPLING_RATE, ) logger.info("Tracing initialized for KubernetesWorker") diff --git a/backend/workers/run_pod_monitor.py b/backend/workers/run_pod_monitor.py index dd0c41da..74bfb1be 100644 --- a/backend/workers/run_pod_monitor.py +++ b/backend/workers/run_pod_monitor.py @@ -16,10 +16,7 @@ def main() -> None: setup_logger() # Configure root logger for worker - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' - ) + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) logger.info("Starting PodMonitor worker...") @@ -31,7 +28,7 @@ def main() -> None: service_name=GroupId.POD_MONITOR, service_version=settings.TRACING_SERVICE_VERSION, enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE + sampling_rate=settings.TRACING_SAMPLING_RATE, ) logger.info("Tracing initialized for PodMonitor Service") diff --git a/backend/workers/run_result_processor.py b/backend/workers/run_result_processor.py index 9a8c4510..21468431 100644 --- a/backend/workers/run_result_processor.py +++ b/backend/workers/run_result_processor.py @@ -14,10 +14,7 @@ def main() -> None: setup_logger() # Configure root logger for worker - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' - ) + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) logger.info("Starting ResultProcessor worker...") @@ -29,7 +26,7 @@ def main() -> None: service_name=GroupId.RESULT_PROCESSOR, service_version=settings.TRACING_SERVICE_VERSION, enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE + sampling_rate=settings.TRACING_SAMPLING_RATE, ) logger.info("Tracing initialized for ResultProcessor Service") diff --git a/backend/workers/run_saga_orchestrator.py b/backend/workers/run_saga_orchestrator.py index 241a2456..819a297b 100644 --- a/backend/workers/run_saga_orchestrator.py +++ b/backend/workers/run_saga_orchestrator.py @@ -2,6 +2,7 @@ import logging import redis.asyncio as redis +from app.core.database_context import DBClient from app.core.logging import setup_logger from app.core.tracing import init_tracing from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository @@ -26,11 +27,7 @@ async def run_saga_orchestrator() -> None: logger = logging.getLogger(__name__) # Create database connection - db_client: AsyncIOMotorClient = AsyncIOMotorClient( - settings.MONGODB_URL, - tz_aware=True, - serverSelectionTimeoutMS=5000 - ) + db_client: DBClient = AsyncIOMotorClient(settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000) db_name = settings.DATABASE_NAME database = db_client[db_name] @@ -48,19 +45,13 @@ async def run_saga_orchestrator() -> None: # Initialize Kafka producer logger.info("Initializing Kafka producer...") - producer_config = ProducerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS - ) + producer_config = ProducerConfig(bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS) producer = UnifiedProducer(producer_config, schema_registry_manager) await producer.start() # Create event store (schema ensured separately) logger.info("Creating event store...") - event_store = create_event_store( - db=database, - schema_registry=schema_registry_manager, - ttl_days=90 - ) + event_store = create_event_store(db=database, schema_registry=schema_registry_manager, ttl_days=90) # Create repository and idempotency manager (Redis-backed) saga_repository = SagaRepository(database) @@ -130,10 +121,7 @@ def main() -> None: setup_logger() # Configure root logger for worker - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' - ) + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) logger.info("Starting Saga Orchestrator worker...") @@ -145,7 +133,7 @@ def main() -> None: service_name=GroupId.SAGA_ORCHESTRATOR, service_version=settings.TRACING_SERVICE_VERSION, enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE + sampling_rate=settings.TRACING_SAMPLING_RATE, ) logger.info("Tracing initialized for Saga Orchestrator Service")