From 8d85d177b83a679ead06b936b33ce0715d047aa3 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sun, 4 Jan 2026 23:53:18 +0100 Subject: [PATCH 1/2] added more docs --- docs/architecture/authentication.md | 2 +- docs/architecture/overview.md | 12 +- docs/architecture/rate-limiting.md | 16 +- docs/architecture/runtime-registry.md | 49 ++++-- docs/components/dead-letter-queue.md | 8 + docs/components/saga/resource-allocation.md | 7 + docs/components/schema-manager.md | 8 + docs/components/sse/execution-sse-flow.md | 7 + docs/components/sse/sse-architecture.md | 9 + .../sse/sse-partitioned-architecture.md | 7 + docs/components/workers/coordinator.md | 74 ++++++++ docs/components/workers/dlq_processor.md | 65 +++++++ docs/components/workers/event_replay.md | 53 ++++++ docs/components/workers/index.md | 59 +++++++ docs/components/workers/k8s_worker.md | 63 +++++++ docs/components/workers/pod_monitor.md | 53 ++++-- docs/components/workers/result_processor.md | 32 +++- docs/components/workers/saga_orchestrator.md | 65 +++++++ docs/frontend/routing.md | 164 +++++++++++++++++ docs/index.md | 24 ++- docs/reference/api-reference.md | 163 ++++++++++++++++- docs/reference/environment-variables.md | 165 ++++++++++++++++++ mkdocs.yml | 8 + 23 files changed, 1069 insertions(+), 44 deletions(-) create mode 100644 docs/components/workers/coordinator.md create mode 100644 docs/components/workers/dlq_processor.md create mode 100644 docs/components/workers/event_replay.md create mode 100644 docs/components/workers/index.md create mode 100644 docs/components/workers/k8s_worker.md create mode 100644 docs/components/workers/saga_orchestrator.md create mode 100644 docs/frontend/routing.md create mode 100644 docs/reference/environment-variables.md diff --git a/docs/architecture/authentication.md b/docs/architecture/authentication.md index 7b97f63c..59000304 100644 --- a/docs/architecture/authentication.md +++ b/docs/architecture/authentication.md @@ -60,7 +60,7 @@ JWTs are signed with HS256 using a secret key from settings: ``` The token payload contains the username in the `sub` claim and an expiration time. Token lifetime is configured via -`ACCESS_TOKEN_EXPIRE_MINUTES` (default: 30 minutes). +`ACCESS_TOKEN_EXPIRE_MINUTES` (default: 24 hours / 1440 minutes). ### CSRF Validation diff --git a/docs/architecture/overview.md b/docs/architecture/overview.md index 2997a652..8d85e7bd 100644 --- a/docs/architecture/overview.md +++ b/docs/architecture/overview.md @@ -1,8 +1,16 @@ # Architecture overview In this file, you can find broad description of main components of project architecture. -Preciser info about peculiarities of separate components (SSE, Kafka topics, DLQ, ..) are in -the [Components](../components/dead-letter-queue.md) section. +For details on specific components, see: + +- [SSE Architecture](../components/sse/sse-architecture.md) +- [Dead Letter Queue](../components/dead-letter-queue.md) +- [Kafka Topics](kafka-topic-architecture.md) +- [Workers](../components/workers/pod_monitor.md) + +!!! note "Event Streaming" + Kafka event streaming is disabled by default (`ENABLE_EVENT_STREAMING=false`). Set this to `true` in your + environment to enable the full event-driven architecture. ## System overview diff --git a/docs/architecture/rate-limiting.md b/docs/architecture/rate-limiting.md index 9d16fbbe..dfd8d1a1 100644 --- a/docs/architecture/rate-limiting.md +++ b/docs/architecture/rate-limiting.md @@ -66,6 +66,10 @@ The platform ships with default rate limits organized by endpoint group. Higher Execution endpoints have the strictest limits since they spawn Kubernetes pods. The catch-all API rule (priority 1) applies to any endpoint not matching a more specific pattern. +!!! note "WebSocket rule" + The `/api/v1/ws` pattern is reserved for future WebSocket support. The platform currently uses Server-Sent Events + (SSE) for real-time updates via `/api/v1/events/*`. + ## Middleware Integration The `RateLimitMiddleware` intercepts all HTTP requests, extracts the user identifier, and checks against the configured @@ -135,10 +139,14 @@ Configuration is cached in Redis for 5 minutes to reduce database load while all Rate limiting is controlled by environment variables: -| Variable | Default | Description | -|---------------------------|--------------|---------------------------------------| -| `RATE_LIMIT_ENABLED` | `true` | Enable/disable rate limiting globally | -| `RATE_LIMIT_REDIS_PREFIX` | `ratelimit:` | Redis key prefix for isolation | +| Variable | Default | Description | +|---------------------------|------------------|------------------------------------------------------| +| `RATE_LIMIT_ENABLED` | `true` | Enable/disable rate limiting globally | +| `RATE_LIMIT_REDIS_PREFIX` | `rate_limit:` | Redis key prefix for isolation | +| `RATE_LIMIT_ALGORITHM` | `sliding_window` | Algorithm to use (`sliding_window` or `token_bucket`)| +| `RATE_LIMIT_DEFAULT_REQUESTS` | `100` | Default request limit | +| `RATE_LIMIT_DEFAULT_WINDOW` | `60` | Default window in seconds | +| `RATE_LIMIT_BURST_MULTIPLIER` | `1.5` | Burst multiplier for token bucket | The system gracefully degrades when Redis is unavailable—requests are allowed through rather than failing closed. diff --git a/docs/architecture/runtime-registry.md b/docs/architecture/runtime-registry.md index ba66b62b..45e87f0b 100644 --- a/docs/architecture/runtime-registry.md +++ b/docs/architecture/runtime-registry.md @@ -77,23 +77,50 @@ The example scripts intentionally use features that may not work on older versio compatibility. For instance, Python's match statement (3.10+), Node's `Promise.withResolvers()` (22+), and Go's `clear()` function (1.21+). -## API Endpoint +## API Endpoints -The `/api/v1/languages` endpoint returns the available runtimes: +The runtime information is available via two endpoints: + +### GET /api/v1/k8s-limits + +Returns resource limits and supported runtimes: + +```json +{ + "cpu_limit": "1000m", + "memory_limit": "128Mi", + "cpu_request": "1000m", + "memory_request": "128Mi", + "execution_timeout": 300, + "supported_runtimes": { + "python": {"versions": ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"], "file_ext": "py"}, + "node": {"versions": ["18", "20", "22"], "file_ext": "js"}, + "ruby": {"versions": ["3.1", "3.2", "3.3"], "file_ext": "rb"}, + "go": {"versions": ["1.20", "1.21", "1.22"], "file_ext": "go"}, + "bash": {"versions": ["5.1", "5.2", "5.3"], "file_ext": "sh"} + } +} +``` + +### GET /api/v1/example-scripts + +Returns example scripts for each language: ```json { - "python": {"versions": ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"], "file_ext": "py"}, - "node": {"versions": ["18", "20", "22"], "file_ext": "js"}, - "ruby": {"versions": ["3.1", "3.2", "3.3"], "file_ext": "rb"}, - "go": {"versions": ["1.20", "1.21", "1.22"], "file_ext": "go"}, - "bash": {"versions": ["5.1", "5.2", "5.3"], "file_ext": "sh"} + "scripts": { + "python": "# Python example script...", + "node": "// Node.js example script...", + "ruby": "# Ruby example script...", + "go": "package main...", + "bash": "#!/bin/bash..." + } } ``` ## Key Files -| File | Purpose | -|----------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------| -| [`runtime_registry.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/runtime_registry.py) | Language specifications and runtime config generation | -| [`api/routes/languages.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/api/routes/languages.py) | API endpoint for available languages | +| File | Purpose | +|----------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------| +| [`runtime_registry.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/runtime_registry.py) | Language specifications and runtime config generation | +| [`api/routes/execution.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/api/routes/execution.py) | API endpoints including k8s-limits and example-scripts | diff --git a/docs/components/dead-letter-queue.md b/docs/components/dead-letter-queue.md index dbe8c99e..0cdc59cd 100644 --- a/docs/components/dead-letter-queue.md +++ b/docs/components/dead-letter-queue.md @@ -88,3 +88,11 @@ If sending to DLQ fails (extremely rare - would mean Kafka is down), the produce The system is designed to be resilient but not perfect. In catastrophic scenarios, you still have Kafka's built-in durability and the ability to replay topics from the beginning if needed. +## Key files + +| File | Purpose | +|--------------------------------------------------------------------------------------------------------------------------|------------------------| +| [`dlq_processor.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/workers/dlq_processor.py) | DLQ processor worker | +| [`manager.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/dlq/manager.py) | DLQ management logic | +| [`unified_producer.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/events/unified_producer.py) | `send_to_dlq()` method | +| [`dlq.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/api/routes/dlq.py) | Admin API routes | diff --git a/docs/components/saga/resource-allocation.md b/docs/components/saga/resource-allocation.md index 0a542455..976b15dd 100644 --- a/docs/components/saga/resource-allocation.md +++ b/docs/components/saga/resource-allocation.md @@ -104,3 +104,10 @@ if active_count >= 100: # <- adjust this value ``` Future improvements could make this configurable per-language or dynamically adjustable based on cluster capacity. + +## Key files + +| File | Purpose | +|-----------------------------------------------------------------------------------------------------------------------------------------------|--------------------------| +| [`execution_saga.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/saga/execution_saga.py) | Saga with allocation step | +| [`resource_allocation_repository.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/db/repositories/resource_allocation_repository.py) | MongoDB operations | diff --git a/docs/components/schema-manager.md b/docs/components/schema-manager.md index 626a237c..9b22fc15 100644 --- a/docs/components/schema-manager.md +++ b/docs/components/schema-manager.md @@ -33,3 +33,11 @@ During API startup, the `lifespan` function in `dishka_lifespan.py` gets the dat To force a specific MongoDB migration to run again, delete its document from `schema_versions`. To start fresh, point the app at a new database. Migrations are designed to be additive; the system doesn't support automatic rollbacks. If you need to undo a migration in production, you'll have to drop indexes or modify validators manually. For Kafka schemas, the registry keeps all versions. If you break compatibility and need to start over, delete the subject from the registry (either via REST API or the registry's UI if available) and let the app re-register on next startup. + +## Key files + +| File | Purpose | +|--------------------------------------------------------------------------------------------------------------------------------|----------------------------| +| [`schema_manager.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/db/schema/schema_manager.py) | MongoDB migrations | +| [`schema_registry.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/events/schema/schema_registry.py) | Kafka Avro serialization | +| [`dishka_lifespan.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/dishka_lifespan.py) | Startup initialization | diff --git a/docs/components/sse/execution-sse-flow.md b/docs/components/sse/execution-sse-flow.md index 8156f981..a81f057d 100644 --- a/docs/components/sse/execution-sse-flow.md +++ b/docs/components/sse/execution-sse-flow.md @@ -9,3 +9,10 @@ The SSE router maintains a small pool of Kafka consumers and routes only the eve Using `result_stored` as the terminal signal removes artificial waiting. Earlier iterations ended the SSE stream on `execution_completed`/`failed`/`timeout` and slept on the server to "give Mongo time" to commit. That pause is unnecessary once the stream ends only after the result processor confirms persistence. This approach preserves clean attribution and ordering. The coordinator enriches pod creation commands with user information so pods are labeled correctly. The pod monitor converts Kubernetes phases into domain events. Timeout classification is deterministic: any pod finishing with `reason=DeadlineExceeded` results in an `execution_timeout` event. The result processor is the single writer of terminal state, so the UI never races the database — when the browser sees `result_stored`, the result is already present. + +## Related docs + +- [SSE Architecture](sse-architecture.md) — overall SSE design and components +- [SSE Partitioned Router](sse-partitioned-architecture.md) — consumer pool and scaling +- [Result Processor](../workers/result_processor.md) — terminal event handling +- [Pod Monitor](../workers/pod_monitor.md) — Kubernetes event translation diff --git a/docs/components/sse/sse-architecture.md b/docs/components/sse/sse-architecture.md index 64c0bf47..b6174701 100644 --- a/docs/components/sse/sse-architecture.md +++ b/docs/components/sse/sse-architecture.md @@ -82,3 +82,12 @@ Key metrics: `sse.connections.active`, `sse.messages.sent.total`, `sse.connectio ## Why not WebSockets? WebSockets were initially implemented but removed because SSE is sufficient for server-to-client communication, simpler connection management, better proxy compatibility (many corporate proxies block WebSockets), excellent browser support with automatic reconnection, and works great with HTTP/2 multiplexing. + +## Key files + +| File | Purpose | +|-----------------------------------------------------------------------------------------------------------------------------------|------------------------| +| [`sse_service.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/sse/sse_service.py) | Client connections | +| [`kafka_redis_bridge.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/sse/kafka_redis_bridge.py) | Kafka-to-Redis routing | +| [`redis_bus.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/sse/redis_bus.py) | Redis pub/sub wrapper | +| [`sse_shutdown_manager.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/sse/sse_shutdown_manager.py) | Graceful shutdown | diff --git a/docs/components/sse/sse-partitioned-architecture.md b/docs/components/sse/sse-partitioned-architecture.md index 7d465503..7411fa49 100644 --- a/docs/components/sse/sse-partitioned-architecture.md +++ b/docs/components/sse/sse-partitioned-architecture.md @@ -33,3 +33,10 @@ Memory management uses configurable buffer limits: max size, TTL for expiration, The `SSEShutdownManager` complements the router. The router handles the data plane (Kafka consumers, event routing to Redis). The shutdown manager handles the control plane (tracking connections, coordinating graceful shutdown, notifying clients). When the server shuts down, SSE clients receive a shutdown event so they can display messages and attempt reconnection. The shutdown manager implements phased shutdown: notify clients, wait for graceful disconnection, force-close remaining connections. SSE connections register with the shutdown manager and monitor a shutdown event while streaming from Redis. When shutdown triggers, connections send shutdown messages and close gracefully. + +## Key files + +| File | Purpose | +|-----------------------------------------------------------------------------------------------------------------------------------|----------------------| +| [`kafka_redis_bridge.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/sse/kafka_redis_bridge.py) | Consumer pool router | +| [`sse_shutdown_manager.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/sse/sse_shutdown_manager.py) | Connection tracking | diff --git a/docs/components/workers/coordinator.md b/docs/components/workers/coordinator.md new file mode 100644 index 00000000..1020c3b6 --- /dev/null +++ b/docs/components/workers/coordinator.md @@ -0,0 +1,74 @@ +# Coordinator + +The coordinator owns admission and queuing policy for executions. It decides which executions can proceed based on +available resources and enforces per-user limits to prevent any single user from monopolizing the system. + +```mermaid +graph LR + Kafka[(Kafka)] --> Coord[Coordinator] + Coord --> Queue[Priority Queue] + Coord --> Resources[Resource Pool] + Coord --> Accepted[Accepted Events] + Accepted --> Kafka +``` + +## How it works + +When an `ExecutionRequested` event arrives, the coordinator checks: + +1. Is the queue full? (max 10,000 pending) +2. Has this user exceeded their limit? (max 100 concurrent) +3. Are there enough CPU and memory resources? + +If all checks pass, the coordinator allocates resources and publishes `ExecutionAccepted`. Otherwise, the request +is either queued for later or rejected. + +The coordinator runs a background scheduling loop that continuously pulls from the priority queue and attempts to +schedule pending executions as resources become available. + +## Priority queue + +Executions are processed in priority order. Lower numeric values are processed first: + +```python +--8<-- "backend/app/services/coordinator/queue_manager.py:14:19" +``` + +When resources are unavailable, executions are requeued with reduced priority to prevent starvation. + +## Resource management + +The coordinator tracks a pool of CPU and memory resources: + +| Parameter | Default | Description | +|---------------------------|---------|----------------------------| +| `total_cpu_cores` | 32 | Total CPU pool | +| `total_memory_mb` | 65,536 | Total memory pool (64GB) | +| `overcommit_factor` | 1.2 | Allow 20% overcommit | +| `max_queue_size` | 10,000 | Maximum pending executions | +| `max_executions_per_user` | 100 | Per-user limit | +| `stale_timeout_seconds` | 3,600 | Stale execution timeout | + +## Topics + +- **Consumes**: `execution_events` (requested, completed, failed, cancelled) +- **Produces**: `execution_events` (accepted) + +## Key files + +| File | Purpose | +|--------------------------------------------------------------------------------------------------------------------------------|-------------------------------| +| [`run_coordinator.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/workers/run_coordinator.py) | Entry point | +| [`coordinator.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/coordinator/coordinator.py) | Main coordinator service | +| [`queue_manager.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/coordinator/queue_manager.py) | Priority queue implementation | +| [`resource_manager.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/coordinator/resource_manager.py) | Resource pool and allocation | + +## Deployment + +```yaml +coordinator: + build: + dockerfile: workers/Dockerfile.coordinator +``` + +Usually runs as a single replica. Leader election via Redis is available if scaling is needed. diff --git a/docs/components/workers/dlq_processor.md b/docs/components/workers/dlq_processor.md new file mode 100644 index 00000000..3e7e7497 --- /dev/null +++ b/docs/components/workers/dlq_processor.md @@ -0,0 +1,65 @@ +# DLQ Processor + +The DLQ (Dead Letter Queue) processor drains and retries dead-lettered messages with exponential backoff. When a +message fails processing too many times, it ends up in the DLQ for manual inspection and retry. + +For the full picture on how dead-lettering works, see [Dead Letter Queue](../dead-letter-queue.md). + +```mermaid +graph LR + DLQ[(Dead Letter Queue)] --> Processor[DLQ Processor] + Processor --> Original[(Original Topic)] + Processor --> Archive[(Archive)] +``` + +## How it works + +When a Kafka consumer fails to process a message after exhausting its retries, the message is sent to the dead letter +queue topic. The DLQ processor picks up these messages and applies a retry policy: + +1. Check if the message has exceeded max retry attempts +2. If not, wait for the backoff delay +3. Republish to the original topic +4. If successful, remove from DLQ +5. If max attempts exceeded, archive the message + +The processor uses exponential backoff — each retry waits longer than the previous one, up to a maximum delay. This +prevents overwhelming downstream services during outages. + +## Configuration + +| Variable | Default | Description | +|--------------------------------|---------|-------------------------------| +| `DLQ_RETRY_MAX_ATTEMPTS` | 5 | Maximum retry attempts | +| `DLQ_RETRY_BASE_DELAY_SECONDS` | 60 | Base retry delay | +| `DLQ_RETRY_MAX_DELAY_SECONDS` | 3,600 | Maximum retry delay (1 hour) | +| `DLQ_RETENTION_DAYS` | 7 | Message retention | +| `DLQ_WARNING_THRESHOLD` | 100 | Threshold for warning alerts | +| `DLQ_CRITICAL_THRESHOLD` | 1,000 | Threshold for critical alerts | + +## Monitoring + +The DLQ can be monitored via the admin API: + +- `GET /api/v1/dlq/stats` — DLQ statistics by status, topic, event type +- `GET /api/v1/dlq/messages` — List DLQ messages with filtering +- `POST /api/v1/dlq/retry` — Manually retry messages +- `DELETE /api/v1/dlq/messages/{id}` — Discard a message + +## Key files + +| File | Purpose | +|-----------------------------------------------------------------------------------------------------------|----------------------| +| [`dlq_processor.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/workers/dlq_processor.py) | Entry point | +| [`manager.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/dlq/manager.py) | DLQ management logic | +| [`dlq.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/api/routes/dlq.py) | Admin API routes | + +## Deployment + +```yaml +dlq-processor: + build: + dockerfile: workers/Dockerfile.dlq_processor +``` + +Usually runs as a single replica. The processor is designed to handle periodic retries, not real-time processing. diff --git a/docs/components/workers/event_replay.md b/docs/components/workers/event_replay.md new file mode 100644 index 00000000..1695ba3c --- /dev/null +++ b/docs/components/workers/event_replay.md @@ -0,0 +1,53 @@ +# Event Replay + +The event replay worker re-emits stored events for debugging or rebuilding projections. It reads from the MongoDB +event store and republishes events to Kafka, optionally filtering by time range, event types, or aggregates. + +```mermaid +graph LR + Mongo[(MongoDB)] --> Replay[Event Replay] + Replay --> Kafka[(Kafka)] +``` + +## How it works + +Event replay is typically triggered via the admin API. You specify a replay configuration with filters: + +- Time range (start/end timestamps) +- Event types to include +- Specific aggregate IDs (like execution IDs) + +The worker reads matching events from the event store in chronological order and republishes them to their original +topics. Each replayed event includes a provenance marker so consumers can distinguish replays from live events. + +## Use cases + +**Debugging** — Replay events for a specific execution to understand what happened step by step. + +**Rebuilding projections** — If a read model gets corrupted or a new projection is added, replay historical events +to rebuild state. + +**Testing** — Replay production events in a test environment to verify behavior. + +## Dry-run mode + +The worker supports dry-run mode where events are loaded and validated but not actually published. This helps verify +filter criteria before running a real replay. + +## Key files + +| File | Purpose | +|----------------------------------------------------------------------------------------------------------------|---------------------------| +| [`run_event_replay.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/workers/run_event_replay.py) | Entry point | +| [`replay_service.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/replay_service.py) | Replay session management | +| [`event_store.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/events/event_store.py) | Event store queries | + +## Deployment + +```yaml +event-replay: + build: + dockerfile: workers/Dockerfile.event_replay +``` + +Typically runs on-demand with 0 replicas. Scale up when replay is needed, scale back down when complete. diff --git a/docs/components/workers/index.md b/docs/components/workers/index.md new file mode 100644 index 00000000..fe14d663 --- /dev/null +++ b/docs/components/workers/index.md @@ -0,0 +1,59 @@ +# Workers + +The execution pipeline is split across seven background workers, each running as a separate container. This separation +keeps concerns isolated - the API doesn't block waiting for pods to finish, the saga orchestrator doesn't care how pods +are created, and the result processor doesn't know anything about Kubernetes. + +Workers communicate through Kafka. Each publishes events when it completes work, and subscribes to events from upstream. +MongoDB and Redis provide shared state where needed. + +```mermaid +graph LR + API[Backend API] -->|execution_requested| Kafka[(Kafka)] + Kafka --> Coord[Coordinator] + Coord -->|execution_accepted| Kafka + Kafka --> Saga[Saga Orchestrator] + Saga -->|create_pod_command| Kafka + Kafka --> K8s[K8s Worker] + K8s -->|pod_created| Kafka + K8s --> Pod[Kubernetes Pod] + Pod --> PodMon[Pod Monitor] + PodMon -->|execution_completed| Kafka + Kafka --> Result[Result Processor] + Result --> Mongo[(MongoDB)] +``` + +## The workers + +| Worker | What it does | Entry point | +|-------------------------------------------|-----------------------------------------------------------|----------------------------| +| [Coordinator](coordinator.md) | Admits executions, manages the queue, allocates resources | `run_coordinator.py` | +| [Saga Orchestrator](saga_orchestrator.md) | Drives the execution state machine, issues pod commands | `run_saga_orchestrator.py` | +| [K8s Worker](k8s_worker.md) | Creates ConfigMaps and Pods with security hardening | `run_k8s_worker.py` | +| [Pod Monitor](pod_monitor.md) | Watches pods, translates K8s events to domain events | `run_pod_monitor.py` | +| [Result Processor](result_processor.md) | Persists execution results, cleans up resources | `run_result_processor.py` | +| [Event Replay](event_replay.md) | Re-emits historical events for debugging | `run_event_replay.py` | +| [DLQ Processor](dlq_processor.md) | Retries failed messages from the dead letter queue | `dlq_processor.py` | + +All entry points live in [`backend/workers/`](https://github.com/HardMax71/Integr8sCode/tree/main/backend/workers). + +## Running locally + +Docker Compose starts everything: + +```bash +./deploy.sh dev +``` + +For debugging a specific worker, run it directly: + +```bash +cd backend +python -m workers.run_coordinator +``` + +## Scaling + +Most workers can run as single replicas. The stateful ones (Coordinator, Saga Orchestrator) use event sourcing to +recover after restarts. The stateless ones (K8s Worker, Pod Monitor, Result Processor) can scale horizontally if +throughput becomes an issue. diff --git a/docs/components/workers/k8s_worker.md b/docs/components/workers/k8s_worker.md new file mode 100644 index 00000000..4513b40c --- /dev/null +++ b/docs/components/workers/k8s_worker.md @@ -0,0 +1,63 @@ +# K8s Worker + +The K8s worker materializes saga commands into Kubernetes resources. When the [Saga Orchestrator](saga_orchestrator.md) +decides a pod should be created, the K8s worker handles the actual creation with proper security hardening. + +```mermaid +graph LR + Kafka[(Kafka)] --> Worker[K8s Worker] + Worker --> K8s[Kubernetes API] + Worker --> Events[Execution Events] + Events --> Kafka +``` + +## How it works + +The worker consumes `CreatePodCommand` and `DeletePodCommand` events from the saga commands topic. For pod creation, +it builds a complete pod specification including: + +- A ConfigMap containing the user's script and an entrypoint script +- A Pod manifest with hardened security context +- Proper labels for tracking and network policy matching + +After creating resources, it publishes `PodCreated` and `ExecutionStarted` events so the rest of the system knows +the execution has begun. + +## Pod security + +Every pod is created with defense-in-depth security: + +| Control | Implementation | +|--------------------|-----------------------------------------------| +| Non-root execution | UID/GID 1000, no privilege escalation | +| Filesystem | Read-only root filesystem | +| Capabilities | All Linux capabilities dropped | +| Seccomp | RuntimeDefault profile | +| Network | Cilium deny-all egress policy (cluster-level) | +| DNS | Disabled | +| Service account | No token mounted | + +The worker refuses to run pods in the `default` namespace to ensure network policies are always applied. + +## Topics + +- **Consumes**: `saga_commands` +- **Produces**: `execution_events`, `pod_events` + +## Key files + +| File | Purpose | +|---------------------------------------------------------------------------------------------------------------------|---------------------| +| [`run_k8s_worker.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/workers/run_k8s_worker.py) | Entry point | +| [`worker.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/k8s_worker/worker.py) | Worker service | +| [`pod_builder.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/k8s_worker/pod_builder.py) | Pod spec generation | + +## Deployment + +```yaml +k8s-worker: + build: + dockerfile: workers/Dockerfile.k8s_worker +``` + +Scale based on pod creation throughput. Each instance needs access to the Kubernetes API via kubeconfig. diff --git a/docs/components/workers/pod_monitor.md b/docs/components/workers/pod_monitor.md index cf723f13..de6fcb89 100644 --- a/docs/components/workers/pod_monitor.md +++ b/docs/components/workers/pod_monitor.md @@ -1,20 +1,24 @@ -# Pod monitor +# Pod Monitor -## Overview +The pod monitor is a bridge between Kubernetes and the event system. It watches for pods created with specific labels +(execution pods) and translates what happens to them into events that the rest of the system understands. -The pod monitor is a bridge between Kubernetes and the event system. It watches for pods created with specific labels (execution pods) and translates what happens to them into events that the rest of the system understands. - -Think of it as a translator sitting between Kubernetes speaking its native language of pod phases and conditions, and the application speaking in terms of execution events. +Think of it as a translator sitting between Kubernetes speaking its native language of pod phases and conditions, +and the application speaking in terms of execution events. ## How it works -When an execution starts, the system creates a Kubernetes pod to run the user's code. The pod monitor watches the Kubernetes API using the watch mechanism — like a WebSocket connection that streams pod changes as they happen. - -As pods go through their lifecycle (getting scheduled, starting to run, completing or failing), the monitor captures these state changes and maps them to domain events. A pod becoming "Running" in Kubernetes becomes an `ExecutionRunningEvent`. A pod that completes becomes either `ExecutionCompletedEvent` or `ExecutionFailedEvent` depending on the exit code. +When an execution starts, the system creates a Kubernetes pod to run the user's code. The pod monitor watches the +Kubernetes API using the watch mechanism — like a WebSocket connection that streams pod changes as they happen. -The clever part is log extraction. When a pod terminates, the monitor fetches container logs, parses them to extract stdout, stderr, and resource usage metrics printed in a special JSON format, then includes all this data in the completion event. +As pods go through their lifecycle (getting scheduled, starting to run, completing or failing), the monitor captures +these state changes and maps them to domain events. A pod becoming "Running" in Kubernetes becomes an +`ExecutionRunningEvent`. A pod that completes becomes either `ExecutionCompletedEvent` or `ExecutionFailedEvent` +depending on the exit code. -## System flow +The clever part is log extraction. When a pod terminates, the monitor fetches container logs, parses them to extract +stdout, stderr, and resource usage metrics printed in a special JSON format, then includes all this data in the +completion event. ```mermaid graph LR @@ -26,10 +30,33 @@ graph LR Result --> DB[(MongoDB)] ``` -Without the pod monitor, there would be no way to know when executions finish or what their output was. It's the eyes and ears of the system for everything happening in Kubernetes. +Without the pod monitor, there would be no way to know when executions finish or what their output was. It's the eyes +and ears of the system for everything happening in Kubernetes. -The service runs as a standalone container because it needs to maintain a persistent watch connection to the Kubernetes API. If it was part of the main backend, every restart would break the watch and potentially miss pod events. +The service runs as a standalone container because it needs to maintain a persistent watch connection to the +Kubernetes API. If it was part of the main backend, every restart would break the watch and potentially miss +pod events. ## Reconciliation -Every five minutes, the monitor queries Kubernetes for all pods matching the labels and compares them to what it thinks it's tracking. This catches any events that might have been missed due to network issues or restarts — like doing a periodic inventory check to make sure nothing slipped through. +Every five minutes, the monitor queries Kubernetes for all pods matching the labels and compares them to what it +thinks it's tracking. This catches any events that might have been missed due to network issues or restarts — like +doing a periodic inventory check to make sure nothing slipped through. + +## Key files + +| File | Purpose | +|-----------------------------------------------------------------------------------------------------------------------|-----------------------| +| [`run_pod_monitor.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/workers/run_pod_monitor.py) | Entry point | +| [`monitor.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/pod_monitor/monitor.py) | Watch and reconcile | +| [`event_mapper.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/pod_monitor/event_mapper.py) | K8s → domain mapping | + +## Deployment + +```yaml +pod-monitor: + build: + dockerfile: workers/Dockerfile.pod_monitor +``` + +Runs as a single replica. The watch connection handles high throughput well. diff --git a/docs/components/workers/result_processor.md b/docs/components/workers/result_processor.md index b0e4cc66..dc24d130 100644 --- a/docs/components/workers/result_processor.md +++ b/docs/components/workers/result_processor.md @@ -1,8 +1,8 @@ -# Result processor +# Result Processor -## Overview - -The result processor consumes execution completion events from Kafka and persists results to MongoDB. It's the final step in the execution pipeline — once a pod finishes and the pod monitor publishes the outcome, the result processor stores everything and notifies downstream consumers. +The result processor consumes execution completion events from Kafka and persists results to MongoDB. It's the final +step in the execution pipeline — once a pod finishes and the [Pod Monitor](pod_monitor.md) publishes the outcome, +the result processor stores everything and notifies downstream consumers. ```mermaid graph LR @@ -15,16 +15,32 @@ graph LR ## What it does -When an `ExecutionCompletedEvent`, `ExecutionFailedEvent`, or `ExecutionTimeoutEvent` arrives, the processor stores the execution output (stdout, stderr, exit codes) in the `execution_results` collection and updates the execution status in the `executions` collection. It also records metrics like duration and memory usage. +When an `ExecutionCompletedEvent`, `ExecutionFailedEvent`, or `ExecutionTimeoutEvent` arrives, the processor stores +the execution output (stdout, stderr, exit codes) in the `execution_results` collection and updates the execution +status in the `executions` collection. It also records metrics like duration and memory usage. -After persisting, the processor publishes a `ResultStoredEvent` so downstream consumers (like SSE streams) know the execution has finished and results are available. +After persisting, the processor publishes a `ResultStoredEvent` so downstream consumers (like SSE streams) know the +execution has finished and results are available. -## Deployment +## Resource cleanup + +The processor also handles cleanup after executions complete. The [`resource_cleaner.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/result_processor/resource_cleaner.py) +module deletes ConfigMaps and pods that are no longer needed, keeping the Kubernetes namespace tidy. + +## Key files -The processor runs as a standalone container in the `result-processor-group` consumer group, subscribing to the `EXECUTION_RESULTS` topic. It depends on Kafka for event consumption, MongoDB for storage, and Schema Registry for event deserialization. +| File | Purpose | +|--------------------------------------------------------------------------------------------------------------------------------|------------------| +| [`run_result_processor.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/workers/run_result_processor.py) | Entry point | +| [`processor.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/result_processor/processor.py) | Result handling | +| [`resource_cleaner.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/result_processor/resource_cleaner.py) | K8s cleanup | + +## Deployment ```yaml result-processor: build: dockerfile: workers/Dockerfile.result_processor ``` + +Runs in the `result-processor-group` consumer group. Can scale horizontally if result throughput becomes a bottleneck. diff --git a/docs/components/workers/saga_orchestrator.md b/docs/components/workers/saga_orchestrator.md new file mode 100644 index 00000000..6507783c --- /dev/null +++ b/docs/components/workers/saga_orchestrator.md @@ -0,0 +1,65 @@ +# Saga Orchestrator + +The saga orchestrator is the stateful choreographer for the execution lifecycle. It coordinates multi-step workflows +by subscribing to execution events and publishing saga commands, ensuring that complex operations complete correctly +or get properly compensated on failure. + +```mermaid +graph LR + Kafka[(Kafka)] --> Saga[Saga Orchestrator] + Saga --> Commands[Saga Commands] + Commands --> Kafka + Saga --> Mongo[(MongoDB)] +``` + +## How it works + +When an execution request comes in, the saga orchestrator creates a new saga instance and drives it through its +lifecycle. Each saga tracks which steps have been completed and what compensation actions are needed if something +fails. + +The orchestrator issues commands like `CreatePodCommand` and `DeletePodCommand` to the [K8s Worker](k8s_worker.md). +It watches for responses and advances the saga state accordingly. If a step fails or times out, it triggers +compensation — like deleting a partially created pod. + +The clever part is idempotency. The orchestrator reconstructs saga state from events on restart, so it can resume +interrupted workflows without duplicate side effects. If a pod was already created, it won't try to create it again. + +## Saga states + +```mermaid +stateDiagram-v2 + [*] --> Started: execution_requested + Started --> Running: pod_created + Running --> Completed: execution_completed + Running --> Failed: execution_failed + Running --> TimedOut: timeout + Failed --> Compensating: start_compensation + TimedOut --> Compensating: start_compensation + Compensating --> Compensated: cleanup_complete + Completed --> [*] + Compensated --> [*] +``` + +## Topics + +- **Consumes**: `execution_events`, saga-related topics +- **Produces**: `saga_commands` + +## Key files + +| File | Purpose | +|-------------------------------------------------------------------------------------------------------------------------|---------------------------| +| [`run_saga_orchestrator.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/workers/run_saga_orchestrator.py) | Entry point | +| [`saga_service.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/saga/saga_service.py) | Saga state management | +| [`execution_saga.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/services/saga/execution_saga.py) | Execution saga definition | + +## Deployment + +```yaml +saga-orchestrator: + build: + dockerfile: workers/Dockerfile.saga_orchestrator +``` + +The orchestrator runs as a single replica since it's stateful. Event sourcing allows recovery after restarts. diff --git a/docs/frontend/routing.md b/docs/frontend/routing.md new file mode 100644 index 00000000..97a44512 --- /dev/null +++ b/docs/frontend/routing.md @@ -0,0 +1,164 @@ +# Frontend Routing + +The frontend is a Svelte 5 single-page application using `@mateothegreat/svelte5-router` for client-side routing. +Routes are defined in `App.svelte` with authentication guards for protected pages. + +## Route Overview + +| Route | Component | Auth Required | Admin Required | Description | +|-------------------|------------------------|---------------|----------------|----------------------------| +| `/` | `Home.svelte` | No | No | Landing page with features | +| `/login` | `Login.svelte` | No | No | User login form | +| `/register` | `Register.svelte` | No | No | User registration form | +| `/privacy` | `Privacy.svelte` | No | No | Privacy policy | +| `/editor` | `Editor.svelte` | Yes | No | Code execution interface | +| `/settings` | `Settings.svelte` | Yes | No | User preferences | +| `/notifications` | `Notifications.svelte` | Yes | No | Notification center | +| `/admin/events` | `AdminEvents.svelte` | Yes | Yes | Event browser | +| `/admin/sagas` | `AdminSagas.svelte` | Yes | Yes | Saga monitoring | +| `/admin/users` | `AdminUsers.svelte` | Yes | Yes | User management | +| `/admin/settings` | `AdminSettings.svelte` | Yes | Yes | System settings | + +## Route Configuration + +Routes are defined in `src/App.svelte` using the router's declarative syntax: + +```typescript +--8<-- "frontend/src/App.svelte:69:85" +``` + +## Authentication Guard + +The `requireAuth` hook checks the `isAuthenticated` store before allowing navigation: + +```typescript +--8<-- "frontend/src/App.svelte:55:67" +``` + +When an unauthenticated user tries to access a protected route: + +1. The original URL is saved to `sessionStorage` +2. User is redirected to `/login` +3. After successful login, user is redirected back to the original URL + +## Admin Role Check + +Admin routes perform an additional role check in the component: + +```typescript +// In AdminEvents.svelte, AdminUsers.svelte, etc. +import { userRole } from '$stores/auth'; +import { get } from 'svelte/store'; + +if (get(userRole) !== 'ADMIN') { + goto('/editor'); +} +``` + +## Navigation + +### Programmatic Navigation + +Use the `goto` function for programmatic navigation: + +```typescript +import { goto } from '@mateothegreat/svelte5-router'; + +// Navigate to editor +goto('/editor'); + +// Navigate with redirect after login +sessionStorage.setItem('redirectAfterLogin', '/editor'); +goto('/login'); +``` + +### Link Navigation + +Use the `route` directive for link navigation: + +```svelte +Open Editor +Event Browser +``` + +## Route Components + +### Public Pages + +| Component | Path | Purpose | +|-------------------|------------------------------|-------------------------------------| +| `Home.svelte` | `src/routes/Home.svelte` | Landing page with features overview | +| `Login.svelte` | `src/routes/Login.svelte` | Login form with credential handling | +| `Register.svelte` | `src/routes/Register.svelte` | Registration form with validation | +| `Privacy.svelte` | `src/routes/Privacy.svelte` | Privacy policy content | + +### Protected Pages + +| Component | Path | Purpose | +|------------------------|-----------------------------------|----------------------------------| +| `Editor.svelte` | `src/routes/Editor.svelte` | Code editor with execution | +| `Settings.svelte` | `src/routes/Settings.svelte` | User preferences (theme, editor) | +| `Notifications.svelte` | `src/routes/Notifications.svelte` | Notification list and management | + +### Admin Pages + +| Component | Path | Purpose | +|------------------------|-----------------------------------------|----------------------------| +| `AdminEvents.svelte` | `src/routes/admin/AdminEvents.svelte` | Event browser with filters | +| `AdminSagas.svelte` | `src/routes/admin/AdminSagas.svelte` | Saga status monitoring | +| `AdminUsers.svelte` | `src/routes/admin/AdminUsers.svelte` | User CRUD and rate limits | +| `AdminSettings.svelte` | `src/routes/admin/AdminSettings.svelte` | System configuration | + +## State Management + +Routes interact with several Svelte stores: + +| Store | Location | Purpose | +|-------------------|-------------------------------|---------------------------------| +| `isAuthenticated` | `stores/auth.ts` | Authentication state | +| `username` | `stores/auth.ts` | Current username | +| `userRole` | `stores/auth.ts` | User role (USER/ADMIN) | +| `csrfToken` | `stores/auth.ts` | CSRF token for API calls | +| `theme` | `stores/theme.ts` | Current theme (light/dark/auto) | +| `notifications` | `stores/notificationStore.ts` | Notification list | + +## SSE Connections + +Protected routes may establish SSE connections for real-time updates: + +- **Editor**: Connects to `/api/v1/events/executions/{id}` during execution +- **Notifications**: Connects to `/api/v1/events/notifications/stream` + +Connections are managed in the component lifecycle: + +```typescript +onMount(() => { + // Establish SSE connection + const eventSource = new EventSource('/api/v1/events/notifications/stream'); + eventSource.onmessage = (event) => { + // Handle notification + }; + + return () => { + eventSource.close(); + }; +}); +``` + +## Deep Linking + +The router supports deep linking with proper URL handling: + +- Direct URL access works for all routes +- 404 handling redirects to home or shows error +- Query parameters are preserved across navigation + +## Key Files + +| File | Purpose | +|----------------------------------------|---------------------------------| +| `src/App.svelte` | Route definitions and layout | +| `src/stores/auth.ts` | Authentication state | +| `src/lib/auth-init.ts` | Auth initialization on app load | +| `src/components/Header.svelte` | Navigation links | +| `src/components/ProtectedRoute.svelte` | Route protection wrapper | diff --git a/docs/index.md b/docs/index.md index 721d932f..878d7fde 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,6 +19,10 @@ cd Integr8sCode | Backend API | `https://localhost:443` | — | | Grafana | `http://localhost:3000` | admin / admin123 | +!!! warning "Default credentials" + The default credentials above are for **development only**. In production, set secure passwords via environment + variables and disable the user-seed job. + Verify the backend is running: ```bash @@ -93,17 +97,29 @@ For detailed architecture diagrams, see the [Architecture](architecture/overview Complete REST and SSE endpoint documentation -- :material-cog: **[Components](components/dead-letter-queue.md)** +- :material-cog: **[Components](components/workers/index.md)** + + --- + + Workers, SSE, DLQ, and Schema management + +- :material-wrench: **[Operations](operations/deployment.md)** + + --- + + Deployment, tracing, and monitoring + +- :material-variable: **[Environment Variables](reference/environment-variables.md)** --- - SSE, Workers, DLQ, and Schema management + Complete configuration reference -- :material-wrench: **[Operations](operations/tracing.md)** +- :material-monitor: **[Frontend](frontend/routing.md)** --- - Tracing, metrics, monitoring, and troubleshooting + Frontend routing and components diff --git a/docs/reference/api-reference.md b/docs/reference/api-reference.md index 2a9b1d58..ab8a2b75 100644 --- a/docs/reference/api-reference.md +++ b/docs/reference/api-reference.md @@ -1,3 +1,164 @@ -# API reference +# API Reference + +The API is served at `/api/v1/` and uses cookie-based JWT authentication with CSRF protection. + +## Quick Reference + +### Authentication + +| Method | Endpoint | Description | +|--------|----------|-------------| +| POST | `/auth/login` | Login with username/password | +| POST | `/auth/register` | Register new user | +| POST | `/auth/logout` | Logout and clear cookies | +| GET | `/auth/verify-token` | Verify current token | +| GET | `/auth/me` | Get current user profile | + +### Execution + +| Method | Endpoint | Description | +|--------|----------|-------------| +| POST | `/execute` | Execute a script | +| GET | `/result/{execution_id}` | Get execution result | +| POST | `/{execution_id}/cancel` | Cancel running execution | +| POST | `/{execution_id}/retry` | Retry execution | +| DELETE | `/{execution_id}` | Delete execution (admin) | +| GET | `/user/executions` | List user's executions | +| GET | `/executions/{execution_id}/events` | Get execution events | +| GET | `/k8s-limits` | Get resource limits and runtimes | +| GET | `/example-scripts` | Get example scripts | + +### Events & SSE + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/events/executions/{execution_id}` | SSE stream for execution | +| GET | `/events/notifications/stream` | SSE stream for notifications | +| GET | `/events/health` | SSE service health | +| GET | `/events/user` | Get user's events | +| GET | `/events/{event_id}` | Get specific event | +| POST | `/events/query` | Query events with filters | +| GET | `/events/correlation/{correlation_id}` | Get events by correlation ID | +| GET | `/events/statistics` | Event statistics | +| GET | `/events/types/list` | List event types | + +### Saved Scripts + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/scripts` | List saved scripts | +| POST | `/scripts` | Create saved script | +| GET | `/scripts/{script_id}` | Get saved script | +| PUT | `/scripts/{script_id}` | Update saved script | +| DELETE | `/scripts/{script_id}` | Delete saved script | + +### Notifications + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/notifications` | List notifications | +| PUT | `/notifications/{id}/read` | Mark as read | +| POST | `/notifications/mark-all-read` | Mark all as read | +| DELETE | `/notifications/{id}` | Delete notification | +| GET | `/notifications/subscriptions` | Get subscriptions | +| PUT | `/notifications/subscriptions/{channel}` | Update subscription | +| GET | `/notifications/unread-count` | Get unread count | + +### Sagas + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/sagas/` | List sagas | +| GET | `/sagas/{saga_id}` | Get saga status | +| GET | `/sagas/execution/{execution_id}` | Get sagas for execution | +| POST | `/sagas/{saga_id}/cancel` | Cancel saga | + +### User Settings + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/user-settings` | Get user settings | +| PUT | `/user-settings` | Update user settings | +| PUT | `/user-settings/theme` | Update theme | +| PUT | `/user-settings/editor` | Update editor settings | + +### Replay + +| Method | Endpoint | Description | +|--------|----------|-------------| +| POST | `/replay/sessions` | Create replay session | +| POST | `/replay/sessions/{id}/start` | Start session | +| POST | `/replay/sessions/{id}/pause` | Pause session | +| POST | `/replay/sessions/{id}/resume` | Resume session | +| POST | `/replay/sessions/{id}/cancel` | Cancel session | +| GET | `/replay/sessions` | List sessions | +| GET | `/replay/sessions/{id}` | Get session details | +| POST | `/replay/cleanup` | Clean up old sessions | + +### Dead Letter Queue + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/dlq/stats` | DLQ statistics | +| GET | `/dlq/messages` | List DLQ messages | +| GET | `/dlq/messages/{event_id}` | Get DLQ message | +| POST | `/dlq/retry` | Retry messages | +| POST | `/dlq/retry-policy` | Set retry policy | +| DELETE | `/dlq/messages/{event_id}` | Discard message | +| GET | `/dlq/topics` | Get topics summary | + +### Health + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/health/live` | Liveness probe | +| GET | `/health/ready` | Readiness probe | + +### Admin - Users + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/admin/users/` | List all users | +| POST | `/admin/users/` | Create user | +| GET | `/admin/users/{user_id}` | Get user details | +| GET | `/admin/users/{user_id}/overview` | Get user overview | +| PUT | `/admin/users/{user_id}` | Update user | +| DELETE | `/admin/users/{user_id}` | Delete user | +| POST | `/admin/users/{user_id}/reset-password` | Reset password | +| GET | `/admin/users/{user_id}/rate-limits` | Get rate limits | +| PUT | `/admin/users/{user_id}/rate-limits` | Update rate limits | +| POST | `/admin/users/{user_id}/rate-limits/reset` | Reset rate limits | + +### Admin - Settings + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/admin/settings/` | Get system settings | +| PUT | `/admin/settings/` | Update settings | +| POST | `/admin/settings/reset` | Reset to defaults | + +### Admin - Events + +| Method | Endpoint | Description | +|--------|----------|-------------| +| POST | `/admin/events/browse` | Browse events | +| GET | `/admin/events/stats` | Event statistics | +| GET | `/admin/events/{event_id}` | Get event detail | +| DELETE | `/admin/events/{event_id}` | Delete event | +| POST | `/admin/events/replay` | Replay events | +| GET | `/admin/events/replay/{id}/status` | Get replay status | +| GET | `/admin/events/export/csv` | Export as CSV | +| GET | `/admin/events/export/json` | Export as JSON | + +### Grafana Alerts + +| Method | Endpoint | Description | +|--------|----------|-------------| +| POST | `/alerts/grafana` | Receive Grafana webhook | +| GET | `/alerts/grafana/test` | Test endpoint | + +--- + +## Interactive Documentation diff --git a/docs/reference/environment-variables.md b/docs/reference/environment-variables.md new file mode 100644 index 00000000..c1888210 --- /dev/null +++ b/docs/reference/environment-variables.md @@ -0,0 +1,165 @@ +# Environment Variables Reference + +Complete reference of all environment variables used by the Integr8sCode backend. + +## Core Configuration + +| Variable | Default | Description | +|-------------------------------|--------------------------------------|-------------------------------------| +| `PROJECT_NAME` | `integr8scode` | Application name | +| `DATABASE_NAME` | `integr8scode_db` | MongoDB database name | +| `MONGODB_URL` | `mongodb://mongo:27017/integr8scode` | MongoDB connection URL | +| `SECRET_KEY` | *required* | JWT signing key (min 32 characters) | +| `ALGORITHM` | `HS256` | JWT signing algorithm | +| `ACCESS_TOKEN_EXPIRE_MINUTES` | `1440` | Token lifetime (24 hours) | +| `API_V1_STR` | `/api/v1` | API version prefix | +| `APP_URL` | `https://integr8scode.cc` | Public application URL | + +## Server Configuration + +| Variable | Default | Description | +|-------------------|-------------------------|---------------------------| +| `SERVER_HOST` | `localhost` | Server bind address | +| `SERVER_PORT` | `443` | Server port | +| `SSL_KEYFILE` | `/app/certs/server.key` | SSL private key path | +| `SSL_CERTFILE` | `/app/certs/server.crt` | SSL certificate path | +| `WEB_CONCURRENCY` | `4` | Gunicorn worker count | +| `WEB_THREADS` | `1` | Threads per worker | +| `WEB_TIMEOUT` | `60` | Request timeout (seconds) | +| `WEB_BACKLOG` | `2048` | Connection backlog | + +## Kubernetes Configuration + +| Variable | Default | Description | +|----------------------------------|------------------|-----------------------------| +| `KUBERNETES_CONFIG_PATH` | `~/.kube/config` | Kubeconfig file path | +| `KUBERNETES_CA_CERTIFICATE_PATH` | *none* | Custom CA certificate path | +| `K8S_POD_CPU_LIMIT` | `1000m` | Pod CPU limit | +| `K8S_POD_MEMORY_LIMIT` | `128Mi` | Pod memory limit | +| `K8S_POD_CPU_REQUEST` | `1000m` | Pod CPU request | +| `K8S_POD_MEMORY_REQUEST` | `128Mi` | Pod memory request | +| `K8S_POD_EXECUTION_TIMEOUT` | `300` | Execution timeout (seconds) | +| `K8S_POD_PRIORITY_CLASS_NAME` | *none* | Pod priority class | + +## Kafka Configuration + +| Variable | Default | Description | +|----------------------------|-------------------------------|-------------------------------------| +| `KAFKA_BOOTSTRAP_SERVERS` | `kafka:29092` | Kafka broker addresses | +| `SCHEMA_REGISTRY_URL` | `http://schema-registry:8081` | Schema Registry URL | +| `SCHEMA_REGISTRY_AUTH` | *none* | Registry auth (`username:password`) | +| `ENABLE_EVENT_STREAMING` | `false` | Enable Kafka event streaming | +| `EVENT_RETENTION_DAYS` | `30` | Event retention period | +| `KAFKA_TOPIC_PREFIX` | `pref` | Topic name prefix | +| `KAFKA_GROUP_SUFFIX` | `suff` | Consumer group suffix | +| `KAFKA_CONSUMER_GROUP_ID` | `integr8scode-backend` | Default consumer group | +| `KAFKA_AUTO_OFFSET_RESET` | `earliest` | Offset reset policy | +| `KAFKA_ENABLE_AUTO_COMMIT` | `true` | Auto-commit offsets | +| `KAFKA_SESSION_TIMEOUT_MS` | `30000` | Session timeout | +| `KAFKA_MAX_POLL_RECORDS` | `500` | Max poll batch size | + +## Redis Configuration + +| Variable | Default | Description | +|--------------------------|---------|-----------------------------| +| `REDIS_HOST` | `redis` | Redis server host | +| `REDIS_PORT` | `6379` | Redis server port | +| `REDIS_DB` | `0` | Redis database number | +| `REDIS_PASSWORD` | *none* | Redis password | +| `REDIS_SSL` | `false` | Enable SSL/TLS | +| `REDIS_MAX_CONNECTIONS` | `50` | Connection pool size | +| `REDIS_DECODE_RESPONSES` | `true` | Decode responses to strings | + +## Rate Limiting Configuration + +| Variable | Default | Description | +|-------------------------------|------------------|------------------------------------------------| +| `RATE_LIMIT_ENABLED` | `true` | Enable rate limiting | +| `RATE_LIMITS` | `100/minute` | Default rate limit string | +| `RATE_LIMIT_DEFAULT_REQUESTS` | `100` | Default request limit | +| `RATE_LIMIT_DEFAULT_WINDOW` | `60` | Default window (seconds) | +| `RATE_LIMIT_BURST_MULTIPLIER` | `1.5` | Token bucket burst factor | +| `RATE_LIMIT_REDIS_PREFIX` | `rate_limit:` | Redis key prefix | +| `RATE_LIMIT_ALGORITHM` | `sliding_window` | Algorithm (`sliding_window` or `token_bucket`) | + +## SSE Configuration + +| Variable | Default | Description | +|--------------------------------------|---------|------------------------------| +| `SSE_CONSUMER_POOL_SIZE` | `10` | Kafka consumer pool size | +| `SSE_HEARTBEAT_INTERVAL` | `30` | Heartbeat interval (seconds) | +| `WEBSOCKET_PING_INTERVAL` | `30` | Connection ping interval | +| `WEBSOCKET_PING_TIMEOUT` | `10` | Ping timeout | +| `WEBSOCKET_MAX_CONNECTIONS_PER_USER` | `5` | Max connections per user | +| `WEBSOCKET_STALE_CONNECTION_TIMEOUT` | `300` | Stale connection timeout | + +## Notification Configuration + +| Variable | Default | Description | +|-------------------------------|---------|-------------------------------| +| `NOTIF_THROTTLE_WINDOW_HOURS` | `1` | Throttle window (hours) | +| `NOTIF_THROTTLE_MAX_PER_HOUR` | `5` | Max notifications per hour | +| `NOTIF_PENDING_BATCH_SIZE` | `10` | Pending batch size | +| `NOTIF_OLD_DAYS` | `30` | Notification retention (days) | +| `NOTIF_RETRY_DELAY_MINUTES` | `5` | Retry delay (minutes) | + +## Dead Letter Queue Configuration + +| Variable | Default | Description | +|--------------------------------|----------|--------------------------| +| `DLQ_RETRY_MAX_ATTEMPTS` | `5` | Maximum retry attempts | +| `DLQ_RETRY_BASE_DELAY_SECONDS` | `60.0` | Base retry delay | +| `DLQ_RETRY_MAX_DELAY_SECONDS` | `3600.0` | Maximum retry delay | +| `DLQ_RETENTION_DAYS` | `7` | DLQ message retention | +| `DLQ_WARNING_THRESHOLD` | `100` | Warning threshold count | +| `DLQ_CRITICAL_THRESHOLD` | `1000` | Critical threshold count | + +## Tracing Configuration (OpenTelemetry) + +| Variable | Default | Description | +|-------------------------------|------------------------|--------------------------------| +| `ENABLE_TRACING` | `true` | Enable distributed tracing | +| `TRACING_SERVICE_NAME` | `integr8scode-backend` | Service name for traces | +| `TRACING_SERVICE_VERSION` | `1.0.0` | Service version | +| `TRACING_SAMPLING_RATE` | `0.1` | Sampling rate (0.0 to 1.0) | +| `TRACING_ADAPTIVE_SAMPLING` | `false` | Enable adaptive sampling | +| `JAEGER_AGENT_HOST` | `jaeger` | Jaeger agent host | +| `JAEGER_AGENT_PORT` | `6831` | Jaeger agent UDP port | +| `JAEGER_COLLECTOR_ENDPOINT` | *none* | Jaeger HTTP collector URL | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | *none* | OTLP exporter endpoint | +| `OTEL_SERVICE_NAME` | *none* | OTLP service name override | +| `OTEL_SERVICE_VERSION` | *none* | OTLP version override | +| `OTEL_RESOURCE_ATTRIBUTES` | *none* | Additional resource attributes | + +## Schema Configuration + +| Variable | Default | Description | +|----------------------|--------------------|-------------------------| +| `SCHEMA_BASE_PATH` | `app/schemas_avro` | Base path for schemas | +| `SCHEMA_AVRO_PATH` | `app/schemas_avro` | Avro schema path | +| `SCHEMA_CONFIG_PATH` | *none* | Schema config file path | + +## Development Configuration + +| Variable | Default | Description | +|--------------------|---------|-------------------------| +| `TESTING` | `false` | Enable test mode | +| `DEVELOPMENT_MODE` | `false` | Enable development mode | +| `SECURE_COOKIES` | `true` | Require secure cookies | +| `LOG_LEVEL` | `DEBUG` | Logging level | + +## MongoDB Docker Configuration + +These are used by Docker Compose for MongoDB initialization: + +| Variable | Default | Description | +|-----------------------|---------|-----------------------| +| `MONGO_ROOT_USER` | *none* | MongoDB root username | +| `MONGO_ROOT_PASSWORD` | *none* | MongoDB root password | + +## Service Metadata + +| Variable | Default | Description | +|-------------------|------------------------|--------------------| +| `SERVICE_NAME` | `integr8scode-backend` | Service identifier | +| `SERVICE_VERSION` | `1.0.0` | Service version | diff --git a/mkdocs.yml b/mkdocs.yml index c4c622af..45aa85c0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -125,6 +125,7 @@ nav: - Service Lifecycle: architecture/lifecycle.md - API Reference: reference/api-reference.md + - Environment Variables: reference/environment-variables.md - Components: - SSE: @@ -132,8 +133,14 @@ nav: - Partitioned Router: components/sse/sse-partitioned-architecture.md - Execution Flow: components/sse/execution-sse-flow.md - Workers: + - Overview: components/workers/index.md + - Saga Orchestrator: components/workers/saga_orchestrator.md + - K8s Worker: components/workers/k8s_worker.md - Pod Monitor: components/workers/pod_monitor.md - Result Processor: components/workers/result_processor.md + - Coordinator: components/workers/coordinator.md + - Event Replay: components/workers/event_replay.md + - DLQ Processor: components/workers/dlq_processor.md - Saga: - Resource Allocation: components/saga/resource-allocation.md - Dead Letter Queue: components/dead-letter-queue.md @@ -157,6 +164,7 @@ nav: - Network Isolation: security/policies.md - Frontend: + - Routing: frontend/routing.md - Error Handling: frontend/error-handling.md - Testing: From 03c7cd8e1faca7edd534e96c348cb5f1349b1180 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sun, 4 Jan 2026 23:59:36 +0100 Subject: [PATCH 2/2] added more docs --- docs/components/workers/dlq_processor.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/components/workers/dlq_processor.md b/docs/components/workers/dlq_processor.md index 3e7e7497..146bc57f 100644 --- a/docs/components/workers/dlq_processor.md +++ b/docs/components/workers/dlq_processor.md @@ -43,8 +43,11 @@ The DLQ can be monitored via the admin API: - `GET /api/v1/dlq/stats` — DLQ statistics by status, topic, event type - `GET /api/v1/dlq/messages` — List DLQ messages with filtering +- `GET /api/v1/dlq/messages/{event_id}` — Retrieve a specific message by ID +- `GET /api/v1/dlq/topics` — List all topics with DLQ messages - `POST /api/v1/dlq/retry` — Manually retry messages -- `DELETE /api/v1/dlq/messages/{id}` — Discard a message +- `POST /api/v1/dlq/retry-policy` — Configure retry policy for a topic +- `DELETE /api/v1/dlq/messages/{event_id}` — Discard a message ## Key files