diff --git a/backend/api/routes_sync.py b/backend/api/routes_sync.py index e2f37709..40265c80 100644 --- a/backend/api/routes_sync.py +++ b/backend/api/routes_sync.py @@ -49,20 +49,22 @@ def get_hygraph_service() -> HygraphService: def _error_envelope(code: str, message: str, details: Optional[dict] = None) -> Dict[str, Any]: - """ - Builds a standardized error envelope for API responses. - - Parameters: - code (str): Machine-readable error code identifying the error. - message (str): Human-readable error message describing the failure. - details (Optional[dict]): Additional contextual information; defaults to an empty dict when not provided. - - Returns: - Dict[str, Any]: Dictionary with keys `ok` (False) and `error` containing `code`, `message`, and `details`. - """ + """Build a standardized error envelope for API responses.""" + return {"ok": False, "error": {"code": code, "message": message, "details": details or {}}} +def _processed_value(payload: Any) -> int: + if isinstance(payload, dict): + value = payload.get("processed", 0) + else: + value = payload + try: + return int(value or 0) + except Exception: # noqa: BLE001 + return 0 + + @router.post( "/hygraph", status_code=status.HTTP_202_ACCEPTED, @@ -125,10 +127,11 @@ async def hygraph_webhook( return JSONResponse({"ok": True, "dedup": True}, status_code=200) try: - payload = json.loads(raw) if raw else {} + _ = json.loads(raw) if raw else {} except json.JSONDecodeError: raise HTTPException(status_code=400, detail=_error_envelope("BAD_REQUEST", "Invalid JSON payload")) + async def _process(event_id_local: Optional[str], body_sha_local: str) -> None: t0 = time.perf_counter() with _tracer.start_as_current_span("hygraph.pull_all") as pull_span: @@ -168,7 +171,6 @@ async def _process(event_id_local: Optional[str], body_sha_local: str) -> None: ) background.add_task(_process, event_id, body_sha) - # Explicit background attachment ensures Starlette runs it before TestClient returns return JSONResponse({"ok": True, "accepted": True}, status_code=202, background=background) @@ -177,12 +179,8 @@ async def hygraph_pull( body: Dict[str, Any] = Body(...), db: Session = Depends(get_db), ) -> Dict[str, Any]: - """ - Admin pull: - - Auth via Bearer token (constant-time compare) - - Accepts "type" or "sync_type" + optional "page_size" - - Validates positive page_size and caps inside service (≤200) - """ + """Admin-triggered Hygraph syncs supporting manual pull types.""" + sync_type = str((body.get("type") or body.get("sync_type") or "")).lower().strip() page_size_raw = body.get("page_size") page_size: Optional[int] = None @@ -238,6 +236,25 @@ async def hygraph_pull( record_sync_duration("all", (time.perf_counter() - start) * 1000) else: raise HTTPException(status_code=400, detail=_error_envelope("BAD_REQUEST", "unsupported type")) + except CircuitOpenError as exc: + circuit_type = sync_type or "all" + hygraph_cb_trips_total.labels(circuit_type).inc() + fallback_payload = exc.fallback + if fallback_payload: + hygraph_cb_fallback_total.labels(circuit_type).inc() + return { + "ok": True, + "breaker_open": True, + "data": fallback_payload, + } + raise HTTPException( + status_code=503, + detail=_error_envelope( + "SERVICE_UNAVAILABLE", + "Hygraph circuit breaker open and no cached data available", + {"type": circuit_type}, + ), + ) except HTTPException: raise except CircuitOpenError as exc: @@ -256,4 +273,4 @@ async def hygraph_pull( logger.exception("hygraph_pull_failure", extra={"type": sync_type, "error": str(e)}) raise HTTPException(status_code=500, detail=_error_envelope("INTERNAL", "sync failed", {"type": sync_type})) - return {"ok": True, "data": counts} \ No newline at end of file + return {"ok": True, "data": counts} diff --git a/docs/backend-development.md b/docs/backend-development.md index 0592502c..87d1ccb2 100644 --- a/docs/backend-development.md +++ b/docs/backend-development.md @@ -173,3 +173,24 @@ FastAPI automatically generates interactive API documentation. When the server i - Swagger UI: http://localhost:8000/docs - ReDoc: http://localhost:8000/redoc + +## Hygraph Circuit Breaker Runbook + +The Hygraph service layer now caches payloads in Redis (or an in-memory fallback) and guards outbound GraphQL traffic with a circuit breaker. Use the following checklist when operating the integration: + +### Warming the Cache + +1. Ensure the Redis instance defined by `HYGRAPH_CACHE_URL`/`REDIS_URL` is reachable from the backend container. +2. Temporarily disable the circuit breaker by resetting it so fresh responses flow from Hygraph. +3. Trigger each sync endpoint with a small page size to populate cached datasets: + - `POST /api/sync/hygraph/pull` with `{"type": "materials"}` + - `POST /api/sync/hygraph/pull` with `{"type": "modules"}` + - `POST /api/sync/hygraph/pull` with `{"type": "systems"}` +4. Confirm the Redis keys `hygraph:materials`, `hygraph:modules`, and `hygraph:systems` contain JSON payloads with the expected counts. + +### Exercising Breaker States + +1. **Closed → Open**: Stop outbound connectivity (e.g., block the Hygraph host with `iptables` or override DNS). Issue a pull request; after `failure_threshold` attempts (default 5) the breaker opens and the Prometheus counter `hygraph_cb_trips_total` increments. +2. **Open → Fallback**: While the breaker is open, hit the same endpoint again. The API should return cached results with `breaker_open=true` and increment `hygraph_cb_fallback_total`. +3. **Half-Open → Closed**: Restore connectivity and wait `recovery_timeout` seconds (default 30). The next request is allowed through; if it succeeds, the breaker closes and live data refreshes the cache. +4. Monitor `/metrics` to verify the counters and ensure `sync_failure_total` remains flat when serving cached responses.