shayancoin · shayancoin · Oct 18, 2025 · Oct 17, 2025 · Oct 18, 2025 · coderabbitai
diff --git a/backend/api/config.py b/backend/api/config.py
@@ -1,6 +1,6 @@
 """Configuration settings for the FastAPI application."""
 
-from functools import lru_cache
+import os
 from typing import Any, Dict
 
 from pydantic import Field, field_validator
@@ -62,11 +62,26 @@ def _validate_hygraph_webhook_secret(cls, value: str) -> str:
         return value.strip()
 
 
-@lru_cache()
+_SETTINGS_CACHE: Settings | None = None
+_LAST_SECRET: str | None = None
+
+
 def get_settings() -> Settings:
-    """Return a cached instance of :class:`Settings`."""
+    """Return a cached instance of :class:`Settings`.
+
+    The cache invalidates when the raw ``HYGRAPH_WEBHOOK_SECRET`` environment
+    variable changes so we don't rebuild :class:`Settings` unnecessarily.
+    """
+
+    global _SETTINGS_CACHE, _LAST_SECRET
+
+    current_secret = os.getenv("HYGRAPH_WEBHOOK_SECRET")
+    if _SETTINGS_CACHE is None or _LAST_SECRET != current_secret:
+        settings = Settings()
+        _SETTINGS_CACHE = settings
+        _LAST_SECRET = current_secret
 
-    return Settings()
+    return _SETTINGS_CACHE
 
 
 def get_fastapi_settings() -> Dict[str, Any]:

diff --git a/backend/api/instrumentation_boot.py b/backend/api/instrumentation_boot.py
@@ -0,0 +1,133 @@
+"""Utilities to configure OpenTelemetry exporters for the API service."""
+
+from __future__ import annotations
+
+import logging
+import os
+from typing import Any, Final
+
+_LOGGER = logging.getLogger(__name__)
+
+_TRUE_VALUES: Final[set[str]] = {"1", "true", "t", "yes", "y", "on"}
+_BOOTSTRAPPED: bool = False
+
+
+def _env_flag(name: str, *, default: bool = False) -> bool:
+    """Parse a boolean environment variable using common truthy values."""
+
+    raw_value = os.getenv(name)
+    if raw_value is None:
+        return default
+
+    value = raw_value.strip().lower()
+    if not value:
+        return default
+    return value in _TRUE_VALUES
+
+
+def _parse_headers(raw_headers: str) -> dict[str, str]:
+    """Convert OTLP header strings into a mapping."""
+
+    parsed: dict[str, str] = {}
+    for item in raw_headers.split(","):
+        if "=" not in item:
+            continue
+        key, value = item.split("=", 1)
+        key = key.strip()
+        if not key:
+            continue
+        parsed[key] = value.strip()
+    return parsed
+
+
+def _span_exporter_kwargs() -> dict[str, Any]:
+    """Build keyword arguments for the OTLP span exporter."""
+
+    kwargs: dict[str, Any] = {
+        "insecure": _env_flag("OTEL_EXPORTER_OTLP_INSECURE", default=False)
+    }
+
+    endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT") or os.getenv(
+        "OTEL_EXPORTER_OTLP_ENDPOINT"
+    )
+    if endpoint:
+        kwargs["endpoint"] = endpoint
+
+    headers = os.getenv("OTEL_EXPORTER_OTLP_TRACES_HEADERS") or os.getenv(
+        "OTEL_EXPORTER_OTLP_HEADERS"
+    )
+    if headers:
+        kwargs["headers"] = _parse_headers(headers)
+
+    return kwargs
+
+
+def _metric_exporter_kwargs() -> dict[str, Any]:
+    """Build keyword arguments for the OTLP metric exporter."""
+
+    kwargs: dict[str, Any] = {
+        "insecure": _env_flag("OTEL_EXPORTER_OTLP_METRICS_INSECURE", default=False)
+    }
+
+    endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT") or os.getenv(
+        "OTEL_EXPORTER_OTLP_ENDPOINT"
+    )
+    if endpoint:
+        kwargs["endpoint"] = endpoint
+
+    headers = os.getenv("OTEL_EXPORTER_OTLP_METRICS_HEADERS") or os.getenv(
+        "OTEL_EXPORTER_OTLP_HEADERS"
+    )
+    if headers:
+        kwargs["headers"] = _parse_headers(headers)
+
+    return kwargs
+
+
+def bootstrap() -> None:
+    """Initialise OpenTelemetry tracing and metrics exporters."""
+
+    global _BOOTSTRAPPED
+
+    if _BOOTSTRAPPED:
+        return
+
+    try:
+        from opentelemetry import metrics as otel_metrics
+        from opentelemetry import trace as otel_trace
+        from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (
+            OTLPMetricExporter,
+        )
+        from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+            OTLPSpanExporter,
+        )
+        from opentelemetry.sdk.metrics import MeterProvider
+        from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+        from opentelemetry.sdk.resources import Resource
+        from opentelemetry.sdk.trace import TracerProvider
+        from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    except ImportError as exc:  # pragma: no cover - optional dependency guard
+        _LOGGER.debug("OpenTelemetry packages are unavailable: %s", exc)
+        return
+
+    resource = Resource.create(
+        {
+            "service.name": os.getenv("OTEL_SERVICE_NAME", "paform-backend"),
+            "service.namespace": os.getenv("OTEL_SERVICE_NAMESPACE", "paform"),
+        }
+    )
+
+    tracer_provider = TracerProvider(resource=resource)
+    span_exporter = OTLPSpanExporter(**_span_exporter_kwargs())
+    tracer_provider.add_span_processor(BatchSpanProcessor(span_exporter))
+    otel_trace.set_tracer_provider(tracer_provider)
+
+    metric_exporter = OTLPMetricExporter(**_metric_exporter_kwargs())
+    meter_provider = MeterProvider(
+        resource=resource,
+        metric_readers=[PeriodicExportingMetricReader(metric_exporter)],
+    )
+    otel_metrics.set_meter_provider(meter_provider)
+
+    _BOOTSTRAPPED = True
+
diff --git a/backend/api/main.py b/backend/api/main.py
@@ -19,14 +19,26 @@
 from api.routes_sync import router as sync_router
 from api.routes_observability import router as observability_router
 from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
+from prometheus_fastapi_instrumentator import Instrumentator
-from prometheus_fastapi_instrumentator import Instrumentator
+try:  # pragma: no cover - optional dependency guard
+    from prometheus_fastapi_instrumentator import Instrumentator
+except ImportError:  # pragma: no cover
+    Instrumentator = None  # type: ignore[assignment]
-from prometheus_fastapi_instrumentator import Instrumentator
+try:  # pragma: no cover - optional dependency guard
+    from prometheus_fastapi_instrumentator import Instrumentator
+except ImportError:  # pragma: no cover
+    Instrumentator = None  # type: ignore[assignment]
 from api.metrics import observe_http_request
 
+try:  # pragma: no cover - instrumentation is optional in some environments
+    from api.instrumentation_boot import bootstrap as bootstrap_instrumentation
+except ImportError:  # pragma: no cover - optional dependency guard
+    bootstrap_instrumentation = None  # type: ignore[assignment]
+
 # Configure logging
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger(__name__)
 
+if bootstrap_instrumentation is not None:  # pragma: no branch - simple guard
+    try:
+        bootstrap_instrumentation()
+    except Exception:  # pragma: no cover - defensive guard
+        logger.exception("Failed to bootstrap OpenTelemetry instrumentation")
+
 # Initialize settings
 settings = get_settings()
 
@@ -74,13 +86,15 @@ async def dispatch(self, request: Request, call_next):  # type: ignore[override]
         start = perf_counter()
         try:
             response = await call_next(request)
+            if response is None:  # pragma: no cover - FastAPI guarantees a Response
+                raise RuntimeError("ASGI call_next returned None response")
         except Exception:
             duration = perf_counter() - start
             observe_http_request(
                 service="backend",
                 route=_resolve_route_label(request),
                 method=request.method,
-                status="500",
+                status_code=500,
                 duration_seconds=duration,
             )
             raise
@@ -90,7 +104,7 @@ async def dispatch(self, request: Request, call_next):  # type: ignore[override]
             service="backend",
             route=_resolve_route_label(request),
             method=request.method,
-            status=str(response.status_code),
+            status_code=response.status_code,
             duration_seconds=duration,
         )
         return response
@@ -152,4 +166,4 @@ async def healthcheck() -> Dict[str, str]:
 @app.get("/metrics")
 async def metrics() -> Response:
     # Expose Prometheus metrics including default process/python collectors
-    return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
+    return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
diff --git a/backend/api/metrics.py b/backend/api/metrics.py
@@ -1,6 +1,36 @@
 from __future__ import annotations
+
+from typing import Any
+
 from prometheus_client import Counter, Histogram
 
+_otel_meter: Any | None = None
+_http_request_counter: Any | None = None
+_http_request_duration_histogram: Any | None = None
+_web_vitals_lcp_histogram: Any | None = None
+
+try:  # pragma: no cover - optional dependency guard
+    from opentelemetry import metrics as otel_metrics
+except ImportError:  # pragma: no cover - optional dependency guard
+    otel_metrics = None  # type: ignore[assignment]
+else:
+    _otel_meter = otel_metrics.get_meter(__name__)
+    _http_request_counter = _otel_meter.create_counter(
+        "http.server.request.count",
+        description="HTTP requests processed by the backend",
+        unit="1",
+    )
+    _http_request_duration_histogram = _otel_meter.create_histogram(
+        "http.server.duration",
+        description="Duration of HTTP requests handled by the backend",
+        unit="s",
+    )
+    _web_vitals_lcp_histogram = _otel_meter.create_histogram(
+        "frontend.web_vitals.lcp",
+        description="Largest Contentful Paint reported from the frontend",
+        unit="s",
+    )
+
 REQUEST_LATENCY_BUCKETS = (
     0.005,
     0.01,
@@ -61,21 +91,41 @@ def observe_http_request(
     service: str,
     route: str,
     method: str,
-    status: str,
+    status_code: int | str,
     duration_seconds: float,
 ) -> None:
     """Record a single HTTP request observation."""
 
+    status_label = str(status_code)
     http_requests_total.labels(
-        service=service, route=route, method=method, status=status
+        service=service, route=route, method=method, status=status_label
     ).inc()
     http_request_duration_seconds.labels(
-        service=service, route=route, method=method, status=status
+        service=service, route=route, method=method, status=status_label
     ).observe(duration_seconds)
 
+    if _http_request_counter is not None or _http_request_duration_histogram is not None:
+        attributes: dict[str, str | int] = {
+            "service.name": service,
+            "http.route": route,
+            "http.method": method,
+        }
+        try:
+            attributes["http.status_code"] = int(status_code)
+        except (TypeError, ValueError):
+            # Skip the status code attribute if it cannot be coerced to an int.
+            pass
+
+        if _http_request_counter is not None:
+            _http_request_counter.add(1, attributes)
+        if _http_request_duration_histogram is not None:
+            _http_request_duration_histogram.record(duration_seconds, attributes)
+
 
 def observe_lcp(*, app: str, seconds: float) -> None:
     """Record a Largest Contentful Paint measurement in seconds."""
 
     web_vitals_lcp.labels(app=app).observe(seconds)
+    if _web_vitals_lcp_histogram is not None:
+        _web_vitals_lcp_histogram.record(seconds, {"app": app})