Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions backend/api/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Configuration settings for the FastAPI application."""

from functools import lru_cache
import os
from typing import Any, Dict

from pydantic import Field, field_validator
Expand Down Expand Up @@ -62,11 +62,26 @@ def _validate_hygraph_webhook_secret(cls, value: str) -> str:
return value.strip()


@lru_cache()
_SETTINGS_CACHE: Settings | None = None
_LAST_SECRET: str | None = None


def get_settings() -> Settings:
"""Return a cached instance of :class:`Settings`."""
"""Return a cached instance of :class:`Settings`.

The cache invalidates when the raw ``HYGRAPH_WEBHOOK_SECRET`` environment
variable changes so we don't rebuild :class:`Settings` unnecessarily.
"""

global _SETTINGS_CACHE, _LAST_SECRET

current_secret = os.getenv("HYGRAPH_WEBHOOK_SECRET")
if _SETTINGS_CACHE is None or _LAST_SECRET != current_secret:
settings = Settings()
_SETTINGS_CACHE = settings
_LAST_SECRET = current_secret

return Settings()
return _SETTINGS_CACHE


def get_fastapi_settings() -> Dict[str, Any]:
Expand Down
133 changes: 133 additions & 0 deletions backend/api/instrumentation_boot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""Utilities to configure OpenTelemetry exporters for the API service."""

from __future__ import annotations

import logging
import os
from typing import Any, Final

_LOGGER = logging.getLogger(__name__)

_TRUE_VALUES: Final[set[str]] = {"1", "true", "t", "yes", "y", "on"}
_BOOTSTRAPPED: bool = False


def _env_flag(name: str, *, default: bool = False) -> bool:
"""Parse a boolean environment variable using common truthy values."""

raw_value = os.getenv(name)
if raw_value is None:
return default

value = raw_value.strip().lower()
if not value:
return default
return value in _TRUE_VALUES


def _parse_headers(raw_headers: str) -> dict[str, str]:
"""Convert OTLP header strings into a mapping."""

parsed: dict[str, str] = {}
for item in raw_headers.split(","):
if "=" not in item:
continue
key, value = item.split("=", 1)
key = key.strip()
if not key:
continue
parsed[key] = value.strip()
return parsed


def _span_exporter_kwargs() -> dict[str, Any]:
"""Build keyword arguments for the OTLP span exporter."""

kwargs: dict[str, Any] = {
"insecure": _env_flag("OTEL_EXPORTER_OTLP_INSECURE", default=False)
}

endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT") or os.getenv(
"OTEL_EXPORTER_OTLP_ENDPOINT"
)
if endpoint:
kwargs["endpoint"] = endpoint

headers = os.getenv("OTEL_EXPORTER_OTLP_TRACES_HEADERS") or os.getenv(
"OTEL_EXPORTER_OTLP_HEADERS"
)
if headers:
kwargs["headers"] = _parse_headers(headers)

return kwargs

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Consider exporter timeouts and TLS options

Exporters can hang on network issues. Provide a bounded timeout and, if applicable, certificate configuration via env to avoid stalls.

Example:

-    kwargs: dict[str, Any] = {
-        "insecure": _env_flag("OTEL_EXPORTER_OTLP_INSECURE", default=False)
-    }
+    kwargs: dict[str, Any] = {
+        "insecure": _env_flag("OTEL_EXPORTER_OTLP_INSECURE", default=False),
+        "timeout": float(os.getenv("OTEL_EXPORTER_OTLP_TIMEOUT", "10")),
+    }

Repeat analogously for metrics kwargs.

Also applies to: 65-85

🤖 Prompt for AI Agents
In backend/api/instrumentation_boot.py around lines 43 to 63 (and similarly
update lines ~65-85 for metrics), the span exporter kwargs lack a bounded
network timeout and TLS/certificate configuration; add reading of environment
variables (e.g. OTEL_EXPORTER_OTLP_TRACES_TIMEOUT or OTEL_EXPORTER_OTLP_TIMEOUT
for spans, and OTEL_EXPORTER_OTLP_METRICS_TIMEOUT for metrics) to parse a
numeric timeout and include it as a "timeout" value in the kwargs, and read
certificate-related env vars (e.g. OTEL_EXPORTER_OTLP_CERTIFICATE or
OTEL_EXPORTER_OTLP_TRACES_CERTIFICATE and their metrics equivalents) to include
the appropriate TLS/certificate config key(s) expected by the OTLP exporter
(pass certificate path or credentials as the exporter expects) so exporters
won’t hang and can be configured for TLS; apply the same pattern to the metrics
kwargs block at the referenced lines.


def _metric_exporter_kwargs() -> dict[str, Any]:
"""Build keyword arguments for the OTLP metric exporter."""

kwargs: dict[str, Any] = {
"insecure": _env_flag("OTEL_EXPORTER_OTLP_METRICS_INSECURE", default=False)
}

endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT") or os.getenv(
"OTEL_EXPORTER_OTLP_ENDPOINT"
)
if endpoint:
kwargs["endpoint"] = endpoint

headers = os.getenv("OTEL_EXPORTER_OTLP_METRICS_HEADERS") or os.getenv(
"OTEL_EXPORTER_OTLP_HEADERS"
)
if headers:
kwargs["headers"] = _parse_headers(headers)

return kwargs


def bootstrap() -> None:
"""Initialise OpenTelemetry tracing and metrics exporters."""

global _BOOTSTRAPPED

if _BOOTSTRAPPED:
return

try:
from opentelemetry import metrics as otel_metrics
from opentelemetry import trace as otel_trace
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (
OTLPMetricExporter,
)
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
OTLPSpanExporter,
)
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
except ImportError as exc: # pragma: no cover - optional dependency guard
_LOGGER.debug("OpenTelemetry packages are unavailable: %s", exc)
return

resource = Resource.create(
{
"service.name": os.getenv("OTEL_SERVICE_NAME", "paform-backend"),
"service.namespace": os.getenv("OTEL_SERVICE_NAMESPACE", "paform"),
}
)

tracer_provider = TracerProvider(resource=resource)
span_exporter = OTLPSpanExporter(**_span_exporter_kwargs())
tracer_provider.add_span_processor(BatchSpanProcessor(span_exporter))
otel_trace.set_tracer_provider(tracer_provider)

metric_exporter = OTLPMetricExporter(**_metric_exporter_kwargs())
meter_provider = MeterProvider(
resource=resource,
metric_readers=[PeriodicExportingMetricReader(metric_exporter)],
)
otel_metrics.set_meter_provider(meter_provider)

_BOOTSTRAPPED = True

20 changes: 17 additions & 3 deletions backend/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,26 @@
from api.routes_sync import router as sync_router
from api.routes_observability import router as observability_router
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
from prometheus_fastapi_instrumentator import Instrumentator
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

Guard optional dependency and prevent metric duplication/conflicts

  • Importing Instrumentator unguarded can crash when the package isn’t installed.
  • Using Instrumentator alongside your custom Prometheus middleware/counters risks double-counting and potential metric-name collisions (e.g., http_requests_total).

Recommended changes:

-from prometheus_fastapi_instrumentator import Instrumentator
+try:  # pragma: no cover - optional dependency guard
+    from prometheus_fastapi_instrumentator import Instrumentator
+except ImportError:  # pragma: no cover
+    Instrumentator = None  # type: ignore[assignment]
-# Register Prometheus instrumentation so HTTP metrics are emitted for alerts/dashboards
-Instrumentator().instrument(app)
+# Register Prometheus instrumentation only if available; exclude /metrics
+if Instrumentator is not None:
+    Instrumentator().instrument(app, excluded_handlers=["/metrics"])

Also decide on a single HTTP request metric source (either Instrumentator or your custom middleware + metrics.py) to avoid duplicate series.

Run a quick web check to confirm default metric names from Instrumentator:

Also applies to: 53-53


🌐 Web query:

Does prometheus_fastapi_instrumentator export a metric named http_requests_total by default, and can it conflict with an existing custom Counter of the same name?

💡 Result:

Short answer: Yes — prometheus_fastapi_instrumentator registers a Counter named http_requests_total by default, and that can conflict with an existing metric of the same name (you'll get duplicate/registration errors).

Sources:

  • The package docs state the default Counter http_requests_total is created. [1][2]
  • The instrumentator registers metrics in the global prometheus_client registry, so registering the same metric twice produces duplicate-timeseries/ValueError errors (reports and issues show "Duplicated timeseries" when metrics are registered more than once). [3][4]

If you want, I can show ways to avoid the conflict (use a custom registry/namespace/subsystem or disable the default metric).



---

🏁 Script executed:

```shell
#!/bin/bash
# Check the current state of lines 22 and 53 in backend/api/main.py
# and search for custom metrics or custom Prometheus middleware

echo "=== Lines 20-60 in backend/api/main.py ==="
sed -n '20,60p' backend/api/main.py

echo -e "\n=== Search for http_requests_total in the codebase ==="
rg -n "http_requests_total" --type python

echo -e "\n=== Search for prometheus_client Counter or Gauge in main.py ==="
rg -n "prometheus_client|Counter|Gauge" backend/api/main.py -A 2 -B 2

Length of output: 1984



---

🏁 Script executed:

```shell
#!/bin/bash
# Inspect api/metrics.py to check for custom prometheus metrics

echo "=== Content of api/metrics.py ==="
cat backend/api/metrics.py

echo -e "\n=== Check for prometheus_client usage in api/metrics.py ==="
rg -n "Counter|Gauge|Histogram|Summary" backend/api/metrics.py -A 2

Length of output: 4730



---

🏁 Script executed:

```shell
#!/bin/bash
# Get exact line numbers for Instrumentator usage
rg -n "Instrumentator" backend/api/main.py

Length of output: 99


Guard optional dependency and fix metric duplication conflict

Verification confirms both critical issues:

  1. Unguarded import (line 22): prometheus_fastapi_instrumentator is imported without try-except, risking crash if the package isn't installed. The codebase already uses this pattern for optional dependencies (see line 26-29 with instrumentation_boot).

  2. Metric name collision (line 53): prometheus_fastapi_instrumentator registers http_requests_total by default, which conflicts with your custom http_requests_total Counter defined in api/metrics.py (line 68). Both register to the same global prometheus_client registry, causing a duplicate-timeseries ValueError at runtime.

Required changes:

Guard the import:

-from prometheus_fastapi_instrumentator import Instrumentator
+try:  # pragma: no cover - optional dependency guard
+    from prometheus_fastapi_instrumentator import Instrumentator
+except ImportError:  # pragma: no cover
+    Instrumentator = None  # type: ignore[assignment]

Conditionally register and avoid metric collision:

-Instrumentator().instrument(app)
+if Instrumentator is not None:
+    Instrumentator().instrument(app, excluded_handlers=["/metrics"])

Additionally, remove http_requests_total from api/metrics.py (line 68–72) since you already capture HTTP metrics via observe_http_request() in your custom middleware. This avoids the dual instrumentation.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
from prometheus_fastapi_instrumentator import Instrumentator
try: # pragma: no cover - optional dependency guard
from prometheus_fastapi_instrumentator import Instrumentator
except ImportError: # pragma: no cover
Instrumentator = None # type: ignore[assignment]
🤖 Prompt for AI Agents
backend/api/main.py around line 22: the import of
prometheus_fastapi_instrumentator must be guarded with a try/except like the
existing instrumentation_boot pattern (lines ~26-29) so the app doesn't crash
when the optional package is absent; only attempt to instantiate/register
Instrumentator when the import succeeds and configure it to not register its
default http_requests_total metric (or disable that metric) to avoid duplication
with our own metrics; remove the http_requests_total Counter in api/metrics.py
(lines ~68-72) and rely on observe_http_request() in our middleware instead, and
ensure the Instrumentator registration is conditional to prevent
double-registration and runtime ValueError.

from api.metrics import observe_http_request

try: # pragma: no cover - instrumentation is optional in some environments
from api.instrumentation_boot import bootstrap as bootstrap_instrumentation
except ImportError: # pragma: no cover - optional dependency guard
bootstrap_instrumentation = None # type: ignore[assignment]

# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

if bootstrap_instrumentation is not None: # pragma: no branch - simple guard
try:
bootstrap_instrumentation()
except Exception: # pragma: no cover - defensive guard
logger.exception("Failed to bootstrap OpenTelemetry instrumentation")

Comment on lines +31 to +36
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Bootstrap timing is fine; avoid re-instantiating Settings later

Bootstrapping at import-time with a try/except is acceptable. Note: below, code constructs Settings again via _Settings(); prefer the cached settings variable to avoid double parsing and to keep behavior consistent with get_settings().

Would you like a follow-up diff to switch to the existing settings instance for the SQLite check?

🤖 Prompt for AI Agents
In backend/api/main.py around lines 36 to 41, the bootstrap try/except is fine
but later code re-instantiates Settings via _Settings(), causing double parsing
and potential inconsistency; change that code to reuse the existing cached
settings instance (call get_settings() or reference the module-level settings
variable) for the SQLite check so the same parsed config is used everywhere, and
remove the direct _Settings() construction.

# Initialize settings
settings = get_settings()

Expand Down Expand Up @@ -74,13 +86,15 @@ async def dispatch(self, request: Request, call_next): # type: ignore[override]
start = perf_counter()
try:
response = await call_next(request)
if response is None: # pragma: no cover - FastAPI guarantees a Response
raise RuntimeError("ASGI call_next returned None response")
except Exception:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Record accurate status codes on exceptions

Currently exceptions are always recorded as 500, which misrepresents handled errors (e.g., RequestValidationError returns 422). Map known exceptions to their final status before recording.

Apply:

-        except Exception:
+        except Exception as exc:
             duration = perf_counter() - start
-            observe_http_request(
+            # Derive best-effort status code
+            from fastapi import HTTPException as FastAPIHTTPException  # local import to avoid top-level import churn
+            status = 500
+            if isinstance(exc, RequestValidationError):
+                status = 422
+            elif isinstance(exc, FastAPIHTTPException):
+                status = int(getattr(exc, "status_code", 500))
+            observe_http_request(
                 service="backend",
                 route=_resolve_route_label(request),
                 method=request.method,
-                status_code=500,
+                status_code=status,
                 duration_seconds=duration,
             )
             raise

Also applies to: 97-99

duration = perf_counter() - start
observe_http_request(
service="backend",
route=_resolve_route_label(request),
method=request.method,
status="500",
status_code=500,
duration_seconds=duration,
)
raise
Expand All @@ -90,7 +104,7 @@ async def dispatch(self, request: Request, call_next): # type: ignore[override]
service="backend",
route=_resolve_route_label(request),
method=request.method,
status=str(response.status_code),
status_code=response.status_code,
duration_seconds=duration,
)
return response
Expand Down Expand Up @@ -152,4 +166,4 @@ async def healthcheck() -> Dict[str, str]:
@app.get("/metrics")
async def metrics() -> Response:
# Expose Prometheus metrics including default process/python collectors
return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
56 changes: 53 additions & 3 deletions backend/api/metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,36 @@
from __future__ import annotations

from typing import Any

from prometheus_client import Counter, Histogram

_otel_meter: Any | None = None
_http_request_counter: Any | None = None
_http_request_duration_histogram: Any | None = None
_web_vitals_lcp_histogram: Any | None = None

try: # pragma: no cover - optional dependency guard
from opentelemetry import metrics as otel_metrics
except ImportError: # pragma: no cover - optional dependency guard
otel_metrics = None # type: ignore[assignment]
else:
_otel_meter = otel_metrics.get_meter(__name__)
_http_request_counter = _otel_meter.create_counter(
"http.server.request.count",
description="HTTP requests processed by the backend",
unit="1",
)
_http_request_duration_histogram = _otel_meter.create_histogram(
"http.server.duration",
description="Duration of HTTP requests handled by the backend",
unit="s",
)
_web_vitals_lcp_histogram = _otel_meter.create_histogram(
"frontend.web_vitals.lcp",
description="Largest Contentful Paint reported from the frontend",
unit="s",
)

REQUEST_LATENCY_BUCKETS = (
0.005,
0.01,
Expand Down Expand Up @@ -61,21 +91,41 @@ def observe_http_request(
service: str,
route: str,
method: str,
status: str,
status_code: int | str,
duration_seconds: float,
) -> None:
"""Record a single HTTP request observation."""

status_label = str(status_code)
http_requests_total.labels(
service=service, route=route, method=method, status=status
service=service, route=route, method=method, status=status_label
).inc()
http_request_duration_seconds.labels(
service=service, route=route, method=method, status=status
service=service, route=route, method=method, status=status_label
).observe(duration_seconds)

if _http_request_counter is not None or _http_request_duration_histogram is not None:
attributes: dict[str, str | int] = {
"service.name": service,
"http.route": route,
"http.method": method,
}
try:
attributes["http.status_code"] = int(status_code)
except (TypeError, ValueError):
# Skip the status code attribute if it cannot be coerced to an int.
pass

if _http_request_counter is not None:
_http_request_counter.add(1, attributes)
if _http_request_duration_histogram is not None:
_http_request_duration_histogram.record(duration_seconds, attributes)


def observe_lcp(*, app: str, seconds: float) -> None:
"""Record a Largest Contentful Paint measurement in seconds."""

web_vitals_lcp.labels(app=app).observe(seconds)
if _web_vitals_lcp_histogram is not None:
_web_vitals_lcp_histogram.record(seconds, {"app": app})