Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions backend/api/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""Main module for the FastAPI application."""

import logging
from time import perf_counter
from typing import Dict

from fastapi import FastAPI, Response, Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from starlette.middleware.cors import CORSMiddleware
from starlette.middleware.base import BaseHTTPMiddleware

from api.config import get_settings
from api.routes import router as api_router
Expand All @@ -15,7 +17,9 @@
from api.routes_quote import router as quote_router
from api.routes_cnc import router as cnc_router
from api.routes_sync import router as sync_router
from api.routes_observability import router as observability_router
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
from api.metrics import observe_http_request

# Configure logging
logging.basicConfig(
Expand Down Expand Up @@ -51,13 +55,54 @@ async def _ensure_sqlite_tables() -> None:
allow_headers=["*"],
)


def _resolve_route_label(request: Request) -> str:
route = request.scope.get("route")
if route and hasattr(route, "path") and route.path:
return route.path
return request.url.path or "unknown"


class PrometheusInstrumentationMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next): # type: ignore[override]
if request.url.path == "/metrics":
return await call_next(request)

start = perf_counter()
try:
response = await call_next(request)
except Exception:
duration = perf_counter() - start
observe_http_request(
service="backend",
route=_resolve_route_label(request),
method=request.method,
status="500",
duration_seconds=duration,
)
raise

duration = perf_counter() - start
observe_http_request(
service="backend",
route=_resolve_route_label(request),
method=request.method,
status=str(response.status_code),
duration_seconds=duration,
)
return response
Comment on lines +66 to +93
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Record the real status code for HTTPException responses.

call_next raises HTTPException for anticipated errors (e.g. 404, 422), but our broad except Exception block records every one of them as status 500. That skews the metrics for client errors and validation failures. Handle HTTPException separately (use its status_code) before catching generic exceptions.

Apply this diff to record accurate status codes:

-from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.middleware.base import BaseHTTPMiddleware
+from fastapi import HTTPException
@@
-        try:
-            response = await call_next(request)
-        except Exception:
+        try:
+            response = await call_next(request)
+        except HTTPException as exc:
+            duration = perf_counter() - start
+            observe_http_request(
+                service="backend",
+                route=_resolve_route_label(request),
+                method=request.method,
+                status=str(exc.status_code),
+                duration_seconds=duration,
+            )
+            raise
+        except Exception:
             duration = perf_counter() - start
             observe_http_request(
                 service="backend",
                 route=_resolve_route_label(request),
                 method=request.method,
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
class PrometheusInstrumentationMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next): # type: ignore[override]
if request.url.path == "/metrics":
return await call_next(request)
start = perf_counter()
try:
response = await call_next(request)
except Exception:
duration = perf_counter() - start
observe_http_request(
service="backend",
route=_resolve_route_label(request),
method=request.method,
status="500",
duration_seconds=duration,
)
raise
duration = perf_counter() - start
observe_http_request(
service="backend",
route=_resolve_route_label(request),
method=request.method,
status=str(response.status_code),
duration_seconds=duration,
)
return response
from starlette.middleware.base import BaseHTTPMiddleware
from fastapi import HTTPException
class PrometheusInstrumentationMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next): # type: ignore[override]
if request.url.path == "/metrics":
return await call_next(request)
start = perf_counter()
try:
response = await call_next(request)
except HTTPException as exc:
duration = perf_counter() - start
observe_http_request(
service="backend",
route=_resolve_route_label(request),
method=request.method,
status=str(exc.status_code),
duration_seconds=duration,
)
raise
except Exception:
duration = perf_counter() - start
observe_http_request(
service="backend",
route=_resolve_route_label(request),
method=request.method,
status="500",
duration_seconds=duration,
)
raise
duration = perf_counter() - start
observe_http_request(
service="backend",
route=_resolve_route_label(request),
method=request.method,
status=str(response.status_code),
duration_seconds=duration,
)
return response
🤖 Prompt for AI Agents
In backend/api/main.py around lines 66 to 93, the middleware’s broad except
block treats all raised HTTPException instances as 500s; update the error
handling to catch fastapi.exceptions.HTTPException (or
starlette.exceptions.HTTPException) separately, call observe_http_request using
the exception’s status_code, then re-raise it, and keep a separate generic
except Exception block that records status "500" and re-raises; ensure the
/metrics bypass remains unchanged and that duration is measured before recording
in both exception branches.



app.add_middleware(PrometheusInstrumentationMiddleware)

# Include API router
app.include_router(api_router)
app.include_router(materials_router)
app.include_router(modules_router)
app.include_router(quote_router)
app.include_router(cnc_router)
app.include_router(sync_router)
app.include_router(observability_router)


@app.exception_handler(RequestValidationError)
Expand Down
64 changes: 62 additions & 2 deletions backend/api/metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
from __future__ import annotations
from prometheus_client import Counter
from prometheus_client import Counter, Histogram

REQUEST_LATENCY_BUCKETS = (
0.005,
0.01,
0.025,
0.05,
0.075,
0.1,
0.25,
0.5,
0.75,
1.0,
2.5,
5.0,
)

# Hygraph sync counters
sync_success_total = Counter(
Expand All @@ -18,4 +33,49 @@
"sync_records_upserted_total",
"Records upserted during Hygraph sync",
labelnames=("type",),
)
)

http_requests_total = Counter(
"http_requests_total",
"Total HTTP requests processed by the backend",
labelnames=("service", "route", "method", "status"),
)

http_request_duration_seconds = Histogram(
"http_request_duration_seconds",
"Latency of HTTP requests handled by the backend",
labelnames=("service", "route", "method", "status"),
buckets=REQUEST_LATENCY_BUCKETS,
)

web_vitals_lcp = Histogram(
"web_vitals_lcp",
"Largest Contentful Paint reported from the frontend (seconds)",
labelnames=("app",),
buckets=(1.5, 2, 2.5, 3, 3.5, 4, 5, 6, 8, 10),
)


def observe_http_request(
*,
service: str,
route: str,
method: str,
status: str,
duration_seconds: float,
) -> None:
"""Record a single HTTP request observation."""

http_requests_total.labels(
service=service, route=route, method=method, status=status
).inc()
http_request_duration_seconds.labels(
service=service, route=route, method=method, status=status
).observe(duration_seconds)


def observe_lcp(*, app: str, seconds: float) -> None:
"""Record a Largest Contentful Paint measurement in seconds."""

web_vitals_lcp.labels(app=app).observe(seconds)

35 changes: 35 additions & 0 deletions backend/api/routes_observability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from __future__ import annotations

from typing import Literal

from fastapi import APIRouter, HTTPException, status
from pydantic import BaseModel, Field

from api.metrics import observe_lcp

router = APIRouter(prefix="/observability", tags=["observability"])


class WebVitalPayload(BaseModel):
name: Literal["LCP", "FCP", "CLS", "FID", "INP", "TTFB"] = Field(
..., description="Name of the reported Web Vital metric"
)
value: float = Field(..., description="Value of the metric as reported by web-vitals")
app: str = Field("frontend", description="Logical application label for the metric")
id: str | None = Field(None, description="Unique identifier assigned by the web-vitals reporter")


@router.post("/web-vitals", status_code=status.HTTP_202_ACCEPTED)
async def ingest_web_vitals(payload: WebVitalPayload) -> dict[str, bool]:
"""Ingest Web Vital measurements emitted by the frontend."""

if payload.name != "LCP":
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Only LCP metrics are supported at this time",
)

# web-vitals reports timings in milliseconds; convert to seconds for Prometheus
seconds = payload.value / 1000.0
observe_lcp(app=payload.app, seconds=seconds)
return {"ok": True}
53 changes: 53 additions & 0 deletions backend/tests/test_observability_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from __future__ import annotations

from fastapi.testclient import TestClient
from prometheus_client import REGISTRY

from api.main import app

client = TestClient(app)


def _get_sample(name: str, labels: dict[str, str]) -> float:
value = REGISTRY.get_sample_value(name, labels)
return float(value) if value is not None else 0.0


def test_lcp_ingest_records_histogram() -> None:
before = _get_sample("web_vitals_lcp_count", {"app": "frontend"})

response = client.post(
"/observability/web-vitals",
json={"name": "LCP", "value": 2400, "app": "frontend"},
)

assert response.status_code == 202
after = _get_sample("web_vitals_lcp_count", {"app": "frontend"})
assert after == before + 1


def test_lcp_ingest_rejects_unsupported_metrics() -> None:
response = client.post(
"/observability/web-vitals",
json={"name": "CLS", "value": 0.04},
)

assert response.status_code == 400
payload = response.json()
assert payload["detail"] == "Only LCP metrics are supported at this time"


def test_http_metrics_recorded_for_requests() -> None:
route_labels = {
"service": "backend",
"route": "/healthcheck",
"method": "GET",
"status": "200",
}
before = _get_sample("http_requests_total", route_labels)

resp = client.get("/healthcheck")

assert resp.status_code == 200
after = _get_sample("http_requests_total", route_labels)
assert after == before + 1
12 changes: 12 additions & 0 deletions dashboards/dashboards.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: Observability Dashboards
orgId: 1
folder: Platform Observability
type: file
disableDeletion: false
allowUiUpdates: true
updateIntervalSeconds: 30
options:
path: ops/grafana/provisioning/dashboards
foldersFromFilesStructure: false
7 changes: 7 additions & 0 deletions frontend/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 6 additions & 8 deletions frontend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,16 @@
"@chakra-ui/react": "^2.8.2",
"@emotion/react": "^11.13.3",
"@emotion/styled": "^11.13.0",
"@react-three/drei": "^9.120.5",
"@react-three/fiber": "^8.17.10",
"framer-motion": "^11.5.4",
"next": "^14.2.28",
"react": "^18",
"react-dom": "^18",
"react-icons": "^5.3.0",
"zustand": "^4.5.0",
"three": "^0.171.0",
"@react-three/fiber": "^8.17.10",
"@react-three/drei": "^9.120.5"
"web-vitals": "^4.2.4",
"zustand": "^4.5.0"
},
"devDependencies": {
"@playwright/test": "^1.56.0",
Expand All @@ -49,12 +50,9 @@
"postcss": "^8",
"tailwindcss": "^3.3.0",
"ts-jest": "^29.2.5",
"ts-node": "^10.9.2",
"typescript": "^5",
"gltfpack": "0.25.0",
"meshoptimizer": "0.25.0",
"vitest": "^1.6.0",
"@playwright/test": "^1.56.0",
"ts-node": "^10.9.2"
"vitest": "^1.6.0"
},
"jest": {
"setupFilesAfterEnv": [
Expand Down
55 changes: 55 additions & 0 deletions frontend/src/app/reportWebVitals.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import type { Metric } from 'next/app';

const DEFAULT_ENDPOINT = '/observability/web-vitals';
const APP_LABEL = process.env.NEXT_PUBLIC_WEB_VITALS_APP ?? 'frontend';

function buildEndpoint(): string {
if (typeof window === 'undefined') {
return DEFAULT_ENDPOINT;
}

const protocol = process.env.NEXT_PUBLIC_BACKEND_PROTOCOL ?? window.location.protocol ?? 'http:';
const host = process.env.NEXT_PUBLIC_BACKEND_HOST ?? window.location.hostname;
const port = process.env.NEXT_PUBLIC_BACKEND_PORT ?? '';
const endpoint = process.env.NEXT_PUBLIC_WEB_VITALS_ENDPOINT ?? DEFAULT_ENDPOINT;

const portSegment = port ? `:${port}` : '';
return `${protocol}//${host}${portSegment}${endpoint}`;
}
Comment on lines +11 to +18
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Normalize protocol when building the reporting URL.

If NEXT_PUBLIC_BACKEND_PROTOCOL is set to common values like "https", the computed endpoint becomes https//… (missing the colon) because we directly interpolate the raw string. That breaks both sendBeacon and the fetch fallback. Please normalize the protocol (ensure it ends with :) or build the URL via new URL(...) so env-provided values work.

Apply this diff to normalize the protocol before interpolation:

-  const protocol = process.env.NEXT_PUBLIC_BACKEND_PROTOCOL ?? window.location.protocol ?? 'http:';
+  const protocol = process.env.NEXT_PUBLIC_BACKEND_PROTOCOL ?? window.location.protocol ?? 'http:';
+  const normalizedProtocol = protocol.endsWith(':') ? protocol : `${protocol}:`;
@@
-  return `${protocol}//${host}${portSegment}${endpoint}`;
+  return `${normalizedProtocol}//${host}${portSegment}${endpoint}`;
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
const protocol = process.env.NEXT_PUBLIC_BACKEND_PROTOCOL ?? window.location.protocol ?? 'http:';
const host = process.env.NEXT_PUBLIC_BACKEND_HOST ?? window.location.hostname;
const port = process.env.NEXT_PUBLIC_BACKEND_PORT ?? '';
const endpoint = process.env.NEXT_PUBLIC_WEB_VITALS_ENDPOINT ?? DEFAULT_ENDPOINT;
const portSegment = port ? `:${port}` : '';
return `${protocol}//${host}${portSegment}${endpoint}`;
}
const protocol = process.env.NEXT_PUBLIC_BACKEND_PROTOCOL ?? window.location.protocol ?? 'http:';
const normalizedProtocol = protocol.endsWith(':') ? protocol : `${protocol}:`;
const host = process.env.NEXT_PUBLIC_BACKEND_HOST ?? window.location.hostname;
const port = process.env.NEXT_PUBLIC_BACKEND_PORT ?? '';
const endpoint = process.env.NEXT_PUBLIC_WEB_VITALS_ENDPOINT ?? DEFAULT_ENDPOINT;
const portSegment = port ? `:${port}` : '';
return `${normalizedProtocol}//${host}${portSegment}${endpoint}`;
}
🤖 Prompt for AI Agents
In frontend/src/app/reportWebVitals.ts around lines 11 to 18, the protocol value
from NEXT_PUBLIC_BACKEND_PROTOCOL may be missing the trailing colon (e.g.
"https") which produces URLs like "https//..."; normalize the protocol before
building the URL by trimming whitespace, checking if it is non-empty and
endsWith(':') and if not appending ':' (or alternately construct the full URL
with new URL by combining host/port/endpoint), then use that normalized protocol
when interpolating so the final URL is valid for sendBeacon/fetch.


function sendMetric(endpoint: string, body: string) {
if (typeof navigator !== 'undefined' && 'sendBeacon' in navigator) {
const blob = new Blob([body], { type: 'application/json' });
const success = navigator.sendBeacon(endpoint, blob);
if (success) {
return;
}
}

fetch(endpoint, {
method: 'POST',
body,
keepalive: true,
headers: {
'Content-Type': 'application/json',
},
}).catch(() => {
// Swallow network errors; metrics reporting should be fire-and-forget
});
}

export function reportWebVitals(metric: Metric) {
if (metric.name !== 'LCP') {
return;
}

const endpoint = buildEndpoint();
const body = JSON.stringify({
name: metric.name,
value: metric.value,
app: APP_LABEL,
id: metric.id,
});

sendMetric(endpoint, body);
}
Loading
Loading