Skip to content

Commit bc14c28

Browse files
authored
Add OpenTelemetry instrumentation for API (#82)
* Add OpenTelemetry instrumentation for API * Normalize http.route in telemetry middleware
1 parent 2e404f3 commit bc14c28

File tree

8 files changed

+534
-101
lines changed

8 files changed

+534
-101
lines changed

backend/api/config.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Configuration settings for the FastAPI application."""
22

3-
from functools import lru_cache
4-
from typing import Any, Dict
3+
import os
4+
from typing import Any, Dict, Optional
55

66
from pydantic import Field, field_validator
77
from pydantic_settings import BaseSettings, SettingsConfigDict
@@ -62,11 +62,20 @@ def _validate_hygraph_webhook_secret(cls, value: str) -> str:
6262
return value.strip()
6363

6464

65-
@lru_cache()
65+
_SETTINGS_CACHE: Optional[Settings] = None
66+
_LAST_SECRET: Optional[str] = None
67+
68+
6669
def get_settings() -> Settings:
67-
"""Return a cached instance of :class:`Settings`."""
70+
"""Return a cached instance of :class:`Settings`, refreshing on secret changes."""
6871

69-
return Settings()
72+
global _SETTINGS_CACHE, _LAST_SECRET
73+
current_secret = os.getenv("HYGRAPH_WEBHOOK_SECRET")
74+
if _SETTINGS_CACHE is None or _LAST_SECRET != current_secret:
75+
settings = Settings()
76+
_SETTINGS_CACHE = settings
77+
_LAST_SECRET = settings.hygraph_webhook_secret
78+
return _SETTINGS_CACHE
7079

7180

7281
def get_fastapi_settings() -> Dict[str, Any]:
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
from __future__ import annotations
2+
3+
import json
4+
import logging
5+
import os
6+
from typing import Any, Dict
7+
8+
from opentelemetry import metrics, trace
9+
from opentelemetry.propagate import set_global_textmap
10+
from opentelemetry.sdk.metrics import MeterProvider
11+
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
12+
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
13+
from opentelemetry.sdk.resources import Resource
14+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
15+
from opentelemetry.sdk.trace import TracerProvider
16+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
17+
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
18+
19+
from api.config import get_settings
20+
21+
_OTEL_BOOTSTRAPPED = False
22+
23+
24+
class JsonLogFormatter(logging.Formatter):
25+
"""Formatter that outputs JSON with tracing metadata."""
26+
27+
def __init__(self, service_fields: Dict[str, str]) -> None:
28+
super().__init__()
29+
self._service_fields = service_fields
30+
31+
def format(self, record: logging.LogRecord) -> str:
32+
payload: Dict[str, Any] = {
33+
"level": record.levelname,
34+
"logger": record.name,
35+
"message": record.getMessage(),
36+
"service": self._service_fields.get("service.name"),
37+
"environment": self._service_fields.get("deployment.environment"),
38+
}
39+
if record.exc_info:
40+
payload["exception"] = self.formatException(record.exc_info)
41+
for key, value in self._service_fields.items():
42+
payload.setdefault(key.replace(".", "_"), value)
43+
44+
span = trace.get_current_span()
45+
ctx = span.get_span_context()
46+
if ctx.trace_id:
47+
payload["trace_id"] = format(ctx.trace_id, "032x")
48+
if ctx.span_id:
49+
payload["span_id"] = format(ctx.span_id, "016x")
50+
51+
for attr in ("event_id", "dedup", "counts", "elapsed_ms", "type", "error"):
52+
if hasattr(record, attr):
53+
payload[attr] = getattr(record, attr)
54+
55+
return json.dumps(payload, default=str)
56+
57+
58+
def _configure_logging(service_fields: Dict[str, str]) -> None:
59+
root = logging.getLogger()
60+
log_level = os.getenv("LOG_LEVEL", "INFO").upper()
61+
root.setLevel(log_level)
62+
63+
formatter = JsonLogFormatter(service_fields)
64+
if not root.handlers:
65+
handler = logging.StreamHandler()
66+
handler.setFormatter(formatter)
67+
root.addHandler(handler)
68+
else:
69+
for handler in root.handlers:
70+
handler.setFormatter(formatter)
71+
handler.setLevel(log_level)
72+
73+
74+
def setup_instrumentation() -> None:
75+
global _OTEL_BOOTSTRAPPED
76+
if _OTEL_BOOTSTRAPPED:
77+
return
78+
79+
settings = get_settings()
80+
service_name = os.getenv("OTEL_SERVICE_NAME", "paform-backend-api")
81+
resource = Resource.create(
82+
{
83+
"service.name": service_name,
84+
"service.namespace": "paform",
85+
"deployment.environment": settings.environment,
86+
}
87+
)
88+
89+
set_global_textmap(TraceContextTextMapPropagator())
90+
91+
tracer_provider = TracerProvider(resource=resource)
92+
span_exporter = OTLPSpanExporter(endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "otel-collector:4317"), insecure=True)
93+
tracer_provider.add_span_processor(BatchSpanProcessor(span_exporter))
94+
trace.set_tracer_provider(tracer_provider)
95+
96+
metric_exporter = OTLPMetricExporter(
97+
endpoint=os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", "otel-collector:4317"),
98+
insecure=True,
99+
)
100+
reader = PeriodicExportingMetricReader(metric_exporter)
101+
meter_provider = MeterProvider(resource=resource, metric_readers=[reader])
102+
metrics.set_meter_provider(meter_provider)
103+
104+
service_fields: Dict[str, str] = {
105+
"service.name": service_name,
106+
"deployment.environment": settings.environment,
107+
}
108+
_configure_logging(service_fields)
109+
110+
_OTEL_BOOTSTRAPPED = True
111+
112+
113+
setup_instrumentation()

backend/api/main.py

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@
44
from time import perf_counter
55
from typing import Dict
66

7-
from fastapi import FastAPI, Response, Request
7+
from fastapi import FastAPI, Request, Response
88
from fastapi.exceptions import RequestValidationError
99
from fastapi.responses import JSONResponse
10+
from opentelemetry import trace
11+
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
12+
from opentelemetry.trace import Status, StatusCode
1013
from starlette.middleware.cors import CORSMiddleware
1114
from starlette.middleware.base import BaseHTTPMiddleware
1215

1316
from api.config import get_settings
17+
from api.instrumentation_boot import setup_instrumentation
18+
from api.metrics import record_http_request
1419
from api.routes import router as api_router
1520
from api.routes_materials import router as materials_router
1621
from api.routes_modules import router as modules_router
@@ -21,11 +26,7 @@
2126
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
2227
from api.metrics import observe_http_request
2328

24-
# Configure logging
25-
logging.basicConfig(
26-
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
27-
)
28-
logger = logging.getLogger(__name__)
29+
setup_instrumentation()
2930

3031
# Initialize settings
3132
settings = get_settings()
@@ -107,6 +108,45 @@ async def dispatch(self, request: Request, call_next): # type: ignore[override]
107108
app.include_router(sync_router)
108109
app.include_router(observability_router)
109110

111+
_http_tracer = trace.get_tracer("paform.api.http")
112+
113+
114+
@app.middleware("http")
115+
async def telemetry_middleware(request: Request, call_next):
116+
method = request.method.upper()
117+
path_template = request.url.path
118+
start = time.perf_counter()
119+
120+
current_span = trace.get_current_span()
121+
if current_span.get_span_context().is_valid:
122+
span_context = nullcontext(current_span)
123+
else:
124+
span_context = _http_tracer.start_as_current_span(f"HTTP {method}")
125+
126+
status_code = 500
127+
response: Optional[Response] = None
128+
with span_context as span:
129+
span.set_attribute("http.method", method)
130+
span.set_attribute("deployment.environment", settings.environment)
131+
try:
132+
response = await call_next(request)
133+
status_code = response.status_code
134+
span.set_attribute("http.status_code", status_code)
135+
if status_code >= 500:
136+
span.set_status(Status(StatusCode.ERROR))
137+
except Exception as exc: # noqa: BLE001
138+
span.record_exception(exc)
139+
span.set_status(Status(StatusCode.ERROR))
140+
raise
141+
finally:
142+
matched_route = request.scope.get("route")
143+
path_template = getattr(matched_route, "path", path_template)
144+
span.set_attribute("http.route", path_template)
145+
duration_ms = (time.perf_counter() - start) * 1000
146+
record_http_request(method, path_template, status_code, duration_ms)
147+
148+
return response if response is not None else Response(status_code=status_code)
149+
110150

111151
@app.exception_handler(RequestValidationError)
112152
async def handle_validation_error(request: Request, exc: RequestValidationError) -> JSONResponse:
@@ -150,6 +190,9 @@ async def healthcheck() -> Dict[str, str]:
150190

151191

152192
@app.get("/metrics")
153-
async def metrics() -> Response:
154-
# Expose Prometheus metrics including default process/python collectors
155-
return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
193+
async def metrics() -> Dict[str, str]:
194+
"""Compatibility endpoint for legacy Prometheus scrapes."""
195+
196+
return {
197+
"detail": "Metrics are exported via OpenTelemetry OTLP; no local payload is available.",
198+
}

backend/api/metrics.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,42 @@
1616
5.0,
1717
)
1818

19-
# Hygraph sync counters
20-
sync_success_total = Counter(
21-
"sync_success_total",
22-
"Successful Hygraph sync operations",
23-
labelnames=("type",),
19+
from typing import Dict
20+
21+
from opentelemetry import metrics
22+
23+
_meter = metrics.get_meter("paform.api.metrics")
24+
25+
_sync_success_counter = _meter.create_counter(
26+
name="sync_success_total",
27+
description="Successful Hygraph sync operations",
2428
)
2529

26-
sync_failure_total = Counter(
27-
"sync_failure_total",
28-
"Failed Hygraph sync operations",
29-
labelnames=("type",),
30+
_sync_failure_counter = _meter.create_counter(
31+
name="sync_failure_total",
32+
description="Failed Hygraph sync operations",
33+
)
34+
35+
_sync_records_counter = _meter.create_counter(
36+
name="sync_records_upserted_total",
37+
description="Records upserted during Hygraph sync",
38+
)
39+
40+
_sync_duration_histogram = _meter.create_histogram(
41+
name="sync_duration_ms",
42+
unit="ms",
43+
description="Duration of Hygraph sync operations in milliseconds",
44+
)
45+
46+
_http_request_counter = _meter.create_counter(
47+
name="http_server_requests_total",
48+
description="Total number of HTTP requests handled",
49+
)
50+
51+
_http_request_duration_histogram = _meter.create_histogram(
52+
name="http_server_request_duration_ms",
53+
unit="ms",
54+
description="HTTP server request duration in milliseconds",
3055
)
3156

3257
sync_records_upserted_total = Counter(

0 commit comments

Comments
 (0)