Skip to content

Commit a9d8f3d

Browse files
authored
Merge pull request #38 from zachyzissou/codex/metrics-endpoint-observability
[codex] add /metrics endpoint for operational alerting
2 parents 992622d + c144df4 commit a9d8f3d

File tree

3 files changed

+104
-0
lines changed

3 files changed

+104
-0
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ docker run -d --name toonami-downlink -p 7004:7004 -v ./data:/data toonami-downl
9393
curl http://localhost:7004/health
9494
```
9595

96+
```bash
97+
# Scrape Prometheus-style metrics
98+
curl http://localhost:7004/metrics
99+
```
100+
96101
```text
97102
{"status":"ok","version":"1.0.0","artifacts":"m3u xml"}
98103
```
@@ -139,6 +144,7 @@ npm run lint:js
139144
## Observability
140145

141146
- Health and status endpoints are the primary runtime checks.
147+
- `/metrics` exposes Prometheus-style gauges for freshness/scheduler/cron state.
142148
- Runtime logs expose refresh cycles and endpoint generation outcomes.
143149
- `TROUBLESHOOTING.md` and `AUDIT.md` contain operational notes and history.
144150
- CI publishes test and security artifacts where configured.

app/server.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1104,6 +1104,49 @@ async def status():
11041104
}
11051105

11061106

1107+
@app.get("/metrics")
1108+
async def metrics():
1109+
"""Prometheus-style runtime metrics for monitoring/alerting."""
1110+
now = datetime.now(UTC)
1111+
state = read_state()
1112+
freshness = _guide_freshness_snapshot(now)
1113+
cron_supported, _ = _cron_support_status(os.environ.get("CRON_SCHEDULE", CRON_SCHEDULE))
1114+
1115+
last_update_value = state.get("last_update")
1116+
last_update_dt = (
1117+
_parse_iso_datetime(last_update_value) if isinstance(last_update_value, str) else None
1118+
)
1119+
last_update_age_seconds = (
1120+
max(0.0, (now - last_update_dt).total_seconds()) if last_update_dt else -1.0
1121+
)
1122+
1123+
try:
1124+
consecutive_failures = int(state.get("consecutive_failures", 0) or 0)
1125+
except (TypeError, ValueError):
1126+
consecutive_failures = 0
1127+
1128+
metrics_text = "\n".join(
1129+
[
1130+
"# HELP downlink_last_update_age_seconds Age in seconds since last successful update (-1 if unknown).",
1131+
"# TYPE downlink_last_update_age_seconds gauge",
1132+
f"downlink_last_update_age_seconds {last_update_age_seconds:.3f}",
1133+
"# HELP downlink_guide_stale 1 when guide data is considered stale, 0 otherwise.",
1134+
"# TYPE downlink_guide_stale gauge",
1135+
f"downlink_guide_stale {1 if freshness['is_stale'] else 0}",
1136+
"# HELP downlink_scheduler_consecutive_failures Current scheduler consecutive failure count.",
1137+
"# TYPE downlink_scheduler_consecutive_failures gauge",
1138+
f"downlink_scheduler_consecutive_failures {consecutive_failures}",
1139+
"# HELP downlink_cron_supported 1 when configured CRON_SCHEDULE is supported, 0 otherwise.",
1140+
"# TYPE downlink_cron_supported gauge",
1141+
f"downlink_cron_supported {1 if cron_supported else 0}",
1142+
]
1143+
)
1144+
return Response(
1145+
content=f"{metrics_text}\n",
1146+
media_type="text/plain; version=0.0.4; charset=utf-8",
1147+
)
1148+
1149+
11071150
@app.get("/m3u")
11081151
async def get_m3u():
11091152
"""Get the M3U playlist file without stream codes."""

test_integration.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,60 @@ async def noop_scheduler_loop():
291291
os.environ["CRON_SCHEDULE"] = previous_cron
292292

293293

294+
def test_metrics_endpoint_prometheus_gauges():
295+
"""Metrics endpoint should expose Prometheus gauges for guide/scheduler state."""
296+
from app import server
297+
298+
previous_cron = os.environ.get("CRON_SCHEDULE")
299+
original_scheduler_loop = server.scheduler_loop
300+
301+
async def noop_scheduler_loop():
302+
return None
303+
304+
try:
305+
os.environ["CRON_SCHEDULE"] = "0 3 * * *"
306+
server.scheduler_loop = noop_scheduler_loop
307+
308+
server.M3U_PATH.parent.mkdir(parents=True, exist_ok=True)
309+
server.M3U_PATH.write_text("#EXTM3U\n#EXTINF:-1,Toonami\nhttp://example.com/stream\n")
310+
server.XML_PATH.write_text("<tv></tv>\n")
311+
now_ts = time.time()
312+
os.utime(server.M3U_PATH, (now_ts, now_ts))
313+
os.utime(server.XML_PATH, (now_ts, now_ts))
314+
server.write_state(
315+
{
316+
"last_update": datetime.now(UTC).isoformat(),
317+
"consecutive_failures": 2,
318+
}
319+
)
320+
321+
with TestClient(app) as client:
322+
response = client.get("/metrics")
323+
assert response.status_code == 200
324+
assert "text/plain" in response.headers.get("content-type", "")
325+
326+
metric_values = {}
327+
for line in response.text.splitlines():
328+
if not line or line.startswith("#"):
329+
continue
330+
name, value = line.split(" ", 1)
331+
metric_values[name] = float(value.strip())
332+
333+
assert "downlink_last_update_age_seconds" in metric_values
334+
assert metric_values["downlink_last_update_age_seconds"] >= 0.0
335+
assert metric_values["downlink_last_update_age_seconds"] < 600.0
336+
assert metric_values["downlink_guide_stale"] == 0.0
337+
assert metric_values["downlink_scheduler_consecutive_failures"] == 2.0
338+
assert metric_values["downlink_cron_supported"] == 1.0
339+
finally:
340+
server.scheduler_loop = original_scheduler_loop
341+
if previous_cron is None:
342+
os.environ.pop("CRON_SCHEDULE", None)
343+
else:
344+
os.environ["CRON_SCHEDULE"] = previous_cron
345+
print("✅ Metrics endpoint exposes expected Prometheus gauges")
346+
347+
294348
def test_record_generation_failure_updates_state():
295349
"""Failure recorder should persist normalized error metadata."""
296350
from app import server
@@ -420,6 +474,7 @@ def main():
420474
test_lan_refresh_host_detection()
421475
test_cron_next_respects_dom_mon_dow()
422476
test_status_reports_cron_and_failure_diagnostics()
477+
test_metrics_endpoint_prometheus_gauges()
423478
test_record_generation_failure_updates_state()
424479
test_health_reports_scheduler_failure_state()
425480
test_health_reports_stale_freshness()

0 commit comments

Comments
 (0)