Skip to content

Commit c32f5e7

Browse files
Surface community config health via public API (#221)
* Surface community config health via public API Downgrade noisy API key warning from error to debug level in discover_assistants() so it stops spamming CLI users. Extract compute_community_health() helper in health.py and add config_health to GET /{community_id}/metrics/public and status field to GET /{community_id}/ config response. Warnings clearly communicate that using the shared platform key is for demonstration only and not sustainable. Closes #220 * Address PR review: fix error handling and security - Move compute_community_health() call inside try/except in get_communities_health() to restore error containment - Add try/except around health computation in config and public metrics endpoints so failures don't crash them - Sanitize warnings for public endpoint: strip env var names to prevent information disclosure - Change logger.debug to logger.warning in discover_assistants so operators see missing key warnings at startup - Move inline imports to top-level (no circular import risk) - Remove redundant "error" key from fallback dict * Fix CI: initialize temp metrics DB in health status tests The public metrics tests need a metrics DB to avoid 503. Use the same tmp_path + patch pattern as test_metrics_public.
1 parent 5dd8751 commit c32f5e7

File tree

5 files changed

+272
-57
lines changed

5 files changed

+272
-57
lines changed

src/api/routers/community.py

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
from src.agents.base import DEFAULT_MAX_CONVERSATION_TOKENS
2727
from src.api.config import get_settings
28+
from src.api.routers.health import compute_community_health
2829
from src.api.security import AuthScope, RequireAuth, RequireScopedAuth
2930
from src.assistants import registry
3031
from src.assistants.community import CommunityAssistant
@@ -188,6 +189,7 @@ class CommunityConfigResponse(BaseModel):
188189
widget: WidgetConfigResponse = Field(
189190
..., description="Widget display configuration (title, placeholder, etc.)"
190191
)
192+
status: str = Field(..., description="Health status: healthy, degraded, or error")
191193

192194

193195
# ---------------------------------------------------------------------------
@@ -1121,13 +1123,22 @@ async def get_community_config() -> CommunityConfigResponse:
11211123
info.community_config.widget if info.community_config else None
11221124
) or WidgetConfig()
11231125

1126+
# Compute lightweight health status for public display
1127+
health_status = "error"
1128+
if info.community_config:
1129+
try:
1130+
health_status = compute_community_health(info.community_config)["status"]
1131+
except Exception:
1132+
logger.exception("Failed to compute health for community %s", info.id)
1133+
11241134
return CommunityConfigResponse(
11251135
id=info.id,
11261136
name=info.name,
11271137
description=info.description,
11281138
default_model=default_model,
11291139
default_model_provider=default_provider,
11301140
widget=WidgetConfigResponse(**widget_cfg.resolve(info.name)),
1141+
status=health_status,
11311142
)
11321143

11331144
# -----------------------------------------------------------------------
@@ -1224,19 +1235,50 @@ async def community_quality_summary(auth: RequireScopedAuth) -> dict[str, Any]:
12241235
async def community_metrics_public() -> dict[str, Any]:
12251236
"""Get public metrics summary for this community.
12261237
1227-
Returns request counts, error rate, and top tools.
1238+
Returns request counts, error rate, top tools, and config health.
12281239
No tokens, costs, or model information exposed.
12291240
"""
12301241
try:
12311242
with metrics_connection() as conn:
1232-
return get_public_community_summary(community_id, conn)
1243+
result = get_public_community_summary(community_id, conn)
12331244
except sqlite3.Error:
12341245
logger.exception("Failed to query public metrics for community %s", community_id)
12351246
raise HTTPException(
12361247
status_code=503,
12371248
detail="Metrics database is temporarily unavailable.",
12381249
)
12391250

1251+
# Add config health alongside usage metrics
1252+
fallback_health: dict[str, Any] = {
1253+
"status": "error",
1254+
"api_key": "missing",
1255+
"documents": 0,
1256+
"warnings": ["Community configuration not found"],
1257+
}
1258+
if info.community_config:
1259+
try:
1260+
health = compute_community_health(info.community_config)
1261+
# Sanitize warnings for public endpoint: strip env var names
1262+
public_warnings = [w for w in health["warnings"] if "Environment variable" not in w]
1263+
if health["api_key"] == "missing" and not public_warnings:
1264+
public_warnings = [
1265+
"API key not configured; using shared platform key. "
1266+
"This is for demonstration only and is not sustainable."
1267+
]
1268+
result["config_health"] = {
1269+
"status": health["status"],
1270+
"api_key": health["api_key"],
1271+
"documents": health["documents"],
1272+
"warnings": public_warnings,
1273+
}
1274+
except Exception:
1275+
logger.exception("Failed to compute health for community %s", community_id)
1276+
result["config_health"] = fallback_health
1277+
else:
1278+
result["config_health"] = fallback_health
1279+
1280+
return result
1281+
12401282
@router.get("/metrics/public/usage")
12411283
async def community_usage_public(
12421284
period: str = Query(

src/api/routers/health.py

Lines changed: 53 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,60 @@
88

99
from src.api.security import RequireAuth
1010
from src.assistants import registry
11+
from src.core.config.community import CommunityConfig
1112

1213
router = APIRouter(prefix="/health", tags=["health"])
1314
logger = logging.getLogger(__name__)
1415

1516

17+
def compute_community_health(config: CommunityConfig) -> dict[str, Any]:
18+
"""Compute health status for a single community config.
19+
20+
Returns:
21+
Dict with status, api_key, cors_origins, documents, sync_age_hours, warnings.
22+
"""
23+
warnings: list[str] = []
24+
25+
# API key status
26+
api_key_env_var = config.openrouter_api_key_env_var
27+
if api_key_env_var:
28+
if os.getenv(api_key_env_var):
29+
api_key_status = "configured"
30+
else:
31+
api_key_status = "missing"
32+
warnings.append(
33+
f"Environment variable '{api_key_env_var}' not set; "
34+
"using shared OSA platform key. This is for demonstration only "
35+
"and is not sustainable. Requires a dedicated API key and active maintainer."
36+
)
37+
else:
38+
api_key_status = "using_platform"
39+
40+
# Documentation sources
41+
doc_count = len(config.documentation) if config.documentation else 0
42+
if doc_count == 0:
43+
warnings.append("No documentation sources configured")
44+
45+
# CORS origins
46+
cors_count = len(config.cors_origins) if config.cors_origins else 0
47+
48+
# Determine overall status
49+
status = "healthy"
50+
if doc_count == 0 or api_key_status == "missing":
51+
status = "error"
52+
elif api_key_status == "using_platform":
53+
status = "degraded"
54+
55+
return {
56+
"status": status,
57+
"api_key": api_key_status,
58+
"cors_origins": cors_count,
59+
"documents": doc_count,
60+
"sync_age_hours": None, # TODO: Add sync tracking in future iteration
61+
"warnings": warnings,
62+
}
63+
64+
1665
@router.get("/communities")
1766
def get_communities_health(_auth: RequireAuth) -> dict[str, Any]:
1867
"""Get health status for all communities.
@@ -23,6 +72,7 @@ def get_communities_health(_auth: RequireAuth) -> dict[str, Any]:
2372
- cors_origins: number of CORS origins configured
2473
- documents: number of documentation sources
2574
- sync_age_hours: hours since last sync (if applicable)
75+
- warnings: list of configuration issues
2676
2777
Returns:
2878
Dictionary mapping community IDs to their health status.
@@ -60,8 +110,10 @@ def get_communities_health(_auth: RequireAuth) -> dict[str, Any]:
60110
"cors_origins": 0,
61111
"documents": 0,
62112
"sync_age_hours": None,
113+
"warnings": ["Community configuration not found"],
63114
}
64115
continue
116+
communities_health[community_id] = compute_community_health(config)
65117
except (AttributeError, KeyError, TypeError) as e:
66118
logger.error(
67119
"Failed to process community health for %s: %s",
@@ -81,51 +133,12 @@ def get_communities_health(_auth: RequireAuth) -> dict[str, Any]:
81133
)
82134
communities_health[fallback_id] = {
83135
"status": "error",
84-
"error": f"Failed to process: {type(e).__name__}",
85136
"api_key": "unknown",
86137
"cors_origins": 0,
87138
"documents": 0,
88139
"sync_age_hours": None,
140+
"warnings": [f"Failed to process: {type(e).__name__}"],
89141
}
90142
continue
91143

92-
# Determine API key status
93-
api_key_env_var = getattr(config, "openrouter_api_key_env_var", None)
94-
if api_key_env_var:
95-
# Check if env var is actually set
96-
api_key_status = "configured" if os.getenv(api_key_env_var) else "missing"
97-
else:
98-
api_key_status = "using_platform"
99-
100-
# Count documentation sources
101-
documentation = getattr(config, "documentation", None)
102-
doc_count = len(documentation) if documentation else 0
103-
104-
# Count CORS origins
105-
cors_origins = getattr(config, "cors_origins", None)
106-
cors_count = len(cors_origins) if cors_origins else 0
107-
108-
# Sync age is not tracked yet, set to None
109-
# TODO: Add sync tracking in future iteration
110-
sync_age_hours = None
111-
112-
# Determine overall status
113-
# - error: critical issues (no docs, missing configured API key)
114-
# - degraded: warnings (using platform key)
115-
# - healthy: all good
116-
status = "healthy"
117-
118-
if doc_count == 0 or api_key_status == "missing":
119-
status = "error"
120-
elif api_key_status == "using_platform":
121-
status = "degraded"
122-
123-
communities_health[community_id] = {
124-
"status": status,
125-
"api_key": api_key_status,
126-
"cors_origins": cors_count,
127-
"documents": doc_count,
128-
"sync_age_hours": sync_age_hours,
129-
}
130-
131144
return communities_health

src/assistants/__init__.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -79,21 +79,15 @@ def discover_assistants() -> list[str]:
7979
discovered.append(config.id)
8080
logger.info("Discovered assistant: %s from %s", config.id, config_path)
8181

82-
# Validate API key env var is set if configured
82+
# Warn once at startup; detailed health surfaced via API endpoints.
83+
# See: https://github.com/OpenScience-Collective/osa/issues/220
8384
if config.openrouter_api_key_env_var and not os.getenv(
8485
config.openrouter_api_key_env_var
8586
):
86-
logger.error(
87-
"Community '%s' configured to use env var '%s' but it is not set. "
88-
"This community will fall back to the platform API key, which may incur unexpected costs. "
89-
"Set the environment variable or remove 'openrouter_api_key_env_var' from config.yaml",
87+
logger.warning(
88+
"Community '%s': env var '%s' not set, will use platform key",
9089
config.id,
9190
config.openrouter_api_key_env_var,
92-
extra={
93-
"community_id": config.id,
94-
"env_var": config.openrouter_api_key_env_var,
95-
"env_var_missing": True,
96-
},
9791
)
9892
except KeyboardInterrupt:
9993
# Never catch user interruption

tests/test_api/test_community_router.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,11 @@
55
- Dynamic endpoint registration
66
- Session isolation between communities
77
- Backward compatibility with HED endpoints
8+
- Public health status in config and metrics endpoints
89
"""
910

11+
import os
12+
1013
import pytest
1114
from fastapi import FastAPI
1215
from fastapi.testclient import TestClient
@@ -346,3 +349,92 @@ def test_session_delete_endpoint_exists(self) -> None:
346349
else:
347350
# Auth required
348351
assert response.status_code in (401, 403)
352+
353+
354+
class TestCommunityConfigHealthStatus:
355+
"""Tests for health status in community config and public metrics."""
356+
357+
@pytest.fixture
358+
def client(self, tmp_path) -> TestClient:
359+
"""Create a test client with auth disabled and metrics DB initialized."""
360+
os.environ["REQUIRE_API_AUTH"] = "false"
361+
from src.api.config import get_settings
362+
363+
get_settings.cache_clear()
364+
365+
# Initialize a temp metrics DB so /metrics/public doesn't 503
366+
from unittest.mock import patch
367+
368+
from src.metrics.db import init_metrics_db
369+
370+
db_path = tmp_path / "metrics.db"
371+
init_metrics_db(db_path)
372+
373+
from src.api.main import app
374+
375+
with patch("src.metrics.db.get_metrics_db_path", return_value=db_path):
376+
yield TestClient(app)
377+
378+
def test_config_response_includes_status(self, client: TestClient) -> None:
379+
"""GET /{community_id}/ should include a status field."""
380+
response = client.get("/hed/")
381+
assert response.status_code == 200
382+
383+
data = response.json()
384+
assert "status" in data
385+
assert data["status"] in ["healthy", "degraded", "error"]
386+
387+
def test_config_status_does_not_leak_details(self, client: TestClient) -> None:
388+
"""Public config should not expose api_key details or warnings."""
389+
response = client.get("/hed/")
390+
data = response.json()
391+
392+
assert "warnings" not in data
393+
assert "api_key" not in data
394+
assert "config_health" not in data
395+
396+
def test_public_metrics_includes_config_health(self, client: TestClient) -> None:
397+
"""GET /{community_id}/metrics/public should include config_health."""
398+
response = client.get("/hed/metrics/public")
399+
assert response.status_code == 200
400+
401+
data = response.json()
402+
assert "config_health" in data
403+
404+
health = data["config_health"]
405+
assert "status" in health
406+
assert health["status"] in ["healthy", "degraded", "error"]
407+
assert "api_key" in health
408+
assert health["api_key"] in ["configured", "using_platform", "missing"]
409+
assert "documents" in health
410+
assert isinstance(health["documents"], int)
411+
assert "warnings" in health
412+
assert isinstance(health["warnings"], list)
413+
414+
def test_public_metrics_config_health_has_warnings_for_missing_key(
415+
self, client: TestClient
416+
) -> None:
417+
"""config_health should include warnings when API key env var is not set."""
418+
from src.assistants import registry
419+
420+
# Find a community with openrouter_api_key_env_var
421+
for assistant in registry.list_all():
422+
config = assistant.community_config
423+
if config and config.openrouter_api_key_env_var:
424+
env_var = config.openrouter_api_key_env_var
425+
original = os.environ.pop(env_var, None)
426+
try:
427+
response = client.get(f"/{assistant.id}/metrics/public")
428+
assert response.status_code == 200
429+
health = response.json()["config_health"]
430+
assert health["api_key"] == "missing"
431+
assert len(health["warnings"]) > 0
432+
assert any("not sustainable" in w for w in health["warnings"])
433+
# Env var names must not leak to public endpoint
434+
assert not any(env_var in w for w in health["warnings"])
435+
finally:
436+
if original is not None:
437+
os.environ[env_var] = original
438+
return
439+
440+
pytest.skip("No community with openrouter_api_key_env_var configured")

0 commit comments

Comments
 (0)