Skip to content

Commit b41a149

Browse files
committed
web done
1 parent c2ebb95 commit b41a149

35 files changed

+589
-1447
lines changed

health_monitor.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
#!/usr/bin/env python3
2-
"""Cluster Health Monitor - Real-time GPU cluster monitoring."""
2+
"""Cluster Health Monitor - Real-time GPU cluster monitoring.
3+
4+
Maintenance:
5+
- Purpose: CLI entrypoint and small web/server launcher for the project.
6+
- Debug: run `python health_monitor.py web` to start the server; check
7+
`config.yaml` for configuration. If debugging collectors, import and
8+
instantiate `monitor.collectors` classes directly.
9+
"""
310

411
import asyncio
512
import sys

monitor/__init__.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,13 @@
1-
"""Monitor package."""
1+
"""Top-level package for Cluster Health Monitor.
2+
3+
Expose high-level package metadata and commonly-used helpers so callers
4+
can import from ``monitor`` conveniently. Keep this file minimal to avoid
5+
heavy import costs at package import time.
6+
"""
7+
8+
__all__ = [
9+
'__version__',
10+
]
11+
12+
# Package version — keep in sync with distribution metadata
13+
__version__ = '1.2.0'

monitor/alerting/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
"""Monitor alerting package."""
1+
"""Alerting helpers for monitor.
2+
3+
Re-export the alert engine for convenient imports.
4+
"""
25

36
from .rules import AlertEngine
47

monitor/alerting/rules.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
1-
"""Alert rules engine for evaluating metrics against thresholds."""
1+
"""Alert rules engine for evaluating metrics against thresholds.
2+
3+
Maintenance:
4+
- Purpose: encapsulate alert logic for GPU and system metrics.
5+
- Debug: if alerts are not firing, verify metric keys (e.g., 'gpus', 'temperature')
6+
and the configuration passed to the engine. Alerts are returned as a list
7+
of dictionaries and not persisted by this module.
8+
"""
29

310
from datetime import datetime
411
from typing import Dict, Any, List

monitor/api/__init__.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,18 @@
1-
"""Monitor API package."""
1+
"""API package for the monitor web dashboard.
2+
3+
Re-export the primary application factory and common module-level
4+
constants so callers can do::
5+
6+
from monitor.api import create_app
7+
8+
Keep this file small to avoid importing heavy web framework code at
9+
package import time.
10+
"""
11+
12+
from .server import create_app, TEMPLATE_DIR, STATIC_DIR
13+
14+
__all__ = [
15+
'create_app',
16+
'TEMPLATE_DIR',
17+
'STATIC_DIR',
18+
]

monitor/api/server.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
1-
"""FastAPI server for REST API and web dashboard."""
1+
"""FastAPI server for REST API and web dashboard.
2+
3+
Maintenance:
4+
- Purpose: defines the web endpoints and WebSocket handlers used by the
5+
dashboard UI and simulation features.
6+
- Debug: enable request logging and inspect `/api/*` endpoints; WebSocket
7+
simulation frames are sent from the benchmark runner. If server fails to
8+
start, check dependency imports (FastAPI, uvicorn) and configuration paths.
9+
"""
210

311
from datetime import datetime, timedelta
412
from pathlib import Path

0 commit comments

Comments
 (0)