Skip to content

Commit f7d8713

Browse files
committed
Add job detail API, resource history, and UI tests
1 parent c05856e commit f7d8713

File tree

12 files changed

+1512
-16
lines changed

12 files changed

+1512
-16
lines changed

src/nimbus/control_plane/app.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1663,6 +1663,18 @@ async def recent_jobs(
16631663
)
16641664
return [JobRecord.model_validate(row) for row in rows]
16651665

1666+
@app.get("/api/jobs/{job_id}", response_model=JobRecord)
1667+
async def job_detail(
1668+
job_id: int,
1669+
_: str = Depends(verify_agent_token),
1670+
session: AsyncSession = Depends(get_session),
1671+
) -> JobRecord:
1672+
REQUEST_COUNTER.inc()
1673+
row = await db.get_job(session, job_id)
1674+
if not row:
1675+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
1676+
return JobRecord.model_validate(row)
1677+
16661678
@app.get("/api/jobs/metadata/summary")
16671679
async def job_metadata_summary(
16681680
key: str,

src/nimbus/host_agent/agent.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import asyncio
66
import time
77
from datetime import UTC, datetime, timezone, timedelta
8+
import json
89
from contextlib import asynccontextmanager
910
from pathlib import Path
1011
from typing import AsyncIterator, Dict, Optional
@@ -626,6 +627,12 @@ async def _collect_job_metrics_metadata(
626627
metadata["executor.name"] = executor_name
627628
if warm_instance_used is not None:
628629
metadata["executor.warm_instance"] = "true" if warm_instance_used else "false"
630+
history = self._resource_tracker.get_usage_history(job_id)
631+
if history:
632+
try:
633+
metadata["resource.timeline"] = json.dumps(history)
634+
except (TypeError, ValueError): # pragma: no cover - defensive
635+
LOGGER.debug("Failed to serialize resource timeline", job_id=job_id)
629636
return metadata
630637

631638
async def _submit_status(

src/nimbus/runners/resource_manager.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
import asyncio
66
import os
77
from dataclasses import dataclass
8+
from datetime import datetime, timezone
89
from pathlib import Path
9-
from typing import Optional, Dict
10+
from typing import Optional, Dict, List
1011

1112
import structlog
1213

@@ -211,18 +212,19 @@ async def cleanup_job_cgroup(self, job_id: int) -> None:
211212
job_id=job_id,
212213
error=str(exc))
213214

214-
async def update_metrics(self, job_id: int, executor_name: str) -> None:
215+
async def update_metrics(self, job_id: int, executor_name: str) -> Optional[ResourceUsage]:
215216
"""Update Prometheus metrics for a job."""
216217
usage = await self.get_job_usage(job_id)
217218
if not usage:
218-
return
219-
219+
return None
220+
220221
labels = [str(job_id), executor_name]
221222
self._cpu_usage_gauge.set(usage.cpu_seconds, labels=labels)
222223
self._memory_usage_gauge.set(usage.memory_bytes, labels=labels)
223-
224+
224225
# Counters need to track deltas, but for simplicity we'll just use current values
225226
# In production, we'd track previous values and report deltas
227+
return usage
226228

227229

228230
class ResourceTracker:
@@ -232,6 +234,7 @@ def __init__(self) -> None:
232234
self._cgroup_manager = CGroupManager()
233235
self._tracking_tasks: Dict[int, asyncio.Task] = {}
234236
self._running = False
237+
self._usage_history: Dict[int, List[dict[str, float | str]]] = {}
235238

236239
async def start(self) -> None:
237240
"""Start the resource tracking system."""
@@ -293,9 +296,10 @@ async def stop_job_tracking(self, job_id: int) -> None:
293296
await task
294297
except asyncio.CancelledError:
295298
pass
296-
299+
297300
# Clean up cgroup
298301
await self._cgroup_manager.cleanup_job_cgroup(job_id)
302+
self._usage_history.pop(job_id, None)
299303

300304
async def add_process(self, job_id: int, pid: int) -> None:
301305
"""Add a process to job tracking."""
@@ -304,15 +308,29 @@ async def add_process(self, job_id: int, pid: int) -> None:
304308
async def get_usage(self, job_id: int) -> Optional[ResourceUsage]:
305309
"""Get resource usage for a job."""
306310
return await self._cgroup_manager.get_job_usage(job_id)
311+
312+
def get_usage_history(self, job_id: int) -> List[dict[str, float | str]]:
313+
return list(self._usage_history.get(job_id, []))
307314

308315
async def _track_job_metrics(self, job_id: int, executor_name: str) -> None:
309316
"""Periodically update metrics for a job."""
310317
while self._running:
311318
try:
312-
await self._cgroup_manager.update_metrics(job_id, executor_name)
319+
usage = await self._cgroup_manager.update_metrics(job_id, executor_name)
320+
if usage:
321+
history = self._usage_history.setdefault(job_id, [])
322+
history.append(
323+
{
324+
"ts": datetime.now(timezone.utc).isoformat(),
325+
"cpu_seconds": usage.cpu_seconds,
326+
"memory_bytes": usage.memory_bytes,
327+
}
328+
)
329+
if len(history) > 120:
330+
history.pop(0)
313331
except Exception as exc:
314-
LOGGER.warning("Metrics update failed",
332+
LOGGER.warning("Metrics update failed",
315333
job_id=job_id,
316334
error=str(exc))
317-
335+
318336
await asyncio.sleep(5) # Update every 5 seconds

tests/test_resource_manager.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ async def test_resource_tracker_metrics_tracking():
397397

398398
mock_usage = ResourceUsage(cpu_seconds=1.0, memory_bytes=2048)
399399

400-
with patch.object(tracker._cgroup_manager, 'update_metrics') as mock_update, \
400+
with patch.object(tracker._cgroup_manager, 'update_metrics', return_value=mock_usage) as mock_update, \
401401
patch('asyncio.sleep') as mock_sleep:
402402

403403
# Mock sleep to raise exception after first iteration to exit loop
@@ -410,6 +410,10 @@ async def test_resource_tracker_metrics_tracking():
410410

411411
# Should be called at least once before cancellation
412412
mock_update.assert_called_with(123, "test")
413+
history = tracker.get_usage_history(123)
414+
assert history
415+
assert history[0]["cpu_seconds"] == mock_usage.cpu_seconds
416+
assert history[0]["memory_bytes"] == mock_usage.memory_bytes
413417

414418

415419
@pytest.mark.asyncio

0 commit comments

Comments
 (0)