Skip to content

Commit a31dbf9

Browse files
committed
feat(viewer): add benchmark tab with WAA integration WIP state
1 parent fc003b0 commit a31dbf9

File tree

2 files changed

+997
-0
lines changed

2 files changed

+997
-0
lines changed

openadapt_ml/cloud/local.py

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,101 @@ def _regenerate_viewer_if_possible(output_dir: Path) -> bool:
7171
return False
7272

7373

74+
def _is_mock_benchmark(benchmark_dir: Path) -> bool:
75+
"""Check if a benchmark run is mock/test data (not real evaluation).
76+
77+
Returns True if the benchmark is mock data that should be filtered out.
78+
"""
79+
# Check summary.json for model_id
80+
summary_path = benchmark_dir / "summary.json"
81+
if summary_path.exists():
82+
try:
83+
with open(summary_path) as f:
84+
summary = json.load(f)
85+
model_id = summary.get("model_id", "").lower()
86+
# Filter out mock/test/random agent runs
87+
if any(term in model_id for term in ["random", "mock", "test"]):
88+
return True
89+
except Exception:
90+
pass
91+
92+
# Check metadata.json for model_id
93+
metadata_path = benchmark_dir / "metadata.json"
94+
if metadata_path.exists():
95+
try:
96+
with open(metadata_path) as f:
97+
metadata = json.load(f)
98+
model_id = metadata.get("model_id", "").lower()
99+
if any(term in model_id for term in ["random", "mock", "test"]):
100+
return True
101+
except Exception:
102+
pass
103+
104+
# Check benchmark name for "mock"
105+
if "mock" in benchmark_dir.name.lower():
106+
return True
107+
108+
return False
109+
110+
111+
def _regenerate_benchmark_viewer_if_available(output_dir: Path) -> bool:
112+
"""Regenerate benchmark.html from latest real benchmark results.
113+
114+
Looks for the most recent non-mock benchmark run in benchmark_results/ directory
115+
and generates a benchmark viewer in the output directory. If no real benchmark
116+
data exists, generates an empty state viewer with guidance.
117+
118+
Returns True if benchmark viewer was regenerated, False otherwise.
119+
"""
120+
from openadapt_ml.training.benchmark_viewer import (
121+
generate_benchmark_viewer,
122+
generate_empty_benchmark_viewer,
123+
)
124+
125+
benchmark_results_dir = Path("benchmark_results")
126+
127+
# Find real (non-mock) benchmark runs
128+
real_benchmarks = []
129+
if benchmark_results_dir.exists():
130+
for d in benchmark_results_dir.iterdir():
131+
if d.is_dir() and (d / "summary.json").exists():
132+
if not _is_mock_benchmark(d):
133+
real_benchmarks.append(d)
134+
135+
benchmark_html_path = output_dir / "benchmark.html"
136+
137+
if not real_benchmarks:
138+
# No real benchmark data - generate empty state viewer
139+
try:
140+
generate_empty_benchmark_viewer(benchmark_html_path)
141+
print(" Generated benchmark viewer: No real evaluation data yet")
142+
return True
143+
except Exception as e:
144+
print(f" Could not generate empty benchmark viewer: {e}")
145+
return False
146+
147+
# Sort by modification time to get the latest real benchmark
148+
latest_benchmark = max(real_benchmarks, key=lambda d: d.stat().st_mtime)
149+
150+
try:
151+
# Generate benchmark.html in the output directory
152+
generate_benchmark_viewer(latest_benchmark, benchmark_html_path)
153+
154+
# Copy tasks folder for screenshots
155+
tasks_src = latest_benchmark / "tasks"
156+
tasks_dst = output_dir / "tasks"
157+
if tasks_src.exists():
158+
if tasks_dst.exists():
159+
shutil.rmtree(tasks_dst)
160+
shutil.copytree(tasks_src, tasks_dst)
161+
162+
print(f" Regenerated benchmark viewer from: {latest_benchmark.name}")
163+
return True
164+
except Exception as e:
165+
print(f" Could not regenerate benchmark viewer: {e}")
166+
return False
167+
168+
74169
def detect_device() -> str:
75170
"""Detect available compute device."""
76171
try:
@@ -327,6 +422,9 @@ def cmd_serve(args: argparse.Namespace) -> int:
327422
except Exception as e:
328423
print(f"Warning: Could not regenerate: {e}")
329424

425+
# Also regenerate benchmark viewer from latest benchmark results
426+
_regenerate_benchmark_viewer_if_available(serve_dir)
427+
330428
start_page = "dashboard.html"
331429

332430
# Serve from the specified directory
@@ -353,9 +451,100 @@ def do_POST(self):
353451
self.end_headers()
354452
self.wfile.write(b'{"status": "stop_signal_created"}')
355453
print(f"\n⏹ Stop signal created: {stop_file}")
454+
elif self.path == '/api/run-benchmark':
455+
# Parse request body for provider
456+
content_length = int(self.headers.get('Content-Length', 0))
457+
body = self.rfile.read(content_length).decode('utf-8') if content_length else '{}'
458+
try:
459+
params = json.loads(body)
460+
except json.JSONDecodeError:
461+
params = {}
462+
463+
provider = params.get('provider', 'anthropic')
464+
tasks = params.get('tasks', 5)
465+
466+
self.send_response(200)
467+
self.send_header('Content-Type', 'application/json')
468+
self.send_header('Access-Control-Allow-Origin', '*')
469+
self.end_headers()
470+
self.wfile.write(json.dumps({"status": "started", "provider": provider, "tasks": tasks}).encode())
471+
472+
# Run benchmark in background thread with progress logging
473+
def run_benchmark():
474+
import subprocess
475+
from dotenv import load_dotenv
476+
477+
# Load .env file for API keys
478+
project_root = Path(__file__).parent.parent.parent
479+
load_dotenv(project_root / ".env")
480+
481+
# Create progress log file (in cwd which is serve_dir)
482+
progress_file = Path("benchmark_progress.json")
483+
484+
print(f"\n🚀 Starting {provider} benchmark evaluation ({tasks} tasks)...")
485+
486+
# Write initial progress
487+
progress_file.write_text(json.dumps({
488+
"status": "running",
489+
"provider": provider,
490+
"tasks_total": tasks,
491+
"tasks_complete": 0,
492+
"message": f"Starting {provider} evaluation..."
493+
}))
494+
495+
# Copy environment with loaded vars
496+
env = os.environ.copy()
497+
498+
result = subprocess.run(
499+
["uv", "run", "python", "-m", "openadapt_ml.benchmarks.cli", "run-api",
500+
"--provider", provider, "--tasks", str(tasks),
501+
"--model-id", f"{provider}-api"],
502+
capture_output=True, text=True, cwd=str(project_root), env=env
503+
)
504+
505+
print(f"\n📋 Benchmark output:\n{result.stdout}")
506+
if result.stderr:
507+
print(f"Stderr: {result.stderr}")
508+
509+
if result.returncode == 0:
510+
print(f"✅ Benchmark complete. Regenerating viewer...")
511+
progress_file.write_text(json.dumps({
512+
"status": "complete",
513+
"provider": provider,
514+
"message": "Evaluation complete! Refreshing results..."
515+
}))
516+
# Regenerate benchmark viewer
517+
_regenerate_benchmark_viewer_if_available(serve_dir)
518+
else:
519+
print(f"❌ Benchmark failed: {result.stderr}")
520+
progress_file.write_text(json.dumps({
521+
"status": "error",
522+
"provider": provider,
523+
"message": f"Evaluation failed: {result.stderr[:200]}"
524+
}))
525+
526+
threading.Thread(target=run_benchmark, daemon=True).start()
356527
else:
357528
self.send_error(404, "Not found")
358529

530+
def do_GET(self):
531+
if self.path.startswith('/api/benchmark-progress'):
532+
# Return benchmark progress
533+
progress_file = Path("benchmark_progress.json") # Relative to serve_dir (cwd)
534+
if progress_file.exists():
535+
progress = progress_file.read_text()
536+
else:
537+
progress = json.dumps({"status": "idle"})
538+
539+
self.send_response(200)
540+
self.send_header('Content-Type', 'application/json')
541+
self.send_header('Access-Control-Allow-Origin', '*')
542+
self.end_headers()
543+
self.wfile.write(progress.encode())
544+
else:
545+
# Default file serving
546+
super().do_GET()
547+
359548
def do_OPTIONS(self):
360549
# Handle CORS preflight
361550
self.send_response(200)

0 commit comments

Comments
 (0)