@@ -71,6 +71,101 @@ def _regenerate_viewer_if_possible(output_dir: Path) -> bool:
7171 return False
7272
7373
74+ def _is_mock_benchmark (benchmark_dir : Path ) -> bool :
75+ """Check if a benchmark run is mock/test data (not real evaluation).
76+
77+ Returns True if the benchmark is mock data that should be filtered out.
78+ """
79+ # Check summary.json for model_id
80+ summary_path = benchmark_dir / "summary.json"
81+ if summary_path .exists ():
82+ try :
83+ with open (summary_path ) as f :
84+ summary = json .load (f )
85+ model_id = summary .get ("model_id" , "" ).lower ()
86+ # Filter out mock/test/random agent runs
87+ if any (term in model_id for term in ["random" , "mock" , "test" ]):
88+ return True
89+ except Exception :
90+ pass
91+
92+ # Check metadata.json for model_id
93+ metadata_path = benchmark_dir / "metadata.json"
94+ if metadata_path .exists ():
95+ try :
96+ with open (metadata_path ) as f :
97+ metadata = json .load (f )
98+ model_id = metadata .get ("model_id" , "" ).lower ()
99+ if any (term in model_id for term in ["random" , "mock" , "test" ]):
100+ return True
101+ except Exception :
102+ pass
103+
104+ # Check benchmark name for "mock"
105+ if "mock" in benchmark_dir .name .lower ():
106+ return True
107+
108+ return False
109+
110+
111+ def _regenerate_benchmark_viewer_if_available (output_dir : Path ) -> bool :
112+ """Regenerate benchmark.html from latest real benchmark results.
113+
114+ Looks for the most recent non-mock benchmark run in benchmark_results/ directory
115+ and generates a benchmark viewer in the output directory. If no real benchmark
116+ data exists, generates an empty state viewer with guidance.
117+
118+ Returns True if benchmark viewer was regenerated, False otherwise.
119+ """
120+ from openadapt_ml .training .benchmark_viewer import (
121+ generate_benchmark_viewer ,
122+ generate_empty_benchmark_viewer ,
123+ )
124+
125+ benchmark_results_dir = Path ("benchmark_results" )
126+
127+ # Find real (non-mock) benchmark runs
128+ real_benchmarks = []
129+ if benchmark_results_dir .exists ():
130+ for d in benchmark_results_dir .iterdir ():
131+ if d .is_dir () and (d / "summary.json" ).exists ():
132+ if not _is_mock_benchmark (d ):
133+ real_benchmarks .append (d )
134+
135+ benchmark_html_path = output_dir / "benchmark.html"
136+
137+ if not real_benchmarks :
138+ # No real benchmark data - generate empty state viewer
139+ try :
140+ generate_empty_benchmark_viewer (benchmark_html_path )
141+ print (" Generated benchmark viewer: No real evaluation data yet" )
142+ return True
143+ except Exception as e :
144+ print (f" Could not generate empty benchmark viewer: { e } " )
145+ return False
146+
147+ # Sort by modification time to get the latest real benchmark
148+ latest_benchmark = max (real_benchmarks , key = lambda d : d .stat ().st_mtime )
149+
150+ try :
151+ # Generate benchmark.html in the output directory
152+ generate_benchmark_viewer (latest_benchmark , benchmark_html_path )
153+
154+ # Copy tasks folder for screenshots
155+ tasks_src = latest_benchmark / "tasks"
156+ tasks_dst = output_dir / "tasks"
157+ if tasks_src .exists ():
158+ if tasks_dst .exists ():
159+ shutil .rmtree (tasks_dst )
160+ shutil .copytree (tasks_src , tasks_dst )
161+
162+ print (f" Regenerated benchmark viewer from: { latest_benchmark .name } " )
163+ return True
164+ except Exception as e :
165+ print (f" Could not regenerate benchmark viewer: { e } " )
166+ return False
167+
168+
74169def detect_device () -> str :
75170 """Detect available compute device."""
76171 try :
@@ -327,6 +422,9 @@ def cmd_serve(args: argparse.Namespace) -> int:
327422 except Exception as e :
328423 print (f"Warning: Could not regenerate: { e } " )
329424
425+ # Also regenerate benchmark viewer from latest benchmark results
426+ _regenerate_benchmark_viewer_if_available (serve_dir )
427+
330428 start_page = "dashboard.html"
331429
332430 # Serve from the specified directory
@@ -353,9 +451,100 @@ def do_POST(self):
353451 self .end_headers ()
354452 self .wfile .write (b'{"status": "stop_signal_created"}' )
355453 print (f"\n ⏹ Stop signal created: { stop_file } " )
454+ elif self .path == '/api/run-benchmark' :
455+ # Parse request body for provider
456+ content_length = int (self .headers .get ('Content-Length' , 0 ))
457+ body = self .rfile .read (content_length ).decode ('utf-8' ) if content_length else '{}'
458+ try :
459+ params = json .loads (body )
460+ except json .JSONDecodeError :
461+ params = {}
462+
463+ provider = params .get ('provider' , 'anthropic' )
464+ tasks = params .get ('tasks' , 5 )
465+
466+ self .send_response (200 )
467+ self .send_header ('Content-Type' , 'application/json' )
468+ self .send_header ('Access-Control-Allow-Origin' , '*' )
469+ self .end_headers ()
470+ self .wfile .write (json .dumps ({"status" : "started" , "provider" : provider , "tasks" : tasks }).encode ())
471+
472+ # Run benchmark in background thread with progress logging
473+ def run_benchmark ():
474+ import subprocess
475+ from dotenv import load_dotenv
476+
477+ # Load .env file for API keys
478+ project_root = Path (__file__ ).parent .parent .parent
479+ load_dotenv (project_root / ".env" )
480+
481+ # Create progress log file (in cwd which is serve_dir)
482+ progress_file = Path ("benchmark_progress.json" )
483+
484+ print (f"\n 🚀 Starting { provider } benchmark evaluation ({ tasks } tasks)..." )
485+
486+ # Write initial progress
487+ progress_file .write_text (json .dumps ({
488+ "status" : "running" ,
489+ "provider" : provider ,
490+ "tasks_total" : tasks ,
491+ "tasks_complete" : 0 ,
492+ "message" : f"Starting { provider } evaluation..."
493+ }))
494+
495+ # Copy environment with loaded vars
496+ env = os .environ .copy ()
497+
498+ result = subprocess .run (
499+ ["uv" , "run" , "python" , "-m" , "openadapt_ml.benchmarks.cli" , "run-api" ,
500+ "--provider" , provider , "--tasks" , str (tasks ),
501+ "--model-id" , f"{ provider } -api" ],
502+ capture_output = True , text = True , cwd = str (project_root ), env = env
503+ )
504+
505+ print (f"\n 📋 Benchmark output:\n { result .stdout } " )
506+ if result .stderr :
507+ print (f"Stderr: { result .stderr } " )
508+
509+ if result .returncode == 0 :
510+ print (f"✅ Benchmark complete. Regenerating viewer..." )
511+ progress_file .write_text (json .dumps ({
512+ "status" : "complete" ,
513+ "provider" : provider ,
514+ "message" : "Evaluation complete! Refreshing results..."
515+ }))
516+ # Regenerate benchmark viewer
517+ _regenerate_benchmark_viewer_if_available (serve_dir )
518+ else :
519+ print (f"❌ Benchmark failed: { result .stderr } " )
520+ progress_file .write_text (json .dumps ({
521+ "status" : "error" ,
522+ "provider" : provider ,
523+ "message" : f"Evaluation failed: { result .stderr [:200 ]} "
524+ }))
525+
526+ threading .Thread (target = run_benchmark , daemon = True ).start ()
356527 else :
357528 self .send_error (404 , "Not found" )
358529
530+ def do_GET (self ):
531+ if self .path .startswith ('/api/benchmark-progress' ):
532+ # Return benchmark progress
533+ progress_file = Path ("benchmark_progress.json" ) # Relative to serve_dir (cwd)
534+ if progress_file .exists ():
535+ progress = progress_file .read_text ()
536+ else :
537+ progress = json .dumps ({"status" : "idle" })
538+
539+ self .send_response (200 )
540+ self .send_header ('Content-Type' , 'application/json' )
541+ self .send_header ('Access-Control-Allow-Origin' , '*' )
542+ self .end_headers ()
543+ self .wfile .write (progress .encode ())
544+ else :
545+ # Default file serving
546+ super ().do_GET ()
547+
359548 def do_OPTIONS (self ):
360549 # Handle CORS preflight
361550 self .send_response (200 )
0 commit comments