|
2 | 2 | import json |
3 | 3 | import tempfile |
4 | 4 | import uuid |
| 5 | +from collections import defaultdict |
5 | 6 | from contextlib import asynccontextmanager |
6 | 7 | from dataclasses import asdict |
7 | 8 | from datetime import datetime |
@@ -115,6 +116,50 @@ def reset(self): |
115 | 116 | # ============================================================================= |
116 | 117 |
|
117 | 118 |
|
| 119 | +def parse_output_directory(output_path: str) -> Dict[str, Dict[str, Any]]: |
| 120 | + """ |
| 121 | + Parse output directory: output_path/model_name/YYYYMMDD_HHMMSS_results.json |
| 122 | +
|
| 123 | + Returns: |
| 124 | + {model_name: {"results": path, "samples": [paths]}} |
| 125 | + """ |
| 126 | + output_dir = Path(output_path) |
| 127 | + if not output_dir.exists(): |
| 128 | + return {} |
| 129 | + |
| 130 | + result = {} |
| 131 | + |
| 132 | + for model_dir in output_dir.iterdir(): |
| 133 | + if not model_dir.is_dir(): |
| 134 | + continue |
| 135 | + |
| 136 | + model_name = model_dir.name |
| 137 | + |
| 138 | + # Group files by timestamp |
| 139 | + timestamps = defaultdict(lambda: {"results": None, "samples": []}) |
| 140 | + |
| 141 | + for file in model_dir.glob("*_results.json"): |
| 142 | + timestamp = file.name.split("_results.json")[0] |
| 143 | + timestamps[timestamp]["results"] = str(file) |
| 144 | + |
| 145 | + for file in model_dir.glob("*_samples_*.jsonl"): |
| 146 | + # Extract timestamp (everything before first _samples_) |
| 147 | + timestamp = file.name.split("_samples_")[0] |
| 148 | + timestamps[timestamp]["samples"].append(str(file)) |
| 149 | + |
| 150 | + if not timestamps: |
| 151 | + continue |
| 152 | + |
| 153 | + # Use latest timestamp |
| 154 | + sorted_ts = sorted(timestamps.keys(), reverse=True) |
| 155 | + if len(sorted_ts) > 1: |
| 156 | + print(f"[WARNING] Multiple timestamps for '{model_name}': {sorted_ts}. Using latest.") |
| 157 | + |
| 158 | + result[model_name] = timestamps[sorted_ts[0]] |
| 159 | + |
| 160 | + return result |
| 161 | + |
| 162 | + |
118 | 163 | async def run_evaluation_subprocess(config: dict) -> dict: |
119 | 164 | """ |
120 | 165 | Run evaluation in a subprocess using accelerate/torchrun. |
@@ -198,6 +243,9 @@ async def run_evaluation_subprocess(config: dict) -> dict: |
198 | 243 | if proc.returncode != 0: |
199 | 244 | raise RuntimeError(f"Evaluation failed with return code {proc.returncode}") |
200 | 245 |
|
| 246 | + # Parse and return results from output directory |
| 247 | + return parse_output_directory(output_path) |
| 248 | + |
201 | 249 |
|
202 | 250 | async def job_worker(): |
203 | 251 | """ |
|
0 commit comments