Skip to content

Commit 830a68d

Browse files
committed
Return the results and the samples from the eval side
1 parent f1d0f0e commit 830a68d

File tree

1 file changed

+48
-0
lines changed

1 file changed

+48
-0
lines changed

lmms_eval/entrypoints/http_server.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
import tempfile
44
import uuid
5+
from collections import defaultdict
56
from contextlib import asynccontextmanager
67
from dataclasses import asdict
78
from datetime import datetime
@@ -115,6 +116,50 @@ def reset(self):
115116
# =============================================================================
116117

117118

119+
def parse_output_directory(output_path: str) -> Dict[str, Dict[str, Any]]:
120+
"""
121+
Parse output directory: output_path/model_name/YYYYMMDD_HHMMSS_results.json
122+
123+
Returns:
124+
{model_name: {"results": path, "samples": [paths]}}
125+
"""
126+
output_dir = Path(output_path)
127+
if not output_dir.exists():
128+
return {}
129+
130+
result = {}
131+
132+
for model_dir in output_dir.iterdir():
133+
if not model_dir.is_dir():
134+
continue
135+
136+
model_name = model_dir.name
137+
138+
# Group files by timestamp
139+
timestamps = defaultdict(lambda: {"results": None, "samples": []})
140+
141+
for file in model_dir.glob("*_results.json"):
142+
timestamp = file.name.split("_results.json")[0]
143+
timestamps[timestamp]["results"] = str(file)
144+
145+
for file in model_dir.glob("*_samples_*.jsonl"):
146+
# Extract timestamp (everything before first _samples_)
147+
timestamp = file.name.split("_samples_")[0]
148+
timestamps[timestamp]["samples"].append(str(file))
149+
150+
if not timestamps:
151+
continue
152+
153+
# Use latest timestamp
154+
sorted_ts = sorted(timestamps.keys(), reverse=True)
155+
if len(sorted_ts) > 1:
156+
print(f"[WARNING] Multiple timestamps for '{model_name}': {sorted_ts}. Using latest.")
157+
158+
result[model_name] = timestamps[sorted_ts[0]]
159+
160+
return result
161+
162+
118163
async def run_evaluation_subprocess(config: dict) -> dict:
119164
"""
120165
Run evaluation in a subprocess using accelerate/torchrun.
@@ -198,6 +243,9 @@ async def run_evaluation_subprocess(config: dict) -> dict:
198243
if proc.returncode != 0:
199244
raise RuntimeError(f"Evaluation failed with return code {proc.returncode}")
200245

246+
# Parse and return results from output directory
247+
return parse_output_directory(output_path)
248+
201249

202250
async def job_worker():
203251
"""

0 commit comments

Comments
 (0)