Skip to content

Commit 124c8f6

Browse files
flush json and eval immediately
1 parent 4a60103 commit 124c8f6

File tree

4 files changed

+13
-6
lines changed

4 files changed

+13
-6
lines changed

experiments/eval/systems/magentic_one_system.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ async def _runner() -> Tuple[str, List[str]]:
188188
# Convert list of logevent objects to list of dicts
189189
messages_json = [msg.model_dump() for msg in messages_so_far]
190190
await f.write(json.dumps(messages_json, indent=2))
191+
await f.flush() # Flush to disk immediately
191192
# how the final answer is formatted: "Final Answer: FINAL ANSWER: Actual final answer"
192193

193194
# get last message with source MagenticOneOrchestrator, might not be the last message
@@ -215,8 +216,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
215216
usage_json = {
216217
"client": get_usage(model_client),
217218
}
218-
with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
219-
json.dump(usage_json, f)
219+
async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
220+
await f.write(json.dumps(usage_json, indent=2))
220221

221222
# Step 5: Prepare the screenshots
222223
screenshots_paths = []

experiments/eval/systems/magentic_ui_sim_user_system.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,7 @@ def get_model_client(
410410
# Convert list of logevent objects to list of dicts
411411
messages_json = [msg.model_dump() for msg in messages_so_far]
412412
await f.write(json.dumps(messages_json, indent=2))
413+
await f.flush() # Flush to disk immediately
413414
# how the final answer is formatted: "Final Answer: FINAL ANSWER: Actual final answer"
414415

415416
if message_str.startswith("Final Answer:"):
@@ -447,8 +448,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
447448
if key != "user_proxy"
448449
),
449450
}
450-
with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
451-
json.dump(usage_json, f)
451+
async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
452+
await f.write(json.dumps(usage_json, indent=2))
452453

453454
await team.close()
454455
# Step 5: Prepare the screenshots

experiments/eval/systems/magentic_ui_system.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ async def _runner() -> Tuple[str, List[str]]:
265265
# Convert list of logevent objects to list of dicts
266266
messages_json = [msg.model_dump() for msg in messages_so_far]
267267
await f.write(json.dumps(messages_json, indent=2))
268+
await f.flush() # Flush to disk immediately
268269
# how the final answer is formatted: "Final Answer: FINAL ANSWER: Actual final answer"
269270

270271
if message_str.startswith("Final Answer:"):
@@ -301,8 +302,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
301302
if key != "user_proxy"
302303
),
303304
}
304-
with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
305-
json.dump(usage_json, f)
305+
async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
306+
await f.write(json.dumps(usage_json, indent=2))
306307

307308
await team.close()
308309
# Step 5: Prepare the screenshots

src/magentic_ui/eval/core.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,10 @@ def _run_single_task(
207207
)
208208

209209
logger.info(f"Completed task for task_id={task_id}")
210+
211+
# Evaluate immediately after task completion
212+
_evaluate_single_task(task_id, system, output_dir, benchmark, redo_eval=False)
213+
210214
return task_id, answer, end_time - start_time
211215
except Exception:
212216
# Log the error with traceback

0 commit comments

Comments
 (0)