flush json and eval immediately

husseinmozannar · husseinmozannar · commit 124c8f6034ad · 2025-12-05T16:00:24.000-05:00
diff --git a/experiments/eval/systems/magentic_one_system.py b/experiments/eval/systems/magentic_one_system.py
@@ -188,6 +188,7 @@ async def _runner() -> Tuple[str, List[str]]:
                     # Convert list of logevent objects to list of dicts
                     messages_json = [msg.model_dump() for msg in messages_so_far]
                     await f.write(json.dumps(messages_json, indent=2))
+                    await f.flush()  # Flush to disk immediately
                 # how the final answer is formatted:  "Final Answer: FINAL ANSWER: Actual final answer"
 
             # get last message with source MagenticOneOrchestrator, might not be the last message
@@ -215,8 +216,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
             usage_json = {
                 "client": get_usage(model_client),
             }
-            with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
-                json.dump(usage_json, f)
+            async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
+                await f.write(json.dumps(usage_json, indent=2))
 
             # Step 5: Prepare the screenshots
             screenshots_paths = []
diff --git a/experiments/eval/systems/magentic_ui_sim_user_system.py b/experiments/eval/systems/magentic_ui_sim_user_system.py
@@ -410,6 +410,7 @@ def get_model_client(
                     # Convert list of logevent objects to list of dicts
                     messages_json = [msg.model_dump() for msg in messages_so_far]
                     await f.write(json.dumps(messages_json, indent=2))
+                    await f.flush()  # Flush to disk immediately
                 # how the final answer is formatted:  "Final Answer: FINAL ANSWER: Actual final answer"
 
                 if message_str.startswith("Final Answer:"):
@@ -447,8 +448,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
                     if key != "user_proxy"
                 ),
             }
-            with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
-                json.dump(usage_json, f)
+            async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
+                await f.write(json.dumps(usage_json, indent=2))
 
             await team.close()
             # Step 5: Prepare the screenshots
diff --git a/experiments/eval/systems/magentic_ui_system.py b/experiments/eval/systems/magentic_ui_system.py
@@ -265,6 +265,7 @@ async def _runner() -> Tuple[str, List[str]]:
                     # Convert list of logevent objects to list of dicts
                     messages_json = [msg.model_dump() for msg in messages_so_far]
                     await f.write(json.dumps(messages_json, indent=2))
+                    await f.flush()  # Flush to disk immediately
                 # how the final answer is formatted:  "Final Answer: FINAL ANSWER: Actual final answer"
 
                 if message_str.startswith("Final Answer:"):
@@ -301,8 +302,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
                     if key != "user_proxy"
                 ),
             }
-            with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
-                json.dump(usage_json, f)
+            async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
+                await f.write(json.dumps(usage_json, indent=2))
 
             await team.close()
             # Step 5: Prepare the screenshots
diff --git a/src/magentic_ui/eval/core.py b/src/magentic_ui/eval/core.py
@@ -207,6 +207,10 @@ def _run_single_task(
             )
 
         logger.info(f"Completed task for task_id={task_id}")
+
+        # Evaluate immediately after task completion
+        _evaluate_single_task(task_id, system, output_dir, benchmark, redo_eval=False)
+
         return task_id, answer, end_time - start_time
     except Exception:
         # Log the error with traceback