add work agent from zxxz

hellen-wong · hellen-wong · commit 0dfe4c35a40c · 2025-08-17T16:14:10.000+08:00
diff --git a/apps/run-agent/scripts/claude-sonnet-3.7/run_evaluate_multiple_runs_gaia-validation.sh b/apps/run-agent/scripts/claude-sonnet-3.7/run_evaluate_multiple_runs_gaia-validation.sh
@@ -5,10 +5,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 NUM_RUNS=3
+MAX_CONCURRENT=20
 BENCHMARK_NAME="gaia-validation"
 LLM_PROVIDER="claude_openrouter"
 LLM_MODEL="anthropic/claude-3.7-sonnet"
 AGENT_SET="miroflow"
+ADD_MESSAGE_ID="true"  # Set to true to add random message ID to all messages sent to LLM
+MAX_TURNS=-1
+TEMPERATURE=0.3
 
 RESULTS_DIR="logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
 
@@ -30,11 +34,15 @@ for i in $(seq 1 $NUM_RUNS); do
             llm=claude_openrouter \
             llm.provider=$LLM_PROVIDER \
             llm.model_name=$LLM_MODEL \
+            llm.temperature=$TEMPERATURE \
             llm.async_client=true \
             benchmark.execution.max_tasks=null \
-            benchmark.execution.max_concurrent=5 \
+            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
             benchmark.execution.pass_at_k=1 \
             agent=$AGENT_SET \
+            agent.add_message_id=$ADD_MESSAGE_ID \
+            agent.main_agent.max_turns=$MAX_TURNS \
+            agent.sub_agents.agent-worker.max_turns=$MAX_TURNS \
             output_dir="$RESULTS_DIR/$RUN_ID" \
             > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
         
diff --git a/data/gaia-val/standardized_data.jsonl b/data/gaia-val/standardized_data.jsonl
diff --git a/data/gaia-val/standardized_data_single_sample.jsonl b/data/gaia-val/standardized_data_single_sample.jsonl
@@ -0,0 +1 @@
+{"task_id": "c61d22de-5f6c-4958-a7f6-5e9707bd3466", "task_question": "A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?", "ground_truth": "egalitarian", "file_path": null, "metadata": {"Level": "2", "Annotator Metadata": {"Steps": "1. Go to arxiv.org and navigate to the Advanced Search page.\n2. Enter \"AI regulation\" in the search box and select \"All fields\" from the dropdown.\n3. Enter 2022-06-01 and 2022-07-01 into the date inputs, select \"Submission date (original)\", and submit the search.\n4. Go through the search results to find the article that has a figure with three axes and labels on each end of the axes, titled \"Fairness in Agreement With European Values: An Interdisciplinary Perspective on AI Regulation\".\n5. Note the six words used as labels: deontological, egalitarian, localized, standardized, utilitarian, and consequential.\n6. Go back to arxiv.org\n7. Find \"Physics and Society\" and go to the page for the \"Physics and Society\" category.\n8. Note that the tag for this category is \"physics.soc-ph\".\n9. Go to the Advanced Search page.\n10. Enter \"physics.soc-ph\" in the search box and select \"All fields\" from the dropdown.\n11. Enter 2016-08-11 and 2016-08-12 into the date inputs, select \"Submission date (original)\", and submit the search.\n12. Search for instances of the six words in the results to find the paper titled \"Phase transition from egalitarian to hierarchical societies driven by competition between cognitive and social constraints\", indicating that \"egalitarian\" is the correct answer.", "Number of steps": "12", "How long did this take?": "8 minutes", "Tools": "1. Web browser\n2. Image recognition tools (to identify and parse a figure with three axes)", "Number of tools": "2"}}}
diff --git a/libs/miroflow-tool/src/miroflow/tool/mcp_servers/reasoning_mcp_server.py b/libs/miroflow-tool/src/miroflow/tool/mcp_servers/reasoning_mcp_server.py
@@ -8,24 +8,32 @@
 ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
 ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com")
 OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY", "")
-OPENROUTER_BASE_URL = os.environ.get(
-    "OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"
-)
+OPENROUTER_BASE_URL = os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
 
 # Initialize FastMCP server
 mcp = FastMCP("reasoning-mcp-server")
 
 
 @mcp.tool()
 async def reasoning(question: str) -> str:
-    """You can use this tool use solve hard math problem, puzzle, riddle and IQ test question that requries a lot of chain of thought efforts.
-    DO NOT use this tool for simple and obvious question.
+    """This tool is for pure text-based reasoning, analysis, and logical thinking. It integrates collected information, organizes final logic, and provides planning insights.
+    
+    IMPORTANT: This tool cannot access the internet, read files, program, or process multimodal content. It only performs pure text reasoning.
+    
+    Use this tool for:
+    - Integrating and synthesizing collected information
+    - Analyzing patterns and relationships in data
+    - Logical reasoning and problem-solving
+    - Planning and strategy development
+    - Complex math problems, puzzles, riddles, and IQ tests
+    
+    DO NOT use this tool for simple and obvious questions.
 
     Args:
-        question: The complex question or problem requiring step-by-step reasoning. Should include all relevant information needed to solve the problem..
+        question: The complex question or problem requiring step-by-step reasoning. Should include all relevant information needed to solve the problem.
 
     Returns:
-        The answer to the question.
+        The reasoned answer to the question.
     """
 
     messages_for_llm = [
diff --git a/libs/miroflow/src/miroflow/prebuilt/config/agent/_default.yaml b/libs/miroflow/src/miroflow/prebuilt/config/agent/_default.yaml
@@ -4,8 +4,8 @@
 main_agent:
   tools:
     - tool-code
-    - tool-vqa
-    - tool-transcribe
+    - tool-image-video
+    - tool-audio
     - tool-reasoning
     - tool-markitdown
   # tool_blacklist:
@@ -16,7 +16,7 @@ sub_agents:
   agent-browsing:
     tools:
       - tool-serper-search
-      - tool-vqa
+      - tool-image-video
       - tool-markitdown
       - tool-code
     max_turns: 20
@@ -32,7 +32,7 @@ sub_agents:
   #   max_turns: 20
 
 tool_config:
-  tool-vqa:
+  tool-image-video:
     enable_claude_vision: "true"
     enable_openai_vision: "true"
 
diff --git a/libs/miroflow/src/miroflow/prebuilt/config/agent/miroflow.yaml b/libs/miroflow/src/miroflow/prebuilt/config/agent/miroflow.yaml
@@ -7,21 +7,20 @@ defaults:
 
 main_agent:
   tools:
-    - tool-vqa
-    - tool-reading
-    - tool-code
     - tool-reasoning
-    - tool-transcribe
   max_turns: 20  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
 # Trival reproduce of OWL:
 sub_agents:
-  agent-browsing:
+  agent-worker:
     tools:
       - tool-searching
-      - tool-vqa
+      - tool-image-video
+      - tool-audio
       - tool-reading
       - tool-code
     max_turns: 20
+    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
 
 o3_hint: true
 o3_final_answer: true
diff --git a/libs/miroflow/src/miroflow/prebuilt/config/benchmark/_default.yaml b/libs/miroflow/src/miroflow/prebuilt/config/benchmark/_default.yaml
@@ -4,6 +4,7 @@ name: "default"
 
 data:
   metadata_file: "standardized_data.jsonl"
+  # metadata_file: "standardized_data_single_sample.jsonl"
   field_mapping:
     task_id_field: "task_id"
     task_question_field: "task_question"
diff --git a/libs/miroflow/src/miroflow/prebuilt/orchestrator.py b/libs/miroflow/src/miroflow/prebuilt/orchestrator.py
@@ -1320,7 +1320,7 @@ async def run_main_agent(
                     final_answer_text = f"{final_answer_text}\n\nO3 Extracted Answer:\n{o3_extracted_answer}"
 
                 except Exception as e:
-                    logger.warning(
+                    logger.error(
                         f"O3 final answer extraction failed after retries: {str(e)}"
                     )
                     # Continue using original final_answer_text
diff --git a/libs/miroflow/src/miroflow/utils/prompt_utils.py b/libs/miroflow/src/miroflow/utils/prompt_utils.py
@@ -172,6 +172,24 @@ def generate_agent_specific_system_prompt(agent_type: str = ""):
 
 You are a task-solving agent that uses tools step-by-step to answer the user's question. Your goal is to provide complete, accurate and well-reasoned answers using additional tools.
 
+## Subtask Delegation Strategy
+
+For each clearly defined single subtask, delegate it to worker agents using the `execute_subtask` tool from the `agent-worker` server. **Important: Only make ONE execute_subtask call per response.**
+
+**CRITICAL: Always treat worker agent responses as unreliable and incomplete sources.** Worker agents may:
+- Report "not found" when information actually exists elsewhere
+- Return partial information while believing it's complete
+- Be overconfident or produce hallucinations
+
+Therefore, you must always verify and validate worker responses by:
+- Cross-referencing information from multiple independent sources
+- Trying alternative search strategies and reformulating subtasks with different approaches
+- Considering that information might exist in different formats or locations
+- Applying critical evaluation to assess credibility and completeness
+- Never accepting "not found" or worker conclusions as final without additional verification
+
+## Final Answer Preparation
+
 Before presenting your answer, and **unless** the user asks to "Summarize the above" (in which case no tools are used), **always** use the `reasoning` tool from the `tool-reasoning` server to step-by-step analyze solving process as follows:
   - Use the reasoning tool to carefully analyze:
       - What the question is truly asking.
@@ -184,11 +202,18 @@ def generate_agent_specific_system_prompt(agent_type: str = ""):
 
 """
 
-    elif agent_type == "agent-browsing":
+    elif agent_type == "agent-worker":
         system_prompt = """# Agent Specific Objective
 
-You are an agent that performs the task of searching and browsing the web for specific information and generating the desired answer. Your task is to retrieve reliable, factual, and verifiable information that fills in knowledge gaps.
-Do not infer, speculate, summarize broadly, or attempt to fill in missing parts yourself. Only return factual content.
+You are an agent that performs various subtasks to collect information and execute specific actions. Your task is to complete well-defined, single-scope objectives efficiently and accurately.
+Do not infer, speculate, or attempt to fill in missing parts yourself. Only return factual content and execute actions as specified.
+
+## File Path Handling
+When subtasks mention file paths, these are local system file paths (not sandbox paths). You can:
+- Use tools to directly access these files from the local system
+- Upload files to the sandbox environment (remember to create a new sandbox for each task, this sandbox only exists for the current task) for processing if needed
+- Choose the most appropriate approach based on the specific task requirements
+- If the final response requires returning a file, download it to the local system first and then return the local path, the sandbox path is not allowed
 
 Critically assess the reliability of all information:
 - If the credibility of a source is uncertain, clearly flag it.
@@ -200,6 +225,7 @@ def generate_agent_specific_system_prompt(agent_type: str = ""):
 - Never assume or guess — if an exact answer cannot be found, say so clearly.
 - Prefer quoting or excerpting **original source text** rather than interpreting or rewriting it, and provide the URL if available.
 - If more context is needed, return a clarification request and do not proceed with tool use.
+- Focus on completing the specific subtask assigned to you, not broader reasoning.
 """
     elif agent_type == "agent-coding":
         system_prompt = """# Agent Specific Objective
@@ -268,7 +294,7 @@ def generate_agent_summarize_prompt(
                 "Focus on factual, specific, and well-organized information."
             )
         )
-    elif agent_type == "agent-browsing":
+    elif agent_type == "agent-worker":
         summarize_prompt = (
             (
                 "This is a direct instruction to you (the assistant), not the result of a tool call.\n\n"
@@ -283,8 +309,8 @@ def generate_agent_summarize_prompt(
                 "You must NOT initiate any further tool use. This is your final opportunity to report "
                 "*all* of the information gathered during the session.\n\n"
                 "The original task is repeated here for reference:\n\n"
-                f"---\n{task_description}\n---\n\n"
-                "Summarize the above search and browsing history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\n\n"
+                f'---\n{task_description}\n---\n\n'
+                "Summarize the above subtask execution history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\n\n"
                 "If you found any useful facts, data, quotes, or answers directly relevant to the original task, include them clearly and completely.\n"
                 "If you reached a conclusion or answer, include it as part of the response.\n"
                 "If the task could not be fully answered, do NOT make up any content. Instead, return all partially relevant findings, "
diff --git a/libs/miroflow/src/miroflow/utils/tool_utils.py b/libs/miroflow/src/miroflow/utils/tool_utils.py
@@ -17,8 +17,8 @@ def create_mcp_server_parameters(
     cfg: DictConfig, agent_cfg: DictConfig, logs_dir: str | None = None
 ):
     """Define and return MCP server configuration list"""
-    ENABLE_CLAUDE_VISION = cfg.agent.tool_config["tool-vqa"]["enable_claude_vision"]
-    ENABLE_OPENAI_VISION = cfg.agent.tool_config["tool-vqa"]["enable_openai_vision"]
+    ENABLE_CLAUDE_VISION = cfg.agent.tool_config["tool-image-video"]["enable_claude_vision"]
+    ENABLE_OPENAI_VISION = cfg.agent.tool_config["tool-image-video"]["enable_openai_vision"]
 
     configs = []
     if agent_cfg.get("tools", None) is not None and "tool-code" in agent_cfg["tools"]:
@@ -40,10 +40,10 @@ def create_mcp_server_parameters(
             }
         )
 
-    if agent_cfg.get("tools", None) is not None and "tool-vqa" in agent_cfg["tools"]:
+    if agent_cfg.get("tools", None) is not None and "tool-image-video" in agent_cfg["tools"]:
         configs.append(
             {
-                "name": "tool-vqa",
+                "name": "tool-image-video",
                 "params": StdioServerParameters(
                     command=sys.executable,
                     args=["-m", "miroflow.tool.mcp_servers.vision_mcp_server"],
@@ -62,11 +62,11 @@ def create_mcp_server_parameters(
 
     if (
         agent_cfg.get("tools", None) is not None
-        and "tool-transcribe" in agent_cfg["tools"]
+        and "tool-audio" in agent_cfg["tools"]
     ):
         configs.append(
             {
-                "name": "tool-transcribe",
+                "name": "tool-audio",
                 "params": StdioServerParameters(
                     command=sys.executable,
                     args=["-m", "miroflow.tool.mcp_servers.audio_mcp_server"],
@@ -202,21 +202,21 @@ def expose_sub_agents_as_tools(sub_agents_cfg: DictConfig):
     """Expose sub-agents as tools"""
     sub_agents_server_params = []
     for sub_agent in sub_agents_cfg.keys():
-        if "agent-browsing" in sub_agent:  # type: ignore
+        if "agent-worker" in sub_agent:  # type: ignore
             sub_agents_server_params.append(
                 dict(
-                    name="agent-browsing",
+                    name="agent-worker",
                     tools=[
                         dict(
-                            name="search_and_browse",
-                            description="This tool is an agent that performs the subtask of searching and browsing the web for specific missing information and generating the desired answer. The subtask should be clearly defined, include relevant background, and focus on factual gaps. It does not perform vague or speculative subtasks. \nArgs: \n\tsubtask: the subtask to be performed. \nReturns: \n\tthe result of the subtask. ",
+                            name="execute_subtask",
+                            description="This tool is an agent that performs various subtasks to collect information and execute specific actions. It can access the internet, read files, program, and process multimodal content, but is not specialized in complex reasoning or logical thinking. The tool returns processed summary reports rather than raw information - it analyzes, synthesizes, and presents findings in a structured format. The subtask should be clearly defined, include relevant background, and focus on a single, well-scoped objective. It does not perform vague or speculative subtasks. \nArgs: \n\tsubtask: the subtask to be performed. \nReturns: \n\tthe processed summary report of the subtask. ",
                             schema={
                                 "type": "object",
                                 "properties": {
                                     "subtask": {"title": "Subtask", "type": "string"}
                                 },
                                 "required": ["subtask"],
-                                "title": "search_and_browseArguments",
+                                "title": "execute_subtaskArguments",
                             },
                         )
                     ],

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"task_id": "c61d22de-5f6c-4958-a7f6-5e9707bd3466", "task_question": "A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?", "ground_truth": "egalitarian", "file_path": null, "metadata": {"Level": "2", "Annotator Metadata": {"Steps": "1. Go to arxiv.org and navigate to the Advanced Search page.\n2. Enter \"AI regulation\" in the search box and select \"All fields\" from the dropdown.\n3. Enter 2022-06-01 and 2022-07-01 into the date inputs, select \"Submission date (original)\", and submit the search.\n4. Go through the search results to find the article that has a figure with three axes and labels on each end of the axes, titled \"Fairness in Agreement With European Values: An Interdisciplinary Perspective on AI Regulation\".\n5. Note the six words used as labels: deontological, egalitarian, localized, standardized, utilitarian, and consequential.\n6. Go back to arxiv.org\n7. Find \"Physics and Society\" and go to the page for the \"Physics and Society\" category.\n8. Note that the tag for this category is \"physics.soc-ph\".\n9. Go to the Advanced Search page.\n10. Enter \"physics.soc-ph\" in the search box and select \"All fields\" from the dropdown.\n11. Enter 2016-08-11 and 2016-08-12 into the date inputs, select \"Submission date (original)\", and submit the search.\n12. Search for instances of the six words in the results to find the paper titled \"Phase transition from egalitarian to hierarchical societies driven by competition between cognitive and social constraints\", indicating that \"egalitarian\" is the correct answer.", "Number of steps": "12", "How long did this take?": "8 minutes", "Tools": "1. Web browser\n2. Image recognition tools (to identify and parse a figure with three axes)", "Number of tools": "2"}}}
Original file line number	Diff line number	Diff line change
`@@ -1320,7 +1320,7 @@ async def run_main_agent(`
`1320`	`1320`	`final_answer_text = f"{final_answer_text}\n\nO3 Extracted Answer:\n{o3_extracted_answer}"`
`1321`	`1321`
`1322`	`1322`	`except Exception as e:`
`1323`		`- logger.warning(`
	`1323`	`+ logger.error(`
`1324`	`1324`	`f"O3 final answer extraction failed after retries: {str(e)}"`
`1325`	`1325`	`)`
`1326`	`1326`	`# Continue using original final_answer_text`