Skip to content

Commit 0dfe4c3

Browse files
committed
add work agent from zxxz
1 parent a206951 commit 0dfe4c3

File tree

10 files changed

+244
-36
lines changed

10 files changed

+244
-36
lines changed

apps/run-agent/scripts/claude-sonnet-3.7/run_evaluate_multiple_runs_gaia-validation.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,14 @@
55
# SPDX-License-Identifier: Apache-2.0
66

77
NUM_RUNS=3
8+
MAX_CONCURRENT=20
89
BENCHMARK_NAME="gaia-validation"
910
LLM_PROVIDER="claude_openrouter"
1011
LLM_MODEL="anthropic/claude-3.7-sonnet"
1112
AGENT_SET="miroflow"
13+
ADD_MESSAGE_ID="true" # Set to true to add random message ID to all messages sent to LLM
14+
MAX_TURNS=-1
15+
TEMPERATURE=0.3
1216

1317
RESULTS_DIR="logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
1418

@@ -30,11 +34,15 @@ for i in $(seq 1 $NUM_RUNS); do
3034
llm=claude_openrouter \
3135
llm.provider=$LLM_PROVIDER \
3236
llm.model_name=$LLM_MODEL \
37+
llm.temperature=$TEMPERATURE \
3338
llm.async_client=true \
3439
benchmark.execution.max_tasks=null \
35-
benchmark.execution.max_concurrent=5 \
40+
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
3641
benchmark.execution.pass_at_k=1 \
3742
agent=$AGENT_SET \
43+
agent.add_message_id=$ADD_MESSAGE_ID \
44+
agent.main_agent.max_turns=$MAX_TURNS \
45+
agent.sub_agents.agent-worker.max_turns=$MAX_TURNS \
3846
output_dir="$RESULTS_DIR/$RUN_ID" \
3947
> "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
4048

data/gaia-val/standardized_data.jsonl

Lines changed: 165 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"task_id": "c61d22de-5f6c-4958-a7f6-5e9707bd3466", "task_question": "A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?", "ground_truth": "egalitarian", "file_path": null, "metadata": {"Level": "2", "Annotator Metadata": {"Steps": "1. Go to arxiv.org and navigate to the Advanced Search page.\n2. Enter \"AI regulation\" in the search box and select \"All fields\" from the dropdown.\n3. Enter 2022-06-01 and 2022-07-01 into the date inputs, select \"Submission date (original)\", and submit the search.\n4. Go through the search results to find the article that has a figure with three axes and labels on each end of the axes, titled \"Fairness in Agreement With European Values: An Interdisciplinary Perspective on AI Regulation\".\n5. Note the six words used as labels: deontological, egalitarian, localized, standardized, utilitarian, and consequential.\n6. Go back to arxiv.org\n7. Find \"Physics and Society\" and go to the page for the \"Physics and Society\" category.\n8. Note that the tag for this category is \"physics.soc-ph\".\n9. Go to the Advanced Search page.\n10. Enter \"physics.soc-ph\" in the search box and select \"All fields\" from the dropdown.\n11. Enter 2016-08-11 and 2016-08-12 into the date inputs, select \"Submission date (original)\", and submit the search.\n12. Search for instances of the six words in the results to find the paper titled \"Phase transition from egalitarian to hierarchical societies driven by competition between cognitive and social constraints\", indicating that \"egalitarian\" is the correct answer.", "Number of steps": "12", "How long did this take?": "8 minutes", "Tools": "1. Web browser\n2. Image recognition tools (to identify and parse a figure with three axes)", "Number of tools": "2"}}}

libs/miroflow-tool/src/miroflow/tool/mcp_servers/reasoning_mcp_server.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,32 @@
88
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
99
ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com")
1010
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY", "")
11-
OPENROUTER_BASE_URL = os.environ.get(
12-
"OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"
13-
)
11+
OPENROUTER_BASE_URL = os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
1412

1513
# Initialize FastMCP server
1614
mcp = FastMCP("reasoning-mcp-server")
1715

1816

1917
@mcp.tool()
2018
async def reasoning(question: str) -> str:
21-
"""You can use this tool use solve hard math problem, puzzle, riddle and IQ test question that requries a lot of chain of thought efforts.
22-
DO NOT use this tool for simple and obvious question.
19+
"""This tool is for pure text-based reasoning, analysis, and logical thinking. It integrates collected information, organizes final logic, and provides planning insights.
20+
21+
IMPORTANT: This tool cannot access the internet, read files, program, or process multimodal content. It only performs pure text reasoning.
22+
23+
Use this tool for:
24+
- Integrating and synthesizing collected information
25+
- Analyzing patterns and relationships in data
26+
- Logical reasoning and problem-solving
27+
- Planning and strategy development
28+
- Complex math problems, puzzles, riddles, and IQ tests
29+
30+
DO NOT use this tool for simple and obvious questions.
2331
2432
Args:
25-
question: The complex question or problem requiring step-by-step reasoning. Should include all relevant information needed to solve the problem..
33+
question: The complex question or problem requiring step-by-step reasoning. Should include all relevant information needed to solve the problem.
2634
2735
Returns:
28-
The answer to the question.
36+
The reasoned answer to the question.
2937
"""
3038

3139
messages_for_llm = [

libs/miroflow/src/miroflow/prebuilt/config/agent/_default.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
main_agent:
55
tools:
66
- tool-code
7-
- tool-vqa
8-
- tool-transcribe
7+
- tool-image-video
8+
- tool-audio
99
- tool-reasoning
1010
- tool-markitdown
1111
# tool_blacklist:
@@ -16,7 +16,7 @@ sub_agents:
1616
agent-browsing:
1717
tools:
1818
- tool-serper-search
19-
- tool-vqa
19+
- tool-image-video
2020
- tool-markitdown
2121
- tool-code
2222
max_turns: 20
@@ -32,7 +32,7 @@ sub_agents:
3232
# max_turns: 20
3333

3434
tool_config:
35-
tool-vqa:
35+
tool-image-video:
3636
enable_claude_vision: "true"
3737
enable_openai_vision: "true"
3838

libs/miroflow/src/miroflow/prebuilt/config/agent/miroflow.yaml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,20 @@ defaults:
77

88
main_agent:
99
tools:
10-
- tool-vqa
11-
- tool-reading
12-
- tool-code
1310
- tool-reasoning
14-
- tool-transcribe
1511
max_turns: 20 # Maximum number of turns for main agent execution
12+
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
1613
# Trival reproduce of OWL:
1714
sub_agents:
18-
agent-browsing:
15+
agent-worker:
1916
tools:
2017
- tool-searching
21-
- tool-vqa
18+
- tool-image-video
19+
- tool-audio
2220
- tool-reading
2321
- tool-code
2422
max_turns: 20
23+
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
2524

2625
o3_hint: true
2726
o3_final_answer: true

libs/miroflow/src/miroflow/prebuilt/config/benchmark/_default.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ name: "default"
44

55
data:
66
metadata_file: "standardized_data.jsonl"
7+
# metadata_file: "standardized_data_single_sample.jsonl"
78
field_mapping:
89
task_id_field: "task_id"
910
task_question_field: "task_question"

libs/miroflow/src/miroflow/prebuilt/orchestrator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1320,7 +1320,7 @@ async def run_main_agent(
13201320
final_answer_text = f"{final_answer_text}\n\nO3 Extracted Answer:\n{o3_extracted_answer}"
13211321

13221322
except Exception as e:
1323-
logger.warning(
1323+
logger.error(
13241324
f"O3 final answer extraction failed after retries: {str(e)}"
13251325
)
13261326
# Continue using original final_answer_text

libs/miroflow/src/miroflow/utils/prompt_utils.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,24 @@ def generate_agent_specific_system_prompt(agent_type: str = ""):
172172
173173
You are a task-solving agent that uses tools step-by-step to answer the user's question. Your goal is to provide complete, accurate and well-reasoned answers using additional tools.
174174
175+
## Subtask Delegation Strategy
176+
177+
For each clearly defined single subtask, delegate it to worker agents using the `execute_subtask` tool from the `agent-worker` server. **Important: Only make ONE execute_subtask call per response.**
178+
179+
**CRITICAL: Always treat worker agent responses as unreliable and incomplete sources.** Worker agents may:
180+
- Report "not found" when information actually exists elsewhere
181+
- Return partial information while believing it's complete
182+
- Be overconfident or produce hallucinations
183+
184+
Therefore, you must always verify and validate worker responses by:
185+
- Cross-referencing information from multiple independent sources
186+
- Trying alternative search strategies and reformulating subtasks with different approaches
187+
- Considering that information might exist in different formats or locations
188+
- Applying critical evaluation to assess credibility and completeness
189+
- Never accepting "not found" or worker conclusions as final without additional verification
190+
191+
## Final Answer Preparation
192+
175193
Before presenting your answer, and **unless** the user asks to "Summarize the above" (in which case no tools are used), **always** use the `reasoning` tool from the `tool-reasoning` server to step-by-step analyze solving process as follows:
176194
- Use the reasoning tool to carefully analyze:
177195
- What the question is truly asking.
@@ -184,11 +202,18 @@ def generate_agent_specific_system_prompt(agent_type: str = ""):
184202
185203
"""
186204

187-
elif agent_type == "agent-browsing":
205+
elif agent_type == "agent-worker":
188206
system_prompt = """# Agent Specific Objective
189207
190-
You are an agent that performs the task of searching and browsing the web for specific information and generating the desired answer. Your task is to retrieve reliable, factual, and verifiable information that fills in knowledge gaps.
191-
Do not infer, speculate, summarize broadly, or attempt to fill in missing parts yourself. Only return factual content.
208+
You are an agent that performs various subtasks to collect information and execute specific actions. Your task is to complete well-defined, single-scope objectives efficiently and accurately.
209+
Do not infer, speculate, or attempt to fill in missing parts yourself. Only return factual content and execute actions as specified.
210+
211+
## File Path Handling
212+
When subtasks mention file paths, these are local system file paths (not sandbox paths). You can:
213+
- Use tools to directly access these files from the local system
214+
- Upload files to the sandbox environment (remember to create a new sandbox for each task, this sandbox only exists for the current task) for processing if needed
215+
- Choose the most appropriate approach based on the specific task requirements
216+
- If the final response requires returning a file, download it to the local system first and then return the local path, the sandbox path is not allowed
192217
193218
Critically assess the reliability of all information:
194219
- If the credibility of a source is uncertain, clearly flag it.
@@ -200,6 +225,7 @@ def generate_agent_specific_system_prompt(agent_type: str = ""):
200225
- Never assume or guess — if an exact answer cannot be found, say so clearly.
201226
- Prefer quoting or excerpting **original source text** rather than interpreting or rewriting it, and provide the URL if available.
202227
- If more context is needed, return a clarification request and do not proceed with tool use.
228+
- Focus on completing the specific subtask assigned to you, not broader reasoning.
203229
"""
204230
elif agent_type == "agent-coding":
205231
system_prompt = """# Agent Specific Objective
@@ -268,7 +294,7 @@ def generate_agent_summarize_prompt(
268294
"Focus on factual, specific, and well-organized information."
269295
)
270296
)
271-
elif agent_type == "agent-browsing":
297+
elif agent_type == "agent-worker":
272298
summarize_prompt = (
273299
(
274300
"This is a direct instruction to you (the assistant), not the result of a tool call.\n\n"
@@ -283,8 +309,8 @@ def generate_agent_summarize_prompt(
283309
"You must NOT initiate any further tool use. This is your final opportunity to report "
284310
"*all* of the information gathered during the session.\n\n"
285311
"The original task is repeated here for reference:\n\n"
286-
f"---\n{task_description}\n---\n\n"
287-
"Summarize the above search and browsing history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\n\n"
312+
f'---\n{task_description}\n---\n\n'
313+
"Summarize the above subtask execution history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\n\n"
288314
"If you found any useful facts, data, quotes, or answers directly relevant to the original task, include them clearly and completely.\n"
289315
"If you reached a conclusion or answer, include it as part of the response.\n"
290316
"If the task could not be fully answered, do NOT make up any content. Instead, return all partially relevant findings, "

libs/miroflow/src/miroflow/utils/tool_utils.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ def create_mcp_server_parameters(
1717
cfg: DictConfig, agent_cfg: DictConfig, logs_dir: str | None = None
1818
):
1919
"""Define and return MCP server configuration list"""
20-
ENABLE_CLAUDE_VISION = cfg.agent.tool_config["tool-vqa"]["enable_claude_vision"]
21-
ENABLE_OPENAI_VISION = cfg.agent.tool_config["tool-vqa"]["enable_openai_vision"]
20+
ENABLE_CLAUDE_VISION = cfg.agent.tool_config["tool-image-video"]["enable_claude_vision"]
21+
ENABLE_OPENAI_VISION = cfg.agent.tool_config["tool-image-video"]["enable_openai_vision"]
2222

2323
configs = []
2424
if agent_cfg.get("tools", None) is not None and "tool-code" in agent_cfg["tools"]:
@@ -40,10 +40,10 @@ def create_mcp_server_parameters(
4040
}
4141
)
4242

43-
if agent_cfg.get("tools", None) is not None and "tool-vqa" in agent_cfg["tools"]:
43+
if agent_cfg.get("tools", None) is not None and "tool-image-video" in agent_cfg["tools"]:
4444
configs.append(
4545
{
46-
"name": "tool-vqa",
46+
"name": "tool-image-video",
4747
"params": StdioServerParameters(
4848
command=sys.executable,
4949
args=["-m", "miroflow.tool.mcp_servers.vision_mcp_server"],
@@ -62,11 +62,11 @@ def create_mcp_server_parameters(
6262

6363
if (
6464
agent_cfg.get("tools", None) is not None
65-
and "tool-transcribe" in agent_cfg["tools"]
65+
and "tool-audio" in agent_cfg["tools"]
6666
):
6767
configs.append(
6868
{
69-
"name": "tool-transcribe",
69+
"name": "tool-audio",
7070
"params": StdioServerParameters(
7171
command=sys.executable,
7272
args=["-m", "miroflow.tool.mcp_servers.audio_mcp_server"],
@@ -202,21 +202,21 @@ def expose_sub_agents_as_tools(sub_agents_cfg: DictConfig):
202202
"""Expose sub-agents as tools"""
203203
sub_agents_server_params = []
204204
for sub_agent in sub_agents_cfg.keys():
205-
if "agent-browsing" in sub_agent: # type: ignore
205+
if "agent-worker" in sub_agent: # type: ignore
206206
sub_agents_server_params.append(
207207
dict(
208-
name="agent-browsing",
208+
name="agent-worker",
209209
tools=[
210210
dict(
211-
name="search_and_browse",
212-
description="This tool is an agent that performs the subtask of searching and browsing the web for specific missing information and generating the desired answer. The subtask should be clearly defined, include relevant background, and focus on factual gaps. It does not perform vague or speculative subtasks. \nArgs: \n\tsubtask: the subtask to be performed. \nReturns: \n\tthe result of the subtask. ",
211+
name="execute_subtask",
212+
description="This tool is an agent that performs various subtasks to collect information and execute specific actions. It can access the internet, read files, program, and process multimodal content, but is not specialized in complex reasoning or logical thinking. The tool returns processed summary reports rather than raw information - it analyzes, synthesizes, and presents findings in a structured format. The subtask should be clearly defined, include relevant background, and focus on a single, well-scoped objective. It does not perform vague or speculative subtasks. \nArgs: \n\tsubtask: the subtask to be performed. \nReturns: \n\tthe processed summary report of the subtask. ",
213213
schema={
214214
"type": "object",
215215
"properties": {
216216
"subtask": {"title": "Subtask", "type": "string"}
217217
},
218218
"required": ["subtask"],
219-
"title": "search_and_browseArguments",
219+
"title": "execute_subtaskArguments",
220220
},
221221
)
222222
],

0 commit comments

Comments
 (0)