Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ cp .env.template .env
# Edit .env and add your OPENROUTER_API_KEY

# 3. Run your first agent
uv run main.py trace --config_file_name=agent_quickstart_1 --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
uv run main.py trace --config_file_name=agent_quickstart_reading --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
```

🎉 **Expected Output:** Your agent should return **\boxed{Congo Democratic Republic}** 😊
Expand Down
2 changes: 1 addition & 1 deletion README_ja.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ cp .env.template .env
# .env を編集して OPENROUTER_API_KEY を追加

# 3. 最初のエージェントを実行
uv run main.py trace --config_file_name=agent_quickstart_1 --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
uv run main.py trace --config_file_name=agent_quickstart_reading --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
```

🎉 **想定出力**: エージェントは **\boxed{Congo Democratic Republic}** を返すはずです 😊
Expand Down
2 changes: 1 addition & 1 deletion README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ cp .env.template .env
# 编辑 .env 并添加您的 OPENROUTER_API_KEY

# 3. 运行您的第一个智能体
uv run main.py trace --config_file_name=agent_quickstart_1 --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
uv run main.py trace --config_file_name=agent_quickstart_reading --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
```

🎉 **预期输出**: 您的智能体应该返回 **\boxed{Congo Democratic Republic}** 😊
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ main_agent:
keep_tool_result: -1
oai_tool_thinking: false

tool_config: []
tool_config:
- tool-reading

max_turns: -1 # Maximum number of turns for main agent execution
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
Expand All @@ -40,30 +41,7 @@ main_agent:
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"


sub_agents:
agent-worker:
prompt_class: SubAgentWorkerPrompt
llm:
provider_class: "ClaudeOpenRouterClient"
model_name: "anthropic/claude-3.7-sonnet"
async_client: true
temperature: 0.3
top_p: 0.95
min_p: 0.0
top_k: -1
max_tokens: 32000
openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
openrouter_provider: "anthropic"
disable_cache_control: false
keep_tool_result: -1
oai_tool_thinking: false

tool_config:
- tool-reading

max_turns: -1 # Maximum number of turns for main agent execution
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
sub_agents: null


# Can define some top-level or default parameters here
Expand Down
50 changes: 50 additions & 0 deletions config/agent_quickstart_search.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
defaults:
- benchmark: gaia-validation
- override hydra/job_logging: none
- _self_ # Allow defining variables at the top of this file


main_agent:
prompt_class: MainAgentPromptBoxedAnswer
llm:
provider_class: "ClaudeOpenRouterClient"
model_name: "anthropic/claude-3.7-sonnet"
async_client: true
temperature: 0.3
top_p: 0.95
min_p: 0.0
top_k: -1
max_tokens: 32000
openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
openrouter_provider: "anthropic"
disable_cache_control: false
keep_tool_result: -1
oai_tool_thinking: false

tool_config:
- tool-searching-serper

max_turns: -1 # Maximum number of turns for main agent execution
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
hint_generation: false
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
final_answer_extraction: false
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"


sub_agents: null


# Can define some top-level or default parameters here
output_dir: logs/
data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored

Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,8 @@ main_agent:
oai_tool_thinking: false

tool_config:
- tool-reasoning-os
- tool-searching
- tool-image-video-os
- tool-reading
- tool-code
- tool-audio-os
- tool-searching

max_turns: -1 # Maximum number of turns for main agent execution
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
name: "tool-serper-search"
name: "tool-searching-serper"
tool_command: "npx"
args:
- "-y"
- "serper-search-scrape-mcp-server"
env:
# Search API key - this value will be loaded from the .env file at runtime
SERPER_API_KEY: "${oc.env:SERPER_API_KEY}"
SERPER_API_KEY: "${oc.env:SERPER_API_KEY}"
4 changes: 2 additions & 2 deletions docs/mkdocs/docs/contribute_benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ Start with a small subset to verify everything works correctly:

```bash title="Test Benchmark Integration"
uv run main.py common-benchmark \
--config_file_name=agent_quickstart_1 \
--config_file_name=agent_quickstart_reading \
benchmark=your-benchmark \
benchmark.execution.max_tasks=3 \
output_dir="logs/test-your-benchmark/$(date +"%Y%m%d_%H%M")"
Expand All @@ -160,7 +160,7 @@ Once testing passes, run the complete benchmark:

```bash title="Run Full Benchmark"
uv run main.py common-benchmark \
--config_file_name=agent_quickstart_1 \
--config_file_name=agent_quickstart_reading \
benchmark=your-benchmark \
output_dir="logs/your-benchmark/$(date +"%Y%m%d_%H%M")"
```
Expand Down
20 changes: 10 additions & 10 deletions docs/mkdocs/docs/futurex.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ OPENAI_BASE_URL="https://api.openai.com/v1"
### Step 3: Run the Evaluation

!!! example "Evaluation Execution"
Execute the following command to run evaluation on the Futurex-Online dataset. This uses the basic `agent_quickstart_1` configuration for quick start purposes.
Execute the following command to run evaluation on the Futurex-Online dataset. This uses the basic `agent_quickstart_reading` configuration for quick start purposes.

```bash title="Run Futurex-Online Evaluation"
uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")"
uv run main.py common-benchmark --config_file_name=agent_quickstart_reading benchmark=futurex output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")"
```

!!! tip "Progress Monitoring and Resume"
Expand All @@ -88,7 +88,7 @@ uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=
If you need to resume an interrupted evaluation, specify the same output directory to continue from where you left off.

```bash title="Resume Evaluation, e.g."
uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/20250918_1010"
uv run main.py common-benchmark --config_file_name=agent_quickstart_reading benchmark=futurex output_dir="logs/futurex/20250918_1010"
```

### Step 4: Extract Results
Expand Down Expand Up @@ -184,13 +184,13 @@ Check the generated files for voting analysis:

```bash title="Check Voting Results"
# View submission file with voting results
cat logs/futurex/agent_quickstart_1_*/futurex_submission.jsonl
cat logs/futurex/agent_quickstart_reading_*/futurex_submission.jsonl

# Check individual run results
ls logs/futurex/agent_quickstart_1_*/run_*/
ls logs/futurex/agent_quickstart_reading_*/run_*/

# Check progress and voting statistics
uv run python utils/progress_check/check_futurex_progress.py logs/futurex/agent_quickstart_1_*
uv run python utils/progress_check/check_futurex_progress.py logs/futurex/agent_quickstart_reading_*
```

### Manual Voting Aggregation
Expand All @@ -199,13 +199,13 @@ You can also manually run the voting aggregation:

```bash title="Manual Voting Aggregation"
# Aggregate multiple runs with majority voting
uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_* --aggregate
uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_reading_* --aggregate

# Force single run mode (if needed)
uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_*/run_1 --single
uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_reading_*/run_1 --single

# Specify custom output file
uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_* -o my_voted_predictions.jsonl
uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_reading_* -o my_voted_predictions.jsonl
```

### Voting Output Format
Expand Down Expand Up @@ -249,7 +249,7 @@ For example, `"vote_counts": {"No": 2}` means 2 out of 2 runs predicted "No", in
After running multiple evaluations, you'll find the following structure:

```
logs/futurex/agent_quickstart_1_YYYYMMDD_HHMM/
logs/futurex/agent_quickstart_reading_YYYYMMDD_HHMM/
├── futurex_submission.jsonl # Final voted predictions
├── run_1/ # First run results
│ ├── benchmark_results.jsonl # Individual task results
Expand Down
Loading