Skip to content

Commit 156112e

Browse files
feat(doc): add tool-vqa and tool-reasoning dodc
2 parents 9ee78ab + d041bbc commit 156112e

19 files changed

+1478
-6
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ marimo/_lsp/
208208
__marimo__/
209209

210210
logs/
211+
tmp/
211212

212213
data/*
213214
!data/README.md

config/agent_gaia-test.yaml

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
defaults:
2+
- benchmark: gaia-test
3+
- override hydra/job_logging: none
4+
- _self_ # Allow defining variables at the top of this file
5+
6+
7+
main_agent:
8+
prompt_class: MainAgentPrompt_GAIA
9+
llm:
10+
provider_class: "ClaudeOpenRouterClient"
11+
model_name: "anthropic/claude-3.7-sonnet"
12+
async_client: true
13+
temperature: 0.3
14+
top_p: 0.95
15+
min_p: 0.0
16+
top_k: -1
17+
max_tokens: 32000
18+
openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
19+
openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
20+
openrouter_provider: "anthropic"
21+
disable_cache_control: false
22+
keep_tool_result: -1
23+
oai_tool_thinking: false
24+
25+
tool_config:
26+
- tool-reasoning
27+
28+
max_turns: -1 # Maximum number of turns for main agent execution
29+
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
30+
31+
input_process:
32+
o3_hint: true
33+
output_process:
34+
o3_final_answer: true
35+
36+
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
37+
add_message_id: true
38+
keep_tool_result: -1
39+
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
40+
41+
42+
sub_agents:
43+
agent-worker:
44+
prompt_class: SubAgentWorkerPrompt
45+
llm:
46+
provider_class: "ClaudeOpenRouterClient"
47+
model_name: "anthropic/claude-3.7-sonnet"
48+
async_client: true
49+
temperature: 0.3
50+
top_p: 0.95
51+
min_p: 0.0
52+
top_k: -1
53+
max_tokens: 32000
54+
openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
55+
openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
56+
openrouter_provider: "anthropic"
57+
disable_cache_control: false
58+
keep_tool_result: -1
59+
oai_tool_thinking: false
60+
61+
tool_config:
62+
- tool-searching
63+
- tool-image-video
64+
- tool-reading
65+
- tool-code
66+
- tool-audio
67+
68+
max_turns: -1 # Maximum number of turns for main agent execution
69+
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
70+
71+
72+
# Can define some top-level or default parameters here
73+
output_dir: logs/
74+
data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored
75+

config/benchmark/futurex.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# config/benchmark/futurex.yaml
2+
defaults:
3+
- default
4+
- _self_
5+
6+
name: "futurex"
7+
8+
data:
9+
data_dir: "${data_dir}/futurex" # Path to your dataset
10+
metadata_file: "standardized_data.jsonl" # Metadata filename
11+
whitelist: [] # Optional: List of specific task_ids to run
12+
13+
execution:
14+
max_tasks: null # null = no limit, or specify a number
15+
max_concurrent: 5 # Number of parallel tasks
16+
pass_at_k: 1 # Number of attempts per task
17+
18+
# Set to skip evaluation since we don't have ground truth
19+
openai_api_key: "skip_evaluation"
20+

config/benchmark/gaia-test.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# config/benchmark/gaia-validation.yaml
2+
defaults:
3+
- default
4+
- _self_
5+
6+
name: "gaia-test"
7+
8+
data:
9+
data_dir: "${data_dir}/gaia-test"
10+
11+
execution:
12+
max_tasks: null # null means no limit
13+
max_concurrent: 10
14+
pass_at_k: 1
15+
16+
openai_api_key: "${oc.env:OPENAI_API_KEY,???}"

docs/mkdocs/docs/download_datasets.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ uv run main.py prepare-benchmark get browsecomp-test
7979
uv run main.py prepare-benchmark get browsecomp-zh-test
8080
uv run main.py prepare-benchmark get hle
8181
uv run main.py prepare-benchmark get xbench-ds
82+
uv run main.py prepare-benchmark get futurex
8283
```
8384

8485
### What This Script Does
@@ -94,6 +95,7 @@ uv run main.py prepare-benchmark get xbench-ds
9495
- `browsecomp-zh-test` - Chinese BrowseComp test set
9596
- `hle` - HLE dataset
9697
- `xbench-ds` - xbench-DeepSearch dataset
98+
- `futurex` - Futurex-Online dataset
9799

98100
### Customizing Dataset Selection
99101

0 commit comments

Comments
 (0)