Skip to content

Commit 64b43bf

Browse files
authored
Merge pull request #28 from MiroMindAI/binwang_dev
feat(benchmark): add new configs and run scripts for mirothinker
2 parents 4ac1dd9 + 535be9c commit 64b43bf

File tree

126 files changed

+6889
-654
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

126 files changed

+6889
-654
lines changed

config/agent_gaia-validation-text-only_mirothinker_single_agent_new_tools.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ main_agent:
1111
_base_: config/llm/base_mirothinker.yaml
1212
prompt: config/prompts/prompt_main_agent.yaml
1313
tools:
14-
- config/tool/tool-search-and-scrape-webpage.yaml
15-
- config/tool/tool-jina-scrape-llm-summary.yaml
14+
- config/tool/tool-serper-search.yaml
15+
- config/tool/tool-jina-scrape.yaml
1616
- config/tool/tool-code.yaml
1717
input_processor:
1818
- ${input-message-generator}

config/agent_web_demo.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ main_agent:
2121
prompt: config/prompts/prompt_main_agent.yaml
2222

2323
tools:
24-
- config/tool/tool-python.yaml
25-
- config/tool/tool-search-and-scrape-webpage.yaml
26-
- config/tool/tool-jina-scrape-llm-summary.yaml
24+
- config/tool/tool-code-sandbox.yaml
25+
- config/tool/tool-serper-search.yaml
26+
- config/tool/tool-jina-scrape.yaml
2727
- config/tool/tool-reading.yaml
2828
#- config/tool/tool-code.yaml
2929
#- config/tool/tool-image-video.yaml

config/benchmark/browsecomp-en.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@ execution:
1414
max_tasks: null # null = no limit, or specify a number
1515
max_concurrent: 5 # Number of parallel tasks
1616
pass_at_k: 1 # Number of attempts per task
17+
max_retry: 5
18+
exceed_max_turn_summary: true
1719

1820
# OpenAI API key for evaluation (required for browsecomp since it has ground truth)
1921
openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
22+
openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
2023

config/benchmark/finsearchcomp.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ execution:
1717

1818
# OpenAI API key for evaluation (required for finsearchcomp since it has ground truth)
1919
openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
20+
openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"

config/benchmark/frames-test.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# config/benchmark/frames-test.yaml
2+
defaults:
3+
- default
4+
- _self_
5+
6+
name: "frames-test"
7+
8+
data:
9+
data_dir: "${data_dir}/frames-test" # Path to frames-test dataset
10+
metadata_file: "standardized_data.jsonl" # Metadata filename
11+
whitelist: [] # Optional: List of specific task_ids to run
12+
13+
execution:
14+
max_tasks: null # null = no limit, or specify a number
15+
max_concurrent: 5 # Number of parallel tasks
16+
pass_at_k: 1 # Number of attempts per task
17+
max_retry: 5
18+
exceed_max_turn_summary: true
19+
20+
# OpenAI API key for evaluation (required for frames-test since it has ground truth)
21+
openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
22+
openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"

config/benchmark/futurex.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,5 @@ execution:
1717

1818
# Set to skip evaluation since we don't have ground truth
1919
openai_api_key: "skip_evaluation"
20+
openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
2021

config/benchmark/gaia-test.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ execution:
1414
pass_at_k: 1
1515

1616
openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
17+
openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"

config/benchmark/gaia-validation.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,8 @@ execution:
1212
max_tasks: -1 # null means no limit
1313
max_concurrent: 15
1414
pass_at_k: 1
15+
max_retry: 5
16+
exceed_max_turn_summary: true
1517

1618
openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
19+
openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"

config/benchmark/hle-text-only.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@ execution:
1414
max_tasks: null # null = no limit, or specify a number
1515
max_concurrent: 10 # Number of parallel tasks
1616
pass_at_k: 1 # Number of attempts per task
17+
max_retry: 5
18+
exceed_max_turn_summary: true
1719

1820
# OpenAI API key for evaluation (required for hle-text-only since it has ground truth)
1921
openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
22+
openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
2023

config/benchmark/hle.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@ execution:
1414
max_tasks: null # null = no limit, or specify a number
1515
max_concurrent: 10 # Number of parallel tasks
1616
pass_at_k: 1 # Number of attempts per task
17+
max_retry: 5
18+
exceed_max_turn_summary: true
1719

1820
# OpenAI API key for evaluation (required for hle since it has ground truth)
1921
openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
22+
openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
2023

0 commit comments

Comments
 (0)