Skip to content

Commit c0aebff

Browse files
authored
feat(gaia-val-text): add gaia-val-text for mirothinker model (#74)
add test for gaia-validaiton-text-only for mirothinker
1 parent 9ccab07 commit c0aebff

File tree

2 files changed

+144
-0
lines changed

2 files changed

+144
-0
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
defaults:
2+
- benchmark: gaia-validation-text-only
3+
- override hydra/job_logging: none
4+
- _self_ # Allow defining variables at the top of this file
5+
6+
7+
main_agent:
8+
prompt_class: MainAgentPrompt_GAIA
9+
llm:
10+
provider_class: "MiroThinkerSGLangClient"
11+
model_name: "MODEL_NAME"
12+
async_client: true
13+
temperature: 0.3
14+
top_p: 0.95
15+
min_p: 0.0
16+
top_k: -1
17+
max_tokens: 4096
18+
oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
19+
oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
20+
keep_tool_result: -1
21+
oai_tool_thinking: false
22+
23+
tool_config:
24+
- tool-reasoning
25+
26+
max_turns: 50 # Maximum number of turns for main agent execution
27+
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
28+
29+
input_process:
30+
hint_generation: false
31+
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
32+
33+
output_process:
34+
final_answer_extraction: true
35+
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
36+
37+
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
38+
add_message_id: true
39+
keep_tool_result: -1
40+
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
41+
42+
43+
sub_agents:
44+
agent-worker:
45+
prompt_class: SubAgentWorkerPrompt
46+
llm:
47+
provider_class: "MiroThinkerSGLangClient"
48+
model_name: "anthropic/claude-3.7-sonnet"
49+
async_client: true
50+
temperature: 0.3
51+
top_p: 1.0
52+
min_p: 0.0
53+
top_k: -1
54+
max_tokens: 4096
55+
oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
56+
oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
57+
keep_tool_result: -1
58+
oai_tool_thinking: false
59+
60+
tool_config:
61+
- tool-searching
62+
- tool-image-video
63+
- tool-reading
64+
- tool-code
65+
- tool-audio
66+
67+
max_turns: 50 # Maximum number of turns for main agent execution
68+
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
69+
70+
71+
# Can define some top-level or default parameters here
72+
output_dir: logs/
73+
data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored
74+
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/bin/bash
2+
3+
# SPDX-FileCopyrightText: 2025 MiromindAI
4+
#
5+
# SPDX-License-Identifier: Apache-2.0
6+
7+
# Configuration parameters
8+
NUM_RUNS=3
9+
AGENT_SET="agent_gaia-validation-text-only_mirothinker"
10+
MAX_CONCURRENT=15
11+
12+
# Set results directory with timestamp
13+
TIMESTAMP=$(date +%Y%m%d_%H%M)
14+
RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}
15+
16+
echo "Starting $NUM_RUNS runs of the evaluation..."
17+
echo "Results will be saved in: $RESULTS_DIR"
18+
19+
# Create results directory
20+
mkdir -p "$RESULTS_DIR"
21+
22+
for i in $(seq 1 $NUM_RUNS); do
23+
echo "=========================================="
24+
echo "Launching experiment $i/$NUM_RUNS"
25+
echo "=========================================="
26+
27+
RUN_ID="run_$i"
28+
29+
(
30+
uv run main.py common-benchmark \
31+
--config_file_name=$AGENT_SET \
32+
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
33+
output_dir="$RESULTS_DIR/$RUN_ID" \
34+
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
35+
> "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
36+
37+
if [ $? -eq 0 ]; then
38+
echo "Run $i completed successfully"
39+
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
40+
if [ -f "$RESULT_FILE" ]; then
41+
echo "Results saved to $RESULT_FILE"
42+
else
43+
echo "Warning: Result file not found for run $i"
44+
fi
45+
else
46+
echo "Run $i failed!"
47+
fi
48+
) &
49+
50+
sleep 2
51+
done
52+
53+
echo "All $NUM_RUNS runs have been launched in parallel"
54+
echo "Waiting for all runs to complete..."
55+
56+
wait
57+
58+
echo "=========================================="
59+
echo "All $NUM_RUNS runs completed!"
60+
echo "=========================================="
61+
62+
echo "Calculating average scores..."
63+
uv run main.py avg-score "$RESULTS_DIR"
64+
65+
echo "=========================================="
66+
echo "Multiple runs evaluation completed!"
67+
echo "Check results in: $RESULTS_DIR"
68+
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
69+
echo "=========================================="
70+

0 commit comments

Comments
 (0)