Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions config/agent_gaia-validation-text-only_mirothinker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
defaults:
- benchmark: gaia-validation-text-only
- override hydra/job_logging: none
- _self_ # Allow defining variables at the top of this file


main_agent:
prompt_class: MainAgentPrompt_GAIA
llm:
provider_class: "MiroThinkerSGLangClient"
model_name: "MODEL_NAME"
async_client: true
temperature: 0.3
top_p: 0.95
min_p: 0.0
top_k: -1
max_tokens: 4096
oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
keep_tool_result: -1
oai_tool_thinking: false

tool_config:
- tool-reasoning

max_turns: 50 # Maximum number of turns for main agent execution
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
hint_generation: false
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"

output_process:
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"


sub_agents:
agent-worker:
prompt_class: SubAgentWorkerPrompt
llm:
provider_class: "MiroThinkerSGLangClient"
model_name: "anthropic/claude-3.7-sonnet"
async_client: true
temperature: 0.3
top_p: 1.0
min_p: 0.0
top_k: -1
max_tokens: 4096
oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
keep_tool_result: -1
oai_tool_thinking: false

tool_config:
- tool-searching
- tool-image-video
- tool-reading
- tool-code
- tool-audio

max_turns: 50 # Maximum number of turns for main agent execution
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn


# Can define some top-level or default parameters here
output_dir: logs/
data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/bin/bash

# SPDX-FileCopyrightText: 2025 MiromindAI
#
# SPDX-License-Identifier: Apache-2.0

# Configuration parameters
NUM_RUNS=3
AGENT_SET="agent_gaia-validation-text-only_mirothinker"
MAX_CONCURRENT=15

# Set results directory with timestamp
TIMESTAMP=$(date +%Y%m%d_%H%M)
RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "=========================================="

RUN_ID="run_$i"

(
uv run main.py common-benchmark \
--config_file_name=$AGENT_SET \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
output_dir="$RESULTS_DIR/$RUN_ID" \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
> "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1

if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &

sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

echo "Calculating average scores..."
uv run main.py avg-score "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="