feat(gaia-val-text): add gaia-val-text for mirothinker model (#74)

BinWang28 · web-flow · commit c0aebffe910a · 2025-10-13T15:04:47.000+08:00
add test for gaia-validaiton-text-only for mirothinker
diff --git a/config/agent_gaia-validation-text-only_mirothinker.yaml b/config/agent_gaia-validation-text-only_mirothinker.yaml
@@ -0,0 +1,74 @@
+defaults:
+  - benchmark: gaia-validation-text-only
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "MiroThinkerSGLangClient"
+    model_name: "MODEL_NAME"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 4096
+    oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+    oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-reasoning
+
+  max_turns: 50  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    hint_generation: false
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  output_process:
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents:
+  agent-worker:
+    prompt_class: SubAgentWorkerPrompt
+    llm: 
+      provider_class: "MiroThinkerSGLangClient"
+      model_name: "anthropic/claude-3.7-sonnet"
+      async_client: true
+      temperature: 0.3
+      top_p: 1.0
+      min_p: 0.0
+      top_k: -1
+      max_tokens: 4096
+      oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+      oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
+      keep_tool_result: -1
+      oai_tool_thinking: false
+    
+    tool_config:
+      - tool-searching
+      - tool-image-video
+      - tool-reading
+      - tool-code
+      - tool-audio
+
+    max_turns: 50  # Maximum number of turns for main agent execution
+    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
+
diff --git a/scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation-text-only.sh b/scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation-text-only.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Configuration parameters
+NUM_RUNS=3
+AGENT_SET="agent_gaia-validation-text-only_mirothinker"
+MAX_CONCURRENT=15
+
+# Set results directory with timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}
+
+echo "Starting $NUM_RUNS runs of the evaluation..."
+echo "Results will be saved in: $RESULTS_DIR"
+
+# Create results directory
+mkdir -p "$RESULTS_DIR"
+
+for i in $(seq 1 $NUM_RUNS); do
+    echo "=========================================="
+    echo "Launching experiment $i/$NUM_RUNS"
+    echo "=========================================="
+    
+    RUN_ID="run_$i"
+    
+    (
+        uv run main.py common-benchmark \
+            --config_file_name=$AGENT_SET \
+            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
+            output_dir="$RESULTS_DIR/$RUN_ID" \
+            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
+            > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
+        
+        if [ $? -eq 0 ]; then
+            echo "Run $i completed successfully"
+            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
+            if [ -f "$RESULT_FILE" ]; then
+                echo "Results saved to $RESULT_FILE"
+            else
+                echo "Warning: Result file not found for run $i"
+            fi
+        else
+            echo "Run $i failed!"
+        fi
+    ) &
+    
+    sleep 2
+done
+
+echo "All $NUM_RUNS runs have been launched in parallel"
+echo "Waiting for all runs to complete..."
+
+wait
+
+echo "=========================================="
+echo "All $NUM_RUNS runs completed!"
+echo "=========================================="
+
+echo "Calculating average scores..."
+uv run main.py avg-score "$RESULTS_DIR"
+
+echo "=========================================="
+echo "Multiple runs evaluation completed!"
+echo "Check results in: $RESULTS_DIR"
+echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
+echo "==========================================" 
+