MiroFlow/scripts/archived/run_evaluate_multiple_runs_gaia-test.sh at 6cf0ed10d0c8b054058bf5e6349948b11d552ff9 · MiroMindAI/MiroFlow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/bin/bash

# SPDX-FileCopyrightText: 2025 MiromindAI
#
# SPDX-License-Identifier: Apache-2.0

# Configuration parameters - dual model configuration
NUM_RUNS=3
MAX_CONCURRENT=20
BENCHMARK_NAME="gaia-test"
AGENT_SET="claude03_claude_dual"
ADD_MESSAGE_ID="true"  # Set to true to add random message ID to all messages sent to LLM
MAX_TURNS=-1

# Automatically set Chinese context - if BENCHMARK_NAME contains xbench or -zh
if [[ $BENCHMARK_NAME == "xbench-ds" ]] || [[ $BENCHMARK_NAME == "browsecomp-zh" ]]; then
    export CHINESE_CONTEXT="true"
    echo "检测到中文相关基准测试，已启用中文上下文：CHINESE_CONTEXT=true"
fi

# export REMOVE_SNIPPETS="true"
# export REMOVE_KNOWLEDGE_GRAPH="true"
# export REMOVE_ANSWER_BOX="true"

export LOGGER_LEVEL="INFO"

RESULTS_DIR="logs/${BENCHMARK_NAME}/${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

mkdir -p "$RESULTS_DIR"

for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "=========================================="

    RUN_ID="run_$i"

    (
        uv run main.py common-benchmark \
            benchmark=$BENCHMARK_NAME \
            agent=$AGENT_SET \
            agent.add_message_id=$ADD_MESSAGE_ID \
            agent.main_agent.max_turns=$MAX_TURNS \
            agent.sub_agents.agent-worker.max_turns=$MAX_TURNS \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=1 \
            output_dir="$RESULTS_DIR/$RUN_ID" \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1

        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &

    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

echo "Calculating average scores..."
uv run main.py avg-score "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="