|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# SPDX-FileCopyrightText: 2025 MiromindAI |
| 4 | +# |
| 5 | +# SPDX-License-Identifier: Apache-2.0 |
| 6 | + |
| 7 | +# Configuration parameters |
| 8 | +NUM_RUNS=3 |
| 9 | +AGENT_SET="agent_gaia-validation-text-only_mirothinker" |
| 10 | +MAX_CONCURRENT=15 |
| 11 | + |
| 12 | +# Set results directory with timestamp |
| 13 | +TIMESTAMP=$(date +%Y%m%d_%H%M) |
| 14 | +RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"} |
| 15 | + |
| 16 | +echo "Starting $NUM_RUNS runs of the evaluation..." |
| 17 | +echo "Results will be saved in: $RESULTS_DIR" |
| 18 | + |
| 19 | +# Create results directory |
| 20 | +mkdir -p "$RESULTS_DIR" |
| 21 | + |
| 22 | +for i in $(seq 1 $NUM_RUNS); do |
| 23 | + echo "==========================================" |
| 24 | + echo "Launching experiment $i/$NUM_RUNS" |
| 25 | + echo "==========================================" |
| 26 | + |
| 27 | + RUN_ID="run_$i" |
| 28 | + |
| 29 | + ( |
| 30 | + uv run main.py common-benchmark \ |
| 31 | + --config_file_name=$AGENT_SET \ |
| 32 | + benchmark.execution.max_concurrent=$MAX_CONCURRENT \ |
| 33 | + output_dir="$RESULTS_DIR/$RUN_ID" \ |
| 34 | + hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ |
| 35 | + > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1 |
| 36 | + |
| 37 | + if [ $? -eq 0 ]; then |
| 38 | + echo "Run $i completed successfully" |
| 39 | + RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) |
| 40 | + if [ -f "$RESULT_FILE" ]; then |
| 41 | + echo "Results saved to $RESULT_FILE" |
| 42 | + else |
| 43 | + echo "Warning: Result file not found for run $i" |
| 44 | + fi |
| 45 | + else |
| 46 | + echo "Run $i failed!" |
| 47 | + fi |
| 48 | + ) & |
| 49 | + |
| 50 | + sleep 2 |
| 51 | +done |
| 52 | + |
| 53 | +echo "All $NUM_RUNS runs have been launched in parallel" |
| 54 | +echo "Waiting for all runs to complete..." |
| 55 | + |
| 56 | +wait |
| 57 | + |
| 58 | +echo "==========================================" |
| 59 | +echo "All $NUM_RUNS runs completed!" |
| 60 | +echo "==========================================" |
| 61 | + |
| 62 | +echo "Calculating average scores..." |
| 63 | +uv run main.py avg-score "$RESULTS_DIR" |
| 64 | + |
| 65 | +echo "==========================================" |
| 66 | +echo "Multiple runs evaluation completed!" |
| 67 | +echo "Check results in: $RESULTS_DIR" |
| 68 | +echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" |
| 69 | +echo "==========================================" |
| 70 | + |
0 commit comments