|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# SPDX-FileCopyrightText: 2025 MiromindAI |
| 4 | +# |
| 5 | +# SPDX-License-Identifier: Apache-2.0 |
| 6 | + |
| 7 | +# Configuration parameters |
| 8 | +NUM_RUNS=3 |
| 9 | +BENCHMARK_NAME="browsecomp-en-200" |
| 10 | +AGENT_SET="fangda_agent_browsecomp-en-200_mirothinker_single_agent_rollback_new_tools_toolblacklist" |
| 11 | +MAX_CONCURRENT=50 |
| 12 | + |
| 13 | +# Set results directory with timestamp |
| 14 | +TIMESTAMP=$(date +%Y%m%d_%H%M) |
| 15 | +RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"} |
| 16 | + |
| 17 | +# Array to track child PIDs |
| 18 | +declare -a CHILD_PIDS=() |
| 19 | + |
| 20 | +cleanup() { |
| 21 | + echo "" |
| 22 | + echo "Received interrupt signal, terminating all processes..." |
| 23 | + for pid in "${CHILD_PIDS[@]}"; do |
| 24 | + if kill -0 "$pid" 2>/dev/null; then |
| 25 | + echo "Killing process group $pid" |
| 26 | + kill -TERM -"$pid" 2>/dev/null |
| 27 | + fi |
| 28 | + done |
| 29 | + # Wait a moment for graceful shutdown |
| 30 | + sleep 2 |
| 31 | + # Force kill any remaining processes |
| 32 | + for pid in "${CHILD_PIDS[@]}"; do |
| 33 | + if kill -0 "$pid" 2>/dev/null; then |
| 34 | + echo "Force killing process group $pid" |
| 35 | + kill -KILL -"$pid" 2>/dev/null |
| 36 | + fi |
| 37 | + done |
| 38 | + echo "All processes terminated." |
| 39 | + exit 130 |
| 40 | +} |
| 41 | + |
| 42 | +trap cleanup SIGINT SIGTERM |
| 43 | + |
| 44 | +echo "Starting $NUM_RUNS runs of the evaluation..." |
| 45 | +echo "Results will be saved in: $RESULTS_DIR" |
| 46 | + |
| 47 | +# Create results directory |
| 48 | +mkdir -p "$RESULTS_DIR" |
| 49 | + |
| 50 | +for i in $(seq 1 $NUM_RUNS); do |
| 51 | + echo "==========================================" |
| 52 | + echo "Launching experiment $i/$NUM_RUNS" |
| 53 | + echo "==========================================" |
| 54 | + |
| 55 | + RUN_ID="run_$i" |
| 56 | + |
| 57 | + # Start process in new process group (set -m creates new pgrp) |
| 58 | + ( |
| 59 | + set -m |
| 60 | + uv run tests/test_benchmark.py \ |
| 61 | + --config-path config/${AGENT_SET}.yaml \ |
| 62 | + benchmark.execution.max_concurrent=$MAX_CONCURRENT \ |
| 63 | + output_dir="$RESULTS_DIR/$RUN_ID" \ |
| 64 | + > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1 |
| 65 | + |
| 66 | + EXIT_CODE=$? |
| 67 | + if [ $EXIT_CODE -eq 0 ]; then |
| 68 | + echo "Run $i completed successfully" |
| 69 | + RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) |
| 70 | + if [ -f "$RESULT_FILE" ]; then |
| 71 | + echo "Results saved to $RESULT_FILE" |
| 72 | + else |
| 73 | + echo "Warning: Result file not found for run $i" |
| 74 | + fi |
| 75 | + else |
| 76 | + # Check if we have JSON result files (task completed but evaluator had issues) |
| 77 | + JSON_COUNT=$(find "${RESULTS_DIR}/$RUN_ID" -name "task_*.json" 2>/dev/null | wc -l) |
| 78 | + if [ "$JSON_COUNT" -gt 0 ]; then |
| 79 | + echo "Run $i finished with exit code $EXIT_CODE but generated $JSON_COUNT task logs" |
| 80 | + else |
| 81 | + echo "Run $i failed with exit code $EXIT_CODE" |
| 82 | + fi |
| 83 | + fi |
| 84 | + ) & |
| 85 | + |
| 86 | + # Get the PID and store it |
| 87 | + CHILD_PIDS+=($!) |
| 88 | + |
| 89 | + sleep 2 |
| 90 | +done |
| 91 | + |
| 92 | +echo "All $NUM_RUNS runs have been launched in parallel" |
| 93 | +echo "Child PIDs: ${CHILD_PIDS[*]}" |
| 94 | +echo "Waiting for all runs to complete..." |
| 95 | +echo "Press Ctrl+C to terminate all processes" |
| 96 | + |
| 97 | +wait |
| 98 | + |
| 99 | +echo "==========================================" |
| 100 | +echo "All $NUM_RUNS runs completed!" |
| 101 | +echo "==========================================" |
| 102 | + |
| 103 | +echo "Calculating average scores..." |
| 104 | +uv run python -c "from src.utils.calculate_average_score import main; main('$RESULTS_DIR')" |
| 105 | + |
| 106 | +echo "==========================================" |
| 107 | +echo "Multiple runs evaluation completed!" |
| 108 | +echo "Check results in: $RESULTS_DIR" |
| 109 | +echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" |
| 110 | +echo "==========================================" |
0 commit comments