feat(benchmark): add browsecomp-en-200 and browsecomp-zh configs

fangda-ye · fangda-ye · commit 2e1be6176a98 · 2026-02-03T16:59:48.000+08:00
diff --git a/config/benchmark/browsecomp-en-200.yaml b/config/benchmark/browsecomp-en-200.yaml
@@ -0,0 +1,21 @@
+# config/benchmark/browsecomp-en.yaml
+defaults:
+  - default
+  - _self_
+
+name: "browsecomp-en-200"
+
+data:
+  data_dir: "${data_dir}/browsecomp-200"  # Path to browsecomp-200 (English) dataset
+  metadata_file: "standardized_data.jsonl"  # Metadata filename
+  whitelist: []  # Optional: List of specific task_ids to run
+
+execution:
+  max_tasks: null      # null = no limit, or specify a number
+  max_concurrent: 5    # Number of parallel tasks
+  pass_at_k: 1         # Number of attempts per task
+
+# OpenAI API key for evaluation (required for browsecomp since it has ground truth)
+openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
+openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
+
diff --git a/config/benchmark/browsecomp-zh.yaml b/config/benchmark/browsecomp-zh.yaml
@@ -14,8 +14,9 @@ execution:
   max_tasks: null      # null = no limit, or specify a number
   max_concurrent: 5    # Number of parallel tasks
   pass_at_k: 1         # Number of attempts per task
+  max_retry: 5
+  exceed_max_turn_summary: true
 
 # OpenAI API key for evaluation (required for browsecomp-zh since it has ground truth)
 openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
-
-
+openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
diff --git a/config/fangda_agent_browsecomp-en-200_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml b/config/fangda_agent_browsecomp-en-200_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml
@@ -0,0 +1,46 @@
+defaults:
+  - benchmark: browsecomp-en-200
+  - override hydra/job_logging: none
+  - _self_
+
+entrypoint: main_agent
+main_agent:
+  name: main_agent
+  type: IterativeAgentWithToolAndRollback
+  max_consecutive_rollbacks: 3
+  max_turns: 400
+  llm:
+    _base_: config/llm/base_mirothinker.yaml
+  prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
+  tools:
+    - config/tool/tool-search-and-scrape-webpage.yaml
+    - config/tool/tool-jina-scrape-llm-summary.yaml
+    - config/tool/tool-python.yaml
+  tool_blacklist:
+    - server: "tool-search-and-scrape-webpage"
+      tool: "sogou_search"
+    - server: "tool-python"
+      tool: "download_file_from_sandbox_to_local"
+  input_processor:
+    - ${input-message-generator}
+  output_processor:
+    - ${output-summary}
+    - ${output-final-answer-extraction}
+    - ${output-exceed-max-turn-summary}
+
+input-message-generator:
+  type: InputMessageGenerator
+output-summary:
+  type: SummaryGenerator
+output-final-answer-extraction:
+  type: RegexBoxedExtractor
+output-exceed-max-turn-summary:
+  type: ExceedMaxTurnSummaryGenerator
+  prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
+  llm:
+    _base_: config/llm/base_mirothinker.yaml
+
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"
+
+
diff --git a/config/fangda_agent_browsecomp-zh_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml b/config/fangda_agent_browsecomp-zh_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml
@@ -0,0 +1,46 @@
+defaults:
+  - benchmark: browsecomp-zh
+  - override hydra/job_logging: none
+  - _self_
+
+entrypoint: main_agent
+main_agent:
+  name: main_agent
+  type: IterativeAgentWithToolAndRollback
+  max_consecutive_rollbacks: 3
+  max_turns: 400
+  llm:
+    _base_: config/llm/base_mirothinker.yaml
+  prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
+  tools:
+    - config/tool/tool-search-and-scrape-webpage.yaml
+    - config/tool/tool-jina-scrape-llm-summary.yaml
+    - config/tool/tool-python.yaml
+  tool_blacklist:
+    - server: "tool-search-and-scrape-webpage"
+      tool: "sogou_search"
+    - server: "tool-python"
+      tool: "download_file_from_sandbox_to_local"
+  input_processor:
+    - ${input-message-generator}
+  output_processor:
+    - ${output-summary}
+    - ${output-final-answer-extraction}
+    - ${output-exceed-max-turn-summary}
+
+input-message-generator:
+  type: InputMessageGenerator
+output-summary:
+  type: SummaryGenerator
+output-final-answer-extraction:
+  type: RegexBoxedExtractor
+output-exceed-max-turn-summary:
+  type: ExceedMaxTurnSummaryGenerator
+  prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
+  llm:
+    _base_: config/llm/base_mirothinker.yaml
+
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"
+
+
diff --git a/scripts/fangda_run_evaluate_multiple_runs_mirothinker_browsecomp-en-200.sh b/scripts/fangda_run_evaluate_multiple_runs_mirothinker_browsecomp-en-200.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Configuration parameters
+NUM_RUNS=3
+BENCHMARK_NAME="browsecomp-en-200"
+AGENT_SET="fangda_agent_browsecomp-en-200_mirothinker_single_agent_rollback_new_tools_toolblacklist"
+MAX_CONCURRENT=50
+
+# Set results directory with timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}
+
+# Array to track child PIDs
+declare -a CHILD_PIDS=()
+
+cleanup() {
+    echo ""
+    echo "Received interrupt signal, terminating all processes..."
+    for pid in "${CHILD_PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process group $pid"
+            kill -TERM -"$pid" 2>/dev/null
+        fi
+    done
+    # Wait a moment for graceful shutdown
+    sleep 2
+    # Force kill any remaining processes
+    for pid in "${CHILD_PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process group $pid"
+            kill -KILL -"$pid" 2>/dev/null
+        fi
+    done
+    echo "All processes terminated."
+    exit 130
+}
+
+trap cleanup SIGINT SIGTERM
+
+echo "Starting $NUM_RUNS runs of the evaluation..."
+echo "Results will be saved in: $RESULTS_DIR"
+
+# Create results directory
+mkdir -p "$RESULTS_DIR"
+
+for i in $(seq 1 $NUM_RUNS); do
+    echo "=========================================="
+    echo "Launching experiment $i/$NUM_RUNS"
+    echo "=========================================="
+    
+    RUN_ID="run_$i"
+    
+    # Start process in new process group (set -m creates new pgrp)
+    (
+        set -m
+        uv run tests/test_benchmark.py \
+            --config-path config/${AGENT_SET}.yaml \
+            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
+            output_dir="$RESULTS_DIR/$RUN_ID" \
+            > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
+        
+        EXIT_CODE=$?
+        if [ $EXIT_CODE -eq 0 ]; then
+            echo "Run $i completed successfully"
+            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
+            if [ -f "$RESULT_FILE" ]; then
+                echo "Results saved to $RESULT_FILE"
+            else
+                echo "Warning: Result file not found for run $i"
+            fi
+        else
+            # Check if we have JSON result files (task completed but evaluator had issues)
+            JSON_COUNT=$(find "${RESULTS_DIR}/$RUN_ID" -name "task_*.json" 2>/dev/null | wc -l)
+            if [ "$JSON_COUNT" -gt 0 ]; then
+                echo "Run $i finished with exit code $EXIT_CODE but generated $JSON_COUNT task logs"
+            else
+                echo "Run $i failed with exit code $EXIT_CODE"
+            fi
+        fi
+    ) &
+    
+    # Get the PID and store it
+    CHILD_PIDS+=($!)
+    
+    sleep 2
+done
+
+echo "All $NUM_RUNS runs have been launched in parallel"
+echo "Child PIDs: ${CHILD_PIDS[*]}"
+echo "Waiting for all runs to complete..."
+echo "Press Ctrl+C to terminate all processes"
+
+wait
+
+echo "=========================================="
+echo "All $NUM_RUNS runs completed!"
+echo "=========================================="
+
+echo "Calculating average scores..."
+uv run python -c "from src.utils.calculate_average_score import main; main('$RESULTS_DIR')"
+
+echo "=========================================="
+echo "Multiple runs evaluation completed!"
+echo "Check results in: $RESULTS_DIR"
+echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
+echo "=========================================="
diff --git a/scripts/fangda_run_evaluate_multiple_runs_mirothinker_browsecomp-zh.sh b/scripts/fangda_run_evaluate_multiple_runs_mirothinker_browsecomp-zh.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Configuration parameters
+NUM_RUNS=3
+BENCHMARK_NAME="browsecomp-zh"
+AGENT_SET="fangda_agent_browsecomp-zh_mirothinker_single_agent_rollback_new_tools_toolblacklist"
+MAX_CONCURRENT=50
+
+# Set results directory with timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}
+
+# Array to track child PIDs
+declare -a CHILD_PIDS=()
+
+cleanup() {
+    echo ""
+    echo "Received interrupt signal, terminating all processes..."
+    for pid in "${CHILD_PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process group $pid"
+            kill -TERM -"$pid" 2>/dev/null
+        fi
+    done
+    # Wait a moment for graceful shutdown
+    sleep 2
+    # Force kill any remaining processes
+    for pid in "${CHILD_PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process group $pid"
+            kill -KILL -"$pid" 2>/dev/null
+        fi
+    done
+    echo "All processes terminated."
+    exit 130
+}
+
+trap cleanup SIGINT SIGTERM
+
+echo "Starting $NUM_RUNS runs of the evaluation..."
+echo "Results will be saved in: $RESULTS_DIR"
+
+# Create results directory
+mkdir -p "$RESULTS_DIR"
+
+for i in $(seq 1 $NUM_RUNS); do
+    echo "=========================================="
+    echo "Launching experiment $i/$NUM_RUNS"
+    echo "=========================================="
+    
+    RUN_ID="run_$i"
+    
+    # Start process in new process group (set -m creates new pgrp)
+    (
+        set -m
+        uv run tests/test_benchmark.py \
+            --config-path config/${AGENT_SET}.yaml \
+            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
+            output_dir="$RESULTS_DIR/$RUN_ID" \
+            > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
+        
+        EXIT_CODE=$?
+        if [ $EXIT_CODE -eq 0 ]; then
+            echo "Run $i completed successfully"
+            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
+            if [ -f "$RESULT_FILE" ]; then
+                echo "Results saved to $RESULT_FILE"
+            else
+                echo "Warning: Result file not found for run $i"
+            fi
+        else
+            # Check if we have JSON result files (task completed but evaluator had issues)
+            JSON_COUNT=$(find "${RESULTS_DIR}/$RUN_ID" -name "task_*.json" 2>/dev/null | wc -l)
+            if [ "$JSON_COUNT" -gt 0 ]; then
+                echo "Run $i finished with exit code $EXIT_CODE but generated $JSON_COUNT task logs"
+            else
+                echo "Run $i failed with exit code $EXIT_CODE"
+            fi
+        fi
+    ) &
+    
+    # Get the PID and store it
+    CHILD_PIDS+=($!)
+    
+    sleep 2
+done
+
+echo "All $NUM_RUNS runs have been launched in parallel"
+echo "Child PIDs: ${CHILD_PIDS[*]}"
+echo "Waiting for all runs to complete..."
+echo "Press Ctrl+C to terminate all processes"
+
+wait
+
+echo "=========================================="
+echo "All $NUM_RUNS runs completed!"
+echo "=========================================="
+
+echo "Calculating average scores..."
+uv run python -c "from src.utils.calculate_average_score import main; main('$RESULTS_DIR')"
+
+echo "=========================================="
+echo "Multiple runs evaluation completed!"
+echo "Check results in: $RESULTS_DIR"
+echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
+echo "=========================================="