Skip to content

Commit 2e1be61

Browse files
committed
feat(benchmark): add browsecomp-en-200 and browsecomp-zh configs
1 parent 0c27c0f commit 2e1be61

6 files changed

+336
-2
lines changed
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# config/benchmark/browsecomp-en.yaml
2+
defaults:
3+
- default
4+
- _self_
5+
6+
name: "browsecomp-en-200"
7+
8+
data:
9+
data_dir: "${data_dir}/browsecomp-200" # Path to browsecomp-200 (English) dataset
10+
metadata_file: "standardized_data.jsonl" # Metadata filename
11+
whitelist: [] # Optional: List of specific task_ids to run
12+
13+
execution:
14+
max_tasks: null # null = no limit, or specify a number
15+
max_concurrent: 5 # Number of parallel tasks
16+
pass_at_k: 1 # Number of attempts per task
17+
18+
# OpenAI API key for evaluation (required for browsecomp since it has ground truth)
19+
openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
20+
openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
21+

config/benchmark/browsecomp-zh.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ execution:
1414
max_tasks: null # null = no limit, or specify a number
1515
max_concurrent: 5 # Number of parallel tasks
1616
pass_at_k: 1 # Number of attempts per task
17+
max_retry: 5
18+
exceed_max_turn_summary: true
1719

1820
# OpenAI API key for evaluation (required for browsecomp-zh since it has ground truth)
1921
openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
20-
21-
22+
openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
defaults:
2+
- benchmark: browsecomp-en-200
3+
- override hydra/job_logging: none
4+
- _self_
5+
6+
entrypoint: main_agent
7+
main_agent:
8+
name: main_agent
9+
type: IterativeAgentWithToolAndRollback
10+
max_consecutive_rollbacks: 3
11+
max_turns: 400
12+
llm:
13+
_base_: config/llm/base_mirothinker.yaml
14+
prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
15+
tools:
16+
- config/tool/tool-search-and-scrape-webpage.yaml
17+
- config/tool/tool-jina-scrape-llm-summary.yaml
18+
- config/tool/tool-python.yaml
19+
tool_blacklist:
20+
- server: "tool-search-and-scrape-webpage"
21+
tool: "sogou_search"
22+
- server: "tool-python"
23+
tool: "download_file_from_sandbox_to_local"
24+
input_processor:
25+
- ${input-message-generator}
26+
output_processor:
27+
- ${output-summary}
28+
- ${output-final-answer-extraction}
29+
- ${output-exceed-max-turn-summary}
30+
31+
input-message-generator:
32+
type: InputMessageGenerator
33+
output-summary:
34+
type: SummaryGenerator
35+
output-final-answer-extraction:
36+
type: RegexBoxedExtractor
37+
output-exceed-max-turn-summary:
38+
type: ExceedMaxTurnSummaryGenerator
39+
prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
40+
llm:
41+
_base_: config/llm/base_mirothinker.yaml
42+
43+
output_dir: logs/
44+
data_dir: "${oc.env:DATA_DIR,data}"
45+
46+
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
defaults:
2+
- benchmark: browsecomp-zh
3+
- override hydra/job_logging: none
4+
- _self_
5+
6+
entrypoint: main_agent
7+
main_agent:
8+
name: main_agent
9+
type: IterativeAgentWithToolAndRollback
10+
max_consecutive_rollbacks: 3
11+
max_turns: 400
12+
llm:
13+
_base_: config/llm/base_mirothinker.yaml
14+
prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
15+
tools:
16+
- config/tool/tool-search-and-scrape-webpage.yaml
17+
- config/tool/tool-jina-scrape-llm-summary.yaml
18+
- config/tool/tool-python.yaml
19+
tool_blacklist:
20+
- server: "tool-search-and-scrape-webpage"
21+
tool: "sogou_search"
22+
- server: "tool-python"
23+
tool: "download_file_from_sandbox_to_local"
24+
input_processor:
25+
- ${input-message-generator}
26+
output_processor:
27+
- ${output-summary}
28+
- ${output-final-answer-extraction}
29+
- ${output-exceed-max-turn-summary}
30+
31+
input-message-generator:
32+
type: InputMessageGenerator
33+
output-summary:
34+
type: SummaryGenerator
35+
output-final-answer-extraction:
36+
type: RegexBoxedExtractor
37+
output-exceed-max-turn-summary:
38+
type: ExceedMaxTurnSummaryGenerator
39+
prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
40+
llm:
41+
_base_: config/llm/base_mirothinker.yaml
42+
43+
output_dir: logs/
44+
data_dir: "${oc.env:DATA_DIR,data}"
45+
46+
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#!/bin/bash
2+
3+
# SPDX-FileCopyrightText: 2025 MiromindAI
4+
#
5+
# SPDX-License-Identifier: Apache-2.0
6+
7+
# Configuration parameters
8+
NUM_RUNS=3
9+
BENCHMARK_NAME="browsecomp-en-200"
10+
AGENT_SET="fangda_agent_browsecomp-en-200_mirothinker_single_agent_rollback_new_tools_toolblacklist"
11+
MAX_CONCURRENT=50
12+
13+
# Set results directory with timestamp
14+
TIMESTAMP=$(date +%Y%m%d_%H%M)
15+
RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}
16+
17+
# Array to track child PIDs
18+
declare -a CHILD_PIDS=()
19+
20+
cleanup() {
21+
echo ""
22+
echo "Received interrupt signal, terminating all processes..."
23+
for pid in "${CHILD_PIDS[@]}"; do
24+
if kill -0 "$pid" 2>/dev/null; then
25+
echo "Killing process group $pid"
26+
kill -TERM -"$pid" 2>/dev/null
27+
fi
28+
done
29+
# Wait a moment for graceful shutdown
30+
sleep 2
31+
# Force kill any remaining processes
32+
for pid in "${CHILD_PIDS[@]}"; do
33+
if kill -0 "$pid" 2>/dev/null; then
34+
echo "Force killing process group $pid"
35+
kill -KILL -"$pid" 2>/dev/null
36+
fi
37+
done
38+
echo "All processes terminated."
39+
exit 130
40+
}
41+
42+
trap cleanup SIGINT SIGTERM
43+
44+
echo "Starting $NUM_RUNS runs of the evaluation..."
45+
echo "Results will be saved in: $RESULTS_DIR"
46+
47+
# Create results directory
48+
mkdir -p "$RESULTS_DIR"
49+
50+
for i in $(seq 1 $NUM_RUNS); do
51+
echo "=========================================="
52+
echo "Launching experiment $i/$NUM_RUNS"
53+
echo "=========================================="
54+
55+
RUN_ID="run_$i"
56+
57+
# Start process in new process group (set -m creates new pgrp)
58+
(
59+
set -m
60+
uv run tests/test_benchmark.py \
61+
--config-path config/${AGENT_SET}.yaml \
62+
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
63+
output_dir="$RESULTS_DIR/$RUN_ID" \
64+
> "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
65+
66+
EXIT_CODE=$?
67+
if [ $EXIT_CODE -eq 0 ]; then
68+
echo "Run $i completed successfully"
69+
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
70+
if [ -f "$RESULT_FILE" ]; then
71+
echo "Results saved to $RESULT_FILE"
72+
else
73+
echo "Warning: Result file not found for run $i"
74+
fi
75+
else
76+
# Check if we have JSON result files (task completed but evaluator had issues)
77+
JSON_COUNT=$(find "${RESULTS_DIR}/$RUN_ID" -name "task_*.json" 2>/dev/null | wc -l)
78+
if [ "$JSON_COUNT" -gt 0 ]; then
79+
echo "Run $i finished with exit code $EXIT_CODE but generated $JSON_COUNT task logs"
80+
else
81+
echo "Run $i failed with exit code $EXIT_CODE"
82+
fi
83+
fi
84+
) &
85+
86+
# Get the PID and store it
87+
CHILD_PIDS+=($!)
88+
89+
sleep 2
90+
done
91+
92+
echo "All $NUM_RUNS runs have been launched in parallel"
93+
echo "Child PIDs: ${CHILD_PIDS[*]}"
94+
echo "Waiting for all runs to complete..."
95+
echo "Press Ctrl+C to terminate all processes"
96+
97+
wait
98+
99+
echo "=========================================="
100+
echo "All $NUM_RUNS runs completed!"
101+
echo "=========================================="
102+
103+
echo "Calculating average scores..."
104+
uv run python -c "from src.utils.calculate_average_score import main; main('$RESULTS_DIR')"
105+
106+
echo "=========================================="
107+
echo "Multiple runs evaluation completed!"
108+
echo "Check results in: $RESULTS_DIR"
109+
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
110+
echo "=========================================="
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#!/bin/bash
2+
3+
# SPDX-FileCopyrightText: 2025 MiromindAI
4+
#
5+
# SPDX-License-Identifier: Apache-2.0
6+
7+
# Configuration parameters
8+
NUM_RUNS=3
9+
BENCHMARK_NAME="browsecomp-zh"
10+
AGENT_SET="fangda_agent_browsecomp-zh_mirothinker_single_agent_rollback_new_tools_toolblacklist"
11+
MAX_CONCURRENT=50
12+
13+
# Set results directory with timestamp
14+
TIMESTAMP=$(date +%Y%m%d_%H%M)
15+
RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}
16+
17+
# Array to track child PIDs
18+
declare -a CHILD_PIDS=()
19+
20+
cleanup() {
21+
echo ""
22+
echo "Received interrupt signal, terminating all processes..."
23+
for pid in "${CHILD_PIDS[@]}"; do
24+
if kill -0 "$pid" 2>/dev/null; then
25+
echo "Killing process group $pid"
26+
kill -TERM -"$pid" 2>/dev/null
27+
fi
28+
done
29+
# Wait a moment for graceful shutdown
30+
sleep 2
31+
# Force kill any remaining processes
32+
for pid in "${CHILD_PIDS[@]}"; do
33+
if kill -0 "$pid" 2>/dev/null; then
34+
echo "Force killing process group $pid"
35+
kill -KILL -"$pid" 2>/dev/null
36+
fi
37+
done
38+
echo "All processes terminated."
39+
exit 130
40+
}
41+
42+
trap cleanup SIGINT SIGTERM
43+
44+
echo "Starting $NUM_RUNS runs of the evaluation..."
45+
echo "Results will be saved in: $RESULTS_DIR"
46+
47+
# Create results directory
48+
mkdir -p "$RESULTS_DIR"
49+
50+
for i in $(seq 1 $NUM_RUNS); do
51+
echo "=========================================="
52+
echo "Launching experiment $i/$NUM_RUNS"
53+
echo "=========================================="
54+
55+
RUN_ID="run_$i"
56+
57+
# Start process in new process group (set -m creates new pgrp)
58+
(
59+
set -m
60+
uv run tests/test_benchmark.py \
61+
--config-path config/${AGENT_SET}.yaml \
62+
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
63+
output_dir="$RESULTS_DIR/$RUN_ID" \
64+
> "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
65+
66+
EXIT_CODE=$?
67+
if [ $EXIT_CODE -eq 0 ]; then
68+
echo "Run $i completed successfully"
69+
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
70+
if [ -f "$RESULT_FILE" ]; then
71+
echo "Results saved to $RESULT_FILE"
72+
else
73+
echo "Warning: Result file not found for run $i"
74+
fi
75+
else
76+
# Check if we have JSON result files (task completed but evaluator had issues)
77+
JSON_COUNT=$(find "${RESULTS_DIR}/$RUN_ID" -name "task_*.json" 2>/dev/null | wc -l)
78+
if [ "$JSON_COUNT" -gt 0 ]; then
79+
echo "Run $i finished with exit code $EXIT_CODE but generated $JSON_COUNT task logs"
80+
else
81+
echo "Run $i failed with exit code $EXIT_CODE"
82+
fi
83+
fi
84+
) &
85+
86+
# Get the PID and store it
87+
CHILD_PIDS+=($!)
88+
89+
sleep 2
90+
done
91+
92+
echo "All $NUM_RUNS runs have been launched in parallel"
93+
echo "Child PIDs: ${CHILD_PIDS[*]}"
94+
echo "Waiting for all runs to complete..."
95+
echo "Press Ctrl+C to terminate all processes"
96+
97+
wait
98+
99+
echo "=========================================="
100+
echo "All $NUM_RUNS runs completed!"
101+
echo "=========================================="
102+
103+
echo "Calculating average scores..."
104+
uv run python -c "from src.utils.calculate_average_score import main; main('$RESULTS_DIR')"
105+
106+
echo "=========================================="
107+
echo "Multiple runs evaluation completed!"
108+
echo "Check results in: $RESULTS_DIR"
109+
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
110+
echo "=========================================="

0 commit comments

Comments
 (0)