MiroMindAI
diff --git a/‎config/benchmark/browsecomp-en-200.yaml‎
Lines changed: 21 additions & 0 deletions b/‎config/benchmark/browsecomp-en-200.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎config/benchmark/browsecomp-zh.yaml‎
Lines changed: 3 additions & 2 deletions b/‎config/benchmark/browsecomp-zh.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎config/fangda_agent_browsecomp-en-200_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml‎
Lines changed: 46 additions & 0 deletions b/‎config/fangda_agent_browsecomp-en-200_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎config/fangda_agent_browsecomp-zh_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml‎
Lines changed: 46 additions & 0 deletions b/‎config/fangda_agent_browsecomp-zh_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎config/llm/base_mirothinker.yaml‎
Lines changed: 1 addition & 0 deletions b/‎config/llm/base_mirothinker.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎config/prompts/fangda_prompt_main_agent.yaml‎
Lines changed: 0 additions & 28 deletions b/‎config/prompts/fangda_prompt_main_agent.yaml‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎scripts/binwang_test.sh‎
Lines changed: 1 addition & 1 deletion b/‎scripts/binwang_test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/fangda_run_evaluate_multiple_runs_mirothinker_browsecomp-en-200.sh‎
Lines changed: 110 additions & 0 deletions b/‎scripts/fangda_run_evaluate_multiple_runs_mirothinker_browsecomp-en-200.sh‎
Lines changed: 110 additions & 0 deletions
@@ -0,0 +1,21 @@
+# config/benchmark/browsecomp-en.yaml
+defaults:
+  - default
+  - _self_
+
+name: "browsecomp-en-200"
+
+data:
+  data_dir: "${data_dir}/browsecomp-200"  # Path to browsecomp-200 (English) dataset
+  metadata_file: "standardized_data.jsonl"  # Metadata filename
+  whitelist: []  # Optional: List of specific task_ids to run
+
+execution:
+  max_tasks: null      # null = no limit, or specify a number
+  max_concurrent: 5    # Number of parallel tasks
+  pass_at_k: 1         # Number of attempts per task
+
+# OpenAI API key for evaluation (required for browsecomp since it has ground truth)
+openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
+openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
+
@@ -14,8 +14,9 @@ execution:
   max_tasks: null      # null = no limit, or specify a number
   max_concurrent: 5    # Number of parallel tasks
   pass_at_k: 1         # Number of attempts per task
+  max_retry: 5
+  exceed_max_turn_summary: true
 
 # OpenAI API key for evaluation (required for browsecomp-zh since it has ground truth)
 openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
-
-
+openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
@@ -0,0 +1,46 @@
+defaults:
+  - benchmark: browsecomp-en-200
+  - override hydra/job_logging: none
+  - _self_
+
+entrypoint: main_agent
+main_agent:
+  name: main_agent
+  type: IterativeAgentWithToolAndRollback
+  max_consecutive_rollbacks: 3
+  max_turns: 400
+  llm:
+    _base_: config/llm/base_mirothinker.yaml
+  prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
+  tools:
+    - config/tool/tool-search-and-scrape-webpage.yaml
+    - config/tool/tool-jina-scrape-llm-summary.yaml
+    - config/tool/tool-python.yaml
+  tool_blacklist:
+    - server: "tool-search-and-scrape-webpage"
+      tool: "sogou_search"
+    - server: "tool-python"
+      tool: "download_file_from_sandbox_to_local"
+  input_processor:
+    - ${input-message-generator}
+  output_processor:
+    - ${output-summary}
+    - ${output-final-answer-extraction}
+    - ${output-exceed-max-turn-summary}
+
+input-message-generator:
+  type: InputMessageGenerator
+output-summary:
+  type: SummaryGenerator
+output-final-answer-extraction:
+  type: RegexBoxedExtractor
+output-exceed-max-turn-summary:
+  type: ExceedMaxTurnSummaryGenerator
+  prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
+  llm:
+    _base_: config/llm/base_mirothinker.yaml
+
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"
+
+
@@ -0,0 +1,46 @@
+defaults:
+  - benchmark: browsecomp-zh
+  - override hydra/job_logging: none
+  - _self_
+
+entrypoint: main_agent
+main_agent:
+  name: main_agent
+  type: IterativeAgentWithToolAndRollback
+  max_consecutive_rollbacks: 3
+  max_turns: 400
+  llm:
+    _base_: config/llm/base_mirothinker.yaml
+  prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
+  tools:
+    - config/tool/tool-search-and-scrape-webpage.yaml
+    - config/tool/tool-jina-scrape-llm-summary.yaml
+    - config/tool/tool-python.yaml
+  tool_blacklist:
+    - server: "tool-search-and-scrape-webpage"
+      tool: "sogou_search"
+    - server: "tool-python"
+      tool: "download_file_from_sandbox_to_local"
+  input_processor:
+    - ${input-message-generator}
+  output_processor:
+    - ${output-summary}
+    - ${output-final-answer-extraction}
+    - ${output-exceed-max-turn-summary}
+
+input-message-generator:
+  type: InputMessageGenerator
+output-summary:
+  type: SummaryGenerator
+output-final-answer-extraction:
+  type: RegexBoxedExtractor
+output-exceed-max-turn-summary:
+  type: ExceedMaxTurnSummaryGenerator
+  prompt: config/prompts/fangda_prompt_main_agent_0128.yaml
+  llm:
+    _base_: config/llm/base_mirothinker.yaml
+
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"
+
+
@@ -18,6 +18,7 @@ repetition_penalty: 1.05
 
 disable_cache_control: false
 keep_tool_result: 5
+strip_think_from_history: false
 
 use_tool_calls: false
 oai_tool_thinking: false
@@ -307,14 +307,6 @@ template:
     components:
       - basic_exceed_max_turn_summary_prompt
 
-    required_context:
-      - task_description
-      - summary
-
-    optional_context:
-      - final_boxed_answer
-      - error_message
-
     basic_exceed_max_turn_summary_prompt: |
       The task was not completed successfully. Do NOT call any tools. Provide a structured summary:
 
@@ -326,26 +318,6 @@ template:
       What happened: [describe the approach taken and why a final answer was not reached]
       Useful findings: [list any facts, intermediate results, or conclusions discovered that should be reused]
 
-      Original Task:
-      ---
-      {{ task_description }}
-      ---
-
-      {% if summary is defined and summary %}
-      Attempt Summary:
-      ---
-      {{ summary }}
-      ---
-      {% endif %}
-
-      {% if final_boxed_answer is defined and final_boxed_answer %}
-      Final Answer Attempted: {{ final_boxed_answer }}
-      {% endif %}
-
-      {% if error_message is defined and error_message %}
-      Error Encountered: {{ error_message }}
-      {% endif %}
-
   exceed_max_turn_summary_header:
     components:
       - header_text
 
@@ -57,7 +57,7 @@ for i in $(seq 1 $NUM_RUNS); do
     # Start process in new process group (set -m creates new pgrp)
     (
         set -m
-        uv run test_benchmark.py \
+        uv run tests/test_benchmark.py \
             --config-path config/${AGENT_SET}.yaml \
             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
             output_dir="$RESULTS_DIR/$RUN_ID" \
 
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Configuration parameters
+NUM_RUNS=3
+BENCHMARK_NAME="browsecomp-en-200"
+AGENT_SET="fangda_agent_browsecomp-en-200_mirothinker_single_agent_rollback_new_tools_toolblacklist"
+MAX_CONCURRENT=50
+
+# Set results directory with timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}
+
+# Array to track child PIDs
+declare -a CHILD_PIDS=()
+
+cleanup() {
+    echo ""
+    echo "Received interrupt signal, terminating all processes..."
+    for pid in "${CHILD_PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process group $pid"
+            kill -TERM -"$pid" 2>/dev/null
+        fi
+    done
+    # Wait a moment for graceful shutdown
+    sleep 2
+    # Force kill any remaining processes
+    for pid in "${CHILD_PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process group $pid"
+            kill -KILL -"$pid" 2>/dev/null
+        fi
+    done
+    echo "All processes terminated."
+    exit 130
+}
+
+trap cleanup SIGINT SIGTERM
+
+echo "Starting $NUM_RUNS runs of the evaluation..."
+echo "Results will be saved in: $RESULTS_DIR"
+
+# Create results directory
+mkdir -p "$RESULTS_DIR"
+
+for i in $(seq 1 $NUM_RUNS); do
+    echo "=========================================="
+    echo "Launching experiment $i/$NUM_RUNS"
+    echo "=========================================="
+    
+    RUN_ID="run_$i"
+    
+    # Start process in new process group (set -m creates new pgrp)
+    (
+        set -m
+        uv run tests/test_benchmark.py \
+            --config-path config/${AGENT_SET}.yaml \
+            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
+            output_dir="$RESULTS_DIR/$RUN_ID" \
+            > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
+        
+        EXIT_CODE=$?
+        if [ $EXIT_CODE -eq 0 ]; then
+            echo "Run $i completed successfully"
+            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
+            if [ -f "$RESULT_FILE" ]; then
+                echo "Results saved to $RESULT_FILE"
+            else
+                echo "Warning: Result file not found for run $i"
+            fi
+        else
+            # Check if we have JSON result files (task completed but evaluator had issues)
+            JSON_COUNT=$(find "${RESULTS_DIR}/$RUN_ID" -name "task_*.json" 2>/dev/null | wc -l)
+            if [ "$JSON_COUNT" -gt 0 ]; then
+                echo "Run $i finished with exit code $EXIT_CODE but generated $JSON_COUNT task logs"
+            else
+                echo "Run $i failed with exit code $EXIT_CODE"
+            fi
+        fi
+    ) &
+    
+    # Get the PID and store it
+    CHILD_PIDS+=($!)
+    
+    sleep 2
+done
+
+echo "All $NUM_RUNS runs have been launched in parallel"
+echo "Child PIDs: ${CHILD_PIDS[*]}"
+echo "Waiting for all runs to complete..."
+echo "Press Ctrl+C to terminate all processes"
+
+wait
+
+echo "=========================================="
+echo "All $NUM_RUNS runs completed!"
+echo "=========================================="
+
+echo "Calculating average scores..."
+uv run python -c "from src.utils.calculate_average_score import main; main('$RESULTS_DIR')"
+
+echo "=========================================="
+echo "Multiple runs evaluation completed!"
+echo "Check results in: $RESULTS_DIR"
+echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
+echo "=========================================="
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ for i in $(seq 1 $NUM_RUNS); do`
`57`	`57`	`# Start process in new process group (set -m creates new pgrp)`
`58`	`58`	`(`
`59`	`59`	`set -m`
`60`		`- uv run test_benchmark.py \`
	`60`	`+ uv run tests/test_benchmark.py \`
`61`	`61`	`--config-path config/${AGENT_SET}.yaml \`
`62`	`62`	`benchmark.execution.max_concurrent=$MAX_CONCURRENT \`
`63`	`63`	`output_dir="$RESULTS_DIR/$RUN_ID" \`