From 56b235d0069cf72cbf9953ed30dcdebcc236c68e Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Thu, 18 Sep 2025 10:35:02 +0800
Subject: [PATCH 01/12] upd: add futurex evaluation support.

---
 .gitignore                                    |   1 +
 config/benchmark/futurex.yaml                 |  20 ++
 utils/extract_futurex_results.py              | 334 ++++++++++++++++++
 utils/prepare_benchmark/gen_futurex.py        |  55 +++
 utils/prepare_benchmark/main.py               |   9 +
 .../progress_check/check_futurex_progress.py  | 218 ++++++++++++
 6 files changed, 637 insertions(+)
 create mode 100644 config/benchmark/futurex.yaml
 create mode 100644 utils/extract_futurex_results.py
 create mode 100644 utils/prepare_benchmark/gen_futurex.py
 create mode 100644 utils/progress_check/check_futurex_progress.py

diff --git a/.gitignore b/.gitignore
index aaf57c1e..8cbc94e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -208,6 +208,7 @@ marimo/_lsp/
 __marimo__/
 
 logs/
+tmp/
 
 data/*
 !data/README.md
diff --git a/config/benchmark/futurex.yaml b/config/benchmark/futurex.yaml
new file mode 100644
index 00000000..97109882
--- /dev/null
+++ b/config/benchmark/futurex.yaml
@@ -0,0 +1,20 @@
+# config/benchmark/futurex.yaml
+defaults:
+  - default
+  - _self_
+
+name: "futurex"
+
+data:
+  data_dir: "${data_dir}/futurex"  # Path to your dataset
+  metadata_file: "standardized_data.jsonl"  # Metadata filename
+  whitelist: []  # Optional: List of specific task_ids to run
+
+execution:
+  max_tasks: null      # null = no limit, or specify a number
+  max_concurrent: 5    # Number of parallel tasks
+  pass_at_k: 1         # Number of attempts per task
+
+# Set to skip evaluation since we don't have ground truth
+openai_api_key: "skip_evaluation"
+
diff --git a/utils/extract_futurex_results.py b/utils/extract_futurex_results.py
new file mode 100644
index 00000000..415b720c
--- /dev/null
+++ b/utils/extract_futurex_results.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+FutureX Results Extractor and Aggregator
+
+This script extracts predictions from MiroFlow benchmark results and can aggregate
+multiple runs using majority voting to create FutureX submission files.
+
+Features:
+1. Extract predictions from single benchmark results
+2. Aggregate multiple runs with majority voting
+3. Generate FutureX-compatible submission files
+4. Support both single-run and multi-run scenarios
+
+Usage:
+    # Extract from single run
+    python extract_futurex_results.py logs/futurex-online-test
+    
+    # Aggregate multiple runs (if run_* subdirectories exist)
+    python extract_futurex_results.py logs/futurex-online-multi-runs
+    
+    # Specify output file
+    python extract_futurex_results.py logs/futurex-online-test -o my_submission.jsonl
+"""
+
+import argparse
+import json
+import os
+from collections import Counter, defaultdict
+from typing import Dict, List, Tuple
+
+
+def majority_vote(
+    preds: List[str], first_seen_order: Dict[str, int]
+) -> Tuple[str, Dict[str, int]]:
+    """
+    Compute the majority-vote prediction for a list of candidate predictions.
+
+    Tie-breaking rules (deterministic):
+      1) Highest frequency wins.
+      2) If there is a tie on frequency, choose the candidate that appeared earliest
+         across all runs (based on the provided first_seen_order index).
+      3) As a final guard (shouldn't be needed if first_seen_order is complete),
+         fall back to lexicographic order.
+
+    Returns:
+      (chosen_prediction, counts_dict)
+    """
+    counter = Counter(preds)
+    # Get the max vote count
+    max_count = max(counter.values())
+    # All candidates that share the max vote count
+    tied = [c for c, cnt in counter.items() if cnt == max_count]
+
+    if len(tied) == 1:
+        chosen = tied[0]
+    else:
+        # Prefer the one seen earliest globally
+        tied.sort(key=lambda x: (first_seen_order.get(x, float("inf")), x))
+        chosen = tied[0]
+
+    # Expose counts for optional debugging/inspection
+    return chosen, dict(counter)
+
+
+def discover_runs(results_dir: str) -> List[str]:
+    """
+    Discover subdirectories inside results_dir that potentially contain a
+    'benchmark_results.jsonl'. We don't strictly require the subdir name to
+    start with 'run_', but we sort the list to keep processing deterministic.
+    """
+    runs = []
+    for name in sorted(os.listdir(results_dir)):
+        path = os.path.join(results_dir, name)
+        if os.path.isdir(path):
+            fpath = os.path.join(path, "benchmark_results.jsonl")
+            if os.path.isfile(fpath):
+                runs.append(path)
+    return runs
+
+
+def extract_predictions_from_file(file_path: str) -> Dict[str, str]:
+    """
+    Extract predictions from a single benchmark_results.jsonl file.
+    
+    Args:
+        file_path: Path to benchmark_results.jsonl file
+        
+    Returns:
+        Dictionary mapping task_id to prediction
+    """
+    predictions = {}
+    
+    with open(file_path, "r", encoding="utf-8") as fin:
+        for line_num, line in enumerate(fin, 1):
+            line = line.strip()
+            if not line:
+                continue
+                
+            try:
+                rec = json.loads(line)
+            except json.JSONDecodeError as e:
+                print(f"Warning: Skipping malformed JSON at line {line_num} in {file_path}: {e}")
+                continue
+
+            task_id = rec.get("task_id")
+            pred = rec.get("model_boxed_answer")
+
+            # Only accept non-empty strings; coerce to str for safety
+            if task_id and pred is not None and str(pred).strip():
+                pred_str = str(pred).strip()
+                predictions[task_id] = pred_str
+                
+    return predictions
+
+
+def aggregate_multiple_runs(results_dir: str) -> Tuple[Dict[str, List[str]], Dict[str, int]]:
+    """
+    Aggregate predictions from multiple runs in subdirectories.
+    
+    Args:
+        results_dir: Directory containing run_* subdirectories
+        
+    Returns:
+        Tuple of (predictions_by_task, first_seen_order)
+    """
+    # Maps task_id -> list of predictions collected across runs
+    preds_by_task: Dict[str, List[str]] = defaultdict(list)
+
+    # Track first-seen order index for each distinct prediction string across all runs.
+    # This enables deterministic tie-breaking.
+    first_seen_order: Dict[str, int] = {}
+    next_order_idx = 0
+
+    runs = discover_runs(results_dir)
+    if not runs:
+        raise FileNotFoundError(
+            f"No run directories with 'benchmark_results.jsonl' found under: {results_dir}"
+        )
+
+    total_lines = 0
+    used_lines = 0
+
+    # Read and aggregate predictions
+    for run_dir in runs:
+        fpath = os.path.join(run_dir, "benchmark_results.jsonl")
+        print(f"Reading: {fpath}")
+        
+        with open(fpath, "r", encoding="utf-8") as fin:
+            for line in fin:
+                total_lines += 1
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    rec = json.loads(line)
+                except json.JSONDecodeError:
+                    # Skip malformed JSON lines, but keep going
+                    continue
+
+                task_id = rec.get("task_id")
+                pred = rec.get("model_boxed_answer")
+
+                # Only accept non-empty strings; coerce to str for safety
+                if task_id and pred is not None and str(pred).strip():
+                    pred_str = str(pred).strip()
+                    preds_by_task[task_id].append(pred_str)
+                    if pred_str not in first_seen_order:
+                        first_seen_order[pred_str] = next_order_idx
+                        next_order_idx += 1
+                    used_lines += 1
+
+    print(f"Collected from {len(runs)} run(s).")
+    print(f"Read {total_lines} line(s), accepted {used_lines} record(s).")
+    
+    return preds_by_task, first_seen_order
+
+
+def process_single_run(results_dir: str) -> Dict[str, str]:
+    """
+    Process a single run (direct benchmark_results.jsonl file).
+    
+    Args:
+        results_dir: Directory containing benchmark_results.jsonl
+        
+    Returns:
+        Dictionary mapping task_id to prediction
+    """
+    file_path = os.path.join(results_dir, "benchmark_results.jsonl")
+    
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"benchmark_results.jsonl not found in: {results_dir}")
+    
+    print(f"Reading single run: {file_path}")
+    predictions = extract_predictions_from_file(file_path)
+    print(f"Extracted {len(predictions)} predictions from single run.")
+    
+    return predictions
+
+
+def write_submission_file(
+    predictions: Dict[str, str], 
+    output_file: str, 
+    is_aggregated: bool = False,
+    vote_counts: Dict[str, Dict[str, int]] = None
+) -> None:
+    """
+    Write predictions to FutureX submission format.
+    
+    Args:
+        predictions: Dictionary mapping task_id to prediction
+        output_file: Output file path
+        is_aggregated: Whether this is from aggregated runs
+        vote_counts: Vote counts for each task (only for aggregated runs)
+    """
+    num_tasks = 0
+    with open(output_file, "w", encoding="utf-8") as out:
+        for task_id in sorted(predictions.keys()):
+            prediction = predictions[task_id]
+            
+            # Create submission record
+            record = {"id": task_id, "prediction": prediction}
+            
+            # Add vote information for aggregated runs
+            if is_aggregated and vote_counts and task_id in vote_counts:
+                record["vote_counts"] = vote_counts[task_id]
+            
+            out.write(json.dumps(record, ensure_ascii=False) + "\n")
+            num_tasks += 1
+
+    print(f"✅ Submission saved to {output_file}")
+    if is_aggregated:
+        print(f"Aggregated {num_tasks} unique task_id(s) from multiple runs.")
+    else:
+        print(f"Extracted {num_tasks} predictions from single run.")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Extract predictions from MiroFlow benchmark results and create FutureX submission files. "
+                   "Supports both single runs and multi-run aggregation with majority voting."
+    )
+    parser.add_argument(
+        "results_dir",
+        help="Path to results dir containing benchmark_results.jsonl or run_*/benchmark_results.jsonl",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default=None,
+        help="Output JSONL file path (default: <results_dir>/futurex_submission.jsonl)",
+    )
+    parser.add_argument(
+        "--aggregate",
+        action="store_true",
+        help="Force aggregation mode (look for run_* subdirectories)",
+    )
+    parser.add_argument(
+        "--single",
+        action="store_true", 
+        help="Force single run mode (look for direct benchmark_results.jsonl)",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    results_dir = os.path.abspath(args.results_dir)
+    if not os.path.isdir(results_dir):
+        raise FileNotFoundError(f"Results dir not found: {results_dir}")
+
+    output_file = (
+        os.path.abspath(args.output)
+        if args.output
+        else os.path.join(results_dir, "futurex_submission.jsonl")
+    )
+
+    # Determine processing mode
+    runs = discover_runs(results_dir)
+    single_file = os.path.join(results_dir, "benchmark_results.jsonl")
+    
+    if args.aggregate:
+        if not runs:
+            raise FileNotFoundError(
+                f"No run directories found for aggregation in: {results_dir}"
+            )
+        mode = "aggregate"
+    elif args.single:
+        if not os.path.isfile(single_file):
+            raise FileNotFoundError(
+                f"benchmark_results.jsonl not found for single run in: {results_dir}"
+            )
+        mode = "single"
+    else:
+        # Auto-detect mode
+        if runs and os.path.isfile(single_file):
+            print("Both single run and multiple runs detected. Using aggregation mode.")
+            print("Use --single to force single run mode.")
+            mode = "aggregate"
+        elif runs:
+            mode = "aggregate"
+        elif os.path.isfile(single_file):
+            mode = "single"
+        else:
+            raise FileNotFoundError(
+                f"No benchmark_results.jsonl files found in: {results_dir}"
+            )
+
+    print(f"Processing mode: {mode}")
+
+    if mode == "aggregate":
+        # Multi-run aggregation with majority voting
+        preds_by_task, first_seen_order = aggregate_multiple_runs(results_dir)
+        
+        # Apply majority voting
+        final_predictions = {}
+        vote_counts = {}
+        
+        for task_id in preds_by_task:
+            voted_pred, counts = majority_vote(preds_by_task[task_id], first_seen_order)
+            final_predictions[task_id] = voted_pred
+            vote_counts[task_id] = counts
+        
+        write_submission_file(final_predictions, output_file, is_aggregated=True, vote_counts=vote_counts)
+        
+    else:
+        # Single run extraction
+        predictions = process_single_run(results_dir)
+        write_submission_file(predictions, output_file, is_aggregated=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/prepare_benchmark/gen_futurex.py b/utils/prepare_benchmark/gen_futurex.py
new file mode 100644
index 00000000..0e2690d6
--- /dev/null
+++ b/utils/prepare_benchmark/gen_futurex.py
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Generator, MutableMapping
+
+from datasets import load_dataset
+
+from utils.prepare_benchmark.common import Task
+
+
+def gen_futurex(hf_token: str) -> Generator[Task, None, None]:
+    """
+    Generate Futurex-Online dataset tasks in MiroFlow format
+    
+    Args:
+        hf_token: Hugging Face token for dataset access
+        
+    Yields:
+        Task: Standardized task objects
+    """
+    # Load the Futurex-Online dataset
+    dataset = load_dataset("futurex-ai/Futurex-Online")
+    
+    # Process each split in the dataset
+    for split_name, split_data in dataset.items():
+        for idx, sample in enumerate(split_data):
+            # Extract task information
+            task_id = sample.get("id", f"futurex_{split_name}_{idx}")
+            task_question = sample.get("prompt", "")
+            end_time = sample.get("end_time", "")
+            level = sample.get("level", "")
+            
+            # Create metadata dictionary
+            metadata: MutableMapping = {
+                "level": level,
+                "end_time": end_time,
+                "source": "futurex-ai/Futurex-Online",
+                "split": split_name,
+                "original_id": sample.get("id", ""),
+                "dataset_name": "Futurex-Online"
+            }
+            
+            # Create standardized Task object
+            task = Task(
+                task_id=task_id,
+                task_question=task_question,
+                ground_truth="",  # Futurex-Online doesn't have ground truth
+                file_path=None,   # No file attachments
+                metadata=metadata,
+            )
+            
+            yield task
+
+    return
diff --git a/utils/prepare_benchmark/main.py b/utils/prepare_benchmark/main.py
index 9712255a..2233bfdc 100644
--- a/utils/prepare_benchmark/main.py
+++ b/utils/prepare_benchmark/main.py
@@ -18,6 +18,7 @@
 from utils.prepare_benchmark.gen_hle import gen_hle_test
 from utils.prepare_benchmark.gen_webwalkerqa import gen_webwalkerqa
 from utils.prepare_benchmark.gen_xbench_ds import gen_xbench_ds
+from utils.prepare_benchmark.gen_futurex import gen_futurex
 
 
 @dataclasses.dataclass
@@ -31,6 +32,7 @@ class _Env:
         "browsecomp-zh-test",
         "hle",
         "xbench-ds",
+        "futurex",
     )
     meta_filename = "standardized_data.jsonl"
     data_dir: pathlib.Path
@@ -108,6 +110,13 @@ def gen():
                 for x in gen_xbench_ds(env.hf_token):
                     yield x
 
+            return gen
+        case "futurex":
+
+            def gen():
+                for x in gen_futurex(env.hf_token):
+                    yield x
+
             return gen
         case _:
             raise ValueError("not supported")
diff --git a/utils/progress_check/check_futurex_progress.py b/utils/progress_check/check_futurex_progress.py
new file mode 100644
index 00000000..d8783174
--- /dev/null
+++ b/utils/progress_check/check_futurex_progress.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+"""
+Futurex-Online Progress Checker
+
+This script analyzes Futurex-Online benchmark results in a log folder to count:
+- Total files processed
+- Files with status "completed" 
+- Files with predictions (final_boxed_answer)
+- Files with errors
+
+Usage:
+    python check_futurex_progress.py [LOG_FOLDER_PATH]
+
+If no path is provided, uses the current directory.
+"""
+
+import json
+import sys
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+
+def analyze_futurex_results(log_folder: str) -> Dict[str, int]:
+    """
+    Analyze Futurex-Online benchmark results from JSON log files.
+
+    Args:
+        log_folder: Path to folder containing task_*.json files
+
+    Returns:
+        Dictionary with counts of different categories
+    """
+    log_path = Path(log_folder)
+
+    if not log_path.exists():
+        raise FileNotFoundError(f"Log folder not found: {log_folder}")
+
+    # Find all task JSON files
+    json_files = list(log_path.glob("task_*_attempt_*.json"))
+
+    results = {
+        "total_files": 0,
+        "completed_status": 0,
+        "running_status": 0,
+        "failed_status": 0,
+        "with_predictions": 0,
+        "without_predictions": 0,
+        "with_errors": 0,
+        "parse_errors": 0,
+    }
+
+    completed_files = []
+    running_files = []
+    failed_files = []
+    prediction_files = []
+    error_files = []
+    parse_error_files = []
+
+    print(f"Scanning {len(json_files)} files in {log_folder}...")
+
+    for json_file in json_files:
+        results["total_files"] += 1
+
+        try:
+            with open(json_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+
+            status = data.get("status", "").lower()
+            final_answer = data.get("final_boxed_answer", "")
+            error_msg = data.get("error", "")
+            judge_result = data.get("judge_result", "")
+
+            # Count by status
+            if status == "completed":
+                results["completed_status"] += 1
+                completed_files.append(json_file.name)
+            elif status == "running":
+                results["running_status"] += 1
+                running_files.append(json_file.name)
+            elif status in ["failed", "error"]:
+                results["failed_status"] += 1
+                failed_files.append(json_file.name)
+            else:
+                # Unknown status
+                results["failed_status"] += 1
+                failed_files.append((json_file.name, f"Unknown status: {status}"))
+
+            # Count by prediction availability
+            if final_answer and final_answer.strip():
+                results["with_predictions"] += 1
+                prediction_files.append((json_file.name, final_answer[:100] + "..." if len(final_answer) > 100 else final_answer))
+            else:
+                results["without_predictions"] += 1
+
+            # Count by error presence
+            if error_msg and error_msg.strip():
+                results["with_errors"] += 1
+                error_files.append((json_file.name, error_msg))
+
+        except (json.JSONDecodeError, KeyError, FileNotFoundError) as e:
+            results["parse_errors"] += 1
+            parse_error_files.append((json_file.name, str(e)))
+            print(f"Error parsing {json_file.name}: {e}")
+
+    return (
+        results,
+        completed_files,
+        running_files,
+        failed_files,
+        prediction_files,
+        error_files,
+        parse_error_files,
+    )
+
+
+def display_results(
+    results: Dict[str, int],
+    completed_files: List[str],
+    running_files: List[str],
+    failed_files: List[str],
+    prediction_files: List[Tuple[str, str]],
+    error_files: List[Tuple[str, str]],
+    parse_error_files: List[Tuple[str, str]],
+) -> None:
+    """Display the analysis results in a formatted way."""
+
+    print("\n" + "=" * 60)
+    print("FUTUREX-ONLINE BENCHMARK RESULTS SUMMARY")
+    print("=" * 60)
+
+    total = results["total_files"]
+    completed = results["completed_status"]
+    running = results["running_status"]
+    failed = results["failed_status"]
+    with_predictions = results["with_predictions"]
+    with_errors = results["with_errors"]
+
+    print(f"Total files processed:           {total:3d}")
+    print(f"Files with status 'completed':   {completed:3d} ({completed/total*100:.1f}%)")
+    print(f"Files with status 'running':     {running:3d} ({running/total*100:.1f}%)")
+    print(f"Files with status 'failed':      {failed:3d} ({failed/total*100:.1f}%)")
+    print(f"Files with predictions:          {with_predictions:3d} ({with_predictions/total*100:.1f}%)")
+    print(f"Files with errors:               {with_errors:3d} ({with_errors/total*100:.1f}%)")
+    print(f"Files with parse errors:         {results['parse_errors']:3d}")
+
+    if completed > 0:
+        prediction_rate = with_predictions / completed * 100
+        print(f"\nPrediction rate (predictions/completed): {prediction_rate:.1f}%")
+
+    print("\n" + "-" * 60)
+    print(f"SUMMARY: {completed} tasks completed, {with_predictions} with predictions")
+    print("-" * 60)
+
+    # Show some example files for verification
+    if completed_files:
+        print("\nFirst 5 completed files:")
+        for i, filename in enumerate(completed_files[:5], 1):
+            print(f"  {i}. {filename}")
+        if len(completed_files) > 5:
+            print(f"  ... and {len(completed_files) - 5} more")
+
+    if running_files:
+        print("\nFirst 5 running files:")
+        for i, filename in enumerate(running_files[:5], 1):
+            print(f"  {i}. {filename}")
+        if len(running_files) > 5:
+            print(f"  ... and {len(running_files) - 5} more")
+
+    if prediction_files:
+        print("\nFirst 5 files with predictions:")
+        for i, (filename, prediction) in enumerate(prediction_files[:5], 1):
+            print(f"  {i}. {filename}")
+            print(f"     Prediction: {prediction}")
+        if len(prediction_files) > 5:
+            print(f"  ... and {len(prediction_files) - 5} more")
+
+    if error_files:
+        print("\nFiles with errors:")
+        for filename, error in error_files[:5]:
+            print(f"  - {filename}: {error[:100]}...")
+        if len(error_files) > 5:
+            print(f"  ... and {len(error_files) - 5} more")
+
+    if parse_error_files:
+        print("\nFiles with parse errors:")
+        for filename, error in parse_error_files:
+            print(f"  - {filename}: {error}")
+
+
+def main():
+    """Main function to run the analysis."""
+
+    # Check if folder path was provided as command line argument
+    if len(sys.argv) > 1:
+        log_folder = sys.argv[1]
+        print(f"Using provided folder path: {log_folder}")
+    else:
+        log_folder = "."
+        print(f"No folder path provided, using current directory: {log_folder}")
+
+    try:
+        print(f"Analyzing Futurex-Online benchmark results in: {log_folder}")
+        results, completed_files, running_files, failed_files, prediction_files, error_files, parse_error_files = analyze_futurex_results(
+            log_folder
+        )
+        display_results(results, completed_files, running_files, failed_files, prediction_files, error_files, parse_error_files)
+
+    except Exception as e:
+        print(f"Error: {e}")
+        print(f"\nUsage: python {sys.argv[0]} [LOG_FOLDER_PATH]")
+        print(f"Example: python {sys.argv[0]} logs/futurex-online-test")
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())

From 287a7bcc2ecdfd1972edb735e3a05e58ecdd0287 Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Thu, 18 Sep 2025 11:57:15 +0800
Subject: [PATCH 02/12] upd: support multiple eval for futurex and add relavent
 doc.

---
 docs/mkdocs/docs/futurex.md                   | 258 ++++++++++++++++++
 docs/mkdocs/mkdocs.yml                        |   1 +
 scripts/run_evaluate_multiple_runs_futurex.sh | 121 ++++++++
 3 files changed, 380 insertions(+)
 create mode 100644 docs/mkdocs/docs/futurex.md
 create mode 100755 scripts/run_evaluate_multiple_runs_futurex.sh

diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md
new file mode 100644
index 00000000..bd022806
--- /dev/null
+++ b/docs/mkdocs/docs/futurex.md
@@ -0,0 +1,258 @@
+# Futurex-Online
+
+MiroFlow's evaluation on the Futurex-Online benchmark demonstrates capabilities in future event prediction tasks.
+
+---
+
+## Dataset Overview
+
+!!! info "Futurex-Online Dataset"
+    The Futurex-Online dataset consists of 61 prediction tasks covering various future events including:
+    - Political events (referendums, elections)
+    - Sports outcomes (football matches)
+    - Legal proceedings
+    - Economic indicators
+
+!!! abstract "Key Dataset Characteristics"
+    - **Total Tasks**: 61
+    - **Task Type**: Future event prediction
+    - **Answer Format**: Boxed answers (\\boxed{Yes/No} or \\boxed{A/B/C})
+    - **Ground Truth**: Not available (prediction tasks)
+    - **Resolution Date**: Around 2025-09-21 (GMT+8)
+
+---
+
+## Reproduction Guide
+
+!!! note "Reproducibility Instructions"
+    This section provides step-by-step instructions to reproduce our Futurex-Online benchmark evaluation. Since this is a prediction dataset without ground truth, we focus on execution traces and response generation.
+
+### Step 1: Prepare the Futurex-Online Dataset
+
+!!! tip "Dataset Setup"
+    Use the integrated prepare-benchmark command to download and process the dataset:
+
+```bash title="Download Futurex-Online Dataset"
+uv run main.py prepare-benchmark get futurex
+```
+
+This will create the standardized dataset at `data/futurex/standardized_data.jsonl`.
+
+### Step 2: Configure API Keys
+
+!!! warning "API Key Configuration"
+    Set up the required API keys for model access and tool functionality. Update the `.env` file to include the following keys:
+
+```env title=".env Configuration"
+# For searching and web scraping
+SERPER_API_KEY="xxx"
+JINA_API_KEY="xxx"
+
+# For Linux sandbox (code execution environment)
+E2B_API_KEY="xxx"
+
+# We use Claude-3.5-Sonnet with OpenRouter backend to initialize the LLM
+OPENROUTER_API_KEY="xxx"
+OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
+
+# Used for Claude vision understanding
+ANTHROPIC_API_KEY="xxx"
+
+# Used for Gemini vision
+GEMINI_API_KEY="xxx"
+
+# Use for llm judge, reasoning, o3 hints, etc.
+OPENAI_API_KEY="xxx"
+OPENAI_BASE_URL="https://api.openai.com/v1"
+```
+
+### Step 3: Run the Evaluation
+
+!!! example "Evaluation Execution"
+    Execute the following command to run evaluation on the Futurex-Online dataset, currently the basic `agent_quickstart_1` is used.
+
+```bash title="Run Futurex-Online Evaluation"
+uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")"
+```
+
+!!! tip "Progress Monitoring and Resume"
+    To check the progress while running:
+    
+    ```bash title="Check Progress"
+    uv run utils/progress_check/check_futurex_progress.py $PATH_TO_LOG
+    ```
+    
+    If you need to resume an interrupted evaluation, specify the same output directory to continue from where you left off.
+
+    ```bash title="Resume Evaluation, e.g."
+    uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/20250918_1010"
+    ```
+
+### Step 4: Extract Results
+
+!!! example "Result Extraction"
+    After evaluation completion, extract the results using the provided utility:
+
+```bash title="Extract Results"
+uv run utils/extract_futurex_results.py --log_dir logs/futurex/$(date +"%Y%m%d_%H%M")
+```
+
+This will generate:
+- `futurex_results.json`: Detailed results for each task
+- `futurex_summary.json`: Summary statistics
+- `futurex_predictions.csv`: Predictions in CSV format
+
+---
+
+## Sample Task Examples
+
+### Political Prediction
+```
+Task: "Will the 2025 Guinea referendum pass? (resolved around 2025-09-21 (GMT+8))"
+Expected Format: \boxed{Yes} or \boxed{No}
+```
+
+### Sports Prediction
+```
+Task: "Brighton vs. Tottenham (resolved around 2025-09-21 (GMT+8))
+A. Brighton win on 2025-09-20
+B. Brighton vs. Tottenham end in a draw  
+C. Tottenham win on 2025-09-20"
+Expected Format: \boxed{A}, \boxed{B}, or \boxed{C}
+```
+
+---
+
+## Multiple Runs and Voting
+
+!!! tip "Improving Prediction Accuracy"
+    For better prediction accuracy, you can run multiple evaluations and use voting mechanisms to aggregate results. This approach helps reduce randomness and improve the reliability of predictions.
+
+### Step 1: Run Multiple Evaluations
+
+Use the multiple runs script to execute several independent evaluations:
+
+```bash title="Run Multiple Evaluations"
+./scripts/run_evaluate_multiple_runs_futurex.sh
+```
+
+This script will:
+- Run 3 independent evaluations by default (configurable with `NUM_RUNS`)
+- Execute all tasks in parallel for efficiency
+- Generate separate result files for each run in `run_1/`, `run_2/`, etc.
+- Create a consolidated `futurex_submission.jsonl` file with voting results
+
+### Step 2: Customize Multiple Runs
+
+You can customize the evaluation parameters:
+
+```bash title="Custom Multiple Runs"
+# Run 5 evaluations with limited tasks for testing
+NUM_RUNS=5 MAX_TASKS=10 ./scripts/run_evaluate_multiple_runs_futurex.sh
+
+# Use different agent configuration
+AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_futurex.sh
+
+# Adjust concurrency for resource management
+MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_futurex.sh
+```
+
+### Step 3: Voting and Aggregation
+
+After multiple runs, the system automatically:
+
+1. **Extracts predictions** from all runs using `utils/extract_futurex_results.py`
+2. **Applies majority voting** to aggregate predictions across runs
+3. **Generates submission file** in the format required by FutureX platform
+4. **Provides voting statistics** showing prediction distribution across runs
+
+The voting process works as follows:
+- **Majority Vote**: Most common prediction across all runs wins
+- **Tie-breaking**: If tied, chooses the prediction that appeared earliest across all runs
+- **Vote Counts**: Tracks how many runs predicted each option
+- **Confidence Indicators**: High agreement indicates more reliable predictions
+
+### Step 4: Analyze Voting Results
+
+Check the generated files for voting analysis:
+
+```bash title="Check Voting Results"
+# View submission file with voting results
+cat logs/futurex/agent_quickstart_1_*/futurex_submission.jsonl
+
+# Check individual run results
+ls logs/futurex/agent_quickstart_1_*/run_*/
+
+# Check progress and voting statistics
+uv run python utils/progress_check/check_futurex_progress.py logs/futurex/agent_quickstart_1_*
+```
+
+### Manual Voting Aggregation
+
+You can also manually run the voting aggregation:
+
+```bash title="Manual Voting Aggregation"
+# Aggregate multiple runs with majority voting
+uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_* --aggregate
+
+# Force single run mode (if needed)
+uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_*/run_1 --single
+
+# Specify custom output file
+uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_* -o my_voted_predictions.jsonl
+```
+
+### Voting Output Format
+
+The voting aggregation generates a submission file with the following format:
+
+```json
+{"id": "687104310a994c0060ef87a9", "prediction": "No", "vote_counts": {"No": 2}}
+{"id": "68a9b46e961bd3003c8f006b", "prediction": "Yes", "vote_counts": {"Yes": 2}}
+```
+
+The output includes:
+- **`id`**: Task identifier
+- **`prediction`**: Final voted prediction (without `\boxed{}` wrapper)
+- **`vote_counts`**: Dictionary showing how many runs predicted each option
+
+For example, `"vote_counts": {"No": 2}` means 2 out of 2 runs predicted "No", indicating high confidence.
+
+---
+
+## Evaluation Notes
+
+!!! warning "No Ground Truth Available"
+    Since Futurex-Online is a prediction dataset, there are no ground truth answers available for evaluation. The focus is on:
+    - Response generation quality
+    - Reasoning process documentation
+    - Prediction confidence and methodology
+
+!!! info "Output Analysis"
+    The evaluation generates detailed execution traces showing:
+    - Research process for each prediction
+    - Information gathering from web sources
+    - Reasoning chains leading to predictions
+    - Final boxed answers in required format
+
+### Directory Structure
+
+After running multiple evaluations, you'll find the following structure:
+
+```
+logs/futurex/agent_quickstart_1_YYYYMMDD_HHMM/
+├── futurex_submission.jsonl          # Final voted predictions
+├── run_1/                            # First run results
+│   ├── benchmark_results.jsonl       # Individual task results
+│   ├── benchmark_results_pass_at_1_accuracy.txt
+│   └── task_*_attempt_1.json        # Detailed execution traces
+├── run_2/                            # Second run results
+│   └── ... (same structure as run_1)
+├── run_1_output.log                  # Run 1 execution log
+└── run_2_output.log                  # Run 2 execution log
+```
+
+---
+
+!!! info "Documentation Info"
+    **Last Updated:** September 2025 · **Doc Contributor:** Team @ MiroMind AI
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
index bf17b63b..808282f7 100644
--- a/docs/mkdocs/mkdocs.yml
+++ b/docs/mkdocs/mkdocs.yml
@@ -52,6 +52,7 @@ nav:
     - Benchmarks: 
       - GAIA-Validation: gaia_validation.md
       - GAIA-Test: gaia_test.md
+      - FutureX: futurex.md
     - Add New Benchmarks: contribute_benchmarks.md
 
   - Tools: 
diff --git a/scripts/run_evaluate_multiple_runs_futurex.sh b/scripts/run_evaluate_multiple_runs_futurex.sh
new file mode 100755
index 00000000..c441696d
--- /dev/null
+++ b/scripts/run_evaluate_multiple_runs_futurex.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Multiple runs FutureX evaluation script
+# Based on the working command: uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir=logs/futurex-test
+
+# Configuration parameters
+NUM_RUNS=${NUM_RUNS:-3}
+MAX_TASKS=${MAX_TASKS:-null}
+MAX_CONCURRENT=${MAX_CONCURRENT:-5}
+BENCHMARK_NAME="futurex"
+AGENT_SET=${AGENT_SET:-"agent_quickstart_1"}
+
+# TODO: Add more settings like message ID and max turns, currently not supported using agent_quickstart_1
+# ADD_MESSAGE_ID=${ADD_MESSAGE_ID:-"false"}
+# MAX_TURNS=${MAX_TURNS:-1}
+
+# Set results directory with timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+RESULTS_DIR="logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"
+
+export LOGGER_LEVEL="INFO"
+
+echo "🚀 Starting $NUM_RUNS runs of FutureX evaluation..."
+echo "📊 Using max_tasks: $MAX_TASKS (set MAX_TASKS=null for full dataset)"
+echo "📊 Using max_concurrent: $MAX_CONCURRENT"
+echo "📁 Results will be saved in: $RESULTS_DIR"
+
+# Create results directory
+mkdir -p "$RESULTS_DIR"
+
+# Launch all parallel tasks
+for i in $(seq 1 $NUM_RUNS); do
+    echo "=========================================="
+    echo "🚀 Launching experiment $i/$NUM_RUNS"
+    echo "📝 Output log: $RESULTS_DIR/run_${i}_output.log"
+    echo "=========================================="
+    
+    # Set specific identifier for this run
+    RUN_ID="run_$i"
+    
+    # Run experiment (background execution)
+    (
+        echo "Starting run $i at $(date)"
+        uv run main.py common-benchmark \
+            --config_file_name=$AGENT_SET \
+            benchmark=$BENCHMARK_NAME \
+            benchmark.execution.max_tasks=$MAX_TASKS \
+            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
+            benchmark.execution.pass_at_k=1 \
+            output_dir=${RESULTS_DIR}/$RUN_ID \
+            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
+            > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
+        
+        # Check if run was successful
+        if [ $? -eq 0 ]; then
+            echo "✅ Run $i completed successfully at $(date)"
+            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
+            if [ -f "$RESULT_FILE" ]; then
+                echo "📊 Results saved to $RESULT_FILE"
+            else
+                echo "⚠️  Warning: Result file not found for run $i"
+            fi
+        else
+            echo "❌ Run $i failed at $(date)!"
+        fi
+    ) &
+    
+    # Small delay between launches
+    sleep 2
+done
+
+echo "🎯 All $NUM_RUNS runs have been launched in parallel"
+echo "⏳ Waiting for all runs to complete..."
+
+# Wait for all background tasks to complete
+wait
+
+echo "=========================================="
+echo "🎉 All $NUM_RUNS runs completed!"
+echo "=========================================="
+
+# Extract predictions and format for FutureX submission
+echo "📤 Extracting predictions and formatting for FutureX submission..."
+uv run python utils/extract_futurex_results.py "$RESULTS_DIR"
+
+# Check status and provide user-friendly message
+if [ $? -eq 0 ]; then
+    echo "✅ Submission file generated: $RESULTS_DIR/futurex_submission.jsonl"
+    echo "📋 You can now upload this file to the FutureX test server."
+else
+    echo "❌ Failed to generate submission file. Please check the logs for details."
+fi
+
+# Show progress summary
+echo "=========================================="
+echo "📊 Progress Summary:"
+echo "=========================================="
+
+echo "=========================================="
+echo "🎯 Multiple runs FutureX evaluation completed!"
+echo "📁 Check results in: $RESULTS_DIR"
+echo "📝 Check individual run logs: $RESULTS_DIR/run_*_output.log"
+echo "📤 Check submission file: $RESULTS_DIR/futurex_submission.jsonl"
+echo "=========================================="
+echo ""
+echo "💡 Usage examples:"
+echo "   # Default: 3 runs with full dataset"
+echo "   ./scripts/run_evaluate_multiple_runs_futurex.sh"
+echo ""
+echo "   # Custom parameters"
+echo "   NUM_RUNS=5 MAX_TASKS=10 MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_futurex.sh"
+echo ""
+echo "   # Different agent configuration"
+echo "   AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_futurex.sh"
+echo ""
+echo "   # Limited tasks for testing"
+echo "   MAX_TASKS=5 ./scripts/run_evaluate_multiple_runs_futurex.sh"
\ No newline at end of file

From bf43b373484c07291b37eb3bfb1382af1c6af546 Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Thu, 18 Sep 2025 14:59:39 +0800
Subject: [PATCH 03/12] upd: fix bugs with doc for futurex.

---
 docs/mkdocs/docs/download_datasets.md |  2 ++
 docs/mkdocs/docs/futurex.md           | 21 +++++++++++++++------
 scripts/run_prepare_benchmark.sh      |  3 ++-
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/docs/mkdocs/docs/download_datasets.md b/docs/mkdocs/docs/download_datasets.md
index 88ef11b7..bd67c2b5 100644
--- a/docs/mkdocs/docs/download_datasets.md
+++ b/docs/mkdocs/docs/download_datasets.md
@@ -79,6 +79,7 @@ uv run main.py prepare-benchmark get browsecomp-test
 uv run main.py prepare-benchmark get browsecomp-zh-test
 uv run main.py prepare-benchmark get hle
 uv run main.py prepare-benchmark get xbench-ds
+uv run main.py prepare-benchmark get futurex
 ```
 
 ### What This Script Does
@@ -94,6 +95,7 @@ uv run main.py prepare-benchmark get xbench-ds
         - `browsecomp-zh-test` - Chinese BrowseComp test set
         - `hle` - HLE dataset
         - `xbench-ds` - xbench-DeepSearch dataset
+        - `futurex` - Futurex-Online dataset
 
 ### Customizing Dataset Selection
 
diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md
index bd022806..ea44b78d 100644
--- a/docs/mkdocs/docs/futurex.md
+++ b/docs/mkdocs/docs/futurex.md
@@ -8,12 +8,15 @@ MiroFlow's evaluation on the Futurex-Online benchmark demonstrates capabilities
 
 !!! info "Futurex-Online Dataset"
     The Futurex-Online dataset consists of 61 prediction tasks covering various future events including:
+
     - Political events (referendums, elections)
     - Sports outcomes (football matches)
     - Legal proceedings
     - Economic indicators
 
+
 !!! abstract "Key Dataset Characteristics"
+
     - **Total Tasks**: 61
     - **Task Type**: Future event prediction
     - **Answer Format**: Boxed answers (\\boxed{Yes/No} or \\boxed{A/B/C})
@@ -22,10 +25,10 @@ MiroFlow's evaluation on the Futurex-Online benchmark demonstrates capabilities
 
 ---
 
-## Reproduction Guide
+## Quick Start Guide
 
-!!! note "Reproducibility Instructions"
-    This section provides step-by-step instructions to reproduce our Futurex-Online benchmark evaluation. Since this is a prediction dataset without ground truth, we focus on execution traces and response generation.
+!!! note "Quick Start Instructions"
+    This section provides step-by-step instructions to run the Futurex-Online benchmark and prepare submission results. Since this is a prediction dataset without ground truth, we focus on execution traces and response generation. **Note**: This is a quick start guide for running the benchmark, not for reproducing exact submitted results.
 
 ### Step 1: Prepare the Futurex-Online Dataset
 
@@ -51,7 +54,7 @@ JINA_API_KEY="xxx"
 # For Linux sandbox (code execution environment)
 E2B_API_KEY="xxx"
 
-# We use Claude-3.5-Sonnet with OpenRouter backend to initialize the LLM
+# We use Claude-3.7-Sonnet with OpenRouter backend to initialize the LLM
 OPENROUTER_API_KEY="xxx"
 OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
 
@@ -69,7 +72,7 @@ OPENAI_BASE_URL="https://api.openai.com/v1"
 ### Step 3: Run the Evaluation
 
 !!! example "Evaluation Execution"
-    Execute the following command to run evaluation on the Futurex-Online dataset, currently the basic `agent_quickstart_1` is used.
+    Execute the following command to run evaluation on the Futurex-Online dataset. This uses the basic `agent_quickstart_1` configuration for quick start purposes.
 
 ```bash title="Run Futurex-Online Evaluation"
 uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")"
@@ -98,6 +101,7 @@ uv run utils/extract_futurex_results.py --log_dir logs/futurex/$(date +"%Y%m%d_%
 ```
 
 This will generate:
+
 - `futurex_results.json`: Detailed results for each task
 - `futurex_summary.json`: Summary statistics
 - `futurex_predictions.csv`: Predictions in CSV format
@@ -126,7 +130,7 @@ Expected Format: \boxed{A}, \boxed{B}, or \boxed{C}
 ## Multiple Runs and Voting
 
 !!! tip "Improving Prediction Accuracy"
-    For better prediction accuracy, you can run multiple evaluations and use voting mechanisms to aggregate results. This approach helps reduce randomness and improve the reliability of predictions.
+    For better prediction accuracy, you can run multiple evaluations and use voting mechanisms to aggregate results. This approach helps reduce randomness and improve the reliability of predictions. **Note**: This is a quick start approach; production submissions may use more sophisticated configurations.
 
 ### Step 1: Run Multiple Evaluations
 
@@ -137,6 +141,7 @@ Use the multiple runs script to execute several independent evaluations:
 ```
 
 This script will:
+
 - Run 3 independent evaluations by default (configurable with `NUM_RUNS`)
 - Execute all tasks in parallel for efficiency
 - Generate separate result files for each run in `run_1/`, `run_2/`, etc.
@@ -167,6 +172,7 @@ After multiple runs, the system automatically:
 4. **Provides voting statistics** showing prediction distribution across runs
 
 The voting process works as follows:
+
 - **Majority Vote**: Most common prediction across all runs wins
 - **Tie-breaking**: If tied, chooses the prediction that appeared earliest across all runs
 - **Vote Counts**: Tracks how many runs predicted each option
@@ -212,6 +218,7 @@ The voting aggregation generates a submission file with the following format:
 ```
 
 The output includes:
+
 - **`id`**: Task identifier
 - **`prediction`**: Final voted prediction (without `\boxed{}` wrapper)
 - **`vote_counts`**: Dictionary showing how many runs predicted each option
@@ -224,12 +231,14 @@ For example, `"vote_counts": {"No": 2}` means 2 out of 2 runs predicted "No", in
 
 !!! warning "No Ground Truth Available"
     Since Futurex-Online is a prediction dataset, there are no ground truth answers available for evaluation. The focus is on:
+
     - Response generation quality
     - Reasoning process documentation
     - Prediction confidence and methodology
 
 !!! info "Output Analysis"
     The evaluation generates detailed execution traces showing:
+
     - Research process for each prediction
     - Information gathering from web sources
     - Reasoning chains leading to predictions
diff --git a/scripts/run_prepare_benchmark.sh b/scripts/run_prepare_benchmark.sh
index a00f7a1a..7574ed3e 100644
--- a/scripts/run_prepare_benchmark.sh
+++ b/scripts/run_prepare_benchmark.sh
@@ -20,4 +20,5 @@ uv run main.py prepare-benchmark get webwalkerqa
 uv run main.py prepare-benchmark get browsecomp-test
 uv run main.py prepare-benchmark get browsecomp-zh-test
 uv run main.py prepare-benchmark get hle
-uv run main.py prepare-benchmark get xbench-ds
\ No newline at end of file
+uv run main.py prepare-benchmark get xbench-ds
+uv run main.py prepare-benchmark get futurex
\ No newline at end of file

From d1e16375bfe83f537374e7c3f2fcfea85b0d3aa1 Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Thu, 18 Sep 2025 15:12:04 +0800
Subject: [PATCH 04/12] debug: fix wrong calling path.

---
 docs/mkdocs/docs/futurex.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md
index ea44b78d..db20f4df 100644
--- a/docs/mkdocs/docs/futurex.md
+++ b/docs/mkdocs/docs/futurex.md
@@ -97,7 +97,7 @@ uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=
     After evaluation completion, extract the results using the provided utility:
 
 ```bash title="Extract Results"
-uv run utils/extract_futurex_results.py --log_dir logs/futurex/$(date +"%Y%m%d_%H%M")
+uv run utils/extract_futurex_results.py logs/futurex/$(date +"%Y%m%d_%H%M")
 ```
 
 This will generate:

From eb6f302d7f357c4344b68084ab00271f59f40959 Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Wed, 24 Sep 2025 10:03:25 +0800
Subject: [PATCH 05/12] add preparation for finsearchcomp.

---
 scripts/run_prepare_benchmark.sh             |  3 +-
 utils/prepare_benchmark/gen_finsearchcomp.py | 55 ++++++++++++++++++++
 utils/prepare_benchmark/main.py              |  8 +++
 3 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 utils/prepare_benchmark/gen_finsearchcomp.py

diff --git a/scripts/run_prepare_benchmark.sh b/scripts/run_prepare_benchmark.sh
index 7574ed3e..837b2e45 100644
--- a/scripts/run_prepare_benchmark.sh
+++ b/scripts/run_prepare_benchmark.sh
@@ -21,4 +21,5 @@ uv run main.py prepare-benchmark get browsecomp-test
 uv run main.py prepare-benchmark get browsecomp-zh-test
 uv run main.py prepare-benchmark get hle
 uv run main.py prepare-benchmark get xbench-ds
-uv run main.py prepare-benchmark get futurex
\ No newline at end of file
+uv run main.py prepare-benchmark get futurex
+uv run main.py prepare-benchmark get finsearchcomp
\ No newline at end of file
diff --git a/utils/prepare_benchmark/gen_finsearchcomp.py b/utils/prepare_benchmark/gen_finsearchcomp.py
new file mode 100644
index 00000000..adc01451
--- /dev/null
+++ b/utils/prepare_benchmark/gen_finsearchcomp.py
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Generator, MutableMapping
+
+from datasets import load_dataset
+
+from utils.prepare_benchmark.common import Task
+
+def gen_finsearchcomp(hf_token: str) -> Generator[Task, None, None]:
+    """
+    Generate FinSearchComp dataset tasks in MiroFlow format
+    
+    Args:
+        hf_token: Hugging Face token for dataset access
+        
+    Yields:
+        Task: Standardized task objects
+    """
+    dataset = load_dataset("ByteSeedXpert/FinSearchComp")
+    
+    for split_name, split_data in dataset.items():
+        for idx, sample in enumerate(split_data):
+            # Extract task information
+            task_id = sample.get("prompt_id", f"finsearchcomp_{split_name}_{idx}")
+            task_question = sample.get("prompt", "")
+            response_reference = sample.get("response_reference", "")
+            judge_prompt_template = sample.get("judge_prompt_template", "")
+            judge_system_prompt = sample.get("judge_system_prompt", "")
+            label = sample.get("label", "")
+            
+            # Create metadata dictionary
+            metadata: MutableMapping = {
+                "judge_prompt_template": judge_prompt_template,
+                "judge_system_prompt": judge_system_prompt,
+                "label": label,
+                "source": "ByteSeedXpert/FinSearchComp",
+                "split": split_name,
+                "original_id": sample.get("prompt_id", ""),
+                "dataset_name": "FinSearchComp"
+            }
+            
+            # Create standardized Task object
+            task = Task(
+                task_id=task_id,
+                task_question=task_question,
+                ground_truth=response_reference,  # Futurex-Online doesn't have ground truth
+                file_path=None,   # No file attachments
+                metadata=metadata,
+            )
+            
+            yield task
+    return
+    
\ No newline at end of file
diff --git a/utils/prepare_benchmark/main.py b/utils/prepare_benchmark/main.py
index 2233bfdc..12db9cf4 100644
--- a/utils/prepare_benchmark/main.py
+++ b/utils/prepare_benchmark/main.py
@@ -19,6 +19,7 @@
 from utils.prepare_benchmark.gen_webwalkerqa import gen_webwalkerqa
 from utils.prepare_benchmark.gen_xbench_ds import gen_xbench_ds
 from utils.prepare_benchmark.gen_futurex import gen_futurex
+from utils.prepare_benchmark.gen_finsearchcomp import gen_finsearchcomp
 
 
 @dataclasses.dataclass
@@ -33,6 +34,7 @@ class _Env:
         "hle",
         "xbench-ds",
         "futurex",
+        "finsearchcomp",
     )
     meta_filename = "standardized_data.jsonl"
     data_dir: pathlib.Path
@@ -117,6 +119,12 @@ def gen():
                 for x in gen_futurex(env.hf_token):
                     yield x
 
+            return gen
+        case "finsearchcomp":
+            def gen():
+                for x in gen_finsearchcomp(env.hf_token):
+                    yield x
+
             return gen
         case _:
             raise ValueError("not supported")

From 4dabaee7bb86470cbca39264229fae629876e9a0 Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Wed, 24 Sep 2025 11:26:14 +0800
Subject: [PATCH 06/12] update a premature version of finsearchcomp benchmark.

---
 common_benchmark.py                          |  2 +
 config/agent_finsearchcomp.yaml              | 74 +++++++++++++++
 config/benchmark/finsearchcomp.yaml          | 19 ++++
 src/core/pipeline.py                         |  7 +-
 utils/eval_answer_from_log.py                |  3 +
 utils/eval_utils.py                          | 96 ++++++++++++++++++++
 utils/prepare_benchmark/gen_finsearchcomp.py | 29 ++++--
 utils/util_llm_parallel_thinking.py          |  2 +-
 utils/util_llm_simple_voting.py              |  2 +-
 9 files changed, 222 insertions(+), 12 deletions(-)
 create mode 100644 config/agent_finsearchcomp.yaml
 create mode 100644 config/benchmark/finsearchcomp.yaml

diff --git a/common_benchmark.py b/common_benchmark.py
index 8c93e356..da268d27 100644
--- a/common_benchmark.py
+++ b/common_benchmark.py
@@ -210,6 +210,7 @@ async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult:
                             sub_agent_tool_managers=self.sub_agent_tool_managers,
                             output_formatter=self.output_formatter,
                             ground_truth=task.ground_truth,
+                            metadata=task.metadata,
                             log_path=self.output_dir
                             / f"task_{task.task_id}_attempt_{attempt}.json",
                         )
@@ -242,6 +243,7 @@ async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult:
                             question=task.task_question,
                             target=task.ground_truth,
                             predicted_answer=attempt_result["model_boxed_answer"],
+                            metadata=task.metadata,
                         )
                         attempt_result["judge_result"] = evaluation_result
                         attempt_result["is_correct"] = evaluation_result == "CORRECT"
diff --git a/config/agent_finsearchcomp.yaml b/config/agent_finsearchcomp.yaml
new file mode 100644
index 00000000..37f42721
--- /dev/null
+++ b/config/agent_finsearchcomp.yaml
@@ -0,0 +1,74 @@
+defaults:
+  - benchmark: finsearchcomp
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "ClaudeOpenRouterClient"
+    model_name: "anthropic/claude-3.7-sonnet"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 32000
+    openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+    openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+    openrouter_provider: "anthropic"
+    disable_cache_control: false
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-reasoning
+
+  max_turns: -1  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    o3_hint: true
+  output_process:
+    o3_final_answer: true
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents:
+  agent-worker:
+    prompt_class: SubAgentWorkerPrompt
+    llm: 
+      provider_class: "ClaudeOpenRouterClient"
+      model_name: "anthropic/claude-3.7-sonnet"
+      async_client: true
+      temperature: 0.3
+      top_p: 0.95
+      min_p: 0.0
+      top_k: -1
+      max_tokens: 32000
+      openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+      openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+      openrouter_provider: "anthropic"
+      disable_cache_control: false
+      keep_tool_result: -1
+      oai_tool_thinking: false
+    
+    tool_config:
+      - tool-searching
+      - tool-image-video
+      - tool-reading
+      - tool-code
+      - tool-audio
+
+    max_turns: -1  # Maximum number of turns for main agent execution
+    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
diff --git a/config/benchmark/finsearchcomp.yaml b/config/benchmark/finsearchcomp.yaml
new file mode 100644
index 00000000..0aaa8211
--- /dev/null
+++ b/config/benchmark/finsearchcomp.yaml
@@ -0,0 +1,19 @@
+# config/benchmark/finsearchcomp.yaml
+defaults:
+  - default
+  - _self_
+
+name: "finsearchcomp"
+
+data:
+  data_dir: "${data_dir}/finsearchcomp"  # Path to finsearchcomp dataset
+  metadata_file: "standardized_data.jsonl"  # Metadata filename
+  whitelist: []  # Optional: List of specific task_ids to run
+
+execution:
+  max_tasks: null      # null = no limit, or specify a number
+  max_concurrent: 5    # Number of parallel tasks
+  pass_at_k: 1         # Number of attempts per task
+
+# OpenAI API key for evaluation (required for finsearchcomp since it has ground truth)
+openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
diff --git a/src/core/pipeline.py b/src/core/pipeline.py
index ea5b62c7..664ae47a 100644
--- a/src/core/pipeline.py
+++ b/src/core/pipeline.py
@@ -31,6 +31,7 @@ async def execute_task_pipeline(
     output_formatter: OutputFormatter,
     log_path: pathlib.Path,
     ground_truth: str | None = None,
+    metadata: dict | None = None,
 ) -> tuple[str, str, pathlib.Path]:
     """
     Executes the full pipeline for a single task.
@@ -61,7 +62,11 @@ async def execute_task_pipeline(
         task_id=task_id,
         task_file_name=task_file_name,
         ground_truth=ground_truth,
-        input={"task_description": task_description, "task_file_name": task_file_name},
+        input={
+            "task_description": task_description, 
+            "task_file_name": task_file_name,
+            "metadata": metadata or {}
+        },
     )
 
     main_agent_llm_client = None
diff --git a/utils/eval_answer_from_log.py b/utils/eval_answer_from_log.py
index 88af7f0a..838da352 100644
--- a/utils/eval_answer_from_log.py
+++ b/utils/eval_answer_from_log.py
@@ -33,6 +33,8 @@ async def main(input_dir: str, benchmark_name: str):
             question = data.get("task_question", "")
             ground_truth = data.get("ground_truth", "")
             predicted_answer = data.get("final_boxed_answer", "")
+            metadata = data.get("input", {}).get("metadata", {})
+            
             # If already has judge result, skip
             # if "judge_result" in data and data["judge_result"] in ("CORRECT", "INCORRECT"):
             #     print(f"Log {log_file} already has judge result: {data['judge_result']}")
@@ -44,6 +46,7 @@ async def main(input_dir: str, benchmark_name: str):
                 question=question,
                 target=ground_truth,
                 predicted_answer=predicted_answer,
+                metadata=metadata,  # Now metadata is available from log files
             )
             print(f"{os.path.basename(log_file)}: {result}")
             # Optionally, update the log file with the result
diff --git a/utils/eval_utils.py b/utils/eval_utils.py
index 4529e56e..fd838763 100644
--- a/utils/eval_utils.py
+++ b/utils/eval_utils.py
@@ -374,18 +374,114 @@ def is_float(element: Any) -> bool:
         # return "NOT_ATTEMPTED"
 
 
+@retry(wait=wait_exponential(multiplier=5), stop=stop_after_attempt(5))
+async def verify_answer_llm_finsearchcomp(
+    openai_client: AsyncOpenAI, 
+    question: str, 
+    target: str, 
+    predicted_answer: str,
+    judge_prompt_template: str,
+    judge_system_prompt: str,
+    metadata: dict = None
+) -> str:
+    """
+    Use FinSearchComp-style LLM judge with dynamic prompts to verify if the predicted answer is correct.
+    
+    Args:
+        openai_client: OpenAI client for LLM calls
+        question: The question being answered
+        target: The correct/target answer (primary ground truth)
+        predicted_answer: The model's predicted answer
+        judge_prompt_template: The judge prompt template from metadata
+        judge_system_prompt: The judge system prompt from metadata
+        metadata: Additional metadata containing response_reference and ground_truth_finance
+        
+    Returns:
+        String indicating the evaluation result: "CORRECT", "INCORRECT", or "NOT_ATTEMPTED"
+    """
+    # Get the appropriate ground truth based on the prompt template
+    response_reference = metadata.get("response_reference", "") if metadata else ""
+    ground_truth_finance = metadata.get("ground_truth_finance", "") if metadata else ""
+    
+    # Format the judge prompt template with the actual values
+    formatted_prompt = judge_prompt_template.format(
+        prompt=question,
+        response_reference=response_reference,
+        ground_truth=ground_truth_finance,
+        response=predicted_answer
+    )
+    
+    # Create messages with system prompt and user prompt
+    messages = [
+        {"role": "system", "content": judge_system_prompt},
+        {"role": "user", "content": formatted_prompt}
+    ]
+    
+    try:
+        # NOTE: no explicit LLM model is specified here, so we use gpt-4o-mini for consistency
+        response = await openai_client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=messages,
+            max_completion_tokens=2048,
+            temperature=0.0  # Deterministic evaluation
+        )
+        
+        content = response.choices[0].message.content
+        
+        # Print FinSearchComp judge reasoning
+        print(f"FinSearchComp LLM Judge Response: {content}")
+        
+        # Parse the response to determine if it's correct
+        # Look for common patterns in the response
+        content_lower = content.lower()
+        
+        # Check for JSON format responses
+        if "answer_score" in content_lower:
+            if '"answer_score": 1' in content or '"answer_score":1' in content:
+                return "CORRECT"
+            elif '"answer_score": 0' in content or '"answer_score":0' in content:
+                return "INCORRECT"
+        
+        # Check for score format responses
+        if "score" in content_lower:
+            if '"score": 1' in content or '"score":1' in content:
+                return "CORRECT"
+            elif '"score": 0' in content or '"score":0' in content:
+                return "INCORRECT"
+        
+        # If we can't parse the response, return NOT_ATTEMPTED
+        print(f"Warning: Could not parse FinSearchComp judge response: {content}")
+        return "NOT_ATTEMPTED"
+        
+    except Exception as e:
+        print(f"FinSearchComp LLM evaluation failed: {e}")
+        return "NOT_ATTEMPTED"
+
+
 async def verify_answer_for_datasets(
     openai_client: AsyncOpenAI,
     benchmark_name: str,
     question: str,
     target: str,
     predicted_answer: str,
+    metadata: dict = None,
 ) -> str:
     """
     Verify the answer for a given dataset.
     """
 
     try:
+        # Handle finsearchcomp with dynamic judge prompts
+        if "finsearchcomp" in benchmark_name and metadata:
+            judge_prompt_template = metadata.get("judge_prompt_template", "")
+            judge_system_prompt = metadata.get("judge_system_prompt", "")
+            
+            if judge_prompt_template and judge_system_prompt:
+                return await verify_answer_llm_finsearchcomp(
+                    openai_client, question, target, predicted_answer,
+                    judge_prompt_template, judge_system_prompt, metadata
+                )
+        
         # for all questions, do gaia scorer first, if not return CORRECT, then do others
         gaia_scorer_answer = await verify_answer_gaia(target, predicted_answer)
 
diff --git a/utils/prepare_benchmark/gen_finsearchcomp.py b/utils/prepare_benchmark/gen_finsearchcomp.py
index adc01451..f56d675a 100644
--- a/utils/prepare_benchmark/gen_finsearchcomp.py
+++ b/utils/prepare_benchmark/gen_finsearchcomp.py
@@ -26,26 +26,37 @@ def gen_finsearchcomp(hf_token: str) -> Generator[Task, None, None]:
             task_id = sample.get("prompt_id", f"finsearchcomp_{split_name}_{idx}")
             task_question = sample.get("prompt", "")
             response_reference = sample.get("response_reference", "")
-            judge_prompt_template = sample.get("judge_prompt_template", "")
-            judge_system_prompt = sample.get("judge_system_prompt", "")
-            label = sample.get("label", "")
+            ground_truth_finance = sample.get("ground_truth", "")
             
-            # Create metadata dictionary
+            # Create metadata dictionary with all original fields
             metadata: MutableMapping = {
-                "judge_prompt_template": judge_prompt_template,
-                "judge_system_prompt": judge_system_prompt,
-                "label": label,
                 "source": "ByteSeedXpert/FinSearchComp",
                 "split": split_name,
                 "original_id": sample.get("prompt_id", ""),
-                "dataset_name": "FinSearchComp"
+                "dataset_name": "FinSearchComp",
+                "response_reference": response_reference,
+                "ground_truth_finance": ground_truth_finance,
             }
             
+            # Add all other fields from sample to metadata (including judge prompts)
+            for key, value in sample.items():
+                if key not in ["prompt_id", "prompt", "response_reference", "ground_truth"]:
+                    metadata[key] = value
+            
+            # Determine the primary ground truth for evaluation
+            # Priority: response_reference > ground_truth_finance
+            if response_reference:
+                ground_truth_task = response_reference
+            elif ground_truth_finance:
+                ground_truth_task = ground_truth_finance
+            else:
+                ground_truth_task = ""  # Fallback to empty string
+            
             # Create standardized Task object
             task = Task(
                 task_id=task_id,
                 task_question=task_question,
-                ground_truth=response_reference,  # Futurex-Online doesn't have ground truth
+                ground_truth=ground_truth_task,
                 file_path=None,   # No file attachments
                 metadata=metadata,
             )
diff --git a/utils/util_llm_parallel_thinking.py b/utils/util_llm_parallel_thinking.py
index 7b5ede5c..7c2d446a 100644
--- a/utils/util_llm_parallel_thinking.py
+++ b/utils/util_llm_parallel_thinking.py
@@ -393,7 +393,7 @@ async def process_single_task(
     )
 
     result = await verify_answer_for_datasets(
-        client, BENCHMARK_NAME, "", data[0]["ground_truth"], selected_solution
+        client, BENCHMARK_NAME, "", data[0]["ground_truth"], selected_solution, {}
     )
 
     task_result = {
diff --git a/utils/util_llm_simple_voting.py b/utils/util_llm_simple_voting.py
index 7b6cee2f..aefa377f 100644
--- a/utils/util_llm_simple_voting.py
+++ b/utils/util_llm_simple_voting.py
@@ -299,7 +299,7 @@ async def process_single_task(
     selected_solution = response["final_answer"]
     reasoning = response["reasoning"]
     result = await verify_answer_for_datasets(
-        BENCHMARK_NAME, "", data[0]["ground_truth"], selected_solution
+        None, BENCHMARK_NAME, "", data[0]["ground_truth"], selected_solution, {}
     )
 
     task_result = {

From c086e414338fad5e1734c4b9763d19c8084d155e Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Wed, 24 Sep 2025 13:11:27 +0800
Subject: [PATCH 07/12] clean redundent code in merging.

---
 docs/mkdocs/mkdocs.yml           | 1 -
 scripts/run_prepare_benchmark.sh | 2 --
 2 files changed, 3 deletions(-)

diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
index ccdcd1b5..f508e144 100644
--- a/docs/mkdocs/mkdocs.yml
+++ b/docs/mkdocs/mkdocs.yml
@@ -53,7 +53,6 @@ nav:
       - GAIA-Validation-Text-Only: gaia_validation_text_only.md
       - GAIA-Test: gaia_test.md
       - FutureX: futurex.md
-      - FutureX: futurex.md
     - Download Datasets: download_datasets.md
     - Add New Benchmarks: contribute_benchmarks.md
 
diff --git a/scripts/run_prepare_benchmark.sh b/scripts/run_prepare_benchmark.sh
index 55c0a92f..837b2e45 100644
--- a/scripts/run_prepare_benchmark.sh
+++ b/scripts/run_prepare_benchmark.sh
@@ -21,7 +21,5 @@ uv run main.py prepare-benchmark get browsecomp-test
 uv run main.py prepare-benchmark get browsecomp-zh-test
 uv run main.py prepare-benchmark get hle
 uv run main.py prepare-benchmark get xbench-ds
-uv run main.py prepare-benchmark get futurex
-
 uv run main.py prepare-benchmark get futurex
 uv run main.py prepare-benchmark get finsearchcomp
\ No newline at end of file

From d6a871591e749a5cfff11fe5077e934b79082065 Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Thu, 25 Sep 2025 09:21:52 +0800
Subject: [PATCH 08/12] upd: modify yaml to use Mirothinker as the main agent,
 add check progress file to exclude T1.

---
 config/agent_finsearchcomp.yaml               |  33 +--
 .../check_finsearchcomp_progress.py           | 249 ++++++++++++++++++
 2 files changed, 263 insertions(+), 19 deletions(-)
 create mode 100755 utils/progress_check/check_finsearchcomp_progress.py

diff --git a/config/agent_finsearchcomp.yaml b/config/agent_finsearchcomp.yaml
index 37f42721..836588f3 100644
--- a/config/agent_finsearchcomp.yaml
+++ b/config/agent_finsearchcomp.yaml
@@ -7,25 +7,23 @@ defaults:
 main_agent:
   prompt_class: MainAgentPrompt_GAIA
   llm: 
-    provider_class: "ClaudeOpenRouterClient"
-    model_name: "anthropic/claude-3.7-sonnet"
+    provider_class: "MiroThinkerSGLangClient"
+    model_name: "MODEL_NAME"
     async_client: true
-    temperature: 0.3
+    temperature: 0.6
     top_p: 0.95
     min_p: 0.0
     top_k: -1
-    max_tokens: 32000
-    openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
-    openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
-    openrouter_provider: "anthropic"
-    disable_cache_control: false
+    max_tokens: 8192
+    oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+    oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
     keep_tool_result: -1
     oai_tool_thinking: false
   
   tool_config:
     - tool-reasoning
 
-  max_turns: -1  # Maximum number of turns for main agent execution
+  max_turns: 20  # Maximum number of turns for main agent execution
   max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
   
   input_process:
@@ -43,18 +41,16 @@ sub_agents:
   agent-worker:
     prompt_class: SubAgentWorkerPrompt
     llm: 
-      provider_class: "ClaudeOpenRouterClient"
-      model_name: "anthropic/claude-3.7-sonnet"
+      provider_class: "MiroThinkerSGLangClient"
+      model_name: "MODEL_NAME"
       async_client: true
-      temperature: 0.3
+      temperature: 0.6
       top_p: 0.95
       min_p: 0.0
       top_k: -1
-      max_tokens: 32000
-      openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
-      openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
-      openrouter_provider: "anthropic"
-      disable_cache_control: false
+      max_tokens: 8192
+      oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+      oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
       keep_tool_result: -1
       oai_tool_thinking: false
     
@@ -65,10 +61,9 @@ sub_agents:
       - tool-code
       - tool-audio
 
-    max_turns: -1  # Maximum number of turns for main agent execution
+    max_turns: 20  # Maximum number of turns for main agent execution
     max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
 
-
 # Can define some top-level or default parameters here
 output_dir: logs/
 data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
diff --git a/utils/progress_check/check_finsearchcomp_progress.py b/utils/progress_check/check_finsearchcomp_progress.py
new file mode 100755
index 00000000..d6084ddb
--- /dev/null
+++ b/utils/progress_check/check_finsearchcomp_progress.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+FinSearchComp Progress Checker
+
+This script analyzes FinSearchComp benchmark results in a log folder to count:
+- Total files processed
+- Files with status "completed"
+- Files with status "completed" AND judge_result "CORRECT" (excluding T1 tasks)
+- Breakdown by task type (T1, T2, T3)
+
+Note: T1 (Time-Sensitive Data Fetching) tasks are excluded from correctness evaluation
+because their ground truth is outdated, but they are still counted as completed.
+
+Usage:
+    python check_finsearchcomp_progress.py [LOG_FOLDER_PATH]
+
+If no path is provided, uses the current directory.
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+
+def extract_task_type(task_id: str) -> str:
+    """
+    Extract task type (T1, T2, T3) from task_id.
+    
+    Args:
+        task_id: Task ID string like "(T1)Time_Sensitive_Data_Fetching_006"
+        
+    Returns:
+        Task type string ("T1", "T2", "T3", or "Unknown")
+    """
+    match = re.match(r'^\(T(\d+)\)', task_id)
+    if match:
+        return f"T{match.group(1)}"
+    return "Unknown"
+
+
+def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
+    """
+    Analyze FinSearchComp benchmark results from JSON log files.
+
+    Args:
+        log_folder: Path to folder containing task_*.json files
+
+    Returns:
+        Dictionary with counts of different categories
+    """
+    log_path = Path(log_folder)
+
+    if not log_path.exists():
+        raise FileNotFoundError(f"Log folder not found: {log_folder}")
+
+    # Find all task JSON files
+    json_files = list(log_path.glob("task_*_attempt_*.json"))
+
+    results = {
+        "total_files": 0,
+        "completed_status": 0,
+        "completed_and_correct": 0,
+        "completed_and_incorrect": 0,
+        "other_status": 0,
+        "parse_errors": 0,
+        "task_type_breakdown": {
+            "T1": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+            "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+            "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+            "Unknown": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}
+        }
+    }
+
+    completed_correct_files = []
+    completed_incorrect_files = []
+    parse_error_files = []
+
+    print(f"Scanning {len(json_files)} files in {log_folder}...")
+
+    for json_file in json_files:
+        results["total_files"] += 1
+
+        try:
+            with open(json_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+
+            task_id = data.get("task_id", "")
+            task_type = extract_task_type(task_id)
+            status = data.get("status", "").lower()
+            judge_result = data.get("judge_result", "").upper()
+
+            # Update task type breakdown
+            results["task_type_breakdown"][task_type]["total"] += 1
+
+            if status == "completed":
+                results["completed_status"] += 1
+                results["task_type_breakdown"][task_type]["completed"] += 1
+
+                # For T1 tasks, exclude from correctness evaluation but count as completed
+                if task_type == "T1":
+                    # T1 tasks are considered "completed" but not evaluated for correctness
+                    # due to outdated ground truth
+                    pass
+                else:
+                    # For T2 and T3 tasks, evaluate correctness
+                    if judge_result == "CORRECT":
+                        results["completed_and_correct"] += 1
+                        results["task_type_breakdown"][task_type]["correct"] += 1
+                        completed_correct_files.append(json_file.name)
+                    else:
+                        results["completed_and_incorrect"] += 1
+                        results["task_type_breakdown"][task_type]["incorrect"] += 1
+                        completed_incorrect_files.append((json_file.name, judge_result))
+            else:
+                results["other_status"] += 1
+
+        except (json.JSONDecodeError, KeyError, FileNotFoundError) as e:
+            results["parse_errors"] += 1
+            parse_error_files.append((json_file.name, str(e)))
+            print(f"Error parsing {json_file.name}: {e}")
+
+    return (
+        results,
+        completed_correct_files,
+        completed_incorrect_files,
+        parse_error_files,
+    )
+
+
+def display_results(
+    results: Dict[str, any],
+    correct_files: List[str],
+    incorrect_files: List[Tuple[str, str]],
+    error_files: List[Tuple[str, str]],
+) -> None:
+    """Display the analysis results in a formatted way."""
+
+    print("\n" + "=" * 70)
+    print("FINSEARCHCOMP BENCHMARK RESULTS SUMMARY")
+    print("=" * 70)
+
+    total = results["total_files"]
+    completed = results["completed_status"]
+    correct = results["completed_and_correct"]
+    incorrect = results["completed_and_incorrect"]
+
+    print(f"Total files processed:           {total:3d}")
+    print(
+        f"Files with status 'completed':   {completed:3d} ({completed/total*100:.1f}%)"
+    )
+    print(f"Files completed AND correct:     {correct:3d} ({correct/total*100:.1f}%)")
+    print(
+        f"Files completed but incorrect:   {incorrect:3d} ({incorrect/total*100:.1f}%)"
+    )
+    print(f"Files with other status:         {results['other_status']:3d}")
+    print(f"Files with parse errors:         {results['parse_errors']:3d}")
+
+    # Calculate accuracy excluding T1 tasks
+    t2_t3_completed = (
+        results["task_type_breakdown"]["T2"]["completed"] + 
+        results["task_type_breakdown"]["T3"]["completed"]
+    )
+    t2_t3_correct = (
+        results["task_type_breakdown"]["T2"]["correct"] + 
+        results["task_type_breakdown"]["T3"]["correct"]
+    )
+    
+    if t2_t3_completed > 0:
+        accuracy = t2_t3_correct / t2_t3_completed * 100
+        print(f"\nAccuracy rate (T2+T3 correct/completed): {accuracy:.1f}%")
+        print(f"  (T1 tasks excluded due to outdated ground truth)")
+
+    # Task type breakdown
+    print("\n" + "-" * 70)
+    print("TASK TYPE BREAKDOWN")
+    print("-" * 70)
+    
+    for task_type in ["T1", "T2", "T3", "Unknown"]:
+        breakdown = results["task_type_breakdown"][task_type]
+        if breakdown["total"] > 0:
+            completion_rate = breakdown["completed"] / breakdown["total"] * 100
+            if task_type == "T1":
+                print(f"{task_type} (Time-Sensitive Data Fetching):")
+                print(f"  Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)")
+                print(f"  Note: Excluded from correctness evaluation (outdated ground truth)")
+            else:
+                accuracy_rate = breakdown["correct"] / breakdown["completed"] * 100 if breakdown["completed"] > 0 else 0
+                print(f"{task_type} ({'Simple Historical Lookup' if task_type == 'T2' else 'Complex Historical Investigation'}):")
+                print(f"  Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)")
+                print(f"  Correct: {breakdown['correct']:3d}, Incorrect: {breakdown['incorrect']:3d}")
+                print(f"  Accuracy: {accuracy_rate:.1f}%")
+
+    print("\n" + "-" * 70)
+    print(f"SUMMARY: {completed} tasks completed, {correct} T2+T3 tasks correct")
+    print(f"         (T1 tasks: {results['task_type_breakdown']['T1']['completed']} completed, excluded from evaluation)")
+    print("-" * 70)
+
+    # Show some example files for verification
+    if correct_files:
+        print("\nFirst 5 correct files (T2+T3 only):")
+        for i, filename in enumerate(correct_files[:5], 1):
+            print(f"  {i}. {filename}")
+        if len(correct_files) > 5:
+            print(f"  ... and {len(correct_files) - 5} more")
+
+    if incorrect_files:
+        print("\nFirst 5 incorrect files (T2+T3 only):")
+        for i, (filename, judge_result) in enumerate(incorrect_files[:5], 1):
+            print(f"  {i}. {filename} -> {judge_result}")
+        if len(incorrect_files) > 5:
+            print(f"  ... and {len(incorrect_files) - 5} more")
+
+    if error_files:
+        print("\nFiles with parse errors:")
+        for filename, error in error_files:
+            print(f"  - {filename}: {error}")
+
+
+def main():
+    """Main function to run the analysis."""
+
+    # Check if folder path was provided as command line argument
+    if len(sys.argv) > 1:
+        log_folder = sys.argv[1]
+        print(f"Using provided folder path: {log_folder}")
+    else:
+        log_folder = "."
+        print(f"No folder path provided, using current directory: {log_folder}")
+
+    try:
+        print(f"Analyzing FinSearchComp benchmark results in: {log_folder}")
+        results, correct_files, incorrect_files, error_files = analyze_finsearchcomp_results(
+            log_folder
+        )
+        display_results(results, correct_files, incorrect_files, error_files)
+
+    except Exception as e:
+        print(f"Error: {e}")
+        print(f"\nUsage: python {sys.argv[0]} [LOG_FOLDER_PATH]")
+        print(f"Example: python {sys.argv[0]} logs/finsearchcomp/agent_finsearchcomp_20250924_1555")
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())

From e7163d33e3904a546f8f43d3d746026f1bcb2ecb Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Thu, 25 Sep 2025 09:49:35 +0800
Subject: [PATCH 09/12] upd: check_progress function for finsearchcomp now
 consider globe and greater china respectively.

---
 .../check_finsearchcomp_progress.py           | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/utils/progress_check/check_finsearchcomp_progress.py b/utils/progress_check/check_finsearchcomp_progress.py
index d6084ddb..a5b26595 100755
--- a/utils/progress_check/check_finsearchcomp_progress.py
+++ b/utils/progress_check/check_finsearchcomp_progress.py
@@ -40,6 +40,27 @@ def extract_task_type(task_id: str) -> str:
     return "Unknown"
 
 
+def extract_region_from_label(label: str) -> str:
+    """
+    Extract region from the label field.
+    
+    Args:
+        label: Label string like "Complex_Historical_Investigation(Global)" or "Financial_Analysis(Greater_China)"
+        
+    Returns:
+        Region string ("Global", "Greater_China", or "Unknown")
+    """
+    if not label:
+        return "Unknown"
+    
+    if "(Global)" in label:
+        return "Global"
+    elif "(Greater_China)" in label:
+        return "Greater_China"
+    else:
+        return "Unknown"
+
+
 def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
     """
     Analyze FinSearchComp benchmark results from JSON log files.
@@ -70,6 +91,16 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
             "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
             "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
             "Unknown": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}
+        },
+        "regional_breakdown": {
+            "Global": {
+                "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+                "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}
+            },
+            "Greater_China": {
+                "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+                "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}
+            }
         }
     }
 
@@ -90,13 +121,25 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
             task_type = extract_task_type(task_id)
             status = data.get("status", "").lower()
             judge_result = data.get("judge_result", "").upper()
+            
+            # Extract region from label
+            label = data.get("input", {}).get("metadata", {}).get("label", "")
+            region = extract_region_from_label(label)
 
             # Update task type breakdown
             results["task_type_breakdown"][task_type]["total"] += 1
+            
+            # Update regional breakdown for T2 and T3 tasks
+            if task_type in ["T2", "T3"] and region in results["regional_breakdown"]:
+                results["regional_breakdown"][region][task_type]["total"] += 1
 
             if status == "completed":
                 results["completed_status"] += 1
                 results["task_type_breakdown"][task_type]["completed"] += 1
+                
+                # Update regional breakdown for completed T2 and T3 tasks
+                if task_type in ["T2", "T3"] and region in results["regional_breakdown"]:
+                    results["regional_breakdown"][region][task_type]["completed"] += 1
 
                 # For T1 tasks, exclude from correctness evaluation but count as completed
                 if task_type == "T1":
@@ -108,10 +151,16 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
                     if judge_result == "CORRECT":
                         results["completed_and_correct"] += 1
                         results["task_type_breakdown"][task_type]["correct"] += 1
+                        # Update regional breakdown for correct T2 and T3 tasks
+                        if task_type in ["T2", "T3"] and region in results["regional_breakdown"]:
+                            results["regional_breakdown"][region][task_type]["correct"] += 1
                         completed_correct_files.append(json_file.name)
                     else:
                         results["completed_and_incorrect"] += 1
                         results["task_type_breakdown"][task_type]["incorrect"] += 1
+                        # Update regional breakdown for incorrect T2 and T3 tasks
+                        if task_type in ["T2", "T3"] and region in results["regional_breakdown"]:
+                            results["regional_breakdown"][region][task_type]["incorrect"] += 1
                         completed_incorrect_files.append((json_file.name, judge_result))
             else:
                 results["other_status"] += 1
@@ -192,6 +241,24 @@ def display_results(
                 print(f"  Correct: {breakdown['correct']:3d}, Incorrect: {breakdown['incorrect']:3d}")
                 print(f"  Accuracy: {accuracy_rate:.1f}%")
 
+    # Regional breakdown for T2 and T3
+    print("\n" + "-" * 70)
+    print("REGIONAL BREAKDOWN (T2 & T3 TASKS)")
+    print("-" * 70)
+    
+    for region in ["Global", "Greater_China"]:
+        print(f"\n{region} Region:")
+        for task_type in ["T2", "T3"]:
+            breakdown = results["regional_breakdown"][region][task_type]
+            if breakdown["total"] > 0:
+                completion_rate = breakdown["completed"] / breakdown["total"] * 100
+                accuracy_rate = breakdown["correct"] / breakdown["completed"] * 100 if breakdown["completed"] > 0 else 0
+                task_name = "Simple Historical Lookup" if task_type == "T2" else "Complex Historical Investigation"
+                print(f"  {task_type} ({task_name}):")
+                print(f"    Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)")
+                print(f"    Correct: {breakdown['correct']:3d}, Incorrect: {breakdown['incorrect']:3d}")
+                print(f"    Accuracy: {accuracy_rate:.1f}%")
+
     print("\n" + "-" * 70)
     print(f"SUMMARY: {completed} tasks completed, {correct} T2+T3 tasks correct")
     print(f"         (T1 tasks: {results['task_type_breakdown']['T1']['completed']} completed, excluded from evaluation)")

From 256ba2c2b3a382eec1c4156ebf362b3c7fd98915 Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Thu, 25 Sep 2025 10:20:05 +0800
Subject: [PATCH 10/12] upd: add docs and shell script for multiple runs.

---
 docs/mkdocs/docs/finsearchcomp.md             | 178 ++++++++++++++++++
 docs/mkdocs/mkdocs.yml                        |   1 +
 ...un_evaluate_multiple_runs_finsearchcomp.sh | 104 ++++++++++
 3 files changed, 283 insertions(+)
 create mode 100644 docs/mkdocs/docs/finsearchcomp.md
 create mode 100755 scripts/run_evaluate_multiple_runs_finsearchcomp.sh

diff --git a/docs/mkdocs/docs/finsearchcomp.md b/docs/mkdocs/docs/finsearchcomp.md
new file mode 100644
index 00000000..19eb6f3e
--- /dev/null
+++ b/docs/mkdocs/docs/finsearchcomp.md
@@ -0,0 +1,178 @@
+# FinSearchComp
+
+MiroFlow's evaluation on the FinSearchComp benchmark demonstrates capabilities in financial information search and analysis tasks, showcasing advanced reasoning abilities in complex financial research scenarios.
+
+More details: [FinSearchComp Dataset](https://huggingface.co/datasets/ByteSeedXpert/FinSearchComp)
+
+---
+
+## Dataset Overview
+
+!!! info "FinSearchComp Dataset"
+    The FinSearchComp dataset consists of financial search and analysis tasks that require comprehensive research capabilities including:
+
+    - Financial data retrieval and analysis
+    - Market research and company analysis
+    - Investment decision support
+    - Financial news and report interpretation
+    - Time-sensitive financial information gathering
+
+!!! abstract "Key Dataset Characteristics"
+
+    - **Total Tasks**: 635 (across T1, T2, T3 categories)
+    - **Task Types**: 
+        - **T1**: Time-Sensitive Data Fetching
+        - **T2**: Financial Analysis and Research
+        - **T3**: Complex Historical Investigation
+    - **Answer Format**: Detailed financial analysis and research reports
+    - **Ground Truth**: Available for T2 and T3 tasks, changes dynamically for T1 tasks
+    - **Evaluation**: Judge-based evaluation with correctness assessment
+
+---
+
+## Quick Start Guide
+
+!!! note "Quick Start Instructions"
+    This section provides step-by-step instructions to run the FinSearchComp benchmark and prepare submission results. **Note**: This is a quick start guide for running the benchmark, not for reproducing exact submitted results.
+
+### Step 1: Prepare the FinSearchComp Dataset
+
+!!! tip "Dataset Setup"
+    Use the integrated prepare-benchmark command to download and process the dataset:
+
+```bash title="Download FinSearchComp Dataset"
+uv run main.py prepare-benchmark get finsearchcomp
+```
+
+This will create the standardized dataset at `data/finsearchcomp/standardized_data.jsonl`.
+
+### Step 2: Configure API Keys
+
+!!! warning "API Key Configuration"
+    Set up the required API keys for model access and tool functionality. Update the `.env` file to include the following keys:
+
+```env title=".env Configuration"
+# For searching and web scraping
+SERPER_API_KEY="xxx"
+JINA_API_KEY="xxx"
+
+# For Linux sandbox (code execution environment)
+E2B_API_KEY="xxx"
+
+# We use MiroThinker model for financial analysis
+OAI_MIROTHINKER_API_KEY="xxx"
+OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1"
+
+# Used for o3 hints and final answer extraction
+OPENAI_API_KEY="xxx"
+OPENAI_BASE_URL="https://api.openai.com/v1"
+
+# Used for Claude vision understanding
+ANTHROPIC_API_KEY="xxx"
+
+# Used for Gemini vision
+GEMINI_API_KEY="xxx"
+```
+
+### Step 3: Run the Evaluation
+
+!!! example "Evaluation Execution"
+    Execute the following command to run evaluation on the FinSearchComp dataset:
+
+```bash title="Run FinSearchComp Evaluation"
+uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")"
+```
+
+!!! tip "Progress Monitoring and Resume"
+    To check the progress while running:
+    
+    ```bash title="Check Progress"
+    uv run utils/progress_check/check_finsearchcomp_progress.py $PATH_TO_LOG
+    ```
+    
+    If you need to resume an interrupted evaluation, specify the same output directory to continue from where you left off.
+
+    ```bash title="Resume Evaluation, e.g."
+    uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir=${PATH_TO_LOG}
+    ```
+
+### Step 4: Extract Results
+
+!!! example "Result Extraction"
+    After evaluation completion, the results are automatically generated in the output directory:
+
+- `benchmark_results.jsonl`: Detailed results for each task
+- `benchmark_results_pass_at_1_accuracy.txt`: Summary accuracy statistics
+- `task_*_attempt_1.json`: Individual task execution traces
+
+---
+
+## Evaluation Notes
+
+!!! warning "Task Type Considerations"
+    The FinSearchComp dataset includes different task types with varying evaluation criteria:
+
+    - **T1 Tasks**: Time-Sensitive Data Fetching tasks are excluded from correctness evaluation due to outdated ground truth, but completion is still tracked
+    - **T2 Tasks**: Financial Analysis tasks are evaluated for correctness and quality
+    - **T3 Tasks**: Complex Historical Investigation tasks require comprehensive research and analysis
+
+!!! info "Output Analysis"
+    The evaluation generates detailed execution traces showing:
+
+    - Research process for each financial task
+    - Information gathering from multiple sources
+    - Financial calculations and analysis
+    - Comprehensive reports with insights and recommendations
+
+### Directory Structure
+
+After running evaluations, you'll find the following structure:
+
+```
+logs/finsearchcomp/agent_finsearchcomp_YYYYMMDD_HHMM/
+├── benchmark_results.jsonl              # Task results summary
+├── benchmark_results_pass_at_1_accuracy.txt  # Accuracy statistics
+├── task_(T1)Time_Sensitive_Data_Fetching_*.json  # T1 task traces
+├── task_(T2)Financial_Analysis_*.json   # T2 task traces
+├── task_(T3)Complex_Historical_Investigation_*.json  # T3 task traces
+└── output.log                           # Execution log
+```
+
+### Task Categories Breakdown
+
+The progress checker provides detailed statistics:
+
+- **Total Tasks**: Complete count across all categories
+- **Completed Tasks**: Successfully finished tasks
+- **Correct Tasks**: Tasks with judge_result "CORRECT" (T2 and T3 only)
+- **Category Breakdown**: Separate counts for T1, T2, and T3 tasks
+- **Accuracy Metrics**: Pass@1 accuracy for evaluable tasks
+
+---
+
+## Usage Examples
+
+### Single Run Evaluation
+```bash title="Basic Evaluation"
+uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")"
+```
+
+### Limited Task Testing
+```bash title="Test with Limited Tasks"
+uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp benchmark.execution.max_tasks=5 output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")"
+```
+
+### Custom Agent Configuration
+```bash title="Different Agent Setup"
+uv run main.py common-benchmark --config_file_name=agent_gaia-validation benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")"
+```
+
+### Multiple Runs for Reliability
+```bash title="Multiple Runs"
+NUM_RUNS=5 ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh
+```
+
+---
+
+!!! info "Documentation Info"
+    **Last Updated:** September 2025 · **Doc Contributor:** Team @ MiroMind AI
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
index 70f69051..1a488830 100644
--- a/docs/mkdocs/mkdocs.yml
+++ b/docs/mkdocs/mkdocs.yml
@@ -54,6 +54,7 @@ nav:
       - GAIA-Test: gaia_test.md
       - FutureX: futurex.md
       - xBench-DeepSearch: xbench_ds.md
+      - FinSearchComp: finsearchcomp.md
     - Download Datasets: download_datasets.md
     - Add New Benchmarks: contribute_benchmarks.md
 
diff --git a/scripts/run_evaluate_multiple_runs_finsearchcomp.sh b/scripts/run_evaluate_multiple_runs_finsearchcomp.sh
new file mode 100755
index 00000000..e7c90fe3
--- /dev/null
+++ b/scripts/run_evaluate_multiple_runs_finsearchcomp.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Multiple runs FinSearchComp evaluation script
+# Based on the working command: uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir=logs/finsearchcomp/$(date +"%Y%m%d_%H%M")
+
+# Configuration parameters
+NUM_RUNS=${NUM_RUNS:-3}
+MAX_TASKS=${MAX_TASKS:-1}
+MAX_CONCURRENT=${MAX_CONCURRENT:-5}
+BENCHMARK_NAME="finsearchcomp"
+AGENT_SET=${AGENT_SET:-"agent_finsearchcomp"}
+
+# Set results directory with timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+RESULTS_DIR="logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"
+
+export LOGGER_LEVEL="INFO"
+
+echo "🚀 Starting $NUM_RUNS runs of FinSearchComp evaluation..."
+echo "📊 Using max_tasks: $MAX_TASKS (set MAX_TASKS=null for full dataset)"
+echo "📊 Using max_concurrent: $MAX_CONCURRENT"
+echo "📁 Results will be saved in: $RESULTS_DIR"
+
+# Create results directory
+mkdir -p "$RESULTS_DIR"
+
+# Launch all parallel tasks
+for i in $(seq 1 $NUM_RUNS); do
+    echo "=========================================="
+    echo "🚀 Launching experiment $i/$NUM_RUNS"
+    echo "📝 Output log: $RESULTS_DIR/run_${i}_output.log"
+    echo "=========================================="
+    
+    # Set specific identifier for this run
+    RUN_ID="run_$i"
+    
+    # Run experiment (background execution)
+    (
+        echo "Starting run $i at $(date)"
+        uv run main.py common-benchmark \
+            --config_file_name=$AGENT_SET \
+            benchmark=$BENCHMARK_NAME \
+            benchmark.execution.max_tasks=$MAX_TASKS \
+            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
+            benchmark.execution.pass_at_k=1 \
+            output_dir=${RESULTS_DIR}/$RUN_ID \
+            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
+            > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
+        
+        # Check if run was successful
+        if [ $? -eq 0 ]; then
+            echo "✅ Run $i completed successfully at $(date)"
+            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
+            if [ -f "$RESULT_FILE" ]; then
+                echo "📊 Results saved to $RESULT_FILE"
+            else
+                echo "⚠️  Warning: Result file not found for run $i"
+            fi
+        else
+            echo "❌ Run $i failed at $(date)!"
+        fi
+    ) &
+    
+    # Small delay between launches
+    sleep 2
+done
+
+echo "🎯 All $NUM_RUNS runs have been launched in parallel"
+echo "⏳ Waiting for all runs to complete..."
+
+# Wait for all background tasks to complete
+wait
+
+echo "=========================================="
+echo "🎉 All $NUM_RUNS runs completed!"
+echo "=========================================="
+
+# Show progress summary
+echo "=========================================="
+echo "📊 Progress Summary:"
+echo "=========================================="
+
+echo "=========================================="
+echo "🎯 Multiple runs FinSearchComp evaluation completed!"
+echo "📁 Check results in: $RESULTS_DIR"
+echo "📝 Check individual run logs: $RESULTS_DIR/run_*_output.log"
+echo "=========================================="
+echo ""
+echo "💡 Usage examples:"
+echo "   # Default: 3 runs with full dataset"
+echo "   ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh"
+echo ""
+echo "   # Custom parameters"
+echo "   NUM_RUNS=5 MAX_TASKS=10 MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh"
+echo ""
+echo "   # Different agent configuration"
+echo "   AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh"
+echo ""
+echo "   # Limited tasks for testing"
+echo "   MAX_TASKS=5 ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh"

From 835e590b992f5b72560bce062aec2efa4e4709d6 Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Thu, 25 Sep 2025 10:57:23 +0800
Subject: [PATCH 11/12] fix: check_finsearchcomp_progress not displaying
 results from greater china region.

---
 utils/progress_check/check_finsearchcomp_progress.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/utils/progress_check/check_finsearchcomp_progress.py b/utils/progress_check/check_finsearchcomp_progress.py
index a5b26595..2e45787f 100755
--- a/utils/progress_check/check_finsearchcomp_progress.py
+++ b/utils/progress_check/check_finsearchcomp_progress.py
@@ -48,15 +48,15 @@ def extract_region_from_label(label: str) -> str:
         label: Label string like "Complex_Historical_Investigation(Global)" or "Financial_Analysis(Greater_China)"
         
     Returns:
-        Region string ("Global", "Greater_China", or "Unknown")
+        Region string ("Global", "Greater China", or "Unknown")
     """
     if not label:
         return "Unknown"
     
     if "(Global)" in label:
         return "Global"
-    elif "(Greater_China)" in label:
-        return "Greater_China"
+    elif "(Greater China)" in label:
+        return "Greater China"
     else:
         return "Unknown"
 
@@ -97,7 +97,7 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
                 "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
                 "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}
             },
-            "Greater_China": {
+            "Greater China": {
                 "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
                 "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}
             }
@@ -246,7 +246,7 @@ def display_results(
     print("REGIONAL BREAKDOWN (T2 & T3 TASKS)")
     print("-" * 70)
     
-    for region in ["Global", "Greater_China"]:
+    for region in ["Global", "Greater China"]:
         print(f"\n{region} Region:")
         for task_type in ["T2", "T3"]:
             breakdown = results["regional_breakdown"][region][task_type]

From 72e9bb6f82225f952d8cb204ac0738a9f3e25e7b Mon Sep 17 00:00:00 2001
From: JubSteven <1120395085@qq.com>
Date: Thu, 25 Sep 2025 17:16:30 +0800
Subject: [PATCH 12/12] fix: catch ContextLimitError in more observed cases.

---
 src/llm/providers/claude_openrouter_client.py  | 2 ++
 src/llm/providers/mirothinker_sglang_client.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/llm/providers/claude_openrouter_client.py b/src/llm/providers/claude_openrouter_client.py
index 4acd114e..fd441252 100644
--- a/src/llm/providers/claude_openrouter_client.py
+++ b/src/llm/providers/claude_openrouter_client.py
@@ -191,6 +191,8 @@ async def _create_message(
                 or "exceeds the maximum length" in error_str
                 or "exceeds the maximum allowed length" in error_str
                 or "Input tokens exceed the configured limit" in error_str
+                or "Requested token count exceeds the model's maximum context length" in error_str
+                or "BadRequestError" in error_str and "context length" in error_str
             ):
                 logger.debug(f"OpenRouter LLM Context limit exceeded: {error_str}")
                 raise ContextLimitError(f"Context limit exceeded: {error_str}")
diff --git a/src/llm/providers/mirothinker_sglang_client.py b/src/llm/providers/mirothinker_sglang_client.py
index f02309a7..6008f8ca 100644
--- a/src/llm/providers/mirothinker_sglang_client.py
+++ b/src/llm/providers/mirothinker_sglang_client.py
@@ -159,6 +159,8 @@ async def _create_message(
                 or "exceeds the maximum length" in error_str
                 or "exceeds the maximum allowed length" in error_str
                 or "Input tokens exceed the configured limit" in error_str
+                or "Requested token count exceeds the model's maximum context length" in error_str
+                or "BadRequestError" in error_str and "context length" in error_str
             ):
                 logger.debug(f"MiroThinker LLM Context limit exceeded: {error_str}")
                 raise ContextLimitError(f"Context limit exceeded: {error_str}")