From 56b235d0069cf72cbf9953ed30dcdebcc236c68e Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Thu, 18 Sep 2025 10:35:02 +0800 Subject: [PATCH 01/12] upd: add futurex evaluation support. --- .gitignore | 1 + config/benchmark/futurex.yaml | 20 ++ utils/extract_futurex_results.py | 334 ++++++++++++++++++ utils/prepare_benchmark/gen_futurex.py | 55 +++ utils/prepare_benchmark/main.py | 9 + .../progress_check/check_futurex_progress.py | 218 ++++++++++++ 6 files changed, 637 insertions(+) create mode 100644 config/benchmark/futurex.yaml create mode 100644 utils/extract_futurex_results.py create mode 100644 utils/prepare_benchmark/gen_futurex.py create mode 100644 utils/progress_check/check_futurex_progress.py diff --git a/.gitignore b/.gitignore index aaf57c1e..8cbc94e8 100644 --- a/.gitignore +++ b/.gitignore @@ -208,6 +208,7 @@ marimo/_lsp/ __marimo__/ logs/ +tmp/ data/* !data/README.md diff --git a/config/benchmark/futurex.yaml b/config/benchmark/futurex.yaml new file mode 100644 index 00000000..97109882 --- /dev/null +++ b/config/benchmark/futurex.yaml @@ -0,0 +1,20 @@ +# config/benchmark/futurex.yaml +defaults: + - default + - _self_ + +name: "futurex" + +data: + data_dir: "${data_dir}/futurex" # Path to your dataset + metadata_file: "standardized_data.jsonl" # Metadata filename + whitelist: [] # Optional: List of specific task_ids to run + +execution: + max_tasks: null # null = no limit, or specify a number + max_concurrent: 5 # Number of parallel tasks + pass_at_k: 1 # Number of attempts per task + +# Set to skip evaluation since we don't have ground truth +openai_api_key: "skip_evaluation" + diff --git a/utils/extract_futurex_results.py b/utils/extract_futurex_results.py new file mode 100644 index 00000000..415b720c --- /dev/null +++ b/utils/extract_futurex_results.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +FutureX Results Extractor and Aggregator + +This script extracts predictions from MiroFlow benchmark results and can aggregate +multiple runs using majority voting to create FutureX submission files. + +Features: +1. Extract predictions from single benchmark results +2. Aggregate multiple runs with majority voting +3. Generate FutureX-compatible submission files +4. Support both single-run and multi-run scenarios + +Usage: + # Extract from single run + python extract_futurex_results.py logs/futurex-online-test + + # Aggregate multiple runs (if run_* subdirectories exist) + python extract_futurex_results.py logs/futurex-online-multi-runs + + # Specify output file + python extract_futurex_results.py logs/futurex-online-test -o my_submission.jsonl +""" + +import argparse +import json +import os +from collections import Counter, defaultdict +from typing import Dict, List, Tuple + + +def majority_vote( + preds: List[str], first_seen_order: Dict[str, int] +) -> Tuple[str, Dict[str, int]]: + """ + Compute the majority-vote prediction for a list of candidate predictions. + + Tie-breaking rules (deterministic): + 1) Highest frequency wins. + 2) If there is a tie on frequency, choose the candidate that appeared earliest + across all runs (based on the provided first_seen_order index). + 3) As a final guard (shouldn't be needed if first_seen_order is complete), + fall back to lexicographic order. + + Returns: + (chosen_prediction, counts_dict) + """ + counter = Counter(preds) + # Get the max vote count + max_count = max(counter.values()) + # All candidates that share the max vote count + tied = [c for c, cnt in counter.items() if cnt == max_count] + + if len(tied) == 1: + chosen = tied[0] + else: + # Prefer the one seen earliest globally + tied.sort(key=lambda x: (first_seen_order.get(x, float("inf")), x)) + chosen = tied[0] + + # Expose counts for optional debugging/inspection + return chosen, dict(counter) + + +def discover_runs(results_dir: str) -> List[str]: + """ + Discover subdirectories inside results_dir that potentially contain a + 'benchmark_results.jsonl'. We don't strictly require the subdir name to + start with 'run_', but we sort the list to keep processing deterministic. + """ + runs = [] + for name in sorted(os.listdir(results_dir)): + path = os.path.join(results_dir, name) + if os.path.isdir(path): + fpath = os.path.join(path, "benchmark_results.jsonl") + if os.path.isfile(fpath): + runs.append(path) + return runs + + +def extract_predictions_from_file(file_path: str) -> Dict[str, str]: + """ + Extract predictions from a single benchmark_results.jsonl file. + + Args: + file_path: Path to benchmark_results.jsonl file + + Returns: + Dictionary mapping task_id to prediction + """ + predictions = {} + + with open(file_path, "r", encoding="utf-8") as fin: + for line_num, line in enumerate(fin, 1): + line = line.strip() + if not line: + continue + + try: + rec = json.loads(line) + except json.JSONDecodeError as e: + print(f"Warning: Skipping malformed JSON at line {line_num} in {file_path}: {e}") + continue + + task_id = rec.get("task_id") + pred = rec.get("model_boxed_answer") + + # Only accept non-empty strings; coerce to str for safety + if task_id and pred is not None and str(pred).strip(): + pred_str = str(pred).strip() + predictions[task_id] = pred_str + + return predictions + + +def aggregate_multiple_runs(results_dir: str) -> Tuple[Dict[str, List[str]], Dict[str, int]]: + """ + Aggregate predictions from multiple runs in subdirectories. + + Args: + results_dir: Directory containing run_* subdirectories + + Returns: + Tuple of (predictions_by_task, first_seen_order) + """ + # Maps task_id -> list of predictions collected across runs + preds_by_task: Dict[str, List[str]] = defaultdict(list) + + # Track first-seen order index for each distinct prediction string across all runs. + # This enables deterministic tie-breaking. + first_seen_order: Dict[str, int] = {} + next_order_idx = 0 + + runs = discover_runs(results_dir) + if not runs: + raise FileNotFoundError( + f"No run directories with 'benchmark_results.jsonl' found under: {results_dir}" + ) + + total_lines = 0 + used_lines = 0 + + # Read and aggregate predictions + for run_dir in runs: + fpath = os.path.join(run_dir, "benchmark_results.jsonl") + print(f"Reading: {fpath}") + + with open(fpath, "r", encoding="utf-8") as fin: + for line in fin: + total_lines += 1 + line = line.strip() + if not line: + continue + try: + rec = json.loads(line) + except json.JSONDecodeError: + # Skip malformed JSON lines, but keep going + continue + + task_id = rec.get("task_id") + pred = rec.get("model_boxed_answer") + + # Only accept non-empty strings; coerce to str for safety + if task_id and pred is not None and str(pred).strip(): + pred_str = str(pred).strip() + preds_by_task[task_id].append(pred_str) + if pred_str not in first_seen_order: + first_seen_order[pred_str] = next_order_idx + next_order_idx += 1 + used_lines += 1 + + print(f"Collected from {len(runs)} run(s).") + print(f"Read {total_lines} line(s), accepted {used_lines} record(s).") + + return preds_by_task, first_seen_order + + +def process_single_run(results_dir: str) -> Dict[str, str]: + """ + Process a single run (direct benchmark_results.jsonl file). + + Args: + results_dir: Directory containing benchmark_results.jsonl + + Returns: + Dictionary mapping task_id to prediction + """ + file_path = os.path.join(results_dir, "benchmark_results.jsonl") + + if not os.path.isfile(file_path): + raise FileNotFoundError(f"benchmark_results.jsonl not found in: {results_dir}") + + print(f"Reading single run: {file_path}") + predictions = extract_predictions_from_file(file_path) + print(f"Extracted {len(predictions)} predictions from single run.") + + return predictions + + +def write_submission_file( + predictions: Dict[str, str], + output_file: str, + is_aggregated: bool = False, + vote_counts: Dict[str, Dict[str, int]] = None +) -> None: + """ + Write predictions to FutureX submission format. + + Args: + predictions: Dictionary mapping task_id to prediction + output_file: Output file path + is_aggregated: Whether this is from aggregated runs + vote_counts: Vote counts for each task (only for aggregated runs) + """ + num_tasks = 0 + with open(output_file, "w", encoding="utf-8") as out: + for task_id in sorted(predictions.keys()): + prediction = predictions[task_id] + + # Create submission record + record = {"id": task_id, "prediction": prediction} + + # Add vote information for aggregated runs + if is_aggregated and vote_counts and task_id in vote_counts: + record["vote_counts"] = vote_counts[task_id] + + out.write(json.dumps(record, ensure_ascii=False) + "\n") + num_tasks += 1 + + print(f"✅ Submission saved to {output_file}") + if is_aggregated: + print(f"Aggregated {num_tasks} unique task_id(s) from multiple runs.") + else: + print(f"Extracted {num_tasks} predictions from single run.") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Extract predictions from MiroFlow benchmark results and create FutureX submission files. " + "Supports both single runs and multi-run aggregation with majority voting." + ) + parser.add_argument( + "results_dir", + help="Path to results dir containing benchmark_results.jsonl or run_*/benchmark_results.jsonl", + ) + parser.add_argument( + "-o", + "--output", + default=None, + help="Output JSONL file path (default: /futurex_submission.jsonl)", + ) + parser.add_argument( + "--aggregate", + action="store_true", + help="Force aggregation mode (look for run_* subdirectories)", + ) + parser.add_argument( + "--single", + action="store_true", + help="Force single run mode (look for direct benchmark_results.jsonl)", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + results_dir = os.path.abspath(args.results_dir) + if not os.path.isdir(results_dir): + raise FileNotFoundError(f"Results dir not found: {results_dir}") + + output_file = ( + os.path.abspath(args.output) + if args.output + else os.path.join(results_dir, "futurex_submission.jsonl") + ) + + # Determine processing mode + runs = discover_runs(results_dir) + single_file = os.path.join(results_dir, "benchmark_results.jsonl") + + if args.aggregate: + if not runs: + raise FileNotFoundError( + f"No run directories found for aggregation in: {results_dir}" + ) + mode = "aggregate" + elif args.single: + if not os.path.isfile(single_file): + raise FileNotFoundError( + f"benchmark_results.jsonl not found for single run in: {results_dir}" + ) + mode = "single" + else: + # Auto-detect mode + if runs and os.path.isfile(single_file): + print("Both single run and multiple runs detected. Using aggregation mode.") + print("Use --single to force single run mode.") + mode = "aggregate" + elif runs: + mode = "aggregate" + elif os.path.isfile(single_file): + mode = "single" + else: + raise FileNotFoundError( + f"No benchmark_results.jsonl files found in: {results_dir}" + ) + + print(f"Processing mode: {mode}") + + if mode == "aggregate": + # Multi-run aggregation with majority voting + preds_by_task, first_seen_order = aggregate_multiple_runs(results_dir) + + # Apply majority voting + final_predictions = {} + vote_counts = {} + + for task_id in preds_by_task: + voted_pred, counts = majority_vote(preds_by_task[task_id], first_seen_order) + final_predictions[task_id] = voted_pred + vote_counts[task_id] = counts + + write_submission_file(final_predictions, output_file, is_aggregated=True, vote_counts=vote_counts) + + else: + # Single run extraction + predictions = process_single_run(results_dir) + write_submission_file(predictions, output_file, is_aggregated=False) + + +if __name__ == "__main__": + main() diff --git a/utils/prepare_benchmark/gen_futurex.py b/utils/prepare_benchmark/gen_futurex.py new file mode 100644 index 00000000..0e2690d6 --- /dev/null +++ b/utils/prepare_benchmark/gen_futurex.py @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: 2025 MiromindAI +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Generator, MutableMapping + +from datasets import load_dataset + +from utils.prepare_benchmark.common import Task + + +def gen_futurex(hf_token: str) -> Generator[Task, None, None]: + """ + Generate Futurex-Online dataset tasks in MiroFlow format + + Args: + hf_token: Hugging Face token for dataset access + + Yields: + Task: Standardized task objects + """ + # Load the Futurex-Online dataset + dataset = load_dataset("futurex-ai/Futurex-Online") + + # Process each split in the dataset + for split_name, split_data in dataset.items(): + for idx, sample in enumerate(split_data): + # Extract task information + task_id = sample.get("id", f"futurex_{split_name}_{idx}") + task_question = sample.get("prompt", "") + end_time = sample.get("end_time", "") + level = sample.get("level", "") + + # Create metadata dictionary + metadata: MutableMapping = { + "level": level, + "end_time": end_time, + "source": "futurex-ai/Futurex-Online", + "split": split_name, + "original_id": sample.get("id", ""), + "dataset_name": "Futurex-Online" + } + + # Create standardized Task object + task = Task( + task_id=task_id, + task_question=task_question, + ground_truth="", # Futurex-Online doesn't have ground truth + file_path=None, # No file attachments + metadata=metadata, + ) + + yield task + + return diff --git a/utils/prepare_benchmark/main.py b/utils/prepare_benchmark/main.py index 9712255a..2233bfdc 100644 --- a/utils/prepare_benchmark/main.py +++ b/utils/prepare_benchmark/main.py @@ -18,6 +18,7 @@ from utils.prepare_benchmark.gen_hle import gen_hle_test from utils.prepare_benchmark.gen_webwalkerqa import gen_webwalkerqa from utils.prepare_benchmark.gen_xbench_ds import gen_xbench_ds +from utils.prepare_benchmark.gen_futurex import gen_futurex @dataclasses.dataclass @@ -31,6 +32,7 @@ class _Env: "browsecomp-zh-test", "hle", "xbench-ds", + "futurex", ) meta_filename = "standardized_data.jsonl" data_dir: pathlib.Path @@ -108,6 +110,13 @@ def gen(): for x in gen_xbench_ds(env.hf_token): yield x + return gen + case "futurex": + + def gen(): + for x in gen_futurex(env.hf_token): + yield x + return gen case _: raise ValueError("not supported") diff --git a/utils/progress_check/check_futurex_progress.py b/utils/progress_check/check_futurex_progress.py new file mode 100644 index 00000000..d8783174 --- /dev/null +++ b/utils/progress_check/check_futurex_progress.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +""" +Futurex-Online Progress Checker + +This script analyzes Futurex-Online benchmark results in a log folder to count: +- Total files processed +- Files with status "completed" +- Files with predictions (final_boxed_answer) +- Files with errors + +Usage: + python check_futurex_progress.py [LOG_FOLDER_PATH] + +If no path is provided, uses the current directory. +""" + +import json +import sys +from pathlib import Path +from typing import Dict, List, Tuple + + +def analyze_futurex_results(log_folder: str) -> Dict[str, int]: + """ + Analyze Futurex-Online benchmark results from JSON log files. + + Args: + log_folder: Path to folder containing task_*.json files + + Returns: + Dictionary with counts of different categories + """ + log_path = Path(log_folder) + + if not log_path.exists(): + raise FileNotFoundError(f"Log folder not found: {log_folder}") + + # Find all task JSON files + json_files = list(log_path.glob("task_*_attempt_*.json")) + + results = { + "total_files": 0, + "completed_status": 0, + "running_status": 0, + "failed_status": 0, + "with_predictions": 0, + "without_predictions": 0, + "with_errors": 0, + "parse_errors": 0, + } + + completed_files = [] + running_files = [] + failed_files = [] + prediction_files = [] + error_files = [] + parse_error_files = [] + + print(f"Scanning {len(json_files)} files in {log_folder}...") + + for json_file in json_files: + results["total_files"] += 1 + + try: + with open(json_file, "r", encoding="utf-8") as f: + data = json.load(f) + + status = data.get("status", "").lower() + final_answer = data.get("final_boxed_answer", "") + error_msg = data.get("error", "") + judge_result = data.get("judge_result", "") + + # Count by status + if status == "completed": + results["completed_status"] += 1 + completed_files.append(json_file.name) + elif status == "running": + results["running_status"] += 1 + running_files.append(json_file.name) + elif status in ["failed", "error"]: + results["failed_status"] += 1 + failed_files.append(json_file.name) + else: + # Unknown status + results["failed_status"] += 1 + failed_files.append((json_file.name, f"Unknown status: {status}")) + + # Count by prediction availability + if final_answer and final_answer.strip(): + results["with_predictions"] += 1 + prediction_files.append((json_file.name, final_answer[:100] + "..." if len(final_answer) > 100 else final_answer)) + else: + results["without_predictions"] += 1 + + # Count by error presence + if error_msg and error_msg.strip(): + results["with_errors"] += 1 + error_files.append((json_file.name, error_msg)) + + except (json.JSONDecodeError, KeyError, FileNotFoundError) as e: + results["parse_errors"] += 1 + parse_error_files.append((json_file.name, str(e))) + print(f"Error parsing {json_file.name}: {e}") + + return ( + results, + completed_files, + running_files, + failed_files, + prediction_files, + error_files, + parse_error_files, + ) + + +def display_results( + results: Dict[str, int], + completed_files: List[str], + running_files: List[str], + failed_files: List[str], + prediction_files: List[Tuple[str, str]], + error_files: List[Tuple[str, str]], + parse_error_files: List[Tuple[str, str]], +) -> None: + """Display the analysis results in a formatted way.""" + + print("\n" + "=" * 60) + print("FUTUREX-ONLINE BENCHMARK RESULTS SUMMARY") + print("=" * 60) + + total = results["total_files"] + completed = results["completed_status"] + running = results["running_status"] + failed = results["failed_status"] + with_predictions = results["with_predictions"] + with_errors = results["with_errors"] + + print(f"Total files processed: {total:3d}") + print(f"Files with status 'completed': {completed:3d} ({completed/total*100:.1f}%)") + print(f"Files with status 'running': {running:3d} ({running/total*100:.1f}%)") + print(f"Files with status 'failed': {failed:3d} ({failed/total*100:.1f}%)") + print(f"Files with predictions: {with_predictions:3d} ({with_predictions/total*100:.1f}%)") + print(f"Files with errors: {with_errors:3d} ({with_errors/total*100:.1f}%)") + print(f"Files with parse errors: {results['parse_errors']:3d}") + + if completed > 0: + prediction_rate = with_predictions / completed * 100 + print(f"\nPrediction rate (predictions/completed): {prediction_rate:.1f}%") + + print("\n" + "-" * 60) + print(f"SUMMARY: {completed} tasks completed, {with_predictions} with predictions") + print("-" * 60) + + # Show some example files for verification + if completed_files: + print("\nFirst 5 completed files:") + for i, filename in enumerate(completed_files[:5], 1): + print(f" {i}. {filename}") + if len(completed_files) > 5: + print(f" ... and {len(completed_files) - 5} more") + + if running_files: + print("\nFirst 5 running files:") + for i, filename in enumerate(running_files[:5], 1): + print(f" {i}. {filename}") + if len(running_files) > 5: + print(f" ... and {len(running_files) - 5} more") + + if prediction_files: + print("\nFirst 5 files with predictions:") + for i, (filename, prediction) in enumerate(prediction_files[:5], 1): + print(f" {i}. {filename}") + print(f" Prediction: {prediction}") + if len(prediction_files) > 5: + print(f" ... and {len(prediction_files) - 5} more") + + if error_files: + print("\nFiles with errors:") + for filename, error in error_files[:5]: + print(f" - {filename}: {error[:100]}...") + if len(error_files) > 5: + print(f" ... and {len(error_files) - 5} more") + + if parse_error_files: + print("\nFiles with parse errors:") + for filename, error in parse_error_files: + print(f" - {filename}: {error}") + + +def main(): + """Main function to run the analysis.""" + + # Check if folder path was provided as command line argument + if len(sys.argv) > 1: + log_folder = sys.argv[1] + print(f"Using provided folder path: {log_folder}") + else: + log_folder = "." + print(f"No folder path provided, using current directory: {log_folder}") + + try: + print(f"Analyzing Futurex-Online benchmark results in: {log_folder}") + results, completed_files, running_files, failed_files, prediction_files, error_files, parse_error_files = analyze_futurex_results( + log_folder + ) + display_results(results, completed_files, running_files, failed_files, prediction_files, error_files, parse_error_files) + + except Exception as e: + print(f"Error: {e}") + print(f"\nUsage: python {sys.argv[0]} [LOG_FOLDER_PATH]") + print(f"Example: python {sys.argv[0]} logs/futurex-online-test") + return 1 + + return 0 + + +if __name__ == "__main__": + exit(main()) From 287a7bcc2ecdfd1972edb735e3a05e58ecdd0287 Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Thu, 18 Sep 2025 11:57:15 +0800 Subject: [PATCH 02/12] upd: support multiple eval for futurex and add relavent doc. --- docs/mkdocs/docs/futurex.md | 258 ++++++++++++++++++ docs/mkdocs/mkdocs.yml | 1 + scripts/run_evaluate_multiple_runs_futurex.sh | 121 ++++++++ 3 files changed, 380 insertions(+) create mode 100644 docs/mkdocs/docs/futurex.md create mode 100755 scripts/run_evaluate_multiple_runs_futurex.sh diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md new file mode 100644 index 00000000..bd022806 --- /dev/null +++ b/docs/mkdocs/docs/futurex.md @@ -0,0 +1,258 @@ +# Futurex-Online + +MiroFlow's evaluation on the Futurex-Online benchmark demonstrates capabilities in future event prediction tasks. + +--- + +## Dataset Overview + +!!! info "Futurex-Online Dataset" + The Futurex-Online dataset consists of 61 prediction tasks covering various future events including: + - Political events (referendums, elections) + - Sports outcomes (football matches) + - Legal proceedings + - Economic indicators + +!!! abstract "Key Dataset Characteristics" + - **Total Tasks**: 61 + - **Task Type**: Future event prediction + - **Answer Format**: Boxed answers (\\boxed{Yes/No} or \\boxed{A/B/C}) + - **Ground Truth**: Not available (prediction tasks) + - **Resolution Date**: Around 2025-09-21 (GMT+8) + +--- + +## Reproduction Guide + +!!! note "Reproducibility Instructions" + This section provides step-by-step instructions to reproduce our Futurex-Online benchmark evaluation. Since this is a prediction dataset without ground truth, we focus on execution traces and response generation. + +### Step 1: Prepare the Futurex-Online Dataset + +!!! tip "Dataset Setup" + Use the integrated prepare-benchmark command to download and process the dataset: + +```bash title="Download Futurex-Online Dataset" +uv run main.py prepare-benchmark get futurex +``` + +This will create the standardized dataset at `data/futurex/standardized_data.jsonl`. + +### Step 2: Configure API Keys + +!!! warning "API Key Configuration" + Set up the required API keys for model access and tool functionality. Update the `.env` file to include the following keys: + +```env title=".env Configuration" +# For searching and web scraping +SERPER_API_KEY="xxx" +JINA_API_KEY="xxx" + +# For Linux sandbox (code execution environment) +E2B_API_KEY="xxx" + +# We use Claude-3.5-Sonnet with OpenRouter backend to initialize the LLM +OPENROUTER_API_KEY="xxx" +OPENROUTER_BASE_URL="https://openrouter.ai/api/v1" + +# Used for Claude vision understanding +ANTHROPIC_API_KEY="xxx" + +# Used for Gemini vision +GEMINI_API_KEY="xxx" + +# Use for llm judge, reasoning, o3 hints, etc. +OPENAI_API_KEY="xxx" +OPENAI_BASE_URL="https://api.openai.com/v1" +``` + +### Step 3: Run the Evaluation + +!!! example "Evaluation Execution" + Execute the following command to run evaluation on the Futurex-Online dataset, currently the basic `agent_quickstart_1` is used. + +```bash title="Run Futurex-Online Evaluation" +uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")" +``` + +!!! tip "Progress Monitoring and Resume" + To check the progress while running: + + ```bash title="Check Progress" + uv run utils/progress_check/check_futurex_progress.py $PATH_TO_LOG + ``` + + If you need to resume an interrupted evaluation, specify the same output directory to continue from where you left off. + + ```bash title="Resume Evaluation, e.g." + uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/20250918_1010" + ``` + +### Step 4: Extract Results + +!!! example "Result Extraction" + After evaluation completion, extract the results using the provided utility: + +```bash title="Extract Results" +uv run utils/extract_futurex_results.py --log_dir logs/futurex/$(date +"%Y%m%d_%H%M") +``` + +This will generate: +- `futurex_results.json`: Detailed results for each task +- `futurex_summary.json`: Summary statistics +- `futurex_predictions.csv`: Predictions in CSV format + +--- + +## Sample Task Examples + +### Political Prediction +``` +Task: "Will the 2025 Guinea referendum pass? (resolved around 2025-09-21 (GMT+8))" +Expected Format: \boxed{Yes} or \boxed{No} +``` + +### Sports Prediction +``` +Task: "Brighton vs. Tottenham (resolved around 2025-09-21 (GMT+8)) +A. Brighton win on 2025-09-20 +B. Brighton vs. Tottenham end in a draw +C. Tottenham win on 2025-09-20" +Expected Format: \boxed{A}, \boxed{B}, or \boxed{C} +``` + +--- + +## Multiple Runs and Voting + +!!! tip "Improving Prediction Accuracy" + For better prediction accuracy, you can run multiple evaluations and use voting mechanisms to aggregate results. This approach helps reduce randomness and improve the reliability of predictions. + +### Step 1: Run Multiple Evaluations + +Use the multiple runs script to execute several independent evaluations: + +```bash title="Run Multiple Evaluations" +./scripts/run_evaluate_multiple_runs_futurex.sh +``` + +This script will: +- Run 3 independent evaluations by default (configurable with `NUM_RUNS`) +- Execute all tasks in parallel for efficiency +- Generate separate result files for each run in `run_1/`, `run_2/`, etc. +- Create a consolidated `futurex_submission.jsonl` file with voting results + +### Step 2: Customize Multiple Runs + +You can customize the evaluation parameters: + +```bash title="Custom Multiple Runs" +# Run 5 evaluations with limited tasks for testing +NUM_RUNS=5 MAX_TASKS=10 ./scripts/run_evaluate_multiple_runs_futurex.sh + +# Use different agent configuration +AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_futurex.sh + +# Adjust concurrency for resource management +MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_futurex.sh +``` + +### Step 3: Voting and Aggregation + +After multiple runs, the system automatically: + +1. **Extracts predictions** from all runs using `utils/extract_futurex_results.py` +2. **Applies majority voting** to aggregate predictions across runs +3. **Generates submission file** in the format required by FutureX platform +4. **Provides voting statistics** showing prediction distribution across runs + +The voting process works as follows: +- **Majority Vote**: Most common prediction across all runs wins +- **Tie-breaking**: If tied, chooses the prediction that appeared earliest across all runs +- **Vote Counts**: Tracks how many runs predicted each option +- **Confidence Indicators**: High agreement indicates more reliable predictions + +### Step 4: Analyze Voting Results + +Check the generated files for voting analysis: + +```bash title="Check Voting Results" +# View submission file with voting results +cat logs/futurex/agent_quickstart_1_*/futurex_submission.jsonl + +# Check individual run results +ls logs/futurex/agent_quickstart_1_*/run_*/ + +# Check progress and voting statistics +uv run python utils/progress_check/check_futurex_progress.py logs/futurex/agent_quickstart_1_* +``` + +### Manual Voting Aggregation + +You can also manually run the voting aggregation: + +```bash title="Manual Voting Aggregation" +# Aggregate multiple runs with majority voting +uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_* --aggregate + +# Force single run mode (if needed) +uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_*/run_1 --single + +# Specify custom output file +uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_* -o my_voted_predictions.jsonl +``` + +### Voting Output Format + +The voting aggregation generates a submission file with the following format: + +```json +{"id": "687104310a994c0060ef87a9", "prediction": "No", "vote_counts": {"No": 2}} +{"id": "68a9b46e961bd3003c8f006b", "prediction": "Yes", "vote_counts": {"Yes": 2}} +``` + +The output includes: +- **`id`**: Task identifier +- **`prediction`**: Final voted prediction (without `\boxed{}` wrapper) +- **`vote_counts`**: Dictionary showing how many runs predicted each option + +For example, `"vote_counts": {"No": 2}` means 2 out of 2 runs predicted "No", indicating high confidence. + +--- + +## Evaluation Notes + +!!! warning "No Ground Truth Available" + Since Futurex-Online is a prediction dataset, there are no ground truth answers available for evaluation. The focus is on: + - Response generation quality + - Reasoning process documentation + - Prediction confidence and methodology + +!!! info "Output Analysis" + The evaluation generates detailed execution traces showing: + - Research process for each prediction + - Information gathering from web sources + - Reasoning chains leading to predictions + - Final boxed answers in required format + +### Directory Structure + +After running multiple evaluations, you'll find the following structure: + +``` +logs/futurex/agent_quickstart_1_YYYYMMDD_HHMM/ +├── futurex_submission.jsonl # Final voted predictions +├── run_1/ # First run results +│ ├── benchmark_results.jsonl # Individual task results +│ ├── benchmark_results_pass_at_1_accuracy.txt +│ └── task_*_attempt_1.json # Detailed execution traces +├── run_2/ # Second run results +│ └── ... (same structure as run_1) +├── run_1_output.log # Run 1 execution log +└── run_2_output.log # Run 2 execution log +``` + +--- + +!!! info "Documentation Info" + **Last Updated:** September 2025 · **Doc Contributor:** Team @ MiroMind AI diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index bf17b63b..808282f7 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -52,6 +52,7 @@ nav: - Benchmarks: - GAIA-Validation: gaia_validation.md - GAIA-Test: gaia_test.md + - FutureX: futurex.md - Add New Benchmarks: contribute_benchmarks.md - Tools: diff --git a/scripts/run_evaluate_multiple_runs_futurex.sh b/scripts/run_evaluate_multiple_runs_futurex.sh new file mode 100755 index 00000000..c441696d --- /dev/null +++ b/scripts/run_evaluate_multiple_runs_futurex.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: 2025 MiromindAI +# +# SPDX-License-Identifier: Apache-2.0 + +# Multiple runs FutureX evaluation script +# Based on the working command: uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir=logs/futurex-test + +# Configuration parameters +NUM_RUNS=${NUM_RUNS:-3} +MAX_TASKS=${MAX_TASKS:-null} +MAX_CONCURRENT=${MAX_CONCURRENT:-5} +BENCHMARK_NAME="futurex" +AGENT_SET=${AGENT_SET:-"agent_quickstart_1"} + +# TODO: Add more settings like message ID and max turns, currently not supported using agent_quickstart_1 +# ADD_MESSAGE_ID=${ADD_MESSAGE_ID:-"false"} +# MAX_TURNS=${MAX_TURNS:-1} + +# Set results directory with timestamp +TIMESTAMP=$(date +%Y%m%d_%H%M) +RESULTS_DIR="logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}" + +export LOGGER_LEVEL="INFO" + +echo "🚀 Starting $NUM_RUNS runs of FutureX evaluation..." +echo "📊 Using max_tasks: $MAX_TASKS (set MAX_TASKS=null for full dataset)" +echo "📊 Using max_concurrent: $MAX_CONCURRENT" +echo "📁 Results will be saved in: $RESULTS_DIR" + +# Create results directory +mkdir -p "$RESULTS_DIR" + +# Launch all parallel tasks +for i in $(seq 1 $NUM_RUNS); do + echo "==========================================" + echo "🚀 Launching experiment $i/$NUM_RUNS" + echo "📝 Output log: $RESULTS_DIR/run_${i}_output.log" + echo "==========================================" + + # Set specific identifier for this run + RUN_ID="run_$i" + + # Run experiment (background execution) + ( + echo "Starting run $i at $(date)" + uv run main.py common-benchmark \ + --config_file_name=$AGENT_SET \ + benchmark=$BENCHMARK_NAME \ + benchmark.execution.max_tasks=$MAX_TASKS \ + benchmark.execution.max_concurrent=$MAX_CONCURRENT \ + benchmark.execution.pass_at_k=1 \ + output_dir=${RESULTS_DIR}/$RUN_ID \ + hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ + > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1 + + # Check if run was successful + if [ $? -eq 0 ]; then + echo "✅ Run $i completed successfully at $(date)" + RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) + if [ -f "$RESULT_FILE" ]; then + echo "📊 Results saved to $RESULT_FILE" + else + echo "⚠️ Warning: Result file not found for run $i" + fi + else + echo "❌ Run $i failed at $(date)!" + fi + ) & + + # Small delay between launches + sleep 2 +done + +echo "🎯 All $NUM_RUNS runs have been launched in parallel" +echo "⏳ Waiting for all runs to complete..." + +# Wait for all background tasks to complete +wait + +echo "==========================================" +echo "🎉 All $NUM_RUNS runs completed!" +echo "==========================================" + +# Extract predictions and format for FutureX submission +echo "📤 Extracting predictions and formatting for FutureX submission..." +uv run python utils/extract_futurex_results.py "$RESULTS_DIR" + +# Check status and provide user-friendly message +if [ $? -eq 0 ]; then + echo "✅ Submission file generated: $RESULTS_DIR/futurex_submission.jsonl" + echo "📋 You can now upload this file to the FutureX test server." +else + echo "❌ Failed to generate submission file. Please check the logs for details." +fi + +# Show progress summary +echo "==========================================" +echo "📊 Progress Summary:" +echo "==========================================" + +echo "==========================================" +echo "🎯 Multiple runs FutureX evaluation completed!" +echo "📁 Check results in: $RESULTS_DIR" +echo "📝 Check individual run logs: $RESULTS_DIR/run_*_output.log" +echo "📤 Check submission file: $RESULTS_DIR/futurex_submission.jsonl" +echo "==========================================" +echo "" +echo "💡 Usage examples:" +echo " # Default: 3 runs with full dataset" +echo " ./scripts/run_evaluate_multiple_runs_futurex.sh" +echo "" +echo " # Custom parameters" +echo " NUM_RUNS=5 MAX_TASKS=10 MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_futurex.sh" +echo "" +echo " # Different agent configuration" +echo " AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_futurex.sh" +echo "" +echo " # Limited tasks for testing" +echo " MAX_TASKS=5 ./scripts/run_evaluate_multiple_runs_futurex.sh" \ No newline at end of file From bf43b373484c07291b37eb3bfb1382af1c6af546 Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Thu, 18 Sep 2025 14:59:39 +0800 Subject: [PATCH 03/12] upd: fix bugs with doc for futurex. --- docs/mkdocs/docs/download_datasets.md | 2 ++ docs/mkdocs/docs/futurex.md | 21 +++++++++++++++------ scripts/run_prepare_benchmark.sh | 3 ++- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/docs/mkdocs/docs/download_datasets.md b/docs/mkdocs/docs/download_datasets.md index 88ef11b7..bd67c2b5 100644 --- a/docs/mkdocs/docs/download_datasets.md +++ b/docs/mkdocs/docs/download_datasets.md @@ -79,6 +79,7 @@ uv run main.py prepare-benchmark get browsecomp-test uv run main.py prepare-benchmark get browsecomp-zh-test uv run main.py prepare-benchmark get hle uv run main.py prepare-benchmark get xbench-ds +uv run main.py prepare-benchmark get futurex ``` ### What This Script Does @@ -94,6 +95,7 @@ uv run main.py prepare-benchmark get xbench-ds - `browsecomp-zh-test` - Chinese BrowseComp test set - `hle` - HLE dataset - `xbench-ds` - xbench-DeepSearch dataset + - `futurex` - Futurex-Online dataset ### Customizing Dataset Selection diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md index bd022806..ea44b78d 100644 --- a/docs/mkdocs/docs/futurex.md +++ b/docs/mkdocs/docs/futurex.md @@ -8,12 +8,15 @@ MiroFlow's evaluation on the Futurex-Online benchmark demonstrates capabilities !!! info "Futurex-Online Dataset" The Futurex-Online dataset consists of 61 prediction tasks covering various future events including: + - Political events (referendums, elections) - Sports outcomes (football matches) - Legal proceedings - Economic indicators + !!! abstract "Key Dataset Characteristics" + - **Total Tasks**: 61 - **Task Type**: Future event prediction - **Answer Format**: Boxed answers (\\boxed{Yes/No} or \\boxed{A/B/C}) @@ -22,10 +25,10 @@ MiroFlow's evaluation on the Futurex-Online benchmark demonstrates capabilities --- -## Reproduction Guide +## Quick Start Guide -!!! note "Reproducibility Instructions" - This section provides step-by-step instructions to reproduce our Futurex-Online benchmark evaluation. Since this is a prediction dataset without ground truth, we focus on execution traces and response generation. +!!! note "Quick Start Instructions" + This section provides step-by-step instructions to run the Futurex-Online benchmark and prepare submission results. Since this is a prediction dataset without ground truth, we focus on execution traces and response generation. **Note**: This is a quick start guide for running the benchmark, not for reproducing exact submitted results. ### Step 1: Prepare the Futurex-Online Dataset @@ -51,7 +54,7 @@ JINA_API_KEY="xxx" # For Linux sandbox (code execution environment) E2B_API_KEY="xxx" -# We use Claude-3.5-Sonnet with OpenRouter backend to initialize the LLM +# We use Claude-3.7-Sonnet with OpenRouter backend to initialize the LLM OPENROUTER_API_KEY="xxx" OPENROUTER_BASE_URL="https://openrouter.ai/api/v1" @@ -69,7 +72,7 @@ OPENAI_BASE_URL="https://api.openai.com/v1" ### Step 3: Run the Evaluation !!! example "Evaluation Execution" - Execute the following command to run evaluation on the Futurex-Online dataset, currently the basic `agent_quickstart_1` is used. + Execute the following command to run evaluation on the Futurex-Online dataset. This uses the basic `agent_quickstart_1` configuration for quick start purposes. ```bash title="Run Futurex-Online Evaluation" uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")" @@ -98,6 +101,7 @@ uv run utils/extract_futurex_results.py --log_dir logs/futurex/$(date +"%Y%m%d_% ``` This will generate: + - `futurex_results.json`: Detailed results for each task - `futurex_summary.json`: Summary statistics - `futurex_predictions.csv`: Predictions in CSV format @@ -126,7 +130,7 @@ Expected Format: \boxed{A}, \boxed{B}, or \boxed{C} ## Multiple Runs and Voting !!! tip "Improving Prediction Accuracy" - For better prediction accuracy, you can run multiple evaluations and use voting mechanisms to aggregate results. This approach helps reduce randomness and improve the reliability of predictions. + For better prediction accuracy, you can run multiple evaluations and use voting mechanisms to aggregate results. This approach helps reduce randomness and improve the reliability of predictions. **Note**: This is a quick start approach; production submissions may use more sophisticated configurations. ### Step 1: Run Multiple Evaluations @@ -137,6 +141,7 @@ Use the multiple runs script to execute several independent evaluations: ``` This script will: + - Run 3 independent evaluations by default (configurable with `NUM_RUNS`) - Execute all tasks in parallel for efficiency - Generate separate result files for each run in `run_1/`, `run_2/`, etc. @@ -167,6 +172,7 @@ After multiple runs, the system automatically: 4. **Provides voting statistics** showing prediction distribution across runs The voting process works as follows: + - **Majority Vote**: Most common prediction across all runs wins - **Tie-breaking**: If tied, chooses the prediction that appeared earliest across all runs - **Vote Counts**: Tracks how many runs predicted each option @@ -212,6 +218,7 @@ The voting aggregation generates a submission file with the following format: ``` The output includes: + - **`id`**: Task identifier - **`prediction`**: Final voted prediction (without `\boxed{}` wrapper) - **`vote_counts`**: Dictionary showing how many runs predicted each option @@ -224,12 +231,14 @@ For example, `"vote_counts": {"No": 2}` means 2 out of 2 runs predicted "No", in !!! warning "No Ground Truth Available" Since Futurex-Online is a prediction dataset, there are no ground truth answers available for evaluation. The focus is on: + - Response generation quality - Reasoning process documentation - Prediction confidence and methodology !!! info "Output Analysis" The evaluation generates detailed execution traces showing: + - Research process for each prediction - Information gathering from web sources - Reasoning chains leading to predictions diff --git a/scripts/run_prepare_benchmark.sh b/scripts/run_prepare_benchmark.sh index a00f7a1a..7574ed3e 100644 --- a/scripts/run_prepare_benchmark.sh +++ b/scripts/run_prepare_benchmark.sh @@ -20,4 +20,5 @@ uv run main.py prepare-benchmark get webwalkerqa uv run main.py prepare-benchmark get browsecomp-test uv run main.py prepare-benchmark get browsecomp-zh-test uv run main.py prepare-benchmark get hle -uv run main.py prepare-benchmark get xbench-ds \ No newline at end of file +uv run main.py prepare-benchmark get xbench-ds +uv run main.py prepare-benchmark get futurex \ No newline at end of file From d1e16375bfe83f537374e7c3f2fcfea85b0d3aa1 Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Thu, 18 Sep 2025 15:12:04 +0800 Subject: [PATCH 04/12] debug: fix wrong calling path. --- docs/mkdocs/docs/futurex.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md index ea44b78d..db20f4df 100644 --- a/docs/mkdocs/docs/futurex.md +++ b/docs/mkdocs/docs/futurex.md @@ -97,7 +97,7 @@ uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark= After evaluation completion, extract the results using the provided utility: ```bash title="Extract Results" -uv run utils/extract_futurex_results.py --log_dir logs/futurex/$(date +"%Y%m%d_%H%M") +uv run utils/extract_futurex_results.py logs/futurex/$(date +"%Y%m%d_%H%M") ``` This will generate: From eb6f302d7f357c4344b68084ab00271f59f40959 Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Wed, 24 Sep 2025 10:03:25 +0800 Subject: [PATCH 05/12] add preparation for finsearchcomp. --- scripts/run_prepare_benchmark.sh | 3 +- utils/prepare_benchmark/gen_finsearchcomp.py | 55 ++++++++++++++++++++ utils/prepare_benchmark/main.py | 8 +++ 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 utils/prepare_benchmark/gen_finsearchcomp.py diff --git a/scripts/run_prepare_benchmark.sh b/scripts/run_prepare_benchmark.sh index 7574ed3e..837b2e45 100644 --- a/scripts/run_prepare_benchmark.sh +++ b/scripts/run_prepare_benchmark.sh @@ -21,4 +21,5 @@ uv run main.py prepare-benchmark get browsecomp-test uv run main.py prepare-benchmark get browsecomp-zh-test uv run main.py prepare-benchmark get hle uv run main.py prepare-benchmark get xbench-ds -uv run main.py prepare-benchmark get futurex \ No newline at end of file +uv run main.py prepare-benchmark get futurex +uv run main.py prepare-benchmark get finsearchcomp \ No newline at end of file diff --git a/utils/prepare_benchmark/gen_finsearchcomp.py b/utils/prepare_benchmark/gen_finsearchcomp.py new file mode 100644 index 00000000..adc01451 --- /dev/null +++ b/utils/prepare_benchmark/gen_finsearchcomp.py @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: 2025 MiromindAI +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Generator, MutableMapping + +from datasets import load_dataset + +from utils.prepare_benchmark.common import Task + +def gen_finsearchcomp(hf_token: str) -> Generator[Task, None, None]: + """ + Generate FinSearchComp dataset tasks in MiroFlow format + + Args: + hf_token: Hugging Face token for dataset access + + Yields: + Task: Standardized task objects + """ + dataset = load_dataset("ByteSeedXpert/FinSearchComp") + + for split_name, split_data in dataset.items(): + for idx, sample in enumerate(split_data): + # Extract task information + task_id = sample.get("prompt_id", f"finsearchcomp_{split_name}_{idx}") + task_question = sample.get("prompt", "") + response_reference = sample.get("response_reference", "") + judge_prompt_template = sample.get("judge_prompt_template", "") + judge_system_prompt = sample.get("judge_system_prompt", "") + label = sample.get("label", "") + + # Create metadata dictionary + metadata: MutableMapping = { + "judge_prompt_template": judge_prompt_template, + "judge_system_prompt": judge_system_prompt, + "label": label, + "source": "ByteSeedXpert/FinSearchComp", + "split": split_name, + "original_id": sample.get("prompt_id", ""), + "dataset_name": "FinSearchComp" + } + + # Create standardized Task object + task = Task( + task_id=task_id, + task_question=task_question, + ground_truth=response_reference, # Futurex-Online doesn't have ground truth + file_path=None, # No file attachments + metadata=metadata, + ) + + yield task + return + \ No newline at end of file diff --git a/utils/prepare_benchmark/main.py b/utils/prepare_benchmark/main.py index 2233bfdc..12db9cf4 100644 --- a/utils/prepare_benchmark/main.py +++ b/utils/prepare_benchmark/main.py @@ -19,6 +19,7 @@ from utils.prepare_benchmark.gen_webwalkerqa import gen_webwalkerqa from utils.prepare_benchmark.gen_xbench_ds import gen_xbench_ds from utils.prepare_benchmark.gen_futurex import gen_futurex +from utils.prepare_benchmark.gen_finsearchcomp import gen_finsearchcomp @dataclasses.dataclass @@ -33,6 +34,7 @@ class _Env: "hle", "xbench-ds", "futurex", + "finsearchcomp", ) meta_filename = "standardized_data.jsonl" data_dir: pathlib.Path @@ -117,6 +119,12 @@ def gen(): for x in gen_futurex(env.hf_token): yield x + return gen + case "finsearchcomp": + def gen(): + for x in gen_finsearchcomp(env.hf_token): + yield x + return gen case _: raise ValueError("not supported") From 4dabaee7bb86470cbca39264229fae629876e9a0 Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Wed, 24 Sep 2025 11:26:14 +0800 Subject: [PATCH 06/12] update a premature version of finsearchcomp benchmark. --- common_benchmark.py | 2 + config/agent_finsearchcomp.yaml | 74 +++++++++++++++ config/benchmark/finsearchcomp.yaml | 19 ++++ src/core/pipeline.py | 7 +- utils/eval_answer_from_log.py | 3 + utils/eval_utils.py | 96 ++++++++++++++++++++ utils/prepare_benchmark/gen_finsearchcomp.py | 29 ++++-- utils/util_llm_parallel_thinking.py | 2 +- utils/util_llm_simple_voting.py | 2 +- 9 files changed, 222 insertions(+), 12 deletions(-) create mode 100644 config/agent_finsearchcomp.yaml create mode 100644 config/benchmark/finsearchcomp.yaml diff --git a/common_benchmark.py b/common_benchmark.py index 8c93e356..da268d27 100644 --- a/common_benchmark.py +++ b/common_benchmark.py @@ -210,6 +210,7 @@ async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult: sub_agent_tool_managers=self.sub_agent_tool_managers, output_formatter=self.output_formatter, ground_truth=task.ground_truth, + metadata=task.metadata, log_path=self.output_dir / f"task_{task.task_id}_attempt_{attempt}.json", ) @@ -242,6 +243,7 @@ async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult: question=task.task_question, target=task.ground_truth, predicted_answer=attempt_result["model_boxed_answer"], + metadata=task.metadata, ) attempt_result["judge_result"] = evaluation_result attempt_result["is_correct"] = evaluation_result == "CORRECT" diff --git a/config/agent_finsearchcomp.yaml b/config/agent_finsearchcomp.yaml new file mode 100644 index 00000000..37f42721 --- /dev/null +++ b/config/agent_finsearchcomp.yaml @@ -0,0 +1,74 @@ +defaults: + - benchmark: finsearchcomp + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPrompt_GAIA + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reasoning + + max_turns: -1 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + o3_hint: true + output_process: + o3_final_answer: true + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: + agent-worker: + prompt_class: SubAgentWorkerPrompt + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-searching + - tool-image-video + - tool-reading + - tool-code + - tool-audio + + max_turns: -1 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored diff --git a/config/benchmark/finsearchcomp.yaml b/config/benchmark/finsearchcomp.yaml new file mode 100644 index 00000000..0aaa8211 --- /dev/null +++ b/config/benchmark/finsearchcomp.yaml @@ -0,0 +1,19 @@ +# config/benchmark/finsearchcomp.yaml +defaults: + - default + - _self_ + +name: "finsearchcomp" + +data: + data_dir: "${data_dir}/finsearchcomp" # Path to finsearchcomp dataset + metadata_file: "standardized_data.jsonl" # Metadata filename + whitelist: [] # Optional: List of specific task_ids to run + +execution: + max_tasks: null # null = no limit, or specify a number + max_concurrent: 5 # Number of parallel tasks + pass_at_k: 1 # Number of attempts per task + +# OpenAI API key for evaluation (required for finsearchcomp since it has ground truth) +openai_api_key: "${oc.env:OPENAI_API_KEY,???}" diff --git a/src/core/pipeline.py b/src/core/pipeline.py index ea5b62c7..664ae47a 100644 --- a/src/core/pipeline.py +++ b/src/core/pipeline.py @@ -31,6 +31,7 @@ async def execute_task_pipeline( output_formatter: OutputFormatter, log_path: pathlib.Path, ground_truth: str | None = None, + metadata: dict | None = None, ) -> tuple[str, str, pathlib.Path]: """ Executes the full pipeline for a single task. @@ -61,7 +62,11 @@ async def execute_task_pipeline( task_id=task_id, task_file_name=task_file_name, ground_truth=ground_truth, - input={"task_description": task_description, "task_file_name": task_file_name}, + input={ + "task_description": task_description, + "task_file_name": task_file_name, + "metadata": metadata or {} + }, ) main_agent_llm_client = None diff --git a/utils/eval_answer_from_log.py b/utils/eval_answer_from_log.py index 88af7f0a..838da352 100644 --- a/utils/eval_answer_from_log.py +++ b/utils/eval_answer_from_log.py @@ -33,6 +33,8 @@ async def main(input_dir: str, benchmark_name: str): question = data.get("task_question", "") ground_truth = data.get("ground_truth", "") predicted_answer = data.get("final_boxed_answer", "") + metadata = data.get("input", {}).get("metadata", {}) + # If already has judge result, skip # if "judge_result" in data and data["judge_result"] in ("CORRECT", "INCORRECT"): # print(f"Log {log_file} already has judge result: {data['judge_result']}") @@ -44,6 +46,7 @@ async def main(input_dir: str, benchmark_name: str): question=question, target=ground_truth, predicted_answer=predicted_answer, + metadata=metadata, # Now metadata is available from log files ) print(f"{os.path.basename(log_file)}: {result}") # Optionally, update the log file with the result diff --git a/utils/eval_utils.py b/utils/eval_utils.py index 4529e56e..fd838763 100644 --- a/utils/eval_utils.py +++ b/utils/eval_utils.py @@ -374,18 +374,114 @@ def is_float(element: Any) -> bool: # return "NOT_ATTEMPTED" +@retry(wait=wait_exponential(multiplier=5), stop=stop_after_attempt(5)) +async def verify_answer_llm_finsearchcomp( + openai_client: AsyncOpenAI, + question: str, + target: str, + predicted_answer: str, + judge_prompt_template: str, + judge_system_prompt: str, + metadata: dict = None +) -> str: + """ + Use FinSearchComp-style LLM judge with dynamic prompts to verify if the predicted answer is correct. + + Args: + openai_client: OpenAI client for LLM calls + question: The question being answered + target: The correct/target answer (primary ground truth) + predicted_answer: The model's predicted answer + judge_prompt_template: The judge prompt template from metadata + judge_system_prompt: The judge system prompt from metadata + metadata: Additional metadata containing response_reference and ground_truth_finance + + Returns: + String indicating the evaluation result: "CORRECT", "INCORRECT", or "NOT_ATTEMPTED" + """ + # Get the appropriate ground truth based on the prompt template + response_reference = metadata.get("response_reference", "") if metadata else "" + ground_truth_finance = metadata.get("ground_truth_finance", "") if metadata else "" + + # Format the judge prompt template with the actual values + formatted_prompt = judge_prompt_template.format( + prompt=question, + response_reference=response_reference, + ground_truth=ground_truth_finance, + response=predicted_answer + ) + + # Create messages with system prompt and user prompt + messages = [ + {"role": "system", "content": judge_system_prompt}, + {"role": "user", "content": formatted_prompt} + ] + + try: + # NOTE: no explicit LLM model is specified here, so we use gpt-4o-mini for consistency + response = await openai_client.chat.completions.create( + model="gpt-4o-mini", + messages=messages, + max_completion_tokens=2048, + temperature=0.0 # Deterministic evaluation + ) + + content = response.choices[0].message.content + + # Print FinSearchComp judge reasoning + print(f"FinSearchComp LLM Judge Response: {content}") + + # Parse the response to determine if it's correct + # Look for common patterns in the response + content_lower = content.lower() + + # Check for JSON format responses + if "answer_score" in content_lower: + if '"answer_score": 1' in content or '"answer_score":1' in content: + return "CORRECT" + elif '"answer_score": 0' in content or '"answer_score":0' in content: + return "INCORRECT" + + # Check for score format responses + if "score" in content_lower: + if '"score": 1' in content or '"score":1' in content: + return "CORRECT" + elif '"score": 0' in content or '"score":0' in content: + return "INCORRECT" + + # If we can't parse the response, return NOT_ATTEMPTED + print(f"Warning: Could not parse FinSearchComp judge response: {content}") + return "NOT_ATTEMPTED" + + except Exception as e: + print(f"FinSearchComp LLM evaluation failed: {e}") + return "NOT_ATTEMPTED" + + async def verify_answer_for_datasets( openai_client: AsyncOpenAI, benchmark_name: str, question: str, target: str, predicted_answer: str, + metadata: dict = None, ) -> str: """ Verify the answer for a given dataset. """ try: + # Handle finsearchcomp with dynamic judge prompts + if "finsearchcomp" in benchmark_name and metadata: + judge_prompt_template = metadata.get("judge_prompt_template", "") + judge_system_prompt = metadata.get("judge_system_prompt", "") + + if judge_prompt_template and judge_system_prompt: + return await verify_answer_llm_finsearchcomp( + openai_client, question, target, predicted_answer, + judge_prompt_template, judge_system_prompt, metadata + ) + # for all questions, do gaia scorer first, if not return CORRECT, then do others gaia_scorer_answer = await verify_answer_gaia(target, predicted_answer) diff --git a/utils/prepare_benchmark/gen_finsearchcomp.py b/utils/prepare_benchmark/gen_finsearchcomp.py index adc01451..f56d675a 100644 --- a/utils/prepare_benchmark/gen_finsearchcomp.py +++ b/utils/prepare_benchmark/gen_finsearchcomp.py @@ -26,26 +26,37 @@ def gen_finsearchcomp(hf_token: str) -> Generator[Task, None, None]: task_id = sample.get("prompt_id", f"finsearchcomp_{split_name}_{idx}") task_question = sample.get("prompt", "") response_reference = sample.get("response_reference", "") - judge_prompt_template = sample.get("judge_prompt_template", "") - judge_system_prompt = sample.get("judge_system_prompt", "") - label = sample.get("label", "") + ground_truth_finance = sample.get("ground_truth", "") - # Create metadata dictionary + # Create metadata dictionary with all original fields metadata: MutableMapping = { - "judge_prompt_template": judge_prompt_template, - "judge_system_prompt": judge_system_prompt, - "label": label, "source": "ByteSeedXpert/FinSearchComp", "split": split_name, "original_id": sample.get("prompt_id", ""), - "dataset_name": "FinSearchComp" + "dataset_name": "FinSearchComp", + "response_reference": response_reference, + "ground_truth_finance": ground_truth_finance, } + # Add all other fields from sample to metadata (including judge prompts) + for key, value in sample.items(): + if key not in ["prompt_id", "prompt", "response_reference", "ground_truth"]: + metadata[key] = value + + # Determine the primary ground truth for evaluation + # Priority: response_reference > ground_truth_finance + if response_reference: + ground_truth_task = response_reference + elif ground_truth_finance: + ground_truth_task = ground_truth_finance + else: + ground_truth_task = "" # Fallback to empty string + # Create standardized Task object task = Task( task_id=task_id, task_question=task_question, - ground_truth=response_reference, # Futurex-Online doesn't have ground truth + ground_truth=ground_truth_task, file_path=None, # No file attachments metadata=metadata, ) diff --git a/utils/util_llm_parallel_thinking.py b/utils/util_llm_parallel_thinking.py index 7b5ede5c..7c2d446a 100644 --- a/utils/util_llm_parallel_thinking.py +++ b/utils/util_llm_parallel_thinking.py @@ -393,7 +393,7 @@ async def process_single_task( ) result = await verify_answer_for_datasets( - client, BENCHMARK_NAME, "", data[0]["ground_truth"], selected_solution + client, BENCHMARK_NAME, "", data[0]["ground_truth"], selected_solution, {} ) task_result = { diff --git a/utils/util_llm_simple_voting.py b/utils/util_llm_simple_voting.py index 7b6cee2f..aefa377f 100644 --- a/utils/util_llm_simple_voting.py +++ b/utils/util_llm_simple_voting.py @@ -299,7 +299,7 @@ async def process_single_task( selected_solution = response["final_answer"] reasoning = response["reasoning"] result = await verify_answer_for_datasets( - BENCHMARK_NAME, "", data[0]["ground_truth"], selected_solution + None, BENCHMARK_NAME, "", data[0]["ground_truth"], selected_solution, {} ) task_result = { From c086e414338fad5e1734c4b9763d19c8084d155e Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Wed, 24 Sep 2025 13:11:27 +0800 Subject: [PATCH 07/12] clean redundent code in merging. --- docs/mkdocs/mkdocs.yml | 1 - scripts/run_prepare_benchmark.sh | 2 -- 2 files changed, 3 deletions(-) diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index ccdcd1b5..f508e144 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -53,7 +53,6 @@ nav: - GAIA-Validation-Text-Only: gaia_validation_text_only.md - GAIA-Test: gaia_test.md - FutureX: futurex.md - - FutureX: futurex.md - Download Datasets: download_datasets.md - Add New Benchmarks: contribute_benchmarks.md diff --git a/scripts/run_prepare_benchmark.sh b/scripts/run_prepare_benchmark.sh index 55c0a92f..837b2e45 100644 --- a/scripts/run_prepare_benchmark.sh +++ b/scripts/run_prepare_benchmark.sh @@ -21,7 +21,5 @@ uv run main.py prepare-benchmark get browsecomp-test uv run main.py prepare-benchmark get browsecomp-zh-test uv run main.py prepare-benchmark get hle uv run main.py prepare-benchmark get xbench-ds -uv run main.py prepare-benchmark get futurex - uv run main.py prepare-benchmark get futurex uv run main.py prepare-benchmark get finsearchcomp \ No newline at end of file From d6a871591e749a5cfff11fe5077e934b79082065 Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Thu, 25 Sep 2025 09:21:52 +0800 Subject: [PATCH 08/12] upd: modify yaml to use Mirothinker as the main agent, add check progress file to exclude T1. --- config/agent_finsearchcomp.yaml | 33 +-- .../check_finsearchcomp_progress.py | 249 ++++++++++++++++++ 2 files changed, 263 insertions(+), 19 deletions(-) create mode 100755 utils/progress_check/check_finsearchcomp_progress.py diff --git a/config/agent_finsearchcomp.yaml b/config/agent_finsearchcomp.yaml index 37f42721..836588f3 100644 --- a/config/agent_finsearchcomp.yaml +++ b/config/agent_finsearchcomp.yaml @@ -7,25 +7,23 @@ defaults: main_agent: prompt_class: MainAgentPrompt_GAIA llm: - provider_class: "ClaudeOpenRouterClient" - model_name: "anthropic/claude-3.7-sonnet" + provider_class: "MiroThinkerSGLangClient" + model_name: "MODEL_NAME" async_client: true - temperature: 0.3 + temperature: 0.6 top_p: 0.95 min_p: 0.0 top_k: -1 - max_tokens: 32000 - openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" - openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" - openrouter_provider: "anthropic" - disable_cache_control: false + max_tokens: 8192 + oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}" + oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}" keep_tool_result: -1 oai_tool_thinking: false tool_config: - tool-reasoning - max_turns: -1 # Maximum number of turns for main agent execution + max_turns: 20 # Maximum number of turns for main agent execution max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn input_process: @@ -43,18 +41,16 @@ sub_agents: agent-worker: prompt_class: SubAgentWorkerPrompt llm: - provider_class: "ClaudeOpenRouterClient" - model_name: "anthropic/claude-3.7-sonnet" + provider_class: "MiroThinkerSGLangClient" + model_name: "MODEL_NAME" async_client: true - temperature: 0.3 + temperature: 0.6 top_p: 0.95 min_p: 0.0 top_k: -1 - max_tokens: 32000 - openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" - openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" - openrouter_provider: "anthropic" - disable_cache_control: false + max_tokens: 8192 + oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}" + oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}" keep_tool_result: -1 oai_tool_thinking: false @@ -65,10 +61,9 @@ sub_agents: - tool-code - tool-audio - max_turns: -1 # Maximum number of turns for main agent execution + max_turns: 20 # Maximum number of turns for main agent execution max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn - # Can define some top-level or default parameters here output_dir: logs/ data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored diff --git a/utils/progress_check/check_finsearchcomp_progress.py b/utils/progress_check/check_finsearchcomp_progress.py new file mode 100755 index 00000000..d6084ddb --- /dev/null +++ b/utils/progress_check/check_finsearchcomp_progress.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +""" +FinSearchComp Progress Checker + +This script analyzes FinSearchComp benchmark results in a log folder to count: +- Total files processed +- Files with status "completed" +- Files with status "completed" AND judge_result "CORRECT" (excluding T1 tasks) +- Breakdown by task type (T1, T2, T3) + +Note: T1 (Time-Sensitive Data Fetching) tasks are excluded from correctness evaluation +because their ground truth is outdated, but they are still counted as completed. + +Usage: + python check_finsearchcomp_progress.py [LOG_FOLDER_PATH] + +If no path is provided, uses the current directory. +""" + +import json +import re +import sys +from pathlib import Path +from typing import Dict, List, Tuple + + +def extract_task_type(task_id: str) -> str: + """ + Extract task type (T1, T2, T3) from task_id. + + Args: + task_id: Task ID string like "(T1)Time_Sensitive_Data_Fetching_006" + + Returns: + Task type string ("T1", "T2", "T3", or "Unknown") + """ + match = re.match(r'^\(T(\d+)\)', task_id) + if match: + return f"T{match.group(1)}" + return "Unknown" + + +def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]: + """ + Analyze FinSearchComp benchmark results from JSON log files. + + Args: + log_folder: Path to folder containing task_*.json files + + Returns: + Dictionary with counts of different categories + """ + log_path = Path(log_folder) + + if not log_path.exists(): + raise FileNotFoundError(f"Log folder not found: {log_folder}") + + # Find all task JSON files + json_files = list(log_path.glob("task_*_attempt_*.json")) + + results = { + "total_files": 0, + "completed_status": 0, + "completed_and_correct": 0, + "completed_and_incorrect": 0, + "other_status": 0, + "parse_errors": 0, + "task_type_breakdown": { + "T1": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + "Unknown": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0} + } + } + + completed_correct_files = [] + completed_incorrect_files = [] + parse_error_files = [] + + print(f"Scanning {len(json_files)} files in {log_folder}...") + + for json_file in json_files: + results["total_files"] += 1 + + try: + with open(json_file, "r", encoding="utf-8") as f: + data = json.load(f) + + task_id = data.get("task_id", "") + task_type = extract_task_type(task_id) + status = data.get("status", "").lower() + judge_result = data.get("judge_result", "").upper() + + # Update task type breakdown + results["task_type_breakdown"][task_type]["total"] += 1 + + if status == "completed": + results["completed_status"] += 1 + results["task_type_breakdown"][task_type]["completed"] += 1 + + # For T1 tasks, exclude from correctness evaluation but count as completed + if task_type == "T1": + # T1 tasks are considered "completed" but not evaluated for correctness + # due to outdated ground truth + pass + else: + # For T2 and T3 tasks, evaluate correctness + if judge_result == "CORRECT": + results["completed_and_correct"] += 1 + results["task_type_breakdown"][task_type]["correct"] += 1 + completed_correct_files.append(json_file.name) + else: + results["completed_and_incorrect"] += 1 + results["task_type_breakdown"][task_type]["incorrect"] += 1 + completed_incorrect_files.append((json_file.name, judge_result)) + else: + results["other_status"] += 1 + + except (json.JSONDecodeError, KeyError, FileNotFoundError) as e: + results["parse_errors"] += 1 + parse_error_files.append((json_file.name, str(e))) + print(f"Error parsing {json_file.name}: {e}") + + return ( + results, + completed_correct_files, + completed_incorrect_files, + parse_error_files, + ) + + +def display_results( + results: Dict[str, any], + correct_files: List[str], + incorrect_files: List[Tuple[str, str]], + error_files: List[Tuple[str, str]], +) -> None: + """Display the analysis results in a formatted way.""" + + print("\n" + "=" * 70) + print("FINSEARCHCOMP BENCHMARK RESULTS SUMMARY") + print("=" * 70) + + total = results["total_files"] + completed = results["completed_status"] + correct = results["completed_and_correct"] + incorrect = results["completed_and_incorrect"] + + print(f"Total files processed: {total:3d}") + print( + f"Files with status 'completed': {completed:3d} ({completed/total*100:.1f}%)" + ) + print(f"Files completed AND correct: {correct:3d} ({correct/total*100:.1f}%)") + print( + f"Files completed but incorrect: {incorrect:3d} ({incorrect/total*100:.1f}%)" + ) + print(f"Files with other status: {results['other_status']:3d}") + print(f"Files with parse errors: {results['parse_errors']:3d}") + + # Calculate accuracy excluding T1 tasks + t2_t3_completed = ( + results["task_type_breakdown"]["T2"]["completed"] + + results["task_type_breakdown"]["T3"]["completed"] + ) + t2_t3_correct = ( + results["task_type_breakdown"]["T2"]["correct"] + + results["task_type_breakdown"]["T3"]["correct"] + ) + + if t2_t3_completed > 0: + accuracy = t2_t3_correct / t2_t3_completed * 100 + print(f"\nAccuracy rate (T2+T3 correct/completed): {accuracy:.1f}%") + print(f" (T1 tasks excluded due to outdated ground truth)") + + # Task type breakdown + print("\n" + "-" * 70) + print("TASK TYPE BREAKDOWN") + print("-" * 70) + + for task_type in ["T1", "T2", "T3", "Unknown"]: + breakdown = results["task_type_breakdown"][task_type] + if breakdown["total"] > 0: + completion_rate = breakdown["completed"] / breakdown["total"] * 100 + if task_type == "T1": + print(f"{task_type} (Time-Sensitive Data Fetching):") + print(f" Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)") + print(f" Note: Excluded from correctness evaluation (outdated ground truth)") + else: + accuracy_rate = breakdown["correct"] / breakdown["completed"] * 100 if breakdown["completed"] > 0 else 0 + print(f"{task_type} ({'Simple Historical Lookup' if task_type == 'T2' else 'Complex Historical Investigation'}):") + print(f" Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)") + print(f" Correct: {breakdown['correct']:3d}, Incorrect: {breakdown['incorrect']:3d}") + print(f" Accuracy: {accuracy_rate:.1f}%") + + print("\n" + "-" * 70) + print(f"SUMMARY: {completed} tasks completed, {correct} T2+T3 tasks correct") + print(f" (T1 tasks: {results['task_type_breakdown']['T1']['completed']} completed, excluded from evaluation)") + print("-" * 70) + + # Show some example files for verification + if correct_files: + print("\nFirst 5 correct files (T2+T3 only):") + for i, filename in enumerate(correct_files[:5], 1): + print(f" {i}. {filename}") + if len(correct_files) > 5: + print(f" ... and {len(correct_files) - 5} more") + + if incorrect_files: + print("\nFirst 5 incorrect files (T2+T3 only):") + for i, (filename, judge_result) in enumerate(incorrect_files[:5], 1): + print(f" {i}. {filename} -> {judge_result}") + if len(incorrect_files) > 5: + print(f" ... and {len(incorrect_files) - 5} more") + + if error_files: + print("\nFiles with parse errors:") + for filename, error in error_files: + print(f" - {filename}: {error}") + + +def main(): + """Main function to run the analysis.""" + + # Check if folder path was provided as command line argument + if len(sys.argv) > 1: + log_folder = sys.argv[1] + print(f"Using provided folder path: {log_folder}") + else: + log_folder = "." + print(f"No folder path provided, using current directory: {log_folder}") + + try: + print(f"Analyzing FinSearchComp benchmark results in: {log_folder}") + results, correct_files, incorrect_files, error_files = analyze_finsearchcomp_results( + log_folder + ) + display_results(results, correct_files, incorrect_files, error_files) + + except Exception as e: + print(f"Error: {e}") + print(f"\nUsage: python {sys.argv[0]} [LOG_FOLDER_PATH]") + print(f"Example: python {sys.argv[0]} logs/finsearchcomp/agent_finsearchcomp_20250924_1555") + return 1 + + return 0 + + +if __name__ == "__main__": + exit(main()) From e7163d33e3904a546f8f43d3d746026f1bcb2ecb Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Thu, 25 Sep 2025 09:49:35 +0800 Subject: [PATCH 09/12] upd: check_progress function for finsearchcomp now consider globe and greater china respectively. --- .../check_finsearchcomp_progress.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/utils/progress_check/check_finsearchcomp_progress.py b/utils/progress_check/check_finsearchcomp_progress.py index d6084ddb..a5b26595 100755 --- a/utils/progress_check/check_finsearchcomp_progress.py +++ b/utils/progress_check/check_finsearchcomp_progress.py @@ -40,6 +40,27 @@ def extract_task_type(task_id: str) -> str: return "Unknown" +def extract_region_from_label(label: str) -> str: + """ + Extract region from the label field. + + Args: + label: Label string like "Complex_Historical_Investigation(Global)" or "Financial_Analysis(Greater_China)" + + Returns: + Region string ("Global", "Greater_China", or "Unknown") + """ + if not label: + return "Unknown" + + if "(Global)" in label: + return "Global" + elif "(Greater_China)" in label: + return "Greater_China" + else: + return "Unknown" + + def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]: """ Analyze FinSearchComp benchmark results from JSON log files. @@ -70,6 +91,16 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]: "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, "Unknown": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0} + }, + "regional_breakdown": { + "Global": { + "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0} + }, + "Greater_China": { + "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0} + } } } @@ -90,13 +121,25 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]: task_type = extract_task_type(task_id) status = data.get("status", "").lower() judge_result = data.get("judge_result", "").upper() + + # Extract region from label + label = data.get("input", {}).get("metadata", {}).get("label", "") + region = extract_region_from_label(label) # Update task type breakdown results["task_type_breakdown"][task_type]["total"] += 1 + + # Update regional breakdown for T2 and T3 tasks + if task_type in ["T2", "T3"] and region in results["regional_breakdown"]: + results["regional_breakdown"][region][task_type]["total"] += 1 if status == "completed": results["completed_status"] += 1 results["task_type_breakdown"][task_type]["completed"] += 1 + + # Update regional breakdown for completed T2 and T3 tasks + if task_type in ["T2", "T3"] and region in results["regional_breakdown"]: + results["regional_breakdown"][region][task_type]["completed"] += 1 # For T1 tasks, exclude from correctness evaluation but count as completed if task_type == "T1": @@ -108,10 +151,16 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]: if judge_result == "CORRECT": results["completed_and_correct"] += 1 results["task_type_breakdown"][task_type]["correct"] += 1 + # Update regional breakdown for correct T2 and T3 tasks + if task_type in ["T2", "T3"] and region in results["regional_breakdown"]: + results["regional_breakdown"][region][task_type]["correct"] += 1 completed_correct_files.append(json_file.name) else: results["completed_and_incorrect"] += 1 results["task_type_breakdown"][task_type]["incorrect"] += 1 + # Update regional breakdown for incorrect T2 and T3 tasks + if task_type in ["T2", "T3"] and region in results["regional_breakdown"]: + results["regional_breakdown"][region][task_type]["incorrect"] += 1 completed_incorrect_files.append((json_file.name, judge_result)) else: results["other_status"] += 1 @@ -192,6 +241,24 @@ def display_results( print(f" Correct: {breakdown['correct']:3d}, Incorrect: {breakdown['incorrect']:3d}") print(f" Accuracy: {accuracy_rate:.1f}%") + # Regional breakdown for T2 and T3 + print("\n" + "-" * 70) + print("REGIONAL BREAKDOWN (T2 & T3 TASKS)") + print("-" * 70) + + for region in ["Global", "Greater_China"]: + print(f"\n{region} Region:") + for task_type in ["T2", "T3"]: + breakdown = results["regional_breakdown"][region][task_type] + if breakdown["total"] > 0: + completion_rate = breakdown["completed"] / breakdown["total"] * 100 + accuracy_rate = breakdown["correct"] / breakdown["completed"] * 100 if breakdown["completed"] > 0 else 0 + task_name = "Simple Historical Lookup" if task_type == "T2" else "Complex Historical Investigation" + print(f" {task_type} ({task_name}):") + print(f" Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)") + print(f" Correct: {breakdown['correct']:3d}, Incorrect: {breakdown['incorrect']:3d}") + print(f" Accuracy: {accuracy_rate:.1f}%") + print("\n" + "-" * 70) print(f"SUMMARY: {completed} tasks completed, {correct} T2+T3 tasks correct") print(f" (T1 tasks: {results['task_type_breakdown']['T1']['completed']} completed, excluded from evaluation)") From 256ba2c2b3a382eec1c4156ebf362b3c7fd98915 Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Thu, 25 Sep 2025 10:20:05 +0800 Subject: [PATCH 10/12] upd: add docs and shell script for multiple runs. --- docs/mkdocs/docs/finsearchcomp.md | 178 ++++++++++++++++++ docs/mkdocs/mkdocs.yml | 1 + ...un_evaluate_multiple_runs_finsearchcomp.sh | 104 ++++++++++ 3 files changed, 283 insertions(+) create mode 100644 docs/mkdocs/docs/finsearchcomp.md create mode 100755 scripts/run_evaluate_multiple_runs_finsearchcomp.sh diff --git a/docs/mkdocs/docs/finsearchcomp.md b/docs/mkdocs/docs/finsearchcomp.md new file mode 100644 index 00000000..19eb6f3e --- /dev/null +++ b/docs/mkdocs/docs/finsearchcomp.md @@ -0,0 +1,178 @@ +# FinSearchComp + +MiroFlow's evaluation on the FinSearchComp benchmark demonstrates capabilities in financial information search and analysis tasks, showcasing advanced reasoning abilities in complex financial research scenarios. + +More details: [FinSearchComp Dataset](https://huggingface.co/datasets/ByteSeedXpert/FinSearchComp) + +--- + +## Dataset Overview + +!!! info "FinSearchComp Dataset" + The FinSearchComp dataset consists of financial search and analysis tasks that require comprehensive research capabilities including: + + - Financial data retrieval and analysis + - Market research and company analysis + - Investment decision support + - Financial news and report interpretation + - Time-sensitive financial information gathering + +!!! abstract "Key Dataset Characteristics" + + - **Total Tasks**: 635 (across T1, T2, T3 categories) + - **Task Types**: + - **T1**: Time-Sensitive Data Fetching + - **T2**: Financial Analysis and Research + - **T3**: Complex Historical Investigation + - **Answer Format**: Detailed financial analysis and research reports + - **Ground Truth**: Available for T2 and T3 tasks, changes dynamically for T1 tasks + - **Evaluation**: Judge-based evaluation with correctness assessment + +--- + +## Quick Start Guide + +!!! note "Quick Start Instructions" + This section provides step-by-step instructions to run the FinSearchComp benchmark and prepare submission results. **Note**: This is a quick start guide for running the benchmark, not for reproducing exact submitted results. + +### Step 1: Prepare the FinSearchComp Dataset + +!!! tip "Dataset Setup" + Use the integrated prepare-benchmark command to download and process the dataset: + +```bash title="Download FinSearchComp Dataset" +uv run main.py prepare-benchmark get finsearchcomp +``` + +This will create the standardized dataset at `data/finsearchcomp/standardized_data.jsonl`. + +### Step 2: Configure API Keys + +!!! warning "API Key Configuration" + Set up the required API keys for model access and tool functionality. Update the `.env` file to include the following keys: + +```env title=".env Configuration" +# For searching and web scraping +SERPER_API_KEY="xxx" +JINA_API_KEY="xxx" + +# For Linux sandbox (code execution environment) +E2B_API_KEY="xxx" + +# We use MiroThinker model for financial analysis +OAI_MIROTHINKER_API_KEY="xxx" +OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1" + +# Used for o3 hints and final answer extraction +OPENAI_API_KEY="xxx" +OPENAI_BASE_URL="https://api.openai.com/v1" + +# Used for Claude vision understanding +ANTHROPIC_API_KEY="xxx" + +# Used for Gemini vision +GEMINI_API_KEY="xxx" +``` + +### Step 3: Run the Evaluation + +!!! example "Evaluation Execution" + Execute the following command to run evaluation on the FinSearchComp dataset: + +```bash title="Run FinSearchComp Evaluation" +uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" +``` + +!!! tip "Progress Monitoring and Resume" + To check the progress while running: + + ```bash title="Check Progress" + uv run utils/progress_check/check_finsearchcomp_progress.py $PATH_TO_LOG + ``` + + If you need to resume an interrupted evaluation, specify the same output directory to continue from where you left off. + + ```bash title="Resume Evaluation, e.g." + uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir=${PATH_TO_LOG} + ``` + +### Step 4: Extract Results + +!!! example "Result Extraction" + After evaluation completion, the results are automatically generated in the output directory: + +- `benchmark_results.jsonl`: Detailed results for each task +- `benchmark_results_pass_at_1_accuracy.txt`: Summary accuracy statistics +- `task_*_attempt_1.json`: Individual task execution traces + +--- + +## Evaluation Notes + +!!! warning "Task Type Considerations" + The FinSearchComp dataset includes different task types with varying evaluation criteria: + + - **T1 Tasks**: Time-Sensitive Data Fetching tasks are excluded from correctness evaluation due to outdated ground truth, but completion is still tracked + - **T2 Tasks**: Financial Analysis tasks are evaluated for correctness and quality + - **T3 Tasks**: Complex Historical Investigation tasks require comprehensive research and analysis + +!!! info "Output Analysis" + The evaluation generates detailed execution traces showing: + + - Research process for each financial task + - Information gathering from multiple sources + - Financial calculations and analysis + - Comprehensive reports with insights and recommendations + +### Directory Structure + +After running evaluations, you'll find the following structure: + +``` +logs/finsearchcomp/agent_finsearchcomp_YYYYMMDD_HHMM/ +├── benchmark_results.jsonl # Task results summary +├── benchmark_results_pass_at_1_accuracy.txt # Accuracy statistics +├── task_(T1)Time_Sensitive_Data_Fetching_*.json # T1 task traces +├── task_(T2)Financial_Analysis_*.json # T2 task traces +├── task_(T3)Complex_Historical_Investigation_*.json # T3 task traces +└── output.log # Execution log +``` + +### Task Categories Breakdown + +The progress checker provides detailed statistics: + +- **Total Tasks**: Complete count across all categories +- **Completed Tasks**: Successfully finished tasks +- **Correct Tasks**: Tasks with judge_result "CORRECT" (T2 and T3 only) +- **Category Breakdown**: Separate counts for T1, T2, and T3 tasks +- **Accuracy Metrics**: Pass@1 accuracy for evaluable tasks + +--- + +## Usage Examples + +### Single Run Evaluation +```bash title="Basic Evaluation" +uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" +``` + +### Limited Task Testing +```bash title="Test with Limited Tasks" +uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp benchmark.execution.max_tasks=5 output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" +``` + +### Custom Agent Configuration +```bash title="Different Agent Setup" +uv run main.py common-benchmark --config_file_name=agent_gaia-validation benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" +``` + +### Multiple Runs for Reliability +```bash title="Multiple Runs" +NUM_RUNS=5 ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh +``` + +--- + +!!! info "Documentation Info" + **Last Updated:** September 2025 · **Doc Contributor:** Team @ MiroMind AI diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index 70f69051..1a488830 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -54,6 +54,7 @@ nav: - GAIA-Test: gaia_test.md - FutureX: futurex.md - xBench-DeepSearch: xbench_ds.md + - FinSearchComp: finsearchcomp.md - Download Datasets: download_datasets.md - Add New Benchmarks: contribute_benchmarks.md diff --git a/scripts/run_evaluate_multiple_runs_finsearchcomp.sh b/scripts/run_evaluate_multiple_runs_finsearchcomp.sh new file mode 100755 index 00000000..e7c90fe3 --- /dev/null +++ b/scripts/run_evaluate_multiple_runs_finsearchcomp.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: 2025 MiromindAI +# +# SPDX-License-Identifier: Apache-2.0 + +# Multiple runs FinSearchComp evaluation script +# Based on the working command: uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir=logs/finsearchcomp/$(date +"%Y%m%d_%H%M") + +# Configuration parameters +NUM_RUNS=${NUM_RUNS:-3} +MAX_TASKS=${MAX_TASKS:-1} +MAX_CONCURRENT=${MAX_CONCURRENT:-5} +BENCHMARK_NAME="finsearchcomp" +AGENT_SET=${AGENT_SET:-"agent_finsearchcomp"} + +# Set results directory with timestamp +TIMESTAMP=$(date +%Y%m%d_%H%M) +RESULTS_DIR="logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}" + +export LOGGER_LEVEL="INFO" + +echo "🚀 Starting $NUM_RUNS runs of FinSearchComp evaluation..." +echo "📊 Using max_tasks: $MAX_TASKS (set MAX_TASKS=null for full dataset)" +echo "📊 Using max_concurrent: $MAX_CONCURRENT" +echo "📁 Results will be saved in: $RESULTS_DIR" + +# Create results directory +mkdir -p "$RESULTS_DIR" + +# Launch all parallel tasks +for i in $(seq 1 $NUM_RUNS); do + echo "==========================================" + echo "🚀 Launching experiment $i/$NUM_RUNS" + echo "📝 Output log: $RESULTS_DIR/run_${i}_output.log" + echo "==========================================" + + # Set specific identifier for this run + RUN_ID="run_$i" + + # Run experiment (background execution) + ( + echo "Starting run $i at $(date)" + uv run main.py common-benchmark \ + --config_file_name=$AGENT_SET \ + benchmark=$BENCHMARK_NAME \ + benchmark.execution.max_tasks=$MAX_TASKS \ + benchmark.execution.max_concurrent=$MAX_CONCURRENT \ + benchmark.execution.pass_at_k=1 \ + output_dir=${RESULTS_DIR}/$RUN_ID \ + hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ + > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1 + + # Check if run was successful + if [ $? -eq 0 ]; then + echo "✅ Run $i completed successfully at $(date)" + RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) + if [ -f "$RESULT_FILE" ]; then + echo "📊 Results saved to $RESULT_FILE" + else + echo "⚠️ Warning: Result file not found for run $i" + fi + else + echo "❌ Run $i failed at $(date)!" + fi + ) & + + # Small delay between launches + sleep 2 +done + +echo "🎯 All $NUM_RUNS runs have been launched in parallel" +echo "⏳ Waiting for all runs to complete..." + +# Wait for all background tasks to complete +wait + +echo "==========================================" +echo "🎉 All $NUM_RUNS runs completed!" +echo "==========================================" + +# Show progress summary +echo "==========================================" +echo "📊 Progress Summary:" +echo "==========================================" + +echo "==========================================" +echo "🎯 Multiple runs FinSearchComp evaluation completed!" +echo "📁 Check results in: $RESULTS_DIR" +echo "📝 Check individual run logs: $RESULTS_DIR/run_*_output.log" +echo "==========================================" +echo "" +echo "💡 Usage examples:" +echo " # Default: 3 runs with full dataset" +echo " ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh" +echo "" +echo " # Custom parameters" +echo " NUM_RUNS=5 MAX_TASKS=10 MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh" +echo "" +echo " # Different agent configuration" +echo " AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh" +echo "" +echo " # Limited tasks for testing" +echo " MAX_TASKS=5 ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh" From 835e590b992f5b72560bce062aec2efa4e4709d6 Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Thu, 25 Sep 2025 10:57:23 +0800 Subject: [PATCH 11/12] fix: check_finsearchcomp_progress not displaying results from greater china region. --- utils/progress_check/check_finsearchcomp_progress.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/utils/progress_check/check_finsearchcomp_progress.py b/utils/progress_check/check_finsearchcomp_progress.py index a5b26595..2e45787f 100755 --- a/utils/progress_check/check_finsearchcomp_progress.py +++ b/utils/progress_check/check_finsearchcomp_progress.py @@ -48,15 +48,15 @@ def extract_region_from_label(label: str) -> str: label: Label string like "Complex_Historical_Investigation(Global)" or "Financial_Analysis(Greater_China)" Returns: - Region string ("Global", "Greater_China", or "Unknown") + Region string ("Global", "Greater China", or "Unknown") """ if not label: return "Unknown" if "(Global)" in label: return "Global" - elif "(Greater_China)" in label: - return "Greater_China" + elif "(Greater China)" in label: + return "Greater China" else: return "Unknown" @@ -97,7 +97,7 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]: "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0} }, - "Greater_China": { + "Greater China": { "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0} } @@ -246,7 +246,7 @@ def display_results( print("REGIONAL BREAKDOWN (T2 & T3 TASKS)") print("-" * 70) - for region in ["Global", "Greater_China"]: + for region in ["Global", "Greater China"]: print(f"\n{region} Region:") for task_type in ["T2", "T3"]: breakdown = results["regional_breakdown"][region][task_type] From 72e9bb6f82225f952d8cb204ac0738a9f3e25e7b Mon Sep 17 00:00:00 2001 From: JubSteven <1120395085@qq.com> Date: Thu, 25 Sep 2025 17:16:30 +0800 Subject: [PATCH 12/12] fix: catch ContextLimitError in more observed cases. --- src/llm/providers/claude_openrouter_client.py | 2 ++ src/llm/providers/mirothinker_sglang_client.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/llm/providers/claude_openrouter_client.py b/src/llm/providers/claude_openrouter_client.py index 4acd114e..fd441252 100644 --- a/src/llm/providers/claude_openrouter_client.py +++ b/src/llm/providers/claude_openrouter_client.py @@ -191,6 +191,8 @@ async def _create_message( or "exceeds the maximum length" in error_str or "exceeds the maximum allowed length" in error_str or "Input tokens exceed the configured limit" in error_str + or "Requested token count exceeds the model's maximum context length" in error_str + or "BadRequestError" in error_str and "context length" in error_str ): logger.debug(f"OpenRouter LLM Context limit exceeded: {error_str}") raise ContextLimitError(f"Context limit exceeded: {error_str}") diff --git a/src/llm/providers/mirothinker_sglang_client.py b/src/llm/providers/mirothinker_sglang_client.py index f02309a7..6008f8ca 100644 --- a/src/llm/providers/mirothinker_sglang_client.py +++ b/src/llm/providers/mirothinker_sglang_client.py @@ -159,6 +159,8 @@ async def _create_message( or "exceeds the maximum length" in error_str or "exceeds the maximum allowed length" in error_str or "Input tokens exceed the configured limit" in error_str + or "Requested token count exceeds the model's maximum context length" in error_str + or "BadRequestError" in error_str and "context length" in error_str ): logger.debug(f"MiroThinker LLM Context limit exceeded: {error_str}") raise ContextLimitError(f"Context limit exceeded: {error_str}")