diff --git a/.gitignore b/.gitignore
index aaf57c1e..8cbc94e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -208,6 +208,7 @@ marimo/_lsp/
 __marimo__/
 
 logs/
+tmp/
 
 data/*
 !data/README.md
diff --git a/config/benchmark/futurex.yaml b/config/benchmark/futurex.yaml
new file mode 100644
index 00000000..97109882
--- /dev/null
+++ b/config/benchmark/futurex.yaml
@@ -0,0 +1,20 @@
+# config/benchmark/futurex.yaml
+defaults:
+  - default
+  - _self_
+
+name: "futurex"
+
+data:
+  data_dir: "${data_dir}/futurex"  # Path to your dataset
+  metadata_file: "standardized_data.jsonl"  # Metadata filename
+  whitelist: []  # Optional: List of specific task_ids to run
+
+execution:
+  max_tasks: null      # null = no limit, or specify a number
+  max_concurrent: 5    # Number of parallel tasks
+  pass_at_k: 1         # Number of attempts per task
+
+# Set to skip evaluation since we don't have ground truth
+openai_api_key: "skip_evaluation"
+
diff --git a/docs/mkdocs/docs/download_datasets.md b/docs/mkdocs/docs/download_datasets.md
index 88ef11b7..bd67c2b5 100644
--- a/docs/mkdocs/docs/download_datasets.md
+++ b/docs/mkdocs/docs/download_datasets.md
@@ -79,6 +79,7 @@ uv run main.py prepare-benchmark get browsecomp-test
 uv run main.py prepare-benchmark get browsecomp-zh-test
 uv run main.py prepare-benchmark get hle
 uv run main.py prepare-benchmark get xbench-ds
+uv run main.py prepare-benchmark get futurex
 ```
 
 ### What This Script Does
@@ -94,6 +95,7 @@ uv run main.py prepare-benchmark get xbench-ds
         - `browsecomp-zh-test` - Chinese BrowseComp test set
         - `hle` - HLE dataset
         - `xbench-ds` - xbench-DeepSearch dataset
+        - `futurex` - Futurex-Online dataset
 
 ### Customizing Dataset Selection
 
diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md
new file mode 100644
index 00000000..db20f4df
--- /dev/null
+++ b/docs/mkdocs/docs/futurex.md
@@ -0,0 +1,267 @@
+# Futurex-Online
+
+MiroFlow's evaluation on the Futurex-Online benchmark demonstrates capabilities in future event prediction tasks.
+
+---
+
+## Dataset Overview
+
+!!! info "Futurex-Online Dataset"
+    The Futurex-Online dataset consists of 61 prediction tasks covering various future events including:
+
+    - Political events (referendums, elections)
+    - Sports outcomes (football matches)
+    - Legal proceedings
+    - Economic indicators
+
+
+!!! abstract "Key Dataset Characteristics"
+
+    - **Total Tasks**: 61
+    - **Task Type**: Future event prediction
+    - **Answer Format**: Boxed answers (\\boxed{Yes/No} or \\boxed{A/B/C})
+    - **Ground Truth**: Not available (prediction tasks)
+    - **Resolution Date**: Around 2025-09-21 (GMT+8)
+
+---
+
+## Quick Start Guide
+
+!!! note "Quick Start Instructions"
+    This section provides step-by-step instructions to run the Futurex-Online benchmark and prepare submission results. Since this is a prediction dataset without ground truth, we focus on execution traces and response generation. **Note**: This is a quick start guide for running the benchmark, not for reproducing exact submitted results.
+
+### Step 1: Prepare the Futurex-Online Dataset
+
+!!! tip "Dataset Setup"
+    Use the integrated prepare-benchmark command to download and process the dataset:
+
+```bash title="Download Futurex-Online Dataset"
+uv run main.py prepare-benchmark get futurex
+```
+
+This will create the standardized dataset at `data/futurex/standardized_data.jsonl`.
+
+### Step 2: Configure API Keys
+
+!!! warning "API Key Configuration"
+    Set up the required API keys for model access and tool functionality. Update the `.env` file to include the following keys:
+
+```env title=".env Configuration"
+# For searching and web scraping
+SERPER_API_KEY="xxx"
+JINA_API_KEY="xxx"
+
+# For Linux sandbox (code execution environment)
+E2B_API_KEY="xxx"
+
+# We use Claude-3.7-Sonnet with OpenRouter backend to initialize the LLM
+OPENROUTER_API_KEY="xxx"
+OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
+
+# Used for Claude vision understanding
+ANTHROPIC_API_KEY="xxx"
+
+# Used for Gemini vision
+GEMINI_API_KEY="xxx"
+
+# Use for llm judge, reasoning, o3 hints, etc.
+OPENAI_API_KEY="xxx"
+OPENAI_BASE_URL="https://api.openai.com/v1"
+```
+
+### Step 3: Run the Evaluation
+
+!!! example "Evaluation Execution"
+    Execute the following command to run evaluation on the Futurex-Online dataset. This uses the basic `agent_quickstart_1` configuration for quick start purposes.
+
+```bash title="Run Futurex-Online Evaluation"
+uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")"
+```
+
+!!! tip "Progress Monitoring and Resume"
+    To check the progress while running:
+    
+    ```bash title="Check Progress"
+    uv run utils/progress_check/check_futurex_progress.py $PATH_TO_LOG
+    ```
+    
+    If you need to resume an interrupted evaluation, specify the same output directory to continue from where you left off.
+
+    ```bash title="Resume Evaluation, e.g."
+    uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/20250918_1010"
+    ```
+
+### Step 4: Extract Results
+
+!!! example "Result Extraction"
+    After evaluation completion, extract the results using the provided utility:
+
+```bash title="Extract Results"
+uv run utils/extract_futurex_results.py logs/futurex/$(date +"%Y%m%d_%H%M")
+```
+
+This will generate:
+
+- `futurex_results.json`: Detailed results for each task
+- `futurex_summary.json`: Summary statistics
+- `futurex_predictions.csv`: Predictions in CSV format
+
+---
+
+## Sample Task Examples
+
+### Political Prediction
+```
+Task: "Will the 2025 Guinea referendum pass? (resolved around 2025-09-21 (GMT+8))"
+Expected Format: \boxed{Yes} or \boxed{No}
+```
+
+### Sports Prediction
+```
+Task: "Brighton vs. Tottenham (resolved around 2025-09-21 (GMT+8))
+A. Brighton win on 2025-09-20
+B. Brighton vs. Tottenham end in a draw  
+C. Tottenham win on 2025-09-20"
+Expected Format: \boxed{A}, \boxed{B}, or \boxed{C}
+```
+
+---
+
+## Multiple Runs and Voting
+
+!!! tip "Improving Prediction Accuracy"
+    For better prediction accuracy, you can run multiple evaluations and use voting mechanisms to aggregate results. This approach helps reduce randomness and improve the reliability of predictions. **Note**: This is a quick start approach; production submissions may use more sophisticated configurations.
+
+### Step 1: Run Multiple Evaluations
+
+Use the multiple runs script to execute several independent evaluations:
+
+```bash title="Run Multiple Evaluations"
+./scripts/run_evaluate_multiple_runs_futurex.sh
+```
+
+This script will:
+
+- Run 3 independent evaluations by default (configurable with `NUM_RUNS`)
+- Execute all tasks in parallel for efficiency
+- Generate separate result files for each run in `run_1/`, `run_2/`, etc.
+- Create a consolidated `futurex_submission.jsonl` file with voting results
+
+### Step 2: Customize Multiple Runs
+
+You can customize the evaluation parameters:
+
+```bash title="Custom Multiple Runs"
+# Run 5 evaluations with limited tasks for testing
+NUM_RUNS=5 MAX_TASKS=10 ./scripts/run_evaluate_multiple_runs_futurex.sh
+
+# Use different agent configuration
+AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_futurex.sh
+
+# Adjust concurrency for resource management
+MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_futurex.sh
+```
+
+### Step 3: Voting and Aggregation
+
+After multiple runs, the system automatically:
+
+1. **Extracts predictions** from all runs using `utils/extract_futurex_results.py`
+2. **Applies majority voting** to aggregate predictions across runs
+3. **Generates submission file** in the format required by FutureX platform
+4. **Provides voting statistics** showing prediction distribution across runs
+
+The voting process works as follows:
+
+- **Majority Vote**: Most common prediction across all runs wins
+- **Tie-breaking**: If tied, chooses the prediction that appeared earliest across all runs
+- **Vote Counts**: Tracks how many runs predicted each option
+- **Confidence Indicators**: High agreement indicates more reliable predictions
+
+### Step 4: Analyze Voting Results
+
+Check the generated files for voting analysis:
+
+```bash title="Check Voting Results"
+# View submission file with voting results
+cat logs/futurex/agent_quickstart_1_*/futurex_submission.jsonl
+
+# Check individual run results
+ls logs/futurex/agent_quickstart_1_*/run_*/
+
+# Check progress and voting statistics
+uv run python utils/progress_check/check_futurex_progress.py logs/futurex/agent_quickstart_1_*
+```
+
+### Manual Voting Aggregation
+
+You can also manually run the voting aggregation:
+
+```bash title="Manual Voting Aggregation"
+# Aggregate multiple runs with majority voting
+uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_* --aggregate
+
+# Force single run mode (if needed)
+uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_*/run_1 --single
+
+# Specify custom output file
+uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_* -o my_voted_predictions.jsonl
+```
+
+### Voting Output Format
+
+The voting aggregation generates a submission file with the following format:
+
+```json
+{"id": "687104310a994c0060ef87a9", "prediction": "No", "vote_counts": {"No": 2}}
+{"id": "68a9b46e961bd3003c8f006b", "prediction": "Yes", "vote_counts": {"Yes": 2}}
+```
+
+The output includes:
+
+- **`id`**: Task identifier
+- **`prediction`**: Final voted prediction (without `\boxed{}` wrapper)
+- **`vote_counts`**: Dictionary showing how many runs predicted each option
+
+For example, `"vote_counts": {"No": 2}` means 2 out of 2 runs predicted "No", indicating high confidence.
+
+---
+
+## Evaluation Notes
+
+!!! warning "No Ground Truth Available"
+    Since Futurex-Online is a prediction dataset, there are no ground truth answers available for evaluation. The focus is on:
+
+    - Response generation quality
+    - Reasoning process documentation
+    - Prediction confidence and methodology
+
+!!! info "Output Analysis"
+    The evaluation generates detailed execution traces showing:
+
+    - Research process for each prediction
+    - Information gathering from web sources
+    - Reasoning chains leading to predictions
+    - Final boxed answers in required format
+
+### Directory Structure
+
+After running multiple evaluations, you'll find the following structure:
+
+```
+logs/futurex/agent_quickstart_1_YYYYMMDD_HHMM/
+├── futurex_submission.jsonl          # Final voted predictions
+├── run_1/                            # First run results
+│   ├── benchmark_results.jsonl       # Individual task results
+│   ├── benchmark_results_pass_at_1_accuracy.txt
+│   └── task_*_attempt_1.json        # Detailed execution traces
+├── run_2/                            # Second run results
+│   └── ... (same structure as run_1)
+├── run_1_output.log                  # Run 1 execution log
+└── run_2_output.log                  # Run 2 execution log
+```
+
+---
+
+!!! info "Documentation Info"
+    **Last Updated:** September 2025 · **Doc Contributor:** Team @ MiroMind AI
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
index bf17b63b..808282f7 100644
--- a/docs/mkdocs/mkdocs.yml
+++ b/docs/mkdocs/mkdocs.yml
@@ -52,6 +52,7 @@ nav:
     - Benchmarks: 
       - GAIA-Validation: gaia_validation.md
       - GAIA-Test: gaia_test.md
+      - FutureX: futurex.md
     - Add New Benchmarks: contribute_benchmarks.md
 
   - Tools: 
diff --git a/scripts/run_evaluate_multiple_runs_futurex.sh b/scripts/run_evaluate_multiple_runs_futurex.sh
new file mode 100755
index 00000000..c441696d
--- /dev/null
+++ b/scripts/run_evaluate_multiple_runs_futurex.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Multiple runs FutureX evaluation script
+# Based on the working command: uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir=logs/futurex-test
+
+# Configuration parameters
+NUM_RUNS=${NUM_RUNS:-3}
+MAX_TASKS=${MAX_TASKS:-null}
+MAX_CONCURRENT=${MAX_CONCURRENT:-5}
+BENCHMARK_NAME="futurex"
+AGENT_SET=${AGENT_SET:-"agent_quickstart_1"}
+
+# TODO: Add more settings like message ID and max turns, currently not supported using agent_quickstart_1
+# ADD_MESSAGE_ID=${ADD_MESSAGE_ID:-"false"}
+# MAX_TURNS=${MAX_TURNS:-1}
+
+# Set results directory with timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+RESULTS_DIR="logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"
+
+export LOGGER_LEVEL="INFO"
+
+echo "🚀 Starting $NUM_RUNS runs of FutureX evaluation..."
+echo "📊 Using max_tasks: $MAX_TASKS (set MAX_TASKS=null for full dataset)"
+echo "📊 Using max_concurrent: $MAX_CONCURRENT"
+echo "📁 Results will be saved in: $RESULTS_DIR"
+
+# Create results directory
+mkdir -p "$RESULTS_DIR"
+
+# Launch all parallel tasks
+for i in $(seq 1 $NUM_RUNS); do
+    echo "=========================================="
+    echo "🚀 Launching experiment $i/$NUM_RUNS"
+    echo "📝 Output log: $RESULTS_DIR/run_${i}_output.log"
+    echo "=========================================="
+    
+    # Set specific identifier for this run
+    RUN_ID="run_$i"
+    
+    # Run experiment (background execution)
+    (
+        echo "Starting run $i at $(date)"
+        uv run main.py common-benchmark \
+            --config_file_name=$AGENT_SET \
+            benchmark=$BENCHMARK_NAME \
+            benchmark.execution.max_tasks=$MAX_TASKS \
+            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
+            benchmark.execution.pass_at_k=1 \
+            output_dir=${RESULTS_DIR}/$RUN_ID \
+            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
+            > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
+        
+        # Check if run was successful
+        if [ $? -eq 0 ]; then
+            echo "✅ Run $i completed successfully at $(date)"
+            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
+            if [ -f "$RESULT_FILE" ]; then
+                echo "📊 Results saved to $RESULT_FILE"
+            else
+                echo "⚠️  Warning: Result file not found for run $i"
+            fi
+        else
+            echo "❌ Run $i failed at $(date)!"
+        fi
+    ) &
+    
+    # Small delay between launches
+    sleep 2
+done
+
+echo "🎯 All $NUM_RUNS runs have been launched in parallel"
+echo "⏳ Waiting for all runs to complete..."
+
+# Wait for all background tasks to complete
+wait
+
+echo "=========================================="
+echo "🎉 All $NUM_RUNS runs completed!"
+echo "=========================================="
+
+# Extract predictions and format for FutureX submission
+echo "📤 Extracting predictions and formatting for FutureX submission..."
+uv run python utils/extract_futurex_results.py "$RESULTS_DIR"
+
+# Check status and provide user-friendly message
+if [ $? -eq 0 ]; then
+    echo "✅ Submission file generated: $RESULTS_DIR/futurex_submission.jsonl"
+    echo "📋 You can now upload this file to the FutureX test server."
+else
+    echo "❌ Failed to generate submission file. Please check the logs for details."
+fi
+
+# Show progress summary
+echo "=========================================="
+echo "📊 Progress Summary:"
+echo "=========================================="
+
+echo "=========================================="
+echo "🎯 Multiple runs FutureX evaluation completed!"
+echo "📁 Check results in: $RESULTS_DIR"
+echo "📝 Check individual run logs: $RESULTS_DIR/run_*_output.log"
+echo "📤 Check submission file: $RESULTS_DIR/futurex_submission.jsonl"
+echo "=========================================="
+echo ""
+echo "💡 Usage examples:"
+echo "   # Default: 3 runs with full dataset"
+echo "   ./scripts/run_evaluate_multiple_runs_futurex.sh"
+echo ""
+echo "   # Custom parameters"
+echo "   NUM_RUNS=5 MAX_TASKS=10 MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_futurex.sh"
+echo ""
+echo "   # Different agent configuration"
+echo "   AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_futurex.sh"
+echo ""
+echo "   # Limited tasks for testing"
+echo "   MAX_TASKS=5 ./scripts/run_evaluate_multiple_runs_futurex.sh"
\ No newline at end of file
diff --git a/scripts/run_prepare_benchmark.sh b/scripts/run_prepare_benchmark.sh
index a00f7a1a..7574ed3e 100644
--- a/scripts/run_prepare_benchmark.sh
+++ b/scripts/run_prepare_benchmark.sh
@@ -20,4 +20,5 @@ uv run main.py prepare-benchmark get webwalkerqa
 uv run main.py prepare-benchmark get browsecomp-test
 uv run main.py prepare-benchmark get browsecomp-zh-test
 uv run main.py prepare-benchmark get hle
-uv run main.py prepare-benchmark get xbench-ds
\ No newline at end of file
+uv run main.py prepare-benchmark get xbench-ds
+uv run main.py prepare-benchmark get futurex
\ No newline at end of file
diff --git a/utils/extract_futurex_results.py b/utils/extract_futurex_results.py
new file mode 100644
index 00000000..415b720c
--- /dev/null
+++ b/utils/extract_futurex_results.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+FutureX Results Extractor and Aggregator
+
+This script extracts predictions from MiroFlow benchmark results and can aggregate
+multiple runs using majority voting to create FutureX submission files.
+
+Features:
+1. Extract predictions from single benchmark results
+2. Aggregate multiple runs with majority voting
+3. Generate FutureX-compatible submission files
+4. Support both single-run and multi-run scenarios
+
+Usage:
+    # Extract from single run
+    python extract_futurex_results.py logs/futurex-online-test
+    
+    # Aggregate multiple runs (if run_* subdirectories exist)
+    python extract_futurex_results.py logs/futurex-online-multi-runs
+    
+    # Specify output file
+    python extract_futurex_results.py logs/futurex-online-test -o my_submission.jsonl
+"""
+
+import argparse
+import json
+import os
+from collections import Counter, defaultdict
+from typing import Dict, List, Tuple
+
+
+def majority_vote(
+    preds: List[str], first_seen_order: Dict[str, int]
+) -> Tuple[str, Dict[str, int]]:
+    """
+    Compute the majority-vote prediction for a list of candidate predictions.
+
+    Tie-breaking rules (deterministic):
+      1) Highest frequency wins.
+      2) If there is a tie on frequency, choose the candidate that appeared earliest
+         across all runs (based on the provided first_seen_order index).
+      3) As a final guard (shouldn't be needed if first_seen_order is complete),
+         fall back to lexicographic order.
+
+    Returns:
+      (chosen_prediction, counts_dict)
+    """
+    counter = Counter(preds)
+    # Get the max vote count
+    max_count = max(counter.values())
+    # All candidates that share the max vote count
+    tied = [c for c, cnt in counter.items() if cnt == max_count]
+
+    if len(tied) == 1:
+        chosen = tied[0]
+    else:
+        # Prefer the one seen earliest globally
+        tied.sort(key=lambda x: (first_seen_order.get(x, float("inf")), x))
+        chosen = tied[0]
+
+    # Expose counts for optional debugging/inspection
+    return chosen, dict(counter)
+
+
+def discover_runs(results_dir: str) -> List[str]:
+    """
+    Discover subdirectories inside results_dir that potentially contain a
+    'benchmark_results.jsonl'. We don't strictly require the subdir name to
+    start with 'run_', but we sort the list to keep processing deterministic.
+    """
+    runs = []
+    for name in sorted(os.listdir(results_dir)):
+        path = os.path.join(results_dir, name)
+        if os.path.isdir(path):
+            fpath = os.path.join(path, "benchmark_results.jsonl")
+            if os.path.isfile(fpath):
+                runs.append(path)
+    return runs
+
+
+def extract_predictions_from_file(file_path: str) -> Dict[str, str]:
+    """
+    Extract predictions from a single benchmark_results.jsonl file.
+    
+    Args:
+        file_path: Path to benchmark_results.jsonl file
+        
+    Returns:
+        Dictionary mapping task_id to prediction
+    """
+    predictions = {}
+    
+    with open(file_path, "r", encoding="utf-8") as fin:
+        for line_num, line in enumerate(fin, 1):
+            line = line.strip()
+            if not line:
+                continue
+                
+            try:
+                rec = json.loads(line)
+            except json.JSONDecodeError as e:
+                print(f"Warning: Skipping malformed JSON at line {line_num} in {file_path}: {e}")
+                continue
+
+            task_id = rec.get("task_id")
+            pred = rec.get("model_boxed_answer")
+
+            # Only accept non-empty strings; coerce to str for safety
+            if task_id and pred is not None and str(pred).strip():
+                pred_str = str(pred).strip()
+                predictions[task_id] = pred_str
+                
+    return predictions
+
+
+def aggregate_multiple_runs(results_dir: str) -> Tuple[Dict[str, List[str]], Dict[str, int]]:
+    """
+    Aggregate predictions from multiple runs in subdirectories.
+    
+    Args:
+        results_dir: Directory containing run_* subdirectories
+        
+    Returns:
+        Tuple of (predictions_by_task, first_seen_order)
+    """
+    # Maps task_id -> list of predictions collected across runs
+    preds_by_task: Dict[str, List[str]] = defaultdict(list)
+
+    # Track first-seen order index for each distinct prediction string across all runs.
+    # This enables deterministic tie-breaking.
+    first_seen_order: Dict[str, int] = {}
+    next_order_idx = 0
+
+    runs = discover_runs(results_dir)
+    if not runs:
+        raise FileNotFoundError(
+            f"No run directories with 'benchmark_results.jsonl' found under: {results_dir}"
+        )
+
+    total_lines = 0
+    used_lines = 0
+
+    # Read and aggregate predictions
+    for run_dir in runs:
+        fpath = os.path.join(run_dir, "benchmark_results.jsonl")
+        print(f"Reading: {fpath}")
+        
+        with open(fpath, "r", encoding="utf-8") as fin:
+            for line in fin:
+                total_lines += 1
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    rec = json.loads(line)
+                except json.JSONDecodeError:
+                    # Skip malformed JSON lines, but keep going
+                    continue
+
+                task_id = rec.get("task_id")
+                pred = rec.get("model_boxed_answer")
+
+                # Only accept non-empty strings; coerce to str for safety
+                if task_id and pred is not None and str(pred).strip():
+                    pred_str = str(pred).strip()
+                    preds_by_task[task_id].append(pred_str)
+                    if pred_str not in first_seen_order:
+                        first_seen_order[pred_str] = next_order_idx
+                        next_order_idx += 1
+                    used_lines += 1
+
+    print(f"Collected from {len(runs)} run(s).")
+    print(f"Read {total_lines} line(s), accepted {used_lines} record(s).")
+    
+    return preds_by_task, first_seen_order
+
+
+def process_single_run(results_dir: str) -> Dict[str, str]:
+    """
+    Process a single run (direct benchmark_results.jsonl file).
+    
+    Args:
+        results_dir: Directory containing benchmark_results.jsonl
+        
+    Returns:
+        Dictionary mapping task_id to prediction
+    """
+    file_path = os.path.join(results_dir, "benchmark_results.jsonl")
+    
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"benchmark_results.jsonl not found in: {results_dir}")
+    
+    print(f"Reading single run: {file_path}")
+    predictions = extract_predictions_from_file(file_path)
+    print(f"Extracted {len(predictions)} predictions from single run.")
+    
+    return predictions
+
+
+def write_submission_file(
+    predictions: Dict[str, str], 
+    output_file: str, 
+    is_aggregated: bool = False,
+    vote_counts: Dict[str, Dict[str, int]] = None
+) -> None:
+    """
+    Write predictions to FutureX submission format.
+    
+    Args:
+        predictions: Dictionary mapping task_id to prediction
+        output_file: Output file path
+        is_aggregated: Whether this is from aggregated runs
+        vote_counts: Vote counts for each task (only for aggregated runs)
+    """
+    num_tasks = 0
+    with open(output_file, "w", encoding="utf-8") as out:
+        for task_id in sorted(predictions.keys()):
+            prediction = predictions[task_id]
+            
+            # Create submission record
+            record = {"id": task_id, "prediction": prediction}
+            
+            # Add vote information for aggregated runs
+            if is_aggregated and vote_counts and task_id in vote_counts:
+                record["vote_counts"] = vote_counts[task_id]
+            
+            out.write(json.dumps(record, ensure_ascii=False) + "\n")
+            num_tasks += 1
+
+    print(f"✅ Submission saved to {output_file}")
+    if is_aggregated:
+        print(f"Aggregated {num_tasks} unique task_id(s) from multiple runs.")
+    else:
+        print(f"Extracted {num_tasks} predictions from single run.")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Extract predictions from MiroFlow benchmark results and create FutureX submission files. "
+                   "Supports both single runs and multi-run aggregation with majority voting."
+    )
+    parser.add_argument(
+        "results_dir",
+        help="Path to results dir containing benchmark_results.jsonl or run_*/benchmark_results.jsonl",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default=None,
+        help="Output JSONL file path (default: <results_dir>/futurex_submission.jsonl)",
+    )
+    parser.add_argument(
+        "--aggregate",
+        action="store_true",
+        help="Force aggregation mode (look for run_* subdirectories)",
+    )
+    parser.add_argument(
+        "--single",
+        action="store_true", 
+        help="Force single run mode (look for direct benchmark_results.jsonl)",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    results_dir = os.path.abspath(args.results_dir)
+    if not os.path.isdir(results_dir):
+        raise FileNotFoundError(f"Results dir not found: {results_dir}")
+
+    output_file = (
+        os.path.abspath(args.output)
+        if args.output
+        else os.path.join(results_dir, "futurex_submission.jsonl")
+    )
+
+    # Determine processing mode
+    runs = discover_runs(results_dir)
+    single_file = os.path.join(results_dir, "benchmark_results.jsonl")
+    
+    if args.aggregate:
+        if not runs:
+            raise FileNotFoundError(
+                f"No run directories found for aggregation in: {results_dir}"
+            )
+        mode = "aggregate"
+    elif args.single:
+        if not os.path.isfile(single_file):
+            raise FileNotFoundError(
+                f"benchmark_results.jsonl not found for single run in: {results_dir}"
+            )
+        mode = "single"
+    else:
+        # Auto-detect mode
+        if runs and os.path.isfile(single_file):
+            print("Both single run and multiple runs detected. Using aggregation mode.")
+            print("Use --single to force single run mode.")
+            mode = "aggregate"
+        elif runs:
+            mode = "aggregate"
+        elif os.path.isfile(single_file):
+            mode = "single"
+        else:
+            raise FileNotFoundError(
+                f"No benchmark_results.jsonl files found in: {results_dir}"
+            )
+
+    print(f"Processing mode: {mode}")
+
+    if mode == "aggregate":
+        # Multi-run aggregation with majority voting
+        preds_by_task, first_seen_order = aggregate_multiple_runs(results_dir)
+        
+        # Apply majority voting
+        final_predictions = {}
+        vote_counts = {}
+        
+        for task_id in preds_by_task:
+            voted_pred, counts = majority_vote(preds_by_task[task_id], first_seen_order)
+            final_predictions[task_id] = voted_pred
+            vote_counts[task_id] = counts
+        
+        write_submission_file(final_predictions, output_file, is_aggregated=True, vote_counts=vote_counts)
+        
+    else:
+        # Single run extraction
+        predictions = process_single_run(results_dir)
+        write_submission_file(predictions, output_file, is_aggregated=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/prepare_benchmark/gen_futurex.py b/utils/prepare_benchmark/gen_futurex.py
new file mode 100644
index 00000000..0e2690d6
--- /dev/null
+++ b/utils/prepare_benchmark/gen_futurex.py
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Generator, MutableMapping
+
+from datasets import load_dataset
+
+from utils.prepare_benchmark.common import Task
+
+
+def gen_futurex(hf_token: str) -> Generator[Task, None, None]:
+    """
+    Generate Futurex-Online dataset tasks in MiroFlow format
+    
+    Args:
+        hf_token: Hugging Face token for dataset access
+        
+    Yields:
+        Task: Standardized task objects
+    """
+    # Load the Futurex-Online dataset
+    dataset = load_dataset("futurex-ai/Futurex-Online")
+    
+    # Process each split in the dataset
+    for split_name, split_data in dataset.items():
+        for idx, sample in enumerate(split_data):
+            # Extract task information
+            task_id = sample.get("id", f"futurex_{split_name}_{idx}")
+            task_question = sample.get("prompt", "")
+            end_time = sample.get("end_time", "")
+            level = sample.get("level", "")
+            
+            # Create metadata dictionary
+            metadata: MutableMapping = {
+                "level": level,
+                "end_time": end_time,
+                "source": "futurex-ai/Futurex-Online",
+                "split": split_name,
+                "original_id": sample.get("id", ""),
+                "dataset_name": "Futurex-Online"
+            }
+            
+            # Create standardized Task object
+            task = Task(
+                task_id=task_id,
+                task_question=task_question,
+                ground_truth="",  # Futurex-Online doesn't have ground truth
+                file_path=None,   # No file attachments
+                metadata=metadata,
+            )
+            
+            yield task
+
+    return
diff --git a/utils/prepare_benchmark/main.py b/utils/prepare_benchmark/main.py
index 9712255a..2233bfdc 100644
--- a/utils/prepare_benchmark/main.py
+++ b/utils/prepare_benchmark/main.py
@@ -18,6 +18,7 @@
 from utils.prepare_benchmark.gen_hle import gen_hle_test
 from utils.prepare_benchmark.gen_webwalkerqa import gen_webwalkerqa
 from utils.prepare_benchmark.gen_xbench_ds import gen_xbench_ds
+from utils.prepare_benchmark.gen_futurex import gen_futurex
 
 
 @dataclasses.dataclass
@@ -31,6 +32,7 @@ class _Env:
         "browsecomp-zh-test",
         "hle",
         "xbench-ds",
+        "futurex",
     )
     meta_filename = "standardized_data.jsonl"
     data_dir: pathlib.Path
@@ -108,6 +110,13 @@ def gen():
                 for x in gen_xbench_ds(env.hf_token):
                     yield x
 
+            return gen
+        case "futurex":
+
+            def gen():
+                for x in gen_futurex(env.hf_token):
+                    yield x
+
             return gen
         case _:
             raise ValueError("not supported")
diff --git a/utils/progress_check/check_futurex_progress.py b/utils/progress_check/check_futurex_progress.py
new file mode 100644
index 00000000..d8783174
--- /dev/null
+++ b/utils/progress_check/check_futurex_progress.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+"""
+Futurex-Online Progress Checker
+
+This script analyzes Futurex-Online benchmark results in a log folder to count:
+- Total files processed
+- Files with status "completed" 
+- Files with predictions (final_boxed_answer)
+- Files with errors
+
+Usage:
+    python check_futurex_progress.py [LOG_FOLDER_PATH]
+
+If no path is provided, uses the current directory.
+"""
+
+import json
+import sys
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+
+def analyze_futurex_results(log_folder: str) -> Dict[str, int]:
+    """
+    Analyze Futurex-Online benchmark results from JSON log files.
+
+    Args:
+        log_folder: Path to folder containing task_*.json files
+
+    Returns:
+        Dictionary with counts of different categories
+    """
+    log_path = Path(log_folder)
+
+    if not log_path.exists():
+        raise FileNotFoundError(f"Log folder not found: {log_folder}")
+
+    # Find all task JSON files
+    json_files = list(log_path.glob("task_*_attempt_*.json"))
+
+    results = {
+        "total_files": 0,
+        "completed_status": 0,
+        "running_status": 0,
+        "failed_status": 0,
+        "with_predictions": 0,
+        "without_predictions": 0,
+        "with_errors": 0,
+        "parse_errors": 0,
+    }
+
+    completed_files = []
+    running_files = []
+    failed_files = []
+    prediction_files = []
+    error_files = []
+    parse_error_files = []
+
+    print(f"Scanning {len(json_files)} files in {log_folder}...")
+
+    for json_file in json_files:
+        results["total_files"] += 1
+
+        try:
+            with open(json_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+
+            status = data.get("status", "").lower()
+            final_answer = data.get("final_boxed_answer", "")
+            error_msg = data.get("error", "")
+            judge_result = data.get("judge_result", "")
+
+            # Count by status
+            if status == "completed":
+                results["completed_status"] += 1
+                completed_files.append(json_file.name)
+            elif status == "running":
+                results["running_status"] += 1
+                running_files.append(json_file.name)
+            elif status in ["failed", "error"]:
+                results["failed_status"] += 1
+                failed_files.append(json_file.name)
+            else:
+                # Unknown status
+                results["failed_status"] += 1
+                failed_files.append((json_file.name, f"Unknown status: {status}"))
+
+            # Count by prediction availability
+            if final_answer and final_answer.strip():
+                results["with_predictions"] += 1
+                prediction_files.append((json_file.name, final_answer[:100] + "..." if len(final_answer) > 100 else final_answer))
+            else:
+                results["without_predictions"] += 1
+
+            # Count by error presence
+            if error_msg and error_msg.strip():
+                results["with_errors"] += 1
+                error_files.append((json_file.name, error_msg))
+
+        except (json.JSONDecodeError, KeyError, FileNotFoundError) as e:
+            results["parse_errors"] += 1
+            parse_error_files.append((json_file.name, str(e)))
+            print(f"Error parsing {json_file.name}: {e}")
+
+    return (
+        results,
+        completed_files,
+        running_files,
+        failed_files,
+        prediction_files,
+        error_files,
+        parse_error_files,
+    )
+
+
+def display_results(
+    results: Dict[str, int],
+    completed_files: List[str],
+    running_files: List[str],
+    failed_files: List[str],
+    prediction_files: List[Tuple[str, str]],
+    error_files: List[Tuple[str, str]],
+    parse_error_files: List[Tuple[str, str]],
+) -> None:
+    """Display the analysis results in a formatted way."""
+
+    print("\n" + "=" * 60)
+    print("FUTUREX-ONLINE BENCHMARK RESULTS SUMMARY")
+    print("=" * 60)
+
+    total = results["total_files"]
+    completed = results["completed_status"]
+    running = results["running_status"]
+    failed = results["failed_status"]
+    with_predictions = results["with_predictions"]
+    with_errors = results["with_errors"]
+
+    print(f"Total files processed:           {total:3d}")
+    print(f"Files with status 'completed':   {completed:3d} ({completed/total*100:.1f}%)")
+    print(f"Files with status 'running':     {running:3d} ({running/total*100:.1f}%)")
+    print(f"Files with status 'failed':      {failed:3d} ({failed/total*100:.1f}%)")
+    print(f"Files with predictions:          {with_predictions:3d} ({with_predictions/total*100:.1f}%)")
+    print(f"Files with errors:               {with_errors:3d} ({with_errors/total*100:.1f}%)")
+    print(f"Files with parse errors:         {results['parse_errors']:3d}")
+
+    if completed > 0:
+        prediction_rate = with_predictions / completed * 100
+        print(f"\nPrediction rate (predictions/completed): {prediction_rate:.1f}%")
+
+    print("\n" + "-" * 60)
+    print(f"SUMMARY: {completed} tasks completed, {with_predictions} with predictions")
+    print("-" * 60)
+
+    # Show some example files for verification
+    if completed_files:
+        print("\nFirst 5 completed files:")
+        for i, filename in enumerate(completed_files[:5], 1):
+            print(f"  {i}. {filename}")
+        if len(completed_files) > 5:
+            print(f"  ... and {len(completed_files) - 5} more")
+
+    if running_files:
+        print("\nFirst 5 running files:")
+        for i, filename in enumerate(running_files[:5], 1):
+            print(f"  {i}. {filename}")
+        if len(running_files) > 5:
+            print(f"  ... and {len(running_files) - 5} more")
+
+    if prediction_files:
+        print("\nFirst 5 files with predictions:")
+        for i, (filename, prediction) in enumerate(prediction_files[:5], 1):
+            print(f"  {i}. {filename}")
+            print(f"     Prediction: {prediction}")
+        if len(prediction_files) > 5:
+            print(f"  ... and {len(prediction_files) - 5} more")
+
+    if error_files:
+        print("\nFiles with errors:")
+        for filename, error in error_files[:5]:
+            print(f"  - {filename}: {error[:100]}...")
+        if len(error_files) > 5:
+            print(f"  ... and {len(error_files) - 5} more")
+
+    if parse_error_files:
+        print("\nFiles with parse errors:")
+        for filename, error in parse_error_files:
+            print(f"  - {filename}: {error}")
+
+
+def main():
+    """Main function to run the analysis."""
+
+    # Check if folder path was provided as command line argument
+    if len(sys.argv) > 1:
+        log_folder = sys.argv[1]
+        print(f"Using provided folder path: {log_folder}")
+    else:
+        log_folder = "."
+        print(f"No folder path provided, using current directory: {log_folder}")
+
+    try:
+        print(f"Analyzing Futurex-Online benchmark results in: {log_folder}")
+        results, completed_files, running_files, failed_files, prediction_files, error_files, parse_error_files = analyze_futurex_results(
+            log_folder
+        )
+        display_results(results, completed_files, running_files, failed_files, prediction_files, error_files, parse_error_files)
+
+    except Exception as e:
+        print(f"Error: {e}")
+        print(f"\nUsage: python {sys.argv[0]} [LOG_FOLDER_PATH]")
+        print(f"Example: python {sys.argv[0]} logs/futurex-online-test")
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())