diff --git a/bench/comprehensive_bench.sh b/bench/comprehensive_bench.sh
index 5054d537..761f78b3 100755
--- a/bench/comprehensive_bench.sh
+++ b/bench/comprehensive_bench.sh
@@ -12,6 +12,7 @@ ROUTER_ENDPOINT="http://127.0.0.1:8801/v1"
 VLLM_ENDPOINT="http://127.0.0.1:8000/v1"
 VLLM_MODEL=""  # Will be auto-detected from endpoint if not specified
 ROUTER_MODEL="auto"
+CONCURRENT_REQUESTS=8
 OUTPUT_BASE="results/comprehensive_research_$(date +%Y%m%d_%H%M%S)"
 
 # Parse command line arguments
@@ -105,12 +106,28 @@ PERSISTENT_RESEARCH_CSV="results/research_results_master.csv"
 # Dataset configurations (dataset_name:samples_per_category)
 # Balanced for statistical significance vs runtime
 declare -A DATASET_CONFIGS=(
-    ["mmlu"]=10          # 57 subjects × 10 = 570 samples
-    ["arc"]=15           # 1 category × 15 = 15 samples  
-    ["gpqa"]=20          # 1 category × 20 = 20 samples
-    ["truthfulqa"]=15    # 1 category × 15 = 15 samples
-    ["commonsenseqa"]=20 # 1 category × 20 = 20 samples
-    ["hellaswag"]=8      # ~50 activities × 8 = ~400 samples
+    # Core proven datasets
+    ["gpqa"]=20          # 1 category × 20 = 20 samples - OUTSTANDING reasoning differentiation
+    ["mmlu"]=10          # 57 subjects × 10 = 570 samples - EXCELLENT reasoning differentiation
+    ["truthfulqa"]=15    # Truthfulness evaluation - some reasoning differentiation (60% → 73.3%)
+    
+    # Mathematical reasoning datasets
+    # ["math"]=15          # Competition mathematics - DISABLED: Dataset not available on HF Hub
+    ["gsm8k"]=25         # Elementary math word problems - EXPECTED good reasoning differentiation
+    ["aqua-rat"]=20      # Algebraic word problems with rationales - EXPECTED good differentiation
+    
+    # Multi-step reasoning datasets
+    ["drop"]=20          # Reading comprehension with discrete reasoning - EXPECTED excellent differentiation
+    ["strategyqa"]=20    # Multi-step implicit reasoning - EXPECTED good differentiation
+    
+    # Scientific reasoning datasets
+    ["sciq"]=25          # Science questions requiring reasoning - EXPECTED moderate differentiation
+    ["openbookqa"]=20    # Elementary science with fact reasoning - EXPECTED moderate differentiation
+    
+    # Disabled datasets with poor reasoning differentiation:
+    # ["arc-challenge"]=15 # 100% accuracy across all modes, minimal benefit
+    # ["commonsenseqa"]=20 # Same accuracy across modes, small token difference
+    # ["hellaswag"]=2      # Minimal differentiation, not reasoning-focused
 )
 
 echo -e "${BLUE}🔬 COMPREHENSIVE MULTI-DATASET BENCHMARK FOR RESEARCH${NC}"
@@ -136,14 +153,17 @@ source "$VENV_PATH/bin/activate"
 mkdir -p "$OUTPUT_BASE"
 mkdir -p "$(dirname "$PERSISTENT_RESEARCH_CSV")"
 
-# Initialize persistent research results CSV (create header only if file doesn't exist)
-if [[ ! -f "$PERSISTENT_RESEARCH_CSV" ]]; then
-    echo "Dataset,Mode,Model,Accuracy,Avg_Latency_ms,Avg_Total_Tokens,Sample_Count,Timestamp" > "$PERSISTENT_RESEARCH_CSV"
-    echo -e "${GREEN}📊 Created new master research CSV: $PERSISTENT_RESEARCH_CSV${NC}"
-else
-    echo -e "${BLUE}📊 Using existing master research CSV: $PERSISTENT_RESEARCH_CSV${NC}"
+# Backup and clear master research CSV for fresh results
+if [[ -f "$PERSISTENT_RESEARCH_CSV" ]]; then
+    BACKUP_CSV="${PERSISTENT_RESEARCH_CSV}.backup_$(date +%Y%m%d_%H%M%S)"
+    cp "$PERSISTENT_RESEARCH_CSV" "$BACKUP_CSV"
+    echo -e "${GREEN}📊 Backed up existing master CSV to: $BACKUP_CSV${NC}"
 fi
 
+# Create fresh master research CSV with header only
+echo "Dataset,Mode,Model,Accuracy,Avg_Latency_ms,Avg_Total_Tokens,Sample_Count,Timestamp" > "$PERSISTENT_RESEARCH_CSV"
+echo -e "${GREEN}📊 Created fresh master research CSV: $PERSISTENT_RESEARCH_CSV${NC}"
+
 # Also create a timestamped copy for this run
 RESEARCH_CSV="$OUTPUT_BASE/research_results.csv"
 cp "$PERSISTENT_RESEARCH_CSV" "$RESEARCH_CSV"
@@ -225,9 +245,12 @@ try:
             model_name = '$VLLM_MODEL'
         
         # For vLLM, we might have multiple modes (NR, NR_REASONING)
-        if '$mode' == 'vllm' and 'mode' in df.columns:
-            for mode_type in df['mode'].unique():
-                mode_df = df[df['mode'] == mode_type]
+        # Check both 'mode' and 'mode_label' columns for mode information
+        if '$mode' == 'vllm' and ('mode' in df.columns or 'mode_label' in df.columns):
+            # Use mode_label if available (more descriptive), otherwise use mode
+            mode_column = 'mode_label' if 'mode_label' in df.columns else 'mode'
+            for mode_type in df[mode_column].unique():
+                mode_df = df[df[mode_column] == mode_type]
                 
                 # Recalculate metrics for this specific mode using correct column names
                 if 'is_correct' in mode_df.columns:
@@ -253,7 +276,17 @@ try:
                     
                 mode_samples = len(mode_df)
                 
-                csv_line = f'$dataset,vLLM_{mode_type},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp'
+                # Map technical mode names to descriptive names
+                if mode_type == 'VLLM_NR':
+                    display_mode = 'vLLM_No_Reasoning'
+                elif mode_type == 'VLLM_NR_REASONING':
+                    display_mode = 'vLLM_All_Reasoning'
+                elif mode_type == 'VLLM_XC':
+                    display_mode = 'vLLM_CoT'
+                else:
+                    display_mode = mode_type  # Use the mode_label as-is if not recognized
+                
+                csv_line = f'$dataset,{display_mode},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp'
                 print(f'    📝 Writing to CSV: {csv_line}', file=sys.stderr)
                 print(csv_line)
         else:
@@ -283,7 +316,7 @@ run_dataset_benchmark() {
     
     echo -e "${GREEN}📊 Benchmarking $dataset dataset ($samples samples per category)...${NC}"
     
-    # Router benchmark
+    # Router benchmark (pass vLLM info for consistent token calculation)
     echo -e "${YELLOW}  🤖 Running router evaluation...${NC}"
     python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
         --dataset "$dataset" \
@@ -291,6 +324,9 @@ run_dataset_benchmark() {
         --run-router \
         --router-endpoint "$ROUTER_ENDPOINT" \
         --router-models "$ROUTER_MODEL" \
+        --vllm-endpoint "$VLLM_ENDPOINT" \
+        --vllm-models "$VLLM_MODEL" \
+        --concurrent-requests "$CONCURRENT_REQUESTS" \
         --output-dir "$OUTPUT_BASE/router_$dataset" \
         --seed 42
 
@@ -307,41 +343,104 @@ run_dataset_benchmark() {
         --vllm-models "$VLLM_MODEL" \
         --vllm-exec-modes NR NR_REASONING \
         --output-dir "$OUTPUT_BASE/vllm_$dataset" \
+        --concurrent-requests "$CONCURRENT_REQUESTS" \
         --seed 42
     
     # Extract and save vLLM metrics immediately
     extract_and_save_metrics "$dataset" "vllm" "$OUTPUT_BASE/vllm_$dataset"
     
-    echo -e "${GREEN}  ✅ Completed $dataset benchmark${NC}"
+    # Generate updated comprehensive plots for current dataset
+    echo -e "${BLUE}  📈 Updating comprehensive plots with $dataset results...${NC}"
+    generate_comprehensive_plot "$dataset"
+    
+    echo -e "${GREEN}  ✅ Completed $dataset benchmark and comprehensive plots updated${NC}"
+    echo -e "${GREEN}  📈 CSV data updated in: $PERSISTENT_RESEARCH_CSV${NC}"
     echo ""
 }
 
-# Function to generate comparison plots
-generate_plots() {
-    echo -e "${BLUE}📈 Generating comparison plots...${NC}"
+# Function to generate comprehensive plot with all completed datasets (called after each dataset completes)
+generate_comprehensive_plot() {
+    local current_dataset=$1
     
-    for dataset in "${!DATASET_CONFIGS[@]}"; do
-        echo -e "${YELLOW}  📊 Plotting $dataset results...${NC}"
+    if [[ -n "$current_dataset" ]]; then
+        echo -e "${YELLOW}    📊 Generating plot for current dataset: $current_dataset...${NC}"
+    else
+        echo -e "${YELLOW}    📊 Generating comprehensive plot with all completed datasets...${NC}"
+    fi
+    
+    # Use the plot_comprehensive_results.py script to generate updated charts
+    if [[ -f "plot_comprehensive_results.py" ]]; then
+        echo -e "${BLUE}      Running comprehensive plotting script...${NC}"
+        # Use the current run's CSV instead of the master CSV to show only this run's results
+        PLOT_CMD="python3 plot_comprehensive_results.py \
+            --csv \"$RESEARCH_CSV\" \
+            --output-dir \"$OUTPUT_BASE\" \
+            --model-filter \"$VLLM_MODEL\""
+        
+        # Add dataset filter if specified
+        if [[ -n "$current_dataset" ]]; then
+            PLOT_CMD="$PLOT_CMD --dataset-filter \"$current_dataset\""
+        fi
+        
+        eval $PLOT_CMD
         
-        # Find the summary.json files
-        ROUTER_SUMMARY=$(find "$OUTPUT_BASE/router_$dataset" -name "summary.json" -type f | head -1)
-        VLLM_SUMMARY=$(find "$OUTPUT_BASE/vllm_$dataset" -name "summary.json" -type f | head -1)
+        echo -e "${GREEN}    ✅ Comprehensive plots updated in $OUTPUT_BASE${NC}"
+        
+        # Print actual paths of generated charts
+        if [[ -f "$OUTPUT_BASE/accuracy_comparison.png" ]]; then
+            echo -e "${GREEN}    📊 Accuracy Chart: $OUTPUT_BASE/accuracy_comparison.png${NC}"
+        fi
+        if [[ -f "$OUTPUT_BASE/token_usage_comparison.png" ]]; then
+            echo -e "${GREEN}    📊 Token Usage Chart: $OUTPUT_BASE/token_usage_comparison.png${NC}"
+        fi
+        if [[ -f "$OUTPUT_BASE/efficiency_analysis.png" ]]; then
+            echo -e "${GREEN}    📊 Efficiency Chart: $OUTPUT_BASE/efficiency_analysis.png${NC}"
+        fi
+    else
+        echo -e "${RED}    ⚠️  plot_comprehensive_results.py not found, skipping comprehensive plots${NC}"
+    fi
+}
+
+# Function to generate plot for a single dataset (kept for compatibility)
+generate_dataset_plot() {
+    local dataset=$1
+    
+    echo -e "${YELLOW}    📊 Plotting $dataset results...${NC}"
+    
+    # Find the summary.json files
+    ROUTER_SUMMARY=$(find "$OUTPUT_BASE/router_$dataset" -name "summary.json" -type f | head -1)
+    VLLM_SUMMARY=$(find "$OUTPUT_BASE/vllm_$dataset" -name "summary.json" -type f | head -1)
 
-        if [[ -f "$VLLM_SUMMARY" ]]; then
-            PLOT_CMD="python3 -m vllm_semantic_router_bench.bench_plot --summary \"$VLLM_SUMMARY\" --out-dir \"$OUTPUT_BASE/plots_$dataset\""
+    if [[ -f "$VLLM_SUMMARY" ]]; then
+        PLOT_CMD="python3 -m vllm_semantic_router_bench.bench_plot --summary \"$VLLM_SUMMARY\" --out-dir \"$OUTPUT_BASE/plots_$dataset\""
 
-            if [[ -f "$ROUTER_SUMMARY" ]]; then
-                PLOT_CMD="$PLOT_CMD --router-summary \"$ROUTER_SUMMARY\""
-            fi
+        if [[ -f "$ROUTER_SUMMARY" ]]; then
+            PLOT_CMD="$PLOT_CMD --router-summary \"$ROUTER_SUMMARY\""
+        fi
+
+        echo -e "${BLUE}      Running: $PLOT_CMD${NC}"
+        eval $PLOT_CMD
+        echo -e "${GREEN}    ✅ $dataset plots generated in $OUTPUT_BASE/plots_$dataset${NC}"
+    else
+        echo -e "${RED}    ⚠️  No vLLM summary.json found for $dataset, skipping plots${NC}"
+    fi
+}
 
-            echo -e "${BLUE}    Running: $PLOT_CMD${NC}"
-            eval $PLOT_CMD
+# Function to generate comparison plots (now just calls individual dataset plots)
+generate_plots() {
+    echo -e "${BLUE}📈 Generating any remaining comparison plots...${NC}"
+    
+    for dataset in "${!DATASET_CONFIGS[@]}"; do
+        # Check if plots already exist
+        if [[ ! -d "$OUTPUT_BASE/plots_$dataset" ]]; then
+            echo -e "${YELLOW}  📊 Generating missing plots for $dataset...${NC}"
+            generate_dataset_plot "$dataset"
         else
-            echo -e "${RED}    ⚠️  No vLLM summary.json found for $dataset, skipping plots${NC}"
+            echo -e "${GREEN}  ✅ Plots for $dataset already exist${NC}"
         fi
     done
 
-    echo -e "${GREEN}  ✅ All plots generated${NC}"
+    echo -e "${GREEN}  ✅ All plots verified/generated${NC}"
     echo ""
 }
 
@@ -372,8 +471,8 @@ EOF
             "mmlu")
                 echo "| MMLU | $samples | ~570 | 57 subjects | Academic Knowledge |" >> "$summary_file"
                 ;;
-            "arc")
-                echo "| ARC | $samples | $samples | 1 (Science) | Scientific Reasoning |" >> "$summary_file"
+            "arc-challenge")
+                echo "| ARC-Challenge | $samples | $samples | 1 (Science) | Scientific Reasoning (Hard) |" >> "$summary_file"
                 ;;
             "gpqa")
                 echo "| GPQA | $samples | $samples | 1 (Graduate) | Graduate-level Q&A |" >> "$summary_file"
@@ -385,7 +484,7 @@ EOF
                 echo "| CommonsenseQA | $samples | $samples | 1 (Common Sense) | Commonsense Reasoning |" >> "$summary_file"
                 ;;
             "hellaswag")
-                echo "| HellaSwag | $samples | ~400 | ~50 activities | Commonsense NLI |" >> "$summary_file"
+                echo "| HellaSwag | $samples | ~100 | ~50 activities | Commonsense NLI |" >> "$summary_file"
                 ;;
         esac
     done
@@ -398,8 +497,8 @@ EOF
 
 ### Accuracy Comparison
 - Router (auto model with reasoning): See research_results.csv
-- vLLM Direct (NR mode): See research_results.csv
-- vLLM Direct (NR_REASONING mode): See research_results.csv
+- vLLM Direct (No Reasoning): See research_results.csv
+- vLLM Direct (All Reasoning): See research_results.csv
 
 ### Token Usage Analysis
 - Average tokens per response by dataset and mode (in research_results.csv)
@@ -448,7 +547,7 @@ EOF
 
 - **Seed**: 42 (for reproducibility)
 - **Router Mode**: Auto model selection with reasoning
-- **vLLM Modes**: NR (neutral) and NR_REASONING (with reasoning)
+- **vLLM Modes**: No Reasoning and All Reasoning
 - **Sample Strategy**: Stratified sampling per category
 - **Evaluation**: Exact match accuracy and token usage
 
@@ -462,9 +561,24 @@ EOF
 echo -e "${BLUE}🚀 Starting comprehensive benchmark...${NC}"
 start_time=$(date +%s)
 
-# Run benchmarks for all datasets
-for dataset in "${!DATASET_CONFIGS[@]}"; do
+# Run benchmarks for reasoning-focused datasets (GPQA first for quick feedback)
+DATASET_ORDER=("gpqa" "truthfulqa" "gsm8k" "aqua-rat" "sciq" "openbookqa" "strategyqa" "drop" "mmlu")
+dataset_count=0
+total_datasets=${#DATASET_ORDER[@]}
+
+for dataset in "${DATASET_ORDER[@]}"; do
+    # Skip if dataset not configured
+    if [[ -z "${DATASET_CONFIGS[$dataset]}" ]]; then
+        echo -e "${YELLOW}⚠️  Dataset $dataset not configured, skipping...${NC}"
+        continue
+    fi
+    
+    dataset_count=$((dataset_count + 1))
+    echo -e "${BLUE}🚀 Progress: Dataset $dataset_count/$total_datasets - Starting $dataset${NC}"
     run_dataset_benchmark "$dataset"
+    echo -e "${GREEN}🎉 Progress: Dataset $dataset_count/$total_datasets - Completed $dataset${NC}"
+    echo -e "${YELLOW}📊 Remaining datasets: $((total_datasets - dataset_count))${NC}"
+    echo ""
 done
 
 # Generate plots
@@ -489,7 +603,16 @@ echo -e "${BLUE}📋 Next Steps:${NC}"
 echo "1. 📊 **Master research data**: $PERSISTENT_RESEARCH_CSV"
 echo "2. 📊 **This run's data**: $OUTPUT_BASE/research_results.csv"  
 echo "3. 📋 Review research summary: $OUTPUT_BASE/RESEARCH_SUMMARY.md"
-echo "4. 📈 Examine plots for visual insights"
+echo "4. 📈 **View comprehensive charts**:"
+if [[ -f "$OUTPUT_BASE/accuracy_comparison.png" ]]; then
+    echo "   📊 Accuracy: $OUTPUT_BASE/accuracy_comparison.png"
+fi
+if [[ -f "$OUTPUT_BASE/token_usage_comparison.png" ]]; then
+    echo "   📊 Token Usage: $OUTPUT_BASE/token_usage_comparison.png"
+fi
+if [[ -f "$OUTPUT_BASE/efficiency_analysis.png" ]]; then
+    echo "   📊 Efficiency: $OUTPUT_BASE/efficiency_analysis.png"
+fi
 echo "5. 📄 Analyze detailed CSV files if needed"
 echo ""
 echo -e "${GREEN}🎓 Research CSV Format:${NC}"
diff --git a/bench/plot_comprehensive_results.py b/bench/plot_comprehensive_results.py
new file mode 100755
index 00000000..676bf1d1
--- /dev/null
+++ b/bench/plot_comprehensive_results.py
@@ -0,0 +1,443 @@
+#!/usr/bin/env python3
+"""
+Comprehensive Results Plotting Script
+
+This script creates comparison plots showing:
+1. Accuracy comparison across datasets and modes
+2. Token usage comparison across datasets and modes
+
+Modes compared:
+- Router (auto model with reasoning)
+- vLLM Direct (No Reasoning)
+- vLLM Direct (All Reasoning)
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+# Set style for better-looking plots
+plt.style.use("seaborn-v0_8")
+sns.set_palette("husl")
+
+
+def load_and_clean_data(csv_path):
+    """Load and clean the research results CSV."""
+    try:
+        df = pd.read_csv(csv_path)
+        print(f"✅ Loaded {len(df)} records from {csv_path}")
+
+        # Show available modes
+        print(f"📊 Available modes: {df['Mode'].unique().tolist()}")
+        print(f"📊 Available datasets: {df['Dataset'].unique().tolist()}")
+        print(f"📊 Available models: {df['Model'].unique().tolist()}")
+
+        # Map old mode names to new descriptive names for backward compatibility
+        mode_mapping = {
+            "vLLM_NR": "vLLM_No_Reasoning",
+            "vLLM_XC": "vLLM_All_Reasoning",
+            "vLLM_NR_REASONING": "vLLM_All_Reasoning",
+        }
+        df["Mode"] = df["Mode"].replace(mode_mapping)
+
+        # Clean data
+        df = df.dropna(subset=["Accuracy", "Avg_Total_Tokens"])
+        df = df[df["Accuracy"] >= 0]  # Remove invalid accuracy values
+
+        # Get the latest results for each dataset/mode/model combination
+        df["Timestamp"] = pd.to_datetime(df["Timestamp"])
+        df_latest = (
+            df.sort_values("Timestamp").groupby(["Dataset", "Mode", "Model"]).tail(1)
+        )
+
+        print(f"✅ Using {len(df_latest)} latest records after cleaning")
+        return df_latest
+
+    except Exception as e:
+        print(f"❌ Error loading data: {e}")
+        sys.exit(1)
+
+
+def create_accuracy_plot(df, output_dir):
+    """Create accuracy comparison plot."""
+    plt.figure(figsize=(14, 8))
+
+    # Prepare data for plotting
+    datasets = sorted(df["Dataset"].unique())
+    modes = ["Router", "vLLM_No_Reasoning", "vLLM_All_Reasoning"]
+
+    # Filter for available modes
+    available_modes = [mode for mode in modes if mode in df["Mode"].unique()]
+
+    # Create subplot data
+    x = np.arange(len(datasets))
+    width = 0.25
+
+    # Colors for each mode
+    colors = {
+        "Router": "#2E86AB",
+        "vLLM_No_Reasoning": "#A23B72",
+        "vLLM_All_Reasoning": "#F18F01",
+    }
+
+    # Plot bars for each mode
+    for i, mode in enumerate(available_modes):
+        mode_data = df[df["Mode"] == mode]
+        accuracies = []
+
+        for dataset in datasets:
+            dataset_data = mode_data[mode_data["Dataset"] == dataset]
+            if not dataset_data.empty:
+                # Use the latest model's accuracy
+                accuracy = dataset_data.iloc[-1]["Accuracy"]
+                accuracies.append(accuracy)
+            else:
+                accuracies.append(0)
+
+        # Clean mode name for display
+        display_name = mode.replace("vLLM_", "vLLM ").replace("_", " ")
+
+        plt.bar(
+            x + i * width,
+            accuracies,
+            width,
+            label=display_name,
+            color=colors.get(mode, f"C{i}"),
+            alpha=0.8,
+        )
+
+        # Add value labels on bars
+        for j, acc in enumerate(accuracies):
+            if acc > 0:
+                plt.text(
+                    x[j] + i * width,
+                    acc + 0.01,
+                    f"{acc:.3f}",
+                    ha="center",
+                    va="bottom",
+                    fontsize=9,
+                    fontweight="bold",
+                )
+
+    plt.xlabel("Dataset", fontsize=12, fontweight="bold")
+    plt.ylabel("Accuracy", fontsize=12, fontweight="bold")
+    plt.title(
+        "Accuracy Comparison: Router vs vLLM Direct\n(No Reasoning vs All Reasoning)",
+        fontsize=14,
+        fontweight="bold",
+        pad=20,
+    )
+    plt.xticks(x + width, [d.upper() for d in datasets], rotation=45, ha="right")
+    plt.legend(loc="upper left", frameon=True, fancybox=True, shadow=True)
+    plt.grid(True, alpha=0.3, axis="y")
+    plt.ylim(0, 1.1)
+
+    # Add model info
+    models = df["Model"].unique()
+    model_text = f"Models: {', '.join(models)}"
+    plt.figtext(0.02, 0.02, model_text, fontsize=8, style="italic")
+
+    plt.tight_layout()
+
+    # Save plot
+    accuracy_path = output_dir / "accuracy_comparison.png"
+    plt.savefig(accuracy_path, dpi=300, bbox_inches="tight")
+    print(f"📊 Accuracy plot saved: {accuracy_path}")
+    plt.close()
+
+
+def create_token_usage_plot(df, output_dir):
+    """Create token usage comparison plot."""
+    plt.figure(figsize=(14, 8))
+
+    # Prepare data for plotting
+    datasets = sorted(df["Dataset"].unique())
+    modes = ["Router", "vLLM_No_Reasoning", "vLLM_All_Reasoning"]
+
+    # Filter for available modes
+    available_modes = [mode for mode in modes if mode in df["Mode"].unique()]
+
+    # Create subplot data
+    x = np.arange(len(datasets))
+    width = 0.25
+
+    # Colors for each mode
+    colors = {
+        "Router": "#2E86AB",
+        "vLLM_No_Reasoning": "#A23B72",
+        "vLLM_All_Reasoning": "#F18F01",
+    }
+
+    # Plot bars for each mode
+    for i, mode in enumerate(available_modes):
+        mode_data = df[df["Mode"] == mode]
+        token_usage = []
+
+        for dataset in datasets:
+            dataset_data = mode_data[mode_data["Dataset"] == dataset]
+            if not dataset_data.empty:
+                # Use the latest model's token usage
+                tokens = dataset_data.iloc[-1]["Avg_Total_Tokens"]
+                token_usage.append(tokens)
+            else:
+                token_usage.append(0)
+
+        # Clean mode name for display
+        display_name = mode.replace("vLLM_", "vLLM ").replace("_", " ")
+
+        plt.bar(
+            x + i * width,
+            token_usage,
+            width,
+            label=display_name,
+            color=colors.get(mode, f"C{i}"),
+            alpha=0.8,
+        )
+
+        # Add value labels on bars
+        for j, tokens in enumerate(token_usage):
+            if tokens > 0:
+                plt.text(
+                    x[j] + i * width,
+                    tokens + max(token_usage) * 0.01,
+                    f"{tokens:.0f}",
+                    ha="center",
+                    va="bottom",
+                    fontsize=9,
+                    fontweight="bold",
+                )
+
+    plt.xlabel("Dataset", fontsize=12, fontweight="bold")
+    plt.ylabel("Average Total Tokens", fontsize=12, fontweight="bold")
+    plt.title(
+        "Token Usage Comparison: Router vs vLLM Direct\n(No Reasoning vs All Reasoning)",
+        fontsize=14,
+        fontweight="bold",
+        pad=20,
+    )
+    plt.xticks(x + width, [d.upper() for d in datasets], rotation=45, ha="right")
+    plt.legend(loc="upper left", frameon=True, fancybox=True, shadow=True)
+    plt.grid(True, alpha=0.3, axis="y")
+
+    # Add model info
+    models = df["Model"].unique()
+    model_text = f"Models: {', '.join(models)}"
+    plt.figtext(0.02, 0.02, model_text, fontsize=8, style="italic")
+
+    plt.tight_layout()
+
+    # Save plot
+    token_path = output_dir / "token_usage_comparison.png"
+    plt.savefig(token_path, dpi=300, bbox_inches="tight")
+    print(f"📊 Token usage plot saved: {token_path}")
+    plt.close()
+
+
+def create_efficiency_plot(df, output_dir):
+    """Create efficiency scatter plot (accuracy vs tokens)."""
+    plt.figure(figsize=(12, 8))
+
+    modes = ["Router", "vLLM_No_Reasoning", "vLLM_All_Reasoning"]
+    available_modes = [mode for mode in modes if mode in df["Mode"].unique()]
+
+    colors = {
+        "Router": "#2E86AB",
+        "vLLM_No_Reasoning": "#A23B72",
+        "vLLM_All_Reasoning": "#F18F01",
+    }
+
+    markers = {"Router": "o", "vLLM_No_Reasoning": "s", "vLLM_All_Reasoning": "^"}
+
+    for mode in available_modes:
+        mode_data = df[df["Mode"] == mode]
+
+        display_name = mode.replace("vLLM_", "vLLM ").replace("_", " ")
+
+        plt.scatter(
+            mode_data["Avg_Total_Tokens"],
+            mode_data["Accuracy"],
+            c=colors.get(mode, "gray"),
+            marker=markers.get(mode, "o"),
+            s=100,
+            alpha=0.7,
+            label=display_name,
+            edgecolors="black",
+            linewidth=1,
+        )
+
+        # Add dataset labels
+        for _, row in mode_data.iterrows():
+            plt.annotate(
+                row["Dataset"].upper(),
+                (row["Avg_Total_Tokens"], row["Accuracy"]),
+                xytext=(5, 5),
+                textcoords="offset points",
+                fontsize=8,
+                alpha=0.8,
+            )
+
+    plt.xlabel("Average Total Tokens", fontsize=12, fontweight="bold")
+    plt.ylabel("Accuracy", fontsize=12, fontweight="bold")
+    plt.title(
+        "Efficiency Analysis: Accuracy vs Token Usage\n(Higher accuracy with lower tokens is better)",
+        fontsize=14,
+        fontweight="bold",
+        pad=20,
+    )
+    plt.legend(frameon=True, fancybox=True, shadow=True)
+    plt.grid(True, alpha=0.3)
+
+    # Add model info
+    models = df["Model"].unique()
+    model_text = f"Models: {', '.join(models)}"
+    plt.figtext(0.02, 0.02, model_text, fontsize=8, style="italic")
+
+    plt.tight_layout()
+
+    # Save plot
+    efficiency_path = output_dir / "efficiency_analysis.png"
+    plt.savefig(efficiency_path, dpi=300, bbox_inches="tight")
+    print(f"📊 Efficiency plot saved: {efficiency_path}")
+    plt.close()
+
+
+def create_summary_table(df, output_dir):
+    """Create a summary table of results."""
+
+    # Check if we're dealing with a single dataset
+    unique_datasets = df["Dataset"].nunique()
+    dataset_name = (
+        df["Dataset"].iloc[0] if unique_datasets == 1 else "Multiple Datasets"
+    )
+
+    if unique_datasets == 1:
+        print(f"\n📋 RESULTS SUMMARY - {dataset_name.upper()}")
+    else:
+        print(f"\n📋 RESULTS SUMMARY - AGGREGATE ACROSS {unique_datasets} DATASETS")
+    print("=" * 80)
+
+    # For single dataset, show individual values; for multiple datasets, show statistics
+    if unique_datasets == 1:
+        # Show individual values for each mode
+        print(
+            f"{'Mode':<20} {'Accuracy':<10} {'Tokens':<10} {'Latency(ms)':<12} {'Samples':<8}"
+        )
+        print("-" * 65)
+
+        for mode in sorted(df["Mode"].unique()):
+            mode_data = df[df["Mode"] == mode].iloc[0]
+            print(
+                f"{mode:<20} {mode_data['Accuracy']:<10.3f} {mode_data['Avg_Total_Tokens']:<10.1f} {mode_data['Avg_Latency_ms']:<12.1f} {mode_data['Sample_Count']:<8}"
+            )
+
+        summary = df.groupby("Mode")[
+            ["Accuracy", "Avg_Total_Tokens", "Avg_Latency_ms"]
+        ].first()
+    else:
+        # Group by mode and calculate averages for multiple datasets
+        summary = (
+            df.groupby("Mode")
+            .agg(
+                {
+                    "Accuracy": ["mean", "std", "count"],
+                    "Avg_Total_Tokens": ["mean", "std"],
+                    "Avg_Latency_ms": ["mean", "std"],
+                }
+            )
+            .round(3)
+        )
+
+        print(summary)
+
+    # Save detailed results table
+    detailed_table = df.pivot_table(
+        index="Dataset",
+        columns="Mode",
+        values=["Accuracy", "Avg_Total_Tokens"],
+        aggfunc="mean",
+    ).round(3)
+
+    table_path = output_dir / "results_summary_table.csv"
+    detailed_table.to_csv(table_path)
+    print(f"\n📊 Detailed results table saved: {table_path}")
+
+    return summary
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Plot comprehensive benchmark results")
+    parser.add_argument(
+        "--csv",
+        type=str,
+        default="research_results_master.csv",
+        help="Path to research results CSV file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="research_plots",
+        help="Output directory for plots",
+    )
+    parser.add_argument(
+        "--model-filter",
+        type=str,
+        default=None,
+        help='Filter results for specific model (e.g., "Qwen/Qwen3-30B-A3B")',
+    )
+    parser.add_argument(
+        "--dataset-filter",
+        type=str,
+        default=None,
+        help='Filter results for specific dataset (e.g., "truthfulqa")',
+    )
+
+    args = parser.parse_args()
+
+    # Create output directory
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(exist_ok=True)
+
+    print(f"🎨 Creating comprehensive benchmark plots...")
+    print(f"📊 Input CSV: {args.csv}")
+    print(f"📁 Output directory: {output_dir}")
+
+    # Load data
+    df = load_and_clean_data(args.csv)
+
+    # Filter by model if specified
+    if args.model_filter:
+        df = df[df["Model"].str.contains(args.model_filter, na=False)]
+        print(f"🔍 Filtered to model: {args.model_filter} ({len(df)} records)")
+
+    # Filter by dataset if specified
+    if args.dataset_filter:
+        df = df[df["Dataset"].str.contains(args.dataset_filter, case=False, na=False)]
+        print(f"🔍 Filtered to dataset: {args.dataset_filter} ({len(df)} records)")
+
+    if df.empty:
+        print("❌ No data available after filtering!")
+        sys.exit(1)
+
+    # Create plots
+    create_accuracy_plot(df, output_dir)
+    create_token_usage_plot(df, output_dir)
+    create_efficiency_plot(df, output_dir)
+
+    # Create summary
+    summary = create_summary_table(df, output_dir)
+
+    print(f"\n🎉 All plots created successfully!")
+    print(f"📁 Check the '{output_dir}' directory for:")
+    print(f"   - accuracy_comparison.png")
+    print(f"   - token_usage_comparison.png")
+    print(f"   - efficiency_analysis.png")
+    print(f"   - results_summary_table.csv")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/vllm_semantic_router_bench/cli.py b/bench/vllm_semantic_router_bench/cli.py
index b8fdab63..2bb347c8 100644
--- a/bench/vllm_semantic_router_bench/cli.py
+++ b/bench/vllm_semantic_router_bench/cli.py
@@ -21,7 +21,7 @@ def main():
   semantic-router-bench test --dataset mmlu --samples 5
 
   # Full benchmark comparison
-  semantic-router-bench compare --dataset arc --samples 10
+  semantic-router-bench compare --dataset arc-challenge --samples 10
 
   # List available datasets
   semantic-router-bench list-datasets
@@ -40,7 +40,15 @@ def main():
     test_parser.add_argument(
         "--dataset",
         required=True,
-        choices=["mmlu", "arc", "gpqa", "truthfulqa", "commonsenseqa", "hellaswag"],
+        choices=[
+            "mmlu",
+            "arc",
+            "arc-challenge",
+            "gpqa",
+            "truthfulqa",
+            "commonsenseqa",
+            "hellaswag",
+        ],
         help="Dataset to test",
     )
     test_parser.add_argument(
@@ -68,7 +76,15 @@ def main():
     compare_parser.add_argument(
         "--dataset",
         required=True,
-        choices=["mmlu", "arc", "gpqa", "truthfulqa", "commonsenseqa", "hellaswag"],
+        choices=[
+            "mmlu",
+            "arc",
+            "arc-challenge",
+            "gpqa",
+            "truthfulqa",
+            "commonsenseqa",
+            "hellaswag",
+        ],
         help="Dataset to benchmark",
     )
     compare_parser.add_argument(
@@ -119,7 +135,14 @@ def main():
     comprehensive_parser.add_argument(
         "--datasets",
         nargs="+",
-        default=["mmlu", "arc", "gpqa", "truthfulqa", "commonsenseqa", "hellaswag"],
+        default=[
+            "mmlu",
+            "arc-challenge",
+            "gpqa",
+            "truthfulqa",
+            "commonsenseqa",
+            "hellaswag",
+        ],
         help="Datasets to benchmark",
     )
     comprehensive_parser.add_argument(
@@ -227,7 +250,7 @@ def list_datasets():
 
         print("\nUsage examples:")
         print("  semantic-router-bench test --dataset mmlu --samples 5")
-        print("  semantic-router-bench compare --dataset arc --samples 10")
+        print("  semantic-router-bench compare --dataset arc-challenge --samples 10")
 
         return 0
     except ImportError as e:
diff --git a/bench/vllm_semantic_router_bench/dataset_factory.py b/bench/vllm_semantic_router_bench/dataset_factory.py
index 429faf9e..499118de 100644
--- a/bench/vllm_semantic_router_bench/dataset_factory.py
+++ b/bench/vllm_semantic_router_bench/dataset_factory.py
@@ -7,20 +7,28 @@
 
 from typing import Dict, List, Optional, Type
 
+from .dataset_implementations.aqua_rat_dataset import AquaRatDataset
 from .dataset_implementations.arc_dataset import (
     ARCChallengeDataset,
     ARCDataset,
     ARCEasyDataset,
 )
 from .dataset_implementations.commonsenseqa_dataset import CommonsenseQADataset
+from .dataset_implementations.drop_dataset import DROPDataset
 from .dataset_implementations.gpqa_dataset import (
     GPQADataset,
     GPQADiamondDataset,
     GPQAExtendedDataset,
     GPQAMainDataset,
 )
+from .dataset_implementations.gsm8k_dataset import GSM8KDataset
 from .dataset_implementations.hellaswag_dataset import HellaSwagDataset
+
+# from .dataset_implementations.math_dataset import MATHDataset  # Disabled - dataset not available
 from .dataset_implementations.mmlu_dataset import MMLUDataset
+from .dataset_implementations.openbookqa_dataset import OpenBookQADataset
+from .dataset_implementations.sciq_dataset import SciQDataset
+from .dataset_implementations.strategyqa_dataset import StrategyQADataset
 from .dataset_implementations.truthfulqa_dataset import TruthfulQADataset
 from .dataset_interface import DatasetInterface
 
@@ -101,6 +109,19 @@ def get_dataset_info(cls, name: str) -> Dict[str, str]:
 DatasetFactory.register_dataset("gpqa-extended", GPQAExtendedDataset)
 DatasetFactory.register_dataset("gpqa-diamond", GPQADiamondDataset)
 
+# Register mathematical reasoning datasets
+# DatasetFactory.register_dataset("math", MATHDataset)  # Disabled - dataset not available
+DatasetFactory.register_dataset("gsm8k", GSM8KDataset)
+DatasetFactory.register_dataset("aqua-rat", AquaRatDataset)
+
+# Register multi-step reasoning datasets
+DatasetFactory.register_dataset("drop", DROPDataset)
+DatasetFactory.register_dataset("strategyqa", StrategyQADataset)
+
+# Register scientific reasoning datasets
+DatasetFactory.register_dataset("sciq", SciQDataset)
+DatasetFactory.register_dataset("openbookqa", OpenBookQADataset)
+
 # Register hard reasoning datasets
 DatasetFactory.register_dataset("truthfulqa", TruthfulQADataset)
 DatasetFactory.register_dataset("commonsenseqa", CommonsenseQADataset)
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/aqua_rat_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/aqua_rat_dataset.py
new file mode 100644
index 00000000..8e99e0a5
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/aqua_rat_dataset.py
@@ -0,0 +1,173 @@
+"""
+AQUA-RAT Dataset Implementation
+
+Algebraic Question Answering with Rationales - algebraic word problems
+with step-by-step rationales for mathematical reasoning evaluation.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, Question
+
+
+class AquaRatDataset(DatasetInterface):
+    """AQUA-RAT dataset implementation for algebraic reasoning with rationales."""
+
+    def __init__(self):
+        """Initialize AQUA-RAT dataset."""
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return "AQUA-RAT"
+
+    @property
+    def supports_cot(self) -> bool:
+        return True  # AQUA-RAT has rationales
+
+    def _load_raw_dataset(self):
+        """Load raw AQUA-RAT dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        # Load the test split
+        dataset = load_dataset("aqua_rat", split="test")
+        self._dataset_cache = pd.DataFrame(dataset)
+        return self._dataset_cache
+
+    def _get_categories(self) -> List[str]:
+        """Get available categories in AQUA-RAT dataset."""
+        if self._categories_cache is not None:
+            return self._categories_cache
+
+        # AQUA-RAT doesn't have category columns, treat as single dataset
+        self._categories_cache = ["default"]
+        return self._categories_cache
+
+    def get_available_categories(self) -> List[str]:
+        """Get list of all available categories in the dataset."""
+        return self._get_categories()
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load AQUA-RAT dataset with optional filtering and sampling."""
+        df = self._load_raw_dataset()
+        available_categories = self._get_categories()
+
+        # Filter categories if specified
+        if categories:
+            missing_categories = set(categories) - set(available_categories)
+            if missing_categories:
+                raise ValueError(
+                    f"Categories not found: {missing_categories}. "
+                    f"Available: {available_categories}"
+                )
+            selected_categories = categories
+        else:
+            selected_categories = available_categories
+
+        # Sample questions if specified
+        if samples_per_category:
+            np.random.seed(seed)
+            random.seed(seed)
+
+            sample_size = min(samples_per_category, len(df))
+            df = df.sample(n=sample_size, random_state=seed)
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            question_text = row["question"]
+            raw_options = row["options"]  # List of 5 options (A, B, C, D, E)
+            correct_answer = row["correct"]  # Letter (A, B, C, D, E)
+            rationale = row["rationale"]  # Step-by-step explanation
+
+            # Clean options by removing letter prefixes (e.g., "A)500" -> "500")
+            options = []
+            for option in raw_options:
+                # Remove letter prefix like "A)", "B)", etc.
+                import re
+
+                cleaned = re.sub(r"^[A-E]\)", "", option).strip()
+                options.append(cleaned)
+
+            question = Question(
+                question_id=f"aqua_rat_{len(questions)}",
+                question=question_text,
+                options=options,
+                correct_answer=correct_answer,
+                category="default",
+                cot_content=rationale,
+                metadata={
+                    "difficulty": "Moderate",
+                    "type": "algebraic_word_problem",
+                    "rationale": rationale,
+                },
+            )
+            questions.append(question)
+
+        dataset_info = DatasetInfo(
+            name="AQUA-RAT",
+            description="Algebraic word problems with step-by-step rationales",
+            categories=selected_categories,
+            total_questions=len(questions),
+            format_type="multiple_choice",
+            difficulty_level="Moderate",
+        )
+
+        return questions, dataset_info
+
+    def format_prompt(self, question: Question, prompt_style: str = "plain") -> str:
+        """Format prompt for AQUA-RAT questions."""
+        options_text = "\n".join(
+            [f"{chr(65+i)}) {opt}" for i, opt in enumerate(question.options)]
+        )
+
+        if prompt_style == "plain":
+            return f"""Solve this algebraic word problem:
+
+{question.question}
+
+{options_text}
+
+Please provide your answer in the following structured format:
+ANSWER: [letter]
+
+For example: ANSWER: A"""
+
+        elif prompt_style == "explicit_cot":
+            return f"""Solve this algebraic word problem step by step:
+
+Problem: {question.question}
+
+Options:
+{options_text}
+
+Please work through this step-by-step:
+1. Identify the variables and what is being asked
+2. Set up the algebraic equations
+3. Solve the equations step by step
+4. Check your answer against the options
+5. Select the correct answer
+
+Please provide your final answer in the following structured format:
+ANSWER: [letter]
+
+For example: ANSWER: A"""
+
+        else:
+            raise ValueError(f"Unknown prompt style: {prompt_style}")
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/drop_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/drop_dataset.py
new file mode 100644
index 00000000..e492f169
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/drop_dataset.py
@@ -0,0 +1,161 @@
+"""
+DROP Dataset Implementation
+
+Discrete Reasoning Over Paragraphs - reading comprehension requiring
+discrete reasoning operations over text passages.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, Question
+
+
+class DROPDataset(DatasetInterface):
+    """DROP dataset implementation for discrete reasoning over paragraphs."""
+
+    def __init__(self):
+        """Initialize DROP dataset."""
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return "DROP"
+
+    @property
+    def supports_cot(self) -> bool:
+        return False  # DROP doesn't have built-in CoT content
+
+    def _load_raw_dataset(self):
+        """Load raw DROP dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        # Load the validation split (test split is not public)
+        dataset = load_dataset("ucinlp/drop", split="validation")
+        self._dataset_cache = pd.DataFrame(dataset)
+        return self._dataset_cache
+
+    def _get_categories(self) -> List[str]:
+        """Get available categories in DROP dataset."""
+        if self._categories_cache is not None:
+            return self._categories_cache
+
+        # DROP doesn't have category columns, treat as single dataset
+        self._categories_cache = ["default"]
+        return self._categories_cache
+
+    def get_available_categories(self) -> List[str]:
+        """Get list of all available categories in the dataset."""
+        return self._get_categories()
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load DROP dataset with optional filtering and sampling."""
+        df = self._load_raw_dataset()
+        available_categories = self._get_categories()
+
+        # Filter categories if specified
+        if categories:
+            missing_categories = set(categories) - set(available_categories)
+            if missing_categories:
+                raise ValueError(
+                    f"Categories not found: {missing_categories}. "
+                    f"Available: {available_categories}"
+                )
+            selected_categories = categories
+        else:
+            selected_categories = available_categories
+
+        # Sample questions if specified
+        if samples_per_category:
+            np.random.seed(seed)
+            random.seed(seed)
+
+            sample_size = min(samples_per_category, len(df))
+            df = df.sample(n=sample_size, random_state=seed)
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            passage = row["passage"]
+            question_text = row["question"]
+            # DROP has multiple possible answers
+            answers_spans = row["answers_spans"]
+            if answers_spans and len(answers_spans["spans"]) > 0:
+                correct_answer = answers_spans["spans"][0]  # Take first valid answer
+            else:
+                correct_answer = "Unknown"
+
+            # Combine passage and question
+            full_question = f"Passage: {passage}\n\nQuestion: {question_text}"
+
+            question = Question(
+                question_id=f"drop_{len(questions)}",
+                question=full_question,
+                options=[],  # DROP is free-form, no multiple choice
+                correct_answer=correct_answer,
+                category="default",
+                cot_content=None,
+                metadata={
+                    "difficulty": "Hard",
+                    "type": "discrete_reasoning",
+                    "passage": passage,
+                    "question_only": question_text,
+                },
+            )
+            questions.append(question)
+
+        dataset_info = DatasetInfo(
+            name="DROP",
+            description="Reading comprehension requiring discrete reasoning over paragraphs",
+            categories=selected_categories,
+            total_questions=len(questions),
+            format_type="free_form",
+            difficulty_level="Hard",
+        )
+
+        return questions, dataset_info
+
+    def format_prompt(self, question: Question, prompt_style: str = "plain") -> str:
+        """Format prompt for DROP questions."""
+        if prompt_style == "plain":
+            return f"""{question.question}
+
+Please read the passage carefully and answer the question based on the information provided.
+
+Please provide your answer in the following structured format:
+ANSWER: [your answer]
+
+For example: ANSWER: 68.5 or ANSWER: germans or ANSWER: Centenary Medal"""
+
+        elif prompt_style == "explicit_cot":
+            return f"""{question.question}
+
+Please work through this step-by-step:
+1. Read the passage carefully
+2. Identify the key information relevant to the question
+3. Determine what type of reasoning is required (counting, arithmetic, comparison, etc.)
+4. Apply the necessary reasoning operations
+5. Provide your final answer
+
+Work through your reasoning step by step, then provide your final answer in the following structured format:
+ANSWER: [your answer]
+
+For example: ANSWER: 68.5 or ANSWER: germans or ANSWER: Centenary Medal"""
+
+        else:
+            raise ValueError(f"Unknown prompt style: {prompt_style}")
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/gsm8k_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/gsm8k_dataset.py
new file mode 100644
index 00000000..4518bb0c
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/gsm8k_dataset.py
@@ -0,0 +1,160 @@
+"""
+GSM8K Dataset Implementation
+
+Grade School Math 8K - 8,500 elementary mathematics word problems
+requiring multi-step reasoning and basic arithmetic.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, Question
+
+
+class GSM8KDataset(DatasetInterface):
+    """GSM8K dataset implementation for elementary mathematical reasoning."""
+
+    def __init__(self):
+        """Initialize GSM8K dataset."""
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return "GSM8K"
+
+    @property
+    def supports_cot(self) -> bool:
+        return True  # GSM8K has step-by-step solutions
+
+    def _load_raw_dataset(self):
+        """Load raw GSM8K dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        # Load the test split
+        dataset = load_dataset("gsm8k", "main", split="test")
+        self._dataset_cache = pd.DataFrame(dataset)
+        return self._dataset_cache
+
+    def _get_categories(self) -> List[str]:
+        """Get available categories in GSM8K dataset."""
+        if self._categories_cache is not None:
+            return self._categories_cache
+
+        # GSM8K doesn't have category columns, treat as single dataset
+        self._categories_cache = ["default"]
+        return self._categories_cache
+
+    def get_available_categories(self) -> List[str]:
+        """Get list of all available categories in the dataset."""
+        return self._get_categories()
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load GSM8K dataset with optional filtering and sampling."""
+        df = self._load_raw_dataset()
+        available_categories = self._get_categories()
+
+        # Filter categories if specified (though GSM8K only has one category)
+        if categories:
+            missing_categories = set(categories) - set(available_categories)
+            if missing_categories:
+                raise ValueError(
+                    f"Categories not found: {missing_categories}. "
+                    f"Available: {available_categories}"
+                )
+            selected_categories = categories
+        else:
+            selected_categories = available_categories
+
+        # Sample questions if specified
+        if samples_per_category:
+            np.random.seed(seed)
+            random.seed(seed)
+
+            sample_size = min(samples_per_category, len(df))
+            df = df.sample(n=sample_size, random_state=seed)
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            question_text = row["question"]
+            answer_text = row["answer"]
+
+            # Extract the final numerical answer from the solution
+            import re
+
+            # GSM8K answers end with "#### [number]"
+            answer_match = re.search(r"####\s*([0-9,.-]+)", answer_text)
+            correct_answer = answer_match.group(1) if answer_match else "Unknown"
+
+            question = Question(
+                question_id=f"gsm8k_{len(questions)}",
+                question=question_text,
+                options=[],  # GSM8K is free-form, no multiple choice
+                correct_answer=correct_answer,
+                category="default",
+                cot_content=answer_text,  # Full solution as CoT
+                metadata={
+                    "difficulty": "Elementary",
+                    "type": "word_problem",
+                    "solution": answer_text,
+                },
+            )
+            questions.append(question)
+
+        dataset_info = DatasetInfo(
+            name="GSM8K",
+            description="Grade school mathematics word problems requiring multi-step reasoning",
+            categories=selected_categories,
+            total_questions=len(questions),
+            format_type="free_form",
+            difficulty_level="Elementary",
+        )
+
+        return questions, dataset_info
+
+    def format_prompt(self, question: Question, prompt_style: str = "plain") -> str:
+        """Format prompt for GSM8K questions."""
+        if prompt_style == "plain":
+            return f"""Solve this math word problem:
+
+{question.question}
+
+Please provide your final answer in the following structured format:
+ANSWER: [number]
+
+For example: ANSWER: 42"""
+
+        elif prompt_style == "explicit_cot":
+            return f"""Solve this math word problem step by step, showing all your work:
+
+Problem: {question.question}
+
+Please work through this step-by-step:
+1. Read the problem carefully and identify what is being asked
+2. Identify the given information
+3. Determine what operations are needed
+4. Solve step by step, showing your calculations
+5. State your final answer clearly
+
+Please provide your final answer in the following structured format:
+ANSWER: [number]
+
+For example: ANSWER: 42"""
+
+        else:
+            raise ValueError(f"Unknown prompt style: {prompt_style}")
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/math_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/math_dataset.py
new file mode 100644
index 00000000..5ab3cce9
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/math_dataset.py
@@ -0,0 +1,171 @@
+"""
+MATH Dataset Implementation
+
+Hendrycks et al. MATH dataset - 12,500 competition mathematics problems
+requiring advanced mathematical reasoning across algebra, calculus, geometry, etc.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, Question
+
+
+class MATHDataset(DatasetInterface):
+    """MATH (Hendrycks et al.) dataset implementation for mathematical reasoning."""
+
+    def __init__(self):
+        """Initialize MATH dataset."""
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return "MATH"
+
+    @property
+    def supports_cot(self) -> bool:
+        return True  # MATH has step-by-step solutions
+
+    def _load_raw_dataset(self):
+        """Load raw MATH dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        # Load the test split - try different possible dataset names
+        try:
+            dataset = load_dataset("hendrycks/math", split="test")
+        except Exception:
+            try:
+                dataset = load_dataset("lighteval/MATH", split="test")
+            except Exception:
+                dataset = load_dataset("competition_math", split="test")
+        self._dataset_cache = pd.DataFrame(dataset)
+        return self._dataset_cache
+
+    def _get_categories(self) -> List[str]:
+        """Get available categories (subjects) in MATH dataset."""
+        if self._categories_cache is not None:
+            return self._categories_cache
+
+        df = self._load_raw_dataset()
+        # MATH has 'type' field for subject areas
+        self._categories_cache = sorted(df["type"].unique().tolist())
+        return self._categories_cache
+
+    def get_available_categories(self) -> List[str]:
+        """Get list of all available categories in the dataset."""
+        return self._get_categories()
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load MATH dataset with optional filtering and sampling."""
+        df = self._load_raw_dataset()
+        available_categories = self._get_categories()
+
+        # Filter categories if specified
+        if categories:
+            missing_categories = set(categories) - set(available_categories)
+            if missing_categories:
+                raise ValueError(
+                    f"Categories not found: {missing_categories}. "
+                    f"Available: {available_categories}"
+                )
+            df = df[df["type"].isin(categories)]
+            selected_categories = categories
+        else:
+            selected_categories = available_categories
+
+        # Sample questions per category
+        if samples_per_category:
+            sampled_dfs = []
+            np.random.seed(seed)
+            random.seed(seed)
+
+            for category in selected_categories:
+                category_df = df[df["type"] == category]
+                if len(category_df) == 0:
+                    continue
+
+                sample_size = min(samples_per_category, len(category_df))
+                sampled_df = category_df.sample(n=sample_size, random_state=seed)
+                sampled_dfs.append(sampled_df)
+
+            if sampled_dfs:
+                df = pd.concat(sampled_dfs, ignore_index=True)
+            else:
+                df = pd.DataFrame()
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            # MATH problems are free-form, but we need to extract the final answer
+            # The solution contains the final answer in \boxed{} format
+            question_text = row["problem"]
+            solution = row["solution"]
+
+            # Extract boxed answer as the correct answer
+            import re
+
+            boxed_match = re.search(r"\\boxed\{([^}]+)\}", solution)
+            correct_answer = boxed_match.group(1) if boxed_match else "Unknown"
+
+            question = Question(
+                question_id=f"math_{len(questions)}",
+                question=question_text,
+                options=[],  # MATH is free-form, no multiple choice
+                correct_answer=correct_answer,
+                category=row["type"],
+                cot_content=solution,  # Full solution as CoT
+                metadata={
+                    "level": row.get("level", "Unknown"),
+                    "subject": row["type"],
+                    "solution": solution,
+                },
+            )
+            questions.append(question)
+
+        dataset_info = DatasetInfo(
+            name="MATH",
+            description="Competition mathematics problems requiring advanced reasoning",
+            categories=selected_categories,
+            total_questions=len(questions),
+            format_type="free_form",
+            difficulty_level="Graduate",  # Competition math is very hard
+        )
+
+        return questions, dataset_info
+
+    def format_prompt(self, question: Question, prompt_style: str = "plain") -> str:
+        """Format prompt for MATH questions."""
+        if prompt_style == "plain":
+            return f"Solve this mathematics problem step by step:\n\n{question.question}\n\nProvide your final answer in the format: Answer: [your answer]"
+
+        elif prompt_style == "explicit_cot":
+            return f"""Solve this mathematics problem step by step, showing all your work:
+
+Problem: {question.question}
+
+Please work through this step-by-step:
+1. Identify what is being asked
+2. Determine the relevant mathematical concepts
+3. Set up the problem
+4. Solve step by step
+5. Verify your answer
+
+Provide your final answer in the format: Answer: [your answer]"""
+
+        else:
+            raise ValueError(f"Unknown prompt style: {prompt_style}")
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/openbookqa_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/openbookqa_dataset.py
new file mode 100644
index 00000000..48bf35be
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/openbookqa_dataset.py
@@ -0,0 +1,167 @@
+"""
+OpenBookQA Dataset Implementation
+
+Elementary science questions requiring reasoning over a "book" of facts.
+Tests ability to combine multiple facts and apply scientific reasoning.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, Question
+
+
+class OpenBookQADataset(DatasetInterface):
+    """OpenBookQA dataset implementation for scientific reasoning with facts."""
+
+    def __init__(self):
+        """Initialize OpenBookQA dataset."""
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return "OpenBookQA"
+
+    @property
+    def supports_cot(self) -> bool:
+        return False  # OpenBookQA doesn't have built-in CoT content
+
+    def _load_raw_dataset(self):
+        """Load raw OpenBookQA dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        # Load the test split
+        dataset = load_dataset("openbookqa", split="test")
+        self._dataset_cache = pd.DataFrame(dataset)
+        return self._dataset_cache
+
+    def _get_categories(self) -> List[str]:
+        """Get available categories in OpenBookQA dataset."""
+        if self._categories_cache is not None:
+            return self._categories_cache
+
+        # OpenBookQA doesn't have category columns, treat as single dataset
+        self._categories_cache = ["default"]
+        return self._categories_cache
+
+    def get_available_categories(self) -> List[str]:
+        """Get list of all available categories in the dataset."""
+        return self._get_categories()
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load OpenBookQA dataset with optional filtering and sampling."""
+        df = self._load_raw_dataset()
+        available_categories = self._get_categories()
+
+        # Filter categories if specified
+        if categories:
+            missing_categories = set(categories) - set(available_categories)
+            if missing_categories:
+                raise ValueError(
+                    f"Categories not found: {missing_categories}. "
+                    f"Available: {available_categories}"
+                )
+            selected_categories = categories
+        else:
+            selected_categories = available_categories
+
+        # Sample questions if specified
+        if samples_per_category:
+            np.random.seed(seed)
+            random.seed(seed)
+
+            sample_size = min(samples_per_category, len(df))
+            df = df.sample(n=sample_size, random_state=seed)
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            question_stem = row["question_stem"]
+            choices = row["choices"]
+            answer_key = row["answerKey"]  # A, B, C, D
+
+            # Extract options from choices
+            # Handle different possible structures for choices
+            if isinstance(choices, dict) and "text" in choices:
+                options = choices["text"]
+            elif isinstance(choices, list):
+                options = [
+                    choice["text"] if isinstance(choice, dict) else choice
+                    for choice in choices
+                ]
+            else:
+                options = [str(choices)]  # Fallback
+
+            question = Question(
+                question_id=f"openbookqa_{len(questions)}",
+                question=question_stem,
+                options=options,
+                correct_answer=answer_key,
+                category="default",
+                cot_content=None,
+                metadata={
+                    "difficulty": "Elementary",
+                    "type": "science_reasoning",
+                    "requires_fact_combination": True,
+                },
+            )
+            questions.append(question)
+
+        dataset_info = DatasetInfo(
+            name="OpenBookQA",
+            description="Elementary science questions requiring reasoning over scientific facts",
+            categories=selected_categories,
+            total_questions=len(questions),
+            format_type="multiple_choice",
+            difficulty_level="Elementary",
+        )
+
+        return questions, dataset_info
+
+    def format_prompt(self, question: Question, prompt_style: str = "plain") -> str:
+        """Format prompt for OpenBookQA questions."""
+        options_text = "\n".join(
+            [f"{chr(65+i)}) {opt}" for i, opt in enumerate(question.options)]
+        )
+
+        if prompt_style == "plain":
+            return f"""Question: {question.question}
+
+{options_text}
+
+Think about what scientific facts and principles apply to this question.
+
+Provide your answer in the format 'Answer: [letter]'."""
+
+        elif prompt_style == "explicit_cot":
+            return f"""Question: {question.question}
+
+Options:
+{options_text}
+
+Please work through this step-by-step:
+1. Identify what scientific concept or principle the question is testing
+2. Think about relevant scientific facts that might apply
+3. Consider how different facts might combine to answer the question
+4. Apply scientific reasoning to eliminate incorrect options
+5. Select the best answer based on scientific principles
+
+Show your scientific reasoning step by step, then provide your answer in the format 'Answer: [letter]'."""
+
+        else:
+            raise ValueError(f"Unknown prompt style: {prompt_style}")
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/sciq_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/sciq_dataset.py
new file mode 100644
index 00000000..f5c9c8e7
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/sciq_dataset.py
@@ -0,0 +1,173 @@
+"""
+SciQ Dataset Implementation
+
+Science Questions - multiple choice science questions requiring
+scientific reasoning and knowledge application.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, Question
+
+
+class SciQDataset(DatasetInterface):
+    """SciQ dataset implementation for scientific reasoning."""
+
+    def __init__(self):
+        """Initialize SciQ dataset."""
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return "SciQ"
+
+    @property
+    def supports_cot(self) -> bool:
+        return False  # SciQ doesn't have built-in CoT content
+
+    def _load_raw_dataset(self):
+        """Load raw SciQ dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        # Load the test split
+        dataset = load_dataset("sciq", split="test")
+        self._dataset_cache = pd.DataFrame(dataset)
+        return self._dataset_cache
+
+    def _get_categories(self) -> List[str]:
+        """Get available categories in SciQ dataset."""
+        if self._categories_cache is not None:
+            return self._categories_cache
+
+        # SciQ doesn't have category columns, treat as single dataset
+        self._categories_cache = ["default"]
+        return self._categories_cache
+
+    def get_available_categories(self) -> List[str]:
+        """Get list of all available categories in the dataset."""
+        return self._get_categories()
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load SciQ dataset with optional filtering and sampling."""
+        df = self._load_raw_dataset()
+        available_categories = self._get_categories()
+
+        # Filter categories if specified
+        if categories:
+            missing_categories = set(categories) - set(available_categories)
+            if missing_categories:
+                raise ValueError(
+                    f"Categories not found: {missing_categories}. "
+                    f"Available: {available_categories}"
+                )
+            selected_categories = categories
+        else:
+            selected_categories = available_categories
+
+        # Sample questions if specified
+        if samples_per_category:
+            np.random.seed(seed)
+            random.seed(seed)
+
+            sample_size = min(samples_per_category, len(df))
+            df = df.sample(n=sample_size, random_state=seed)
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            question_text = row["question"]
+            correct_answer = row["correct_answer"]
+
+            # Build options list
+            options = [
+                row["correct_answer"],
+                row["distractor1"],
+                row["distractor2"],
+                row["distractor3"],
+            ]
+            # Shuffle options and find correct index
+            random.seed(42)  # Fixed seed for reproducible option order
+            shuffled_options = options.copy()
+            random.shuffle(shuffled_options)
+            correct_idx = shuffled_options.index(correct_answer)
+            correct_letter = chr(65 + correct_idx)  # A, B, C, D
+
+            question = Question(
+                question_id=f"sciq_{len(questions)}",
+                question=question_text,
+                options=shuffled_options,
+                correct_answer=correct_letter,
+                category="default",
+                cot_content=None,
+                metadata={
+                    "difficulty": "Moderate",
+                    "type": "science_multiple_choice",
+                    "support": row.get(
+                        "support", ""
+                    ),  # Background passage if available
+                },
+            )
+            questions.append(question)
+
+        dataset_info = DatasetInfo(
+            name="SciQ",
+            description="Science questions requiring scientific reasoning and knowledge",
+            categories=selected_categories,
+            total_questions=len(questions),
+            format_type="multiple_choice",
+            difficulty_level="Moderate",
+        )
+
+        return questions, dataset_info
+
+    def format_prompt(self, question: Question, prompt_style: str = "plain") -> str:
+        """Format prompt for SciQ questions."""
+        options_text = "\n".join(
+            [f"{chr(65+i)}) {opt}" for i, opt in enumerate(question.options)]
+        )
+
+        # Add support passage if available
+        support_text = ""
+        if question.metadata and question.metadata.get("support"):
+            support_text = f"Background: {question.metadata['support']}\n\n"
+
+        if prompt_style == "plain":
+            return f"""{support_text}Question: {question.question}
+
+{options_text}
+
+Provide your answer in the format 'Answer: [letter]'."""
+
+        elif prompt_style == "explicit_cot":
+            return f"""{support_text}Question: {question.question}
+
+Options:
+{options_text}
+
+Please work through this step-by-step:
+1. Read the question carefully and identify what scientific concept is being tested
+2. Consider any background information provided
+3. Apply relevant scientific principles and knowledge
+4. Eliminate incorrect options through reasoning
+5. Select the best answer
+
+Show your scientific reasoning step by step, then provide your answer in the format 'Answer: [letter]'."""
+
+        else:
+            raise ValueError(f"Unknown prompt style: {prompt_style}")
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/strategyqa_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/strategyqa_dataset.py
new file mode 100644
index 00000000..03a7618a
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/strategyqa_dataset.py
@@ -0,0 +1,161 @@
+"""
+StrategyQA Dataset Implementation
+
+Multi-step reasoning questions requiring implicit reasoning steps
+and strategic thinking to answer yes/no questions.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, Question
+
+
+class StrategyQADataset(DatasetInterface):
+    """StrategyQA dataset implementation for multi-step implicit reasoning."""
+
+    def __init__(self):
+        """Initialize StrategyQA dataset."""
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return "StrategyQA"
+
+    @property
+    def supports_cot(self) -> bool:
+        return True  # StrategyQA has decomposition and evidence
+
+    def _load_raw_dataset(self):
+        """Load raw StrategyQA dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        # Load the test split
+        dataset = load_dataset("ChilleD/StrategyQA", split="test")
+        self._dataset_cache = pd.DataFrame(dataset)
+        return self._dataset_cache
+
+    def _get_categories(self) -> List[str]:
+        """Get available categories in StrategyQA dataset."""
+        if self._categories_cache is not None:
+            return self._categories_cache
+
+        # StrategyQA doesn't have category columns, treat as single dataset
+        self._categories_cache = ["default"]
+        return self._categories_cache
+
+    def get_available_categories(self) -> List[str]:
+        """Get list of all available categories in the dataset."""
+        return self._get_categories()
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load StrategyQA dataset with optional filtering and sampling."""
+        df = self._load_raw_dataset()
+        available_categories = self._get_categories()
+
+        # Filter categories if specified
+        if categories:
+            missing_categories = set(categories) - set(available_categories)
+            if missing_categories:
+                raise ValueError(
+                    f"Categories not found: {missing_categories}. "
+                    f"Available: {available_categories}"
+                )
+            selected_categories = categories
+        else:
+            selected_categories = available_categories
+
+        # Sample questions if specified
+        if samples_per_category:
+            np.random.seed(seed)
+            random.seed(seed)
+
+            sample_size = min(samples_per_category, len(df))
+            df = df.sample(n=sample_size, random_state=seed)
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            question_text = row["question"]
+            answer = row["answer"]  # Boolean
+            correct_answer = "Yes" if answer else "No"
+
+            # Build CoT from decomposition and evidence if available
+            cot_content = None
+            if "decomposition" in row and row["decomposition"]:
+                decomp = row["decomposition"]
+                if isinstance(decomp, list):
+                    cot_content = "Reasoning steps:\n" + "\n".join(
+                        [f"{i+1}. {step}" for i, step in enumerate(decomp)]
+                    )
+                else:
+                    cot_content = f"Reasoning: {decomp}"
+
+            question = Question(
+                question_id=f"strategyqa_{len(questions)}",
+                question=question_text,
+                options=["Yes", "No"],  # Binary choice
+                correct_answer=correct_answer,
+                category="default",
+                cot_content=cot_content,
+                metadata={
+                    "difficulty": "Hard",
+                    "type": "multi_step_reasoning",
+                    "requires_implicit_steps": True,
+                },
+            )
+            questions.append(question)
+
+        dataset_info = DatasetInfo(
+            name="StrategyQA",
+            description="Multi-step reasoning questions requiring implicit reasoning steps",
+            categories=selected_categories,
+            total_questions=len(questions),
+            format_type="binary_choice",
+            difficulty_level="Hard",
+        )
+
+        return questions, dataset_info
+
+    def format_prompt(self, question: Question, prompt_style: str = "plain") -> str:
+        """Format prompt for StrategyQA questions."""
+        if prompt_style == "plain":
+            return f"""Answer this question with Yes or No:
+
+{question.question}
+
+Think carefully about what information and reasoning steps are needed to answer this question.
+
+Answer: """
+
+        elif prompt_style == "explicit_cot":
+            return f"""Answer this question with Yes or No, showing your reasoning:
+
+Question: {question.question}
+
+Please work through this step-by-step:
+1. Break down what the question is really asking
+2. Identify what facts or knowledge are needed
+3. Work through the logical steps required
+4. Consider any implicit assumptions or connections
+5. Reach your conclusion
+
+Show your reasoning step by step, then provide your final answer (Yes or No)."""
+
+        else:
+            raise ValueError(f"Unknown prompt style: {prompt_style}")
diff --git a/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py b/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py
index 5a963ff0..995b49c0 100644
--- a/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py
+++ b/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py
@@ -32,10 +32,15 @@
 from .dataset_interface import DatasetInfo, Question, questions_to_dataframe
 
 # Robust answer extraction patterns for structured response parsing
-ANSWER_PATTERN_PRIMARY = re.compile(r"(?:answer\s*:?\s*)([A-Z])", re.IGNORECASE)
-ANSWER_PATTERN_FINAL = re.compile(r"(?:final\s*answer\s*:?\s*)([A-Z])", re.IGNORECASE)
+ANSWER_PATTERN_PRIMARY = re.compile(
+    r"(?:answer\s*:?\s+)([A-Z])(?:\s|[.!?)]|$)", re.IGNORECASE
+)
+ANSWER_PATTERN_FINAL = re.compile(
+    r"(?:final\s*answer\s*:?\s+)([A-Z])(?:\s|[.!?)]|$)", re.IGNORECASE
+)
 ANSWER_PATTERN_CONCLUSION = re.compile(
-    r"(?:therefore|thus|so).*?([A-Z])", re.IGNORECASE
+    r"(?:therefore|thus|so).*?(?:answer\s+is\s+|is\s+)([A-Z])(?:\s|[.!?)]|$)",
+    re.IGNORECASE,
 )
 
 
@@ -196,25 +201,46 @@ def get_dataset_optimal_tokens(dataset_info, model_name=None):
     model_multiplier = 1.0
     if model_name:
         model_lower = model_name.lower()
+        print(f"  🔍 Model detection: '{model_name}' -> '{model_lower}'")
         if "qwen" in model_lower:
             # Qwen models are more efficient and can handle longer contexts
             model_multiplier = 1.5
+            print(f"  ✅ Qwen model detected, using multiplier: {model_multiplier}")
         elif "deepseek" in model_lower:
             # DeepSeek models (e.g., V3.1) are capable and can handle longer contexts
             model_multiplier = 1.5
+            print(f"  ✅ DeepSeek model detected, using multiplier: {model_multiplier}")
         elif "gpt-oss" in model_lower:
             # GPT-OSS models use baseline token limits
             model_multiplier = 1.0
+            print(f"  ✅ GPT-OSS model detected, using multiplier: {model_multiplier}")
+        else:
+            print(
+                f"  ⚠️  Unknown model type, using baseline multiplier: {model_multiplier}"
+            )
         # Default to baseline for unknown models
 
-    # Base token limits per dataset (optimized for gpt-oss20b baseline)
+    # Base token limits per dataset (optimized for reasoning tasks with generous headroom)
     base_dataset_tokens = {
-        "gpqa": 3000,  # Graduate-level scientific reasoning (increased for complex multi-step reasoning)
-        "truthfulqa": 800,  # Misconception analysis
-        "hellaswag": 800,  # Natural continuation reasoning
-        "arc": 800,  # Elementary/middle school science
-        "commonsenseqa": 1000,  # Common sense reasoning
-        "mmlu": 3000,  # Academic knowledge (increased for complex technical domains like engineering/chemistry)
+        # Proven optimal datasets
+        "gpqa": 4000,  # Graduate-level scientific reasoning (proven optimal from results)
+        "mmlu": 4000,  # Academic knowledge (proven optimal from results)
+        "truthfulqa": 2500,  # Misconception analysis (proven adequate from results)
+        # Mathematical reasoning datasets
+        # "math": 6000,  # Competition mathematics - DISABLED: dataset not available
+        "gsm8k": 2500,  # Elementary math word problems - simpler than competition math
+        "aqua-rat": 3000,  # Algebraic word problems with rationales
+        # Multi-step reasoning datasets
+        "drop": 4000,  # Reading comprehension with discrete reasoning - complex passages
+        "strategyqa": 3500,  # Multi-step implicit reasoning - requires detailed thinking
+        # Scientific reasoning datasets
+        "sciq": 2000,  # Science questions - moderate complexity
+        "openbookqa": 2500,  # Elementary science with fact reasoning
+        # Other datasets
+        "hellaswag": 2000,  # Natural continuation reasoning
+        "arc": 2000,  # Elementary/middle school science
+        "arc-challenge": 3000,  # Harder ARC questions
+        "commonsenseqa": 2500,  # Common sense reasoning
     }
 
     # Find matching dataset and apply model multiplier
@@ -229,16 +255,34 @@ def get_dataset_optimal_tokens(dataset_info, model_name=None):
         difficulty_tokens = {"graduate": 300, "hard": 300, "moderate": 200, "easy": 150}
         base_tokens = difficulty_tokens.get(difficulty, 200)
 
+    # Special case: Qwen3 models need higher tokens for complex reasoning datasets
+    if model_name and "qwen" in model_name.lower():
+        if "mmlu" in dataset_name or "gpqa" in dataset_name:
+            final_tokens = 10240
+            dataset_type = "MMLU" if "mmlu" in dataset_name else "GPQA"
+            print(
+                f"  🎯 Special case: Qwen3 + {dataset_type} = {final_tokens} tokens (fixed requirement)"
+            )
+            return final_tokens
+        # elif "math" in dataset_name:  # DISABLED: dataset not available
+        #     final_tokens = 8000  # Competition math needs extensive proofs
+        #     print(f"  🎯 Special case: Qwen3 + MATH = {final_tokens} tokens (competition math requirement)")
+        #     return final_tokens
+
     # Apply model-specific multiplier and round to nearest 50
     final_tokens = int(base_tokens * model_multiplier)
     final_tokens = ((final_tokens + 25) // 50) * 50  # Round to nearest 50
 
+    print(
+        f"  🧮 Token calculation: {base_tokens} × {model_multiplier} = {int(base_tokens * model_multiplier)} → {final_tokens} (rounded)"
+    )
+
     return final_tokens
 
 
 def get_available_models(endpoint: str, api_key: str = "") -> List[str]:
     """Get available models from an endpoint."""
-    client = OpenAI(base_url=endpoint, api_key=api_key or None)
+    client = OpenAI(base_url=endpoint, api_key=api_key or None, timeout=300.0)
     try:
         models = client.models.list()
         return [m.id for m in models.data]
@@ -247,8 +291,8 @@ def get_available_models(endpoint: str, api_key: str = "") -> List[str]:
         return []
 
 
-def extract_answer(response: Any) -> Optional[str]:
-    """Extract answer from model response."""
+def extract_answer(response: Any, question: Optional[Question] = None) -> Optional[str]:
+    """Extract answer from model response based on question format."""
     # Normalize non-string responses into a string to be robust to providers
     # that return structured content (e.g., lists of parts or dicts).
     if response is None:
@@ -285,6 +329,39 @@ def extract_answer(response: Any) -> Optional[str]:
         except Exception:
             response = str(response)
 
+    # First, try to extract structured answer format "ANSWER: [value]"
+    structured_answer = extract_structured_answer(response)
+    if structured_answer:
+        return structured_answer
+
+    # Determine answer format based on question type
+    if question and hasattr(question, "options") and question.options:
+        if len(question.options) == 2 and set(question.options) == {"Yes", "No"}:
+            # Binary Yes/No questions (StrategyQA)
+            return extract_binary_answer(response)
+        else:
+            # Multiple choice questions (GPQA, MMLU, etc.)
+            return extract_multiple_choice_answer(response)
+    else:
+        # Free-form questions (GSM8K, DROP, etc.)
+        return extract_free_form_answer(response)
+
+
+def extract_structured_answer(response: str) -> Optional[str]:
+    """Extract answer from structured 'ANSWER: [value]' format."""
+    # Look for "ANSWER: [value]" pattern (case insensitive)
+    pattern = re.compile(r"ANSWER:\s*(.+?)(?:\n|$)", re.IGNORECASE)
+    match = pattern.search(response)
+    if match:
+        answer = match.group(1).strip()
+        # Clean up common trailing punctuation
+        answer = re.sub(r"[.!?]+$", "", answer)
+        return answer
+    return None
+
+
+def extract_multiple_choice_answer(response: str) -> Optional[str]:
+    """Extract multiple choice answer (A, B, C, D, etc.)."""
     # Try multiple extraction patterns in order of preference
     patterns = [ANSWER_PATTERN_PRIMARY, ANSWER_PATTERN_FINAL, ANSWER_PATTERN_CONCLUSION]
 
@@ -293,6 +370,20 @@ def extract_answer(response: Any) -> Optional[str]:
         if match:
             return match.group(1).upper()
 
+    # Additional patterns for common answer formats
+    additional_patterns = [
+        r"(?:correct\s+answer\s+is\s+)([A-Z])",  # "correct answer is E"
+        r"(?:option\s+)([A-Z])",  # "option E"
+        r"(?:choice\s+)([A-Z])",  # "choice E"
+        r"([A-Z])\)",  # "E)" format
+        r"([A-Z])\s*[.!]?\s*$",  # Letter at end of line
+    ]
+
+    for pattern in additional_patterns:
+        match = re.search(pattern, response, re.IGNORECASE)
+        if match:
+            return match.group(1).upper()
+
     # Fallback 1: Look for standalone letters at end of response
     lines = response.strip().split("\n")
     for line in reversed(lines[-3:]):  # Check last 3 lines
@@ -300,14 +391,206 @@ def extract_answer(response: Any) -> Optional[str]:
         if len(line) == 1 and line.upper() in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
             return line.upper()
 
-    # Fallback 2: Find last letter in entire response
-    for char in reversed(response):
-        if char.upper() in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
-            return char.upper()
+    # Fallback 2: Look for letters in specific contexts (more targeted)
+    # Check for patterns like "is E" or "answer E" in last few lines
+    for line in reversed(lines[-3:]):
+        line = line.strip()
+        # Look for letter after common words
+        context_match = re.search(
+            r"(?:is|answer|option|choice)\s+([A-Z])(?:\s|[.!?]|$)", line, re.IGNORECASE
+        )
+        if context_match:
+            return context_match.group(1).upper()
+
+    # Final fallback: Find last letter that appears to be an answer (not in middle of words)
+    # Only consider letters that are standalone or followed by punctuation
+    for match in re.finditer(r"\b([A-Z])(?:\s|[.!?)]|$)", response):
+        letter = match.group(1).upper()
+        if letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+            return letter  # Return the last match found
+
+    return None
+
+
+def extract_binary_answer(response: str) -> Optional[str]:
+    """Extract Yes/No answer from response."""
+    response_lower = response.lower()
+
+    # Look for explicit yes/no patterns
+    yes_patterns = [r"\byes\b", r"\btrue\b", r"\bcorrect\b", r"\baffirmative\b"]
+    no_patterns = [r"\bno\b", r"\bfalse\b", r"\bincorrect\b", r"\bnegative\b"]
+
+    # Check last few lines first (most likely to contain final answer)
+    lines = response.strip().split("\n")
+    for line in reversed(lines[-3:]):
+        line_lower = line.lower().strip()
+
+        for pattern in yes_patterns:
+            if re.search(pattern, line_lower):
+                return "Yes"
+
+        for pattern in no_patterns:
+            if re.search(pattern, line_lower):
+                return "No"
+
+    # Fallback: check entire response
+    for pattern in yes_patterns:
+        if re.search(pattern, response_lower):
+            return "Yes"
+
+    for pattern in no_patterns:
+        if re.search(pattern, response_lower):
+            return "No"
+
+    return None
+
+
+def extract_free_form_answer(response: str) -> Optional[str]:
+    """Extract free-form answer (numbers, text, etc.)."""
+    # For numerical answers, look for numbers with improved patterns
+    number_patterns = [
+        r"(?:answer\s*:?\s*)([0-9,.-]+)",  # "Answer: 42" or "Answer 42"
+        r"####\s*([0-9,.-]+)",  # GSM8K format "#### 42"
+        r"\$([0-9,.-]+)",  # Money format "$42"
+        r"([0-9,.-]+)\s*(?:dollars?|cents?|%|percent)",  # "42 dollars"
+        r"(?:is\s+)([0-9,.-]+)",  # "is 42" or "is 68.5"
+        r"(?:was\s+)([0-9,.-]+)",  # "was 42"
+        r"(?:were\s+)([0-9,.-]+)",  # "were 42"
+        r"([0-9,.-]+)(?:\s+(?:people|units|items|years|days|months|miles|kilometers|percent|%|dollars?|cents?))",  # "68.5 people"
+    ]
+
+    # Check last few lines first (most likely to contain final answer)
+    lines = response.strip().split("\n")
+    for line in reversed(lines[-3:]):
+        line = line.strip()
+
+        for pattern in number_patterns:
+            match = re.search(pattern, line, re.IGNORECASE)
+            if match:
+                return match.group(1).replace(",", "")  # Remove commas from numbers
+
+    # Fallback: check entire response for numbers
+    for pattern in number_patterns:
+        match = re.search(pattern, response, re.IGNORECASE)
+        if match:
+            return match.group(1).replace(",", "")
+
+    # For non-numerical free-form answers (like "germans", "Centenary Medal")
+    # Look for explicit answer patterns first
+    text_patterns = [
+        r"(?:answer\s*:?\s*)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)",  # "Answer: germans" or "Answer: Centenary Medal"
+        r"(?:is\s+)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)",  # "is germans"
+        r"(?:was\s+)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)",  # "was Centenary Medal"
+        r"(?:were\s+)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)",  # "were germans"
+        r"(?:awarded\s+(?:him\s+)?(?:the\s+)?)([A-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)",  # "awarded the Centenary Medal"
+        r"(?:received\s+(?:the\s+)?)([A-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)",  # "received the Centenary Medal"
+        r"(?:called\s+)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)",  # "called germans"
+        r"(?:named\s+)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)",  # "named Centenary Medal"
+    ]
+
+    # Check last few lines for text answers
+    for line in reversed(lines[-3:]):
+        line = line.strip()
+
+        for pattern in text_patterns:
+            match = re.search(pattern, line, re.IGNORECASE)
+            if match:
+                answer = match.group(1).strip()
+                # Clean up common suffixes but preserve important words
+                answer = re.sub(
+                    r"\s+(?:in\s+\d+|for\s+service).*$", "", answer, flags=re.IGNORECASE
+                )
+                # Limit to reasonable length (1-4 words for most DROP answers)
+                words = answer.split()
+                if len(words) <= 4:
+                    return answer
+                else:
+                    return " ".join(words[:2])  # Take first 2 words for long matches
+
+    # Final fallback: extract last meaningful line
+    for line in reversed(lines[-3:]):
+        line = line.strip()
+        if line and not line.startswith(
+            (
+                "Question:",
+                "Answer:",
+                "Therefore",
+                "So",
+                "Thus",
+                "Based on",
+                "Looking at",
+            )
+        ):
+            # Remove common prefixes and return clean answer
+            line = re.sub(
+                r"^(?:the\s+)?(?:answer\s+is\s+)?", "", line, flags=re.IGNORECASE
+            )
+            # Take first few words if it's a long sentence
+            words = line.split()
+            if len(words) > 5:
+                return " ".join(words[:3])  # Take first 3 words
+            return line.strip()
 
     return None
 
 
+def compare_free_form_answers(predicted: str, correct: str) -> bool:
+    """Compare free-form answers with normalization."""
+    if not predicted or not correct:
+        return False
+
+    # Normalize both answers
+    predicted_norm = normalize_answer(predicted)
+    correct_norm = normalize_answer(correct)
+
+    # Direct match
+    if predicted_norm == correct_norm:
+        return True
+
+    # For numerical answers, try parsing as numbers
+    try:
+        pred_num = float(predicted_norm.replace(",", ""))
+        correct_num = float(correct_norm.replace(",", ""))
+        # Allow small floating point differences
+        return abs(pred_num - correct_num) < 1e-6
+    except (ValueError, AttributeError):
+        pass
+
+    # For text answers, check if predicted contains correct or vice versa
+    if len(predicted_norm) > 3 and len(correct_norm) > 3:
+        return predicted_norm in correct_norm or correct_norm in predicted_norm
+
+    return False
+
+
+def normalize_answer(answer: str) -> str:
+    """Normalize answer for comparison."""
+    if not isinstance(answer, str):
+        answer = str(answer)
+
+    # Convert to lowercase and strip
+    answer = answer.lower().strip()
+
+    # Remove common punctuation and extra spaces
+    answer = re.sub(r"[^\w\s.-]", "", answer)
+    answer = re.sub(r"\s+", " ", answer).strip()
+
+    # Remove common prefixes
+    prefixes = [
+        "the answer is",
+        "answer:",
+        "the answer:",
+        "answer is",
+        "final answer:",
+        "therefore",
+    ]
+    for prefix in prefixes:
+        if answer.startswith(prefix):
+            answer = answer[len(prefix) :].strip()
+
+    return answer
+
+
 def call_model(
     client: OpenAI,
     model: str,
@@ -343,41 +626,41 @@ def build_extra_body_for_model(
 ) -> Optional[Dict[str, Any]]:
     """Return an extra_body dict to toggle reasoning for a given model.
 
+    This function matches the exact pattern from reasoning_eval_consolidated.py
+    to ensure compatibility and consistent behavior.
+
     - DeepSeek v3.1: {"chat_template_kwargs": {"thinking": true/false}}
-    - GPT-OSS: {"reasoning_effort": "low|medium|high"} when ON; if not provided, then low
+    - Qwen3: {"chat_template_kwargs": {"enable_thinking": true/false}}
+    - GPT-OSS: {"reasoning_effort": "low|high"} based on reasoning flag
     """
-    # reasoning: True -> ON, False -> OFF, None -> base (default behavior)
+    # reasoning: True -> ON, False -> OFF, None -> no reasoning parameters
+    if reasoning is None:
+        return None
 
     lower = model_name.lower()
+
+    # DeepSeek v3.1 family (matches reasoning_eval_consolidated.py pattern)
     if (("ds" in lower) or ("deepseek" in lower)) and (
         "v31" in lower or "v3.1" in lower or "v3" in lower
     ):
-        if reasoning is True:
-            return {"chat_template_kwargs": {"thinking": True}}
-        elif reasoning is False:
-            return {"chat_template_kwargs": {"thinking": False}}
-        else:  # reasoning is None (base mode)
-            # Base: do not set thinking for DeepSeek - let it use default behavior
-            return None
-
-    # Qwen3 family
+        return {"chat_template_kwargs": {"thinking": reasoning}}
+
+    # Qwen3 family (matches reasoning_eval_consolidated.py pattern)
     if "qwen3" in lower:
-        if reasoning is True:
-            return {"chat_template_kwargs": {"enable_thinking": True}}
-        if reasoning is False:
-            return {"chat_template_kwargs": {"enable_thinking": False}}
-        return None
+        return {"chat_template_kwargs": {"enable_thinking": reasoning}}
 
-    # GPT OSS family
+    # GPT-OSS family (matches reasoning_eval_consolidated.py pattern)
     if "gpt-oss" in lower or "openai/gpt-oss" in lower or "gpt_oss" in lower:
-        if reasoning is True:
+        if reasoning:
             return {"reasoning_effort": "high"}
-        elif reasoning is False:
+        else:
             return {"reasoning_effort": "low"}
-        else:  # reasoning is None (base mode)
-            # Base: do not set reasoning_effort - let it use default behavior
-            return None
 
+    # OpenAI models with reasoning parameter
+    if "gpt" in lower or "o1" in lower:
+        return {"reasoning": reasoning}
+
+    # Model does not support reasoning parameters
     return None
 
 
@@ -396,13 +679,17 @@ def process_question_single(
     # Format prompt based on mode
     if prompt_mode == "XC":
         prompt = dataset.format_prompt(question, "explicit_cot")
-        extra_body = None
+        extra_body = (
+            None  # XC mode never uses reasoning parameters (CoT prompt instead)
+        )
     elif prompt_mode == "AR":
         prompt = dataset.format_prompt(question, "plain")
         extra_body = ar_extra_body
-    else:  # NR or Router-Transparent
+    else:  # NR mode (could be Router-Transparent or direct vLLM)
         prompt = dataset.format_prompt(question, "plain")
-        extra_body = None
+        # For Router-Transparent: ar_extra_body=None (router decides reasoning)
+        # For direct vLLM: ar_extra_body contains reasoning parameters
+        extra_body = ar_extra_body
 
     start_time = time.time()
     response_text, success, prompt_tokens, completion_tokens, total_tokens = call_model(
@@ -410,21 +697,29 @@ def process_question_single(
     )
     end_time = time.time()
 
-    predicted_answer = extract_answer(response_text) if success else None
-
-    # Compare predicted answer with correct answer (handle both letter and index formats)
-    if predicted_answer and predicted_answer in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
-        if isinstance(question.correct_answer, str):
-            # Dataset stores answer as letter (e.g., MMLU: "F")
-            is_correct = predicted_answer == question.correct_answer
-        elif isinstance(question.correct_answer, int):
-            # Dataset stores answer as index (e.g., CommonsenseQA: 1, ARC: 0)
-            predicted_idx = ord(predicted_answer) - ord("A")
-            is_correct = predicted_idx == question.correct_answer
+    predicted_answer = extract_answer(response_text, question) if success else None
+
+    # Compare predicted answer with correct answer (handle multiple formats)
+    is_correct = False
+    if predicted_answer:
+        if hasattr(question, "options") and question.options:
+            if len(question.options) == 2 and set(question.options) == {"Yes", "No"}:
+                # Binary Yes/No questions (StrategyQA)
+                is_correct = predicted_answer == question.correct_answer
+            elif predicted_answer in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+                # Multiple choice questions (GPQA, MMLU, etc.)
+                if isinstance(question.correct_answer, str):
+                    # Dataset stores answer as letter (e.g., MMLU: "F")
+                    is_correct = predicted_answer == question.correct_answer
+                elif isinstance(question.correct_answer, int):
+                    # Dataset stores answer as index (e.g., CommonsenseQA: 1, ARC: 0)
+                    predicted_idx = ord(predicted_answer) - ord("A")
+                    is_correct = predicted_idx == question.correct_answer
         else:
-            is_correct = False
-    else:
-        is_correct = False
+            # Free-form questions (GSM8K, DROP, etc.)
+            is_correct = compare_free_form_answers(
+                predicted_answer, question.correct_answer
+            )
 
     return {
         "mode": prompt_mode,
@@ -456,7 +751,7 @@ def evaluate_model_router_transparent(
     temperature: float,
 ) -> pd.DataFrame:
     """Evaluate model in router-transparent mode."""
-    client = OpenAI(base_url=endpoint, api_key=api_key or None)
+    client = OpenAI(base_url=endpoint, api_key=api_key or None, timeout=300.0)
     print(f"Using model: {model}, endpoint: {endpoint}")
 
     results: List[Dict[str, Any]] = []
@@ -526,7 +821,7 @@ def evaluate_model_vllm_multimode(
     2. XC - CoT prompt, no reasoning toggle (prompt-based reasoning) - ONLY if dataset has CoT
     3. NR_REASONING - Plain prompt, reasoning toggle ON (model-based reasoning) - ALWAYS included
     """
-    client = OpenAI(base_url=endpoint, api_key=api_key or "dummy-key")
+    client = OpenAI(base_url=endpoint, api_key=api_key or "dummy-key", timeout=300.0)
     print(f"Using vLLM model: {model}, endpoint: {endpoint}")
 
     # Check if dataset has actual CoT content by examining sample questions
@@ -565,35 +860,22 @@ def evaluate_model_vllm_multimode(
     ) or ("qwen3" in model_lower)
 
     # Base modes (always included)
-    if is_deepseek_or_qwen:
-        mode_variants: List[Tuple[str, str, Optional[bool]]] = [
-            ("VLLM_NR", "NR", False),  # Plain prompt, reasoning OFF (baseline)
-            (
-                "VLLM_NR_REASONING",
-                "NR",
-                True,
-            ),  # Plain prompt, reasoning ON (model reasoning)
-        ]
-    else:
-        mode_variants: List[Tuple[str, str, Optional[bool]]] = [
-            ("VLLM_NR", "NR", None),  # Plain prompt, no toggle (baseline)
-            (
-                "VLLM_NR_REASONING",
-                "NR",
-                True,
-            ),  # Plain prompt, reasoning toggle ON (model reasoning)
-        ]
+    # Always use explicit True/False for reasoning-capable models to ensure consistent behavior
+    mode_variants: List[Tuple[str, str, Optional[bool]]] = [
+        ("VLLM_NR", "NR", False),  # Plain prompt, reasoning OFF (baseline)
+        (
+            "VLLM_NR_REASONING",
+            "NR",
+            True,
+        ),  # Plain prompt, reasoning ON (model reasoning)
+    ]
 
     # Add XC mode only if dataset has CoT content
     if has_cot_content:
-        if is_deepseek_or_qwen:
-            mode_variants.insert(
-                1, ("VLLM_XC", "XC", False)
-            )  # Insert between NR and NR_REASONING
-        else:
-            mode_variants.insert(
-                1, ("VLLM_XC", "XC", None)
-            )  # Insert between NR and NR_REASONING
+        # Always use explicit False for XC mode (CoT prompt with reasoning OFF)
+        mode_variants.insert(
+            1, ("VLLM_XC", "XC", False)
+        )  # Insert between NR and NR_REASONING
 
     def run_variants(q: Question) -> List[Dict[str, Any]]:
         local_records: List[Dict[str, Any]] = []
@@ -869,13 +1151,23 @@ def main():
     print(f"vLLM models: {vllm_models}")
 
     # Function to get optimal tokens for a specific model
-    # For fair comparison, use consistent token limits regardless of model name
+    # Use model-aware token allocation for optimal performance
     def get_model_optimal_tokens(model_name):
         if args.max_tokens:
             return args.max_tokens
         else:
-            # Use base dataset tokens without model-specific multipliers for fair comparison
-            return get_dataset_optimal_tokens(dataset_info, model_name=None)
+            # For router evaluation, use the first vLLM model for token calculation if available
+            # This ensures consistent token allocation between router and vLLM evaluations
+            reference_model = None
+            if vllm_models and len(vllm_models) > 0:
+                reference_model = vllm_models[0]
+                print(
+                    f"  🔗 Using vLLM model '{reference_model}' for router token calculation"
+                )
+            elif model_name and model_name != "auto":
+                reference_model = model_name
+
+            return get_dataset_optimal_tokens(dataset_info, model_name=reference_model)
 
     # Router evaluation (NR-only)
     if args.run_router and router_endpoint and router_models: