diff --git a/bench/comprehensive_bench.sh b/bench/comprehensive_bench.sh index 5054d537..761f78b3 100755 --- a/bench/comprehensive_bench.sh +++ b/bench/comprehensive_bench.sh @@ -12,6 +12,7 @@ ROUTER_ENDPOINT="http://127.0.0.1:8801/v1" VLLM_ENDPOINT="http://127.0.0.1:8000/v1" VLLM_MODEL="" # Will be auto-detected from endpoint if not specified ROUTER_MODEL="auto" +CONCURRENT_REQUESTS=8 OUTPUT_BASE="results/comprehensive_research_$(date +%Y%m%d_%H%M%S)" # Parse command line arguments @@ -105,12 +106,28 @@ PERSISTENT_RESEARCH_CSV="results/research_results_master.csv" # Dataset configurations (dataset_name:samples_per_category) # Balanced for statistical significance vs runtime declare -A DATASET_CONFIGS=( - ["mmlu"]=10 # 57 subjects × 10 = 570 samples - ["arc"]=15 # 1 category × 15 = 15 samples - ["gpqa"]=20 # 1 category × 20 = 20 samples - ["truthfulqa"]=15 # 1 category × 15 = 15 samples - ["commonsenseqa"]=20 # 1 category × 20 = 20 samples - ["hellaswag"]=8 # ~50 activities × 8 = ~400 samples + # Core proven datasets + ["gpqa"]=20 # 1 category × 20 = 20 samples - OUTSTANDING reasoning differentiation + ["mmlu"]=10 # 57 subjects × 10 = 570 samples - EXCELLENT reasoning differentiation + ["truthfulqa"]=15 # Truthfulness evaluation - some reasoning differentiation (60% → 73.3%) + + # Mathematical reasoning datasets + # ["math"]=15 # Competition mathematics - DISABLED: Dataset not available on HF Hub + ["gsm8k"]=25 # Elementary math word problems - EXPECTED good reasoning differentiation + ["aqua-rat"]=20 # Algebraic word problems with rationales - EXPECTED good differentiation + + # Multi-step reasoning datasets + ["drop"]=20 # Reading comprehension with discrete reasoning - EXPECTED excellent differentiation + ["strategyqa"]=20 # Multi-step implicit reasoning - EXPECTED good differentiation + + # Scientific reasoning datasets + ["sciq"]=25 # Science questions requiring reasoning - EXPECTED moderate differentiation + ["openbookqa"]=20 # Elementary science with fact reasoning - EXPECTED moderate differentiation + + # Disabled datasets with poor reasoning differentiation: + # ["arc-challenge"]=15 # 100% accuracy across all modes, minimal benefit + # ["commonsenseqa"]=20 # Same accuracy across modes, small token difference + # ["hellaswag"]=2 # Minimal differentiation, not reasoning-focused ) echo -e "${BLUE}🔬 COMPREHENSIVE MULTI-DATASET BENCHMARK FOR RESEARCH${NC}" @@ -136,14 +153,17 @@ source "$VENV_PATH/bin/activate" mkdir -p "$OUTPUT_BASE" mkdir -p "$(dirname "$PERSISTENT_RESEARCH_CSV")" -# Initialize persistent research results CSV (create header only if file doesn't exist) -if [[ ! -f "$PERSISTENT_RESEARCH_CSV" ]]; then - echo "Dataset,Mode,Model,Accuracy,Avg_Latency_ms,Avg_Total_Tokens,Sample_Count,Timestamp" > "$PERSISTENT_RESEARCH_CSV" - echo -e "${GREEN}📊 Created new master research CSV: $PERSISTENT_RESEARCH_CSV${NC}" -else - echo -e "${BLUE}📊 Using existing master research CSV: $PERSISTENT_RESEARCH_CSV${NC}" +# Backup and clear master research CSV for fresh results +if [[ -f "$PERSISTENT_RESEARCH_CSV" ]]; then + BACKUP_CSV="${PERSISTENT_RESEARCH_CSV}.backup_$(date +%Y%m%d_%H%M%S)" + cp "$PERSISTENT_RESEARCH_CSV" "$BACKUP_CSV" + echo -e "${GREEN}📊 Backed up existing master CSV to: $BACKUP_CSV${NC}" fi +# Create fresh master research CSV with header only +echo "Dataset,Mode,Model,Accuracy,Avg_Latency_ms,Avg_Total_Tokens,Sample_Count,Timestamp" > "$PERSISTENT_RESEARCH_CSV" +echo -e "${GREEN}📊 Created fresh master research CSV: $PERSISTENT_RESEARCH_CSV${NC}" + # Also create a timestamped copy for this run RESEARCH_CSV="$OUTPUT_BASE/research_results.csv" cp "$PERSISTENT_RESEARCH_CSV" "$RESEARCH_CSV" @@ -225,9 +245,12 @@ try: model_name = '$VLLM_MODEL' # For vLLM, we might have multiple modes (NR, NR_REASONING) - if '$mode' == 'vllm' and 'mode' in df.columns: - for mode_type in df['mode'].unique(): - mode_df = df[df['mode'] == mode_type] + # Check both 'mode' and 'mode_label' columns for mode information + if '$mode' == 'vllm' and ('mode' in df.columns or 'mode_label' in df.columns): + # Use mode_label if available (more descriptive), otherwise use mode + mode_column = 'mode_label' if 'mode_label' in df.columns else 'mode' + for mode_type in df[mode_column].unique(): + mode_df = df[df[mode_column] == mode_type] # Recalculate metrics for this specific mode using correct column names if 'is_correct' in mode_df.columns: @@ -253,7 +276,17 @@ try: mode_samples = len(mode_df) - csv_line = f'$dataset,vLLM_{mode_type},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp' + # Map technical mode names to descriptive names + if mode_type == 'VLLM_NR': + display_mode = 'vLLM_No_Reasoning' + elif mode_type == 'VLLM_NR_REASONING': + display_mode = 'vLLM_All_Reasoning' + elif mode_type == 'VLLM_XC': + display_mode = 'vLLM_CoT' + else: + display_mode = mode_type # Use the mode_label as-is if not recognized + + csv_line = f'$dataset,{display_mode},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp' print(f' 📝 Writing to CSV: {csv_line}', file=sys.stderr) print(csv_line) else: @@ -283,7 +316,7 @@ run_dataset_benchmark() { echo -e "${GREEN}📊 Benchmarking $dataset dataset ($samples samples per category)...${NC}" - # Router benchmark + # Router benchmark (pass vLLM info for consistent token calculation) echo -e "${YELLOW} 🤖 Running router evaluation...${NC}" python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \ --dataset "$dataset" \ @@ -291,6 +324,9 @@ run_dataset_benchmark() { --run-router \ --router-endpoint "$ROUTER_ENDPOINT" \ --router-models "$ROUTER_MODEL" \ + --vllm-endpoint "$VLLM_ENDPOINT" \ + --vllm-models "$VLLM_MODEL" \ + --concurrent-requests "$CONCURRENT_REQUESTS" \ --output-dir "$OUTPUT_BASE/router_$dataset" \ --seed 42 @@ -307,41 +343,104 @@ run_dataset_benchmark() { --vllm-models "$VLLM_MODEL" \ --vllm-exec-modes NR NR_REASONING \ --output-dir "$OUTPUT_BASE/vllm_$dataset" \ + --concurrent-requests "$CONCURRENT_REQUESTS" \ --seed 42 # Extract and save vLLM metrics immediately extract_and_save_metrics "$dataset" "vllm" "$OUTPUT_BASE/vllm_$dataset" - echo -e "${GREEN} ✅ Completed $dataset benchmark${NC}" + # Generate updated comprehensive plots for current dataset + echo -e "${BLUE} 📈 Updating comprehensive plots with $dataset results...${NC}" + generate_comprehensive_plot "$dataset" + + echo -e "${GREEN} ✅ Completed $dataset benchmark and comprehensive plots updated${NC}" + echo -e "${GREEN} 📈 CSV data updated in: $PERSISTENT_RESEARCH_CSV${NC}" echo "" } -# Function to generate comparison plots -generate_plots() { - echo -e "${BLUE}📈 Generating comparison plots...${NC}" +# Function to generate comprehensive plot with all completed datasets (called after each dataset completes) +generate_comprehensive_plot() { + local current_dataset=$1 - for dataset in "${!DATASET_CONFIGS[@]}"; do - echo -e "${YELLOW} 📊 Plotting $dataset results...${NC}" + if [[ -n "$current_dataset" ]]; then + echo -e "${YELLOW} 📊 Generating plot for current dataset: $current_dataset...${NC}" + else + echo -e "${YELLOW} 📊 Generating comprehensive plot with all completed datasets...${NC}" + fi + + # Use the plot_comprehensive_results.py script to generate updated charts + if [[ -f "plot_comprehensive_results.py" ]]; then + echo -e "${BLUE} Running comprehensive plotting script...${NC}" + # Use the current run's CSV instead of the master CSV to show only this run's results + PLOT_CMD="python3 plot_comprehensive_results.py \ + --csv \"$RESEARCH_CSV\" \ + --output-dir \"$OUTPUT_BASE\" \ + --model-filter \"$VLLM_MODEL\"" + + # Add dataset filter if specified + if [[ -n "$current_dataset" ]]; then + PLOT_CMD="$PLOT_CMD --dataset-filter \"$current_dataset\"" + fi + + eval $PLOT_CMD - # Find the summary.json files - ROUTER_SUMMARY=$(find "$OUTPUT_BASE/router_$dataset" -name "summary.json" -type f | head -1) - VLLM_SUMMARY=$(find "$OUTPUT_BASE/vllm_$dataset" -name "summary.json" -type f | head -1) + echo -e "${GREEN} ✅ Comprehensive plots updated in $OUTPUT_BASE${NC}" + + # Print actual paths of generated charts + if [[ -f "$OUTPUT_BASE/accuracy_comparison.png" ]]; then + echo -e "${GREEN} 📊 Accuracy Chart: $OUTPUT_BASE/accuracy_comparison.png${NC}" + fi + if [[ -f "$OUTPUT_BASE/token_usage_comparison.png" ]]; then + echo -e "${GREEN} 📊 Token Usage Chart: $OUTPUT_BASE/token_usage_comparison.png${NC}" + fi + if [[ -f "$OUTPUT_BASE/efficiency_analysis.png" ]]; then + echo -e "${GREEN} 📊 Efficiency Chart: $OUTPUT_BASE/efficiency_analysis.png${NC}" + fi + else + echo -e "${RED} ⚠️ plot_comprehensive_results.py not found, skipping comprehensive plots${NC}" + fi +} + +# Function to generate plot for a single dataset (kept for compatibility) +generate_dataset_plot() { + local dataset=$1 + + echo -e "${YELLOW} 📊 Plotting $dataset results...${NC}" + + # Find the summary.json files + ROUTER_SUMMARY=$(find "$OUTPUT_BASE/router_$dataset" -name "summary.json" -type f | head -1) + VLLM_SUMMARY=$(find "$OUTPUT_BASE/vllm_$dataset" -name "summary.json" -type f | head -1) - if [[ -f "$VLLM_SUMMARY" ]]; then - PLOT_CMD="python3 -m vllm_semantic_router_bench.bench_plot --summary \"$VLLM_SUMMARY\" --out-dir \"$OUTPUT_BASE/plots_$dataset\"" + if [[ -f "$VLLM_SUMMARY" ]]; then + PLOT_CMD="python3 -m vllm_semantic_router_bench.bench_plot --summary \"$VLLM_SUMMARY\" --out-dir \"$OUTPUT_BASE/plots_$dataset\"" - if [[ -f "$ROUTER_SUMMARY" ]]; then - PLOT_CMD="$PLOT_CMD --router-summary \"$ROUTER_SUMMARY\"" - fi + if [[ -f "$ROUTER_SUMMARY" ]]; then + PLOT_CMD="$PLOT_CMD --router-summary \"$ROUTER_SUMMARY\"" + fi + + echo -e "${BLUE} Running: $PLOT_CMD${NC}" + eval $PLOT_CMD + echo -e "${GREEN} ✅ $dataset plots generated in $OUTPUT_BASE/plots_$dataset${NC}" + else + echo -e "${RED} ⚠️ No vLLM summary.json found for $dataset, skipping plots${NC}" + fi +} - echo -e "${BLUE} Running: $PLOT_CMD${NC}" - eval $PLOT_CMD +# Function to generate comparison plots (now just calls individual dataset plots) +generate_plots() { + echo -e "${BLUE}📈 Generating any remaining comparison plots...${NC}" + + for dataset in "${!DATASET_CONFIGS[@]}"; do + # Check if plots already exist + if [[ ! -d "$OUTPUT_BASE/plots_$dataset" ]]; then + echo -e "${YELLOW} 📊 Generating missing plots for $dataset...${NC}" + generate_dataset_plot "$dataset" else - echo -e "${RED} ⚠️ No vLLM summary.json found for $dataset, skipping plots${NC}" + echo -e "${GREEN} ✅ Plots for $dataset already exist${NC}" fi done - echo -e "${GREEN} ✅ All plots generated${NC}" + echo -e "${GREEN} ✅ All plots verified/generated${NC}" echo "" } @@ -372,8 +471,8 @@ EOF "mmlu") echo "| MMLU | $samples | ~570 | 57 subjects | Academic Knowledge |" >> "$summary_file" ;; - "arc") - echo "| ARC | $samples | $samples | 1 (Science) | Scientific Reasoning |" >> "$summary_file" + "arc-challenge") + echo "| ARC-Challenge | $samples | $samples | 1 (Science) | Scientific Reasoning (Hard) |" >> "$summary_file" ;; "gpqa") echo "| GPQA | $samples | $samples | 1 (Graduate) | Graduate-level Q&A |" >> "$summary_file" @@ -385,7 +484,7 @@ EOF echo "| CommonsenseQA | $samples | $samples | 1 (Common Sense) | Commonsense Reasoning |" >> "$summary_file" ;; "hellaswag") - echo "| HellaSwag | $samples | ~400 | ~50 activities | Commonsense NLI |" >> "$summary_file" + echo "| HellaSwag | $samples | ~100 | ~50 activities | Commonsense NLI |" >> "$summary_file" ;; esac done @@ -398,8 +497,8 @@ EOF ### Accuracy Comparison - Router (auto model with reasoning): See research_results.csv -- vLLM Direct (NR mode): See research_results.csv -- vLLM Direct (NR_REASONING mode): See research_results.csv +- vLLM Direct (No Reasoning): See research_results.csv +- vLLM Direct (All Reasoning): See research_results.csv ### Token Usage Analysis - Average tokens per response by dataset and mode (in research_results.csv) @@ -448,7 +547,7 @@ EOF - **Seed**: 42 (for reproducibility) - **Router Mode**: Auto model selection with reasoning -- **vLLM Modes**: NR (neutral) and NR_REASONING (with reasoning) +- **vLLM Modes**: No Reasoning and All Reasoning - **Sample Strategy**: Stratified sampling per category - **Evaluation**: Exact match accuracy and token usage @@ -462,9 +561,24 @@ EOF echo -e "${BLUE}🚀 Starting comprehensive benchmark...${NC}" start_time=$(date +%s) -# Run benchmarks for all datasets -for dataset in "${!DATASET_CONFIGS[@]}"; do +# Run benchmarks for reasoning-focused datasets (GPQA first for quick feedback) +DATASET_ORDER=("gpqa" "truthfulqa" "gsm8k" "aqua-rat" "sciq" "openbookqa" "strategyqa" "drop" "mmlu") +dataset_count=0 +total_datasets=${#DATASET_ORDER[@]} + +for dataset in "${DATASET_ORDER[@]}"; do + # Skip if dataset not configured + if [[ -z "${DATASET_CONFIGS[$dataset]}" ]]; then + echo -e "${YELLOW}⚠️ Dataset $dataset not configured, skipping...${NC}" + continue + fi + + dataset_count=$((dataset_count + 1)) + echo -e "${BLUE}🚀 Progress: Dataset $dataset_count/$total_datasets - Starting $dataset${NC}" run_dataset_benchmark "$dataset" + echo -e "${GREEN}🎉 Progress: Dataset $dataset_count/$total_datasets - Completed $dataset${NC}" + echo -e "${YELLOW}📊 Remaining datasets: $((total_datasets - dataset_count))${NC}" + echo "" done # Generate plots @@ -489,7 +603,16 @@ echo -e "${BLUE}📋 Next Steps:${NC}" echo "1. 📊 **Master research data**: $PERSISTENT_RESEARCH_CSV" echo "2. 📊 **This run's data**: $OUTPUT_BASE/research_results.csv" echo "3. 📋 Review research summary: $OUTPUT_BASE/RESEARCH_SUMMARY.md" -echo "4. 📈 Examine plots for visual insights" +echo "4. 📈 **View comprehensive charts**:" +if [[ -f "$OUTPUT_BASE/accuracy_comparison.png" ]]; then + echo " 📊 Accuracy: $OUTPUT_BASE/accuracy_comparison.png" +fi +if [[ -f "$OUTPUT_BASE/token_usage_comparison.png" ]]; then + echo " 📊 Token Usage: $OUTPUT_BASE/token_usage_comparison.png" +fi +if [[ -f "$OUTPUT_BASE/efficiency_analysis.png" ]]; then + echo " 📊 Efficiency: $OUTPUT_BASE/efficiency_analysis.png" +fi echo "5. 📄 Analyze detailed CSV files if needed" echo "" echo -e "${GREEN}🎓 Research CSV Format:${NC}" diff --git a/bench/plot_comprehensive_results.py b/bench/plot_comprehensive_results.py new file mode 100755 index 00000000..676bf1d1 --- /dev/null +++ b/bench/plot_comprehensive_results.py @@ -0,0 +1,443 @@ +#!/usr/bin/env python3 +""" +Comprehensive Results Plotting Script + +This script creates comparison plots showing: +1. Accuracy comparison across datasets and modes +2. Token usage comparison across datasets and modes + +Modes compared: +- Router (auto model with reasoning) +- vLLM Direct (No Reasoning) +- vLLM Direct (All Reasoning) +""" + +import argparse +import sys +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + +# Set style for better-looking plots +plt.style.use("seaborn-v0_8") +sns.set_palette("husl") + + +def load_and_clean_data(csv_path): + """Load and clean the research results CSV.""" + try: + df = pd.read_csv(csv_path) + print(f"✅ Loaded {len(df)} records from {csv_path}") + + # Show available modes + print(f"📊 Available modes: {df['Mode'].unique().tolist()}") + print(f"📊 Available datasets: {df['Dataset'].unique().tolist()}") + print(f"📊 Available models: {df['Model'].unique().tolist()}") + + # Map old mode names to new descriptive names for backward compatibility + mode_mapping = { + "vLLM_NR": "vLLM_No_Reasoning", + "vLLM_XC": "vLLM_All_Reasoning", + "vLLM_NR_REASONING": "vLLM_All_Reasoning", + } + df["Mode"] = df["Mode"].replace(mode_mapping) + + # Clean data + df = df.dropna(subset=["Accuracy", "Avg_Total_Tokens"]) + df = df[df["Accuracy"] >= 0] # Remove invalid accuracy values + + # Get the latest results for each dataset/mode/model combination + df["Timestamp"] = pd.to_datetime(df["Timestamp"]) + df_latest = ( + df.sort_values("Timestamp").groupby(["Dataset", "Mode", "Model"]).tail(1) + ) + + print(f"✅ Using {len(df_latest)} latest records after cleaning") + return df_latest + + except Exception as e: + print(f"❌ Error loading data: {e}") + sys.exit(1) + + +def create_accuracy_plot(df, output_dir): + """Create accuracy comparison plot.""" + plt.figure(figsize=(14, 8)) + + # Prepare data for plotting + datasets = sorted(df["Dataset"].unique()) + modes = ["Router", "vLLM_No_Reasoning", "vLLM_All_Reasoning"] + + # Filter for available modes + available_modes = [mode for mode in modes if mode in df["Mode"].unique()] + + # Create subplot data + x = np.arange(len(datasets)) + width = 0.25 + + # Colors for each mode + colors = { + "Router": "#2E86AB", + "vLLM_No_Reasoning": "#A23B72", + "vLLM_All_Reasoning": "#F18F01", + } + + # Plot bars for each mode + for i, mode in enumerate(available_modes): + mode_data = df[df["Mode"] == mode] + accuracies = [] + + for dataset in datasets: + dataset_data = mode_data[mode_data["Dataset"] == dataset] + if not dataset_data.empty: + # Use the latest model's accuracy + accuracy = dataset_data.iloc[-1]["Accuracy"] + accuracies.append(accuracy) + else: + accuracies.append(0) + + # Clean mode name for display + display_name = mode.replace("vLLM_", "vLLM ").replace("_", " ") + + plt.bar( + x + i * width, + accuracies, + width, + label=display_name, + color=colors.get(mode, f"C{i}"), + alpha=0.8, + ) + + # Add value labels on bars + for j, acc in enumerate(accuracies): + if acc > 0: + plt.text( + x[j] + i * width, + acc + 0.01, + f"{acc:.3f}", + ha="center", + va="bottom", + fontsize=9, + fontweight="bold", + ) + + plt.xlabel("Dataset", fontsize=12, fontweight="bold") + plt.ylabel("Accuracy", fontsize=12, fontweight="bold") + plt.title( + "Accuracy Comparison: Router vs vLLM Direct\n(No Reasoning vs All Reasoning)", + fontsize=14, + fontweight="bold", + pad=20, + ) + plt.xticks(x + width, [d.upper() for d in datasets], rotation=45, ha="right") + plt.legend(loc="upper left", frameon=True, fancybox=True, shadow=True) + plt.grid(True, alpha=0.3, axis="y") + plt.ylim(0, 1.1) + + # Add model info + models = df["Model"].unique() + model_text = f"Models: {', '.join(models)}" + plt.figtext(0.02, 0.02, model_text, fontsize=8, style="italic") + + plt.tight_layout() + + # Save plot + accuracy_path = output_dir / "accuracy_comparison.png" + plt.savefig(accuracy_path, dpi=300, bbox_inches="tight") + print(f"📊 Accuracy plot saved: {accuracy_path}") + plt.close() + + +def create_token_usage_plot(df, output_dir): + """Create token usage comparison plot.""" + plt.figure(figsize=(14, 8)) + + # Prepare data for plotting + datasets = sorted(df["Dataset"].unique()) + modes = ["Router", "vLLM_No_Reasoning", "vLLM_All_Reasoning"] + + # Filter for available modes + available_modes = [mode for mode in modes if mode in df["Mode"].unique()] + + # Create subplot data + x = np.arange(len(datasets)) + width = 0.25 + + # Colors for each mode + colors = { + "Router": "#2E86AB", + "vLLM_No_Reasoning": "#A23B72", + "vLLM_All_Reasoning": "#F18F01", + } + + # Plot bars for each mode + for i, mode in enumerate(available_modes): + mode_data = df[df["Mode"] == mode] + token_usage = [] + + for dataset in datasets: + dataset_data = mode_data[mode_data["Dataset"] == dataset] + if not dataset_data.empty: + # Use the latest model's token usage + tokens = dataset_data.iloc[-1]["Avg_Total_Tokens"] + token_usage.append(tokens) + else: + token_usage.append(0) + + # Clean mode name for display + display_name = mode.replace("vLLM_", "vLLM ").replace("_", " ") + + plt.bar( + x + i * width, + token_usage, + width, + label=display_name, + color=colors.get(mode, f"C{i}"), + alpha=0.8, + ) + + # Add value labels on bars + for j, tokens in enumerate(token_usage): + if tokens > 0: + plt.text( + x[j] + i * width, + tokens + max(token_usage) * 0.01, + f"{tokens:.0f}", + ha="center", + va="bottom", + fontsize=9, + fontweight="bold", + ) + + plt.xlabel("Dataset", fontsize=12, fontweight="bold") + plt.ylabel("Average Total Tokens", fontsize=12, fontweight="bold") + plt.title( + "Token Usage Comparison: Router vs vLLM Direct\n(No Reasoning vs All Reasoning)", + fontsize=14, + fontweight="bold", + pad=20, + ) + plt.xticks(x + width, [d.upper() for d in datasets], rotation=45, ha="right") + plt.legend(loc="upper left", frameon=True, fancybox=True, shadow=True) + plt.grid(True, alpha=0.3, axis="y") + + # Add model info + models = df["Model"].unique() + model_text = f"Models: {', '.join(models)}" + plt.figtext(0.02, 0.02, model_text, fontsize=8, style="italic") + + plt.tight_layout() + + # Save plot + token_path = output_dir / "token_usage_comparison.png" + plt.savefig(token_path, dpi=300, bbox_inches="tight") + print(f"📊 Token usage plot saved: {token_path}") + plt.close() + + +def create_efficiency_plot(df, output_dir): + """Create efficiency scatter plot (accuracy vs tokens).""" + plt.figure(figsize=(12, 8)) + + modes = ["Router", "vLLM_No_Reasoning", "vLLM_All_Reasoning"] + available_modes = [mode for mode in modes if mode in df["Mode"].unique()] + + colors = { + "Router": "#2E86AB", + "vLLM_No_Reasoning": "#A23B72", + "vLLM_All_Reasoning": "#F18F01", + } + + markers = {"Router": "o", "vLLM_No_Reasoning": "s", "vLLM_All_Reasoning": "^"} + + for mode in available_modes: + mode_data = df[df["Mode"] == mode] + + display_name = mode.replace("vLLM_", "vLLM ").replace("_", " ") + + plt.scatter( + mode_data["Avg_Total_Tokens"], + mode_data["Accuracy"], + c=colors.get(mode, "gray"), + marker=markers.get(mode, "o"), + s=100, + alpha=0.7, + label=display_name, + edgecolors="black", + linewidth=1, + ) + + # Add dataset labels + for _, row in mode_data.iterrows(): + plt.annotate( + row["Dataset"].upper(), + (row["Avg_Total_Tokens"], row["Accuracy"]), + xytext=(5, 5), + textcoords="offset points", + fontsize=8, + alpha=0.8, + ) + + plt.xlabel("Average Total Tokens", fontsize=12, fontweight="bold") + plt.ylabel("Accuracy", fontsize=12, fontweight="bold") + plt.title( + "Efficiency Analysis: Accuracy vs Token Usage\n(Higher accuracy with lower tokens is better)", + fontsize=14, + fontweight="bold", + pad=20, + ) + plt.legend(frameon=True, fancybox=True, shadow=True) + plt.grid(True, alpha=0.3) + + # Add model info + models = df["Model"].unique() + model_text = f"Models: {', '.join(models)}" + plt.figtext(0.02, 0.02, model_text, fontsize=8, style="italic") + + plt.tight_layout() + + # Save plot + efficiency_path = output_dir / "efficiency_analysis.png" + plt.savefig(efficiency_path, dpi=300, bbox_inches="tight") + print(f"📊 Efficiency plot saved: {efficiency_path}") + plt.close() + + +def create_summary_table(df, output_dir): + """Create a summary table of results.""" + + # Check if we're dealing with a single dataset + unique_datasets = df["Dataset"].nunique() + dataset_name = ( + df["Dataset"].iloc[0] if unique_datasets == 1 else "Multiple Datasets" + ) + + if unique_datasets == 1: + print(f"\n📋 RESULTS SUMMARY - {dataset_name.upper()}") + else: + print(f"\n📋 RESULTS SUMMARY - AGGREGATE ACROSS {unique_datasets} DATASETS") + print("=" * 80) + + # For single dataset, show individual values; for multiple datasets, show statistics + if unique_datasets == 1: + # Show individual values for each mode + print( + f"{'Mode':<20} {'Accuracy':<10} {'Tokens':<10} {'Latency(ms)':<12} {'Samples':<8}" + ) + print("-" * 65) + + for mode in sorted(df["Mode"].unique()): + mode_data = df[df["Mode"] == mode].iloc[0] + print( + f"{mode:<20} {mode_data['Accuracy']:<10.3f} {mode_data['Avg_Total_Tokens']:<10.1f} {mode_data['Avg_Latency_ms']:<12.1f} {mode_data['Sample_Count']:<8}" + ) + + summary = df.groupby("Mode")[ + ["Accuracy", "Avg_Total_Tokens", "Avg_Latency_ms"] + ].first() + else: + # Group by mode and calculate averages for multiple datasets + summary = ( + df.groupby("Mode") + .agg( + { + "Accuracy": ["mean", "std", "count"], + "Avg_Total_Tokens": ["mean", "std"], + "Avg_Latency_ms": ["mean", "std"], + } + ) + .round(3) + ) + + print(summary) + + # Save detailed results table + detailed_table = df.pivot_table( + index="Dataset", + columns="Mode", + values=["Accuracy", "Avg_Total_Tokens"], + aggfunc="mean", + ).round(3) + + table_path = output_dir / "results_summary_table.csv" + detailed_table.to_csv(table_path) + print(f"\n📊 Detailed results table saved: {table_path}") + + return summary + + +def main(): + parser = argparse.ArgumentParser(description="Plot comprehensive benchmark results") + parser.add_argument( + "--csv", + type=str, + default="research_results_master.csv", + help="Path to research results CSV file", + ) + parser.add_argument( + "--output-dir", + type=str, + default="research_plots", + help="Output directory for plots", + ) + parser.add_argument( + "--model-filter", + type=str, + default=None, + help='Filter results for specific model (e.g., "Qwen/Qwen3-30B-A3B")', + ) + parser.add_argument( + "--dataset-filter", + type=str, + default=None, + help='Filter results for specific dataset (e.g., "truthfulqa")', + ) + + args = parser.parse_args() + + # Create output directory + output_dir = Path(args.output_dir) + output_dir.mkdir(exist_ok=True) + + print(f"🎨 Creating comprehensive benchmark plots...") + print(f"📊 Input CSV: {args.csv}") + print(f"📁 Output directory: {output_dir}") + + # Load data + df = load_and_clean_data(args.csv) + + # Filter by model if specified + if args.model_filter: + df = df[df["Model"].str.contains(args.model_filter, na=False)] + print(f"🔍 Filtered to model: {args.model_filter} ({len(df)} records)") + + # Filter by dataset if specified + if args.dataset_filter: + df = df[df["Dataset"].str.contains(args.dataset_filter, case=False, na=False)] + print(f"🔍 Filtered to dataset: {args.dataset_filter} ({len(df)} records)") + + if df.empty: + print("❌ No data available after filtering!") + sys.exit(1) + + # Create plots + create_accuracy_plot(df, output_dir) + create_token_usage_plot(df, output_dir) + create_efficiency_plot(df, output_dir) + + # Create summary + summary = create_summary_table(df, output_dir) + + print(f"\n🎉 All plots created successfully!") + print(f"📁 Check the '{output_dir}' directory for:") + print(f" - accuracy_comparison.png") + print(f" - token_usage_comparison.png") + print(f" - efficiency_analysis.png") + print(f" - results_summary_table.csv") + + +if __name__ == "__main__": + main() diff --git a/bench/vllm_semantic_router_bench/cli.py b/bench/vllm_semantic_router_bench/cli.py index b8fdab63..2bb347c8 100644 --- a/bench/vllm_semantic_router_bench/cli.py +++ b/bench/vllm_semantic_router_bench/cli.py @@ -21,7 +21,7 @@ def main(): semantic-router-bench test --dataset mmlu --samples 5 # Full benchmark comparison - semantic-router-bench compare --dataset arc --samples 10 + semantic-router-bench compare --dataset arc-challenge --samples 10 # List available datasets semantic-router-bench list-datasets @@ -40,7 +40,15 @@ def main(): test_parser.add_argument( "--dataset", required=True, - choices=["mmlu", "arc", "gpqa", "truthfulqa", "commonsenseqa", "hellaswag"], + choices=[ + "mmlu", + "arc", + "arc-challenge", + "gpqa", + "truthfulqa", + "commonsenseqa", + "hellaswag", + ], help="Dataset to test", ) test_parser.add_argument( @@ -68,7 +76,15 @@ def main(): compare_parser.add_argument( "--dataset", required=True, - choices=["mmlu", "arc", "gpqa", "truthfulqa", "commonsenseqa", "hellaswag"], + choices=[ + "mmlu", + "arc", + "arc-challenge", + "gpqa", + "truthfulqa", + "commonsenseqa", + "hellaswag", + ], help="Dataset to benchmark", ) compare_parser.add_argument( @@ -119,7 +135,14 @@ def main(): comprehensive_parser.add_argument( "--datasets", nargs="+", - default=["mmlu", "arc", "gpqa", "truthfulqa", "commonsenseqa", "hellaswag"], + default=[ + "mmlu", + "arc-challenge", + "gpqa", + "truthfulqa", + "commonsenseqa", + "hellaswag", + ], help="Datasets to benchmark", ) comprehensive_parser.add_argument( @@ -227,7 +250,7 @@ def list_datasets(): print("\nUsage examples:") print(" semantic-router-bench test --dataset mmlu --samples 5") - print(" semantic-router-bench compare --dataset arc --samples 10") + print(" semantic-router-bench compare --dataset arc-challenge --samples 10") return 0 except ImportError as e: diff --git a/bench/vllm_semantic_router_bench/dataset_factory.py b/bench/vllm_semantic_router_bench/dataset_factory.py index 429faf9e..499118de 100644 --- a/bench/vllm_semantic_router_bench/dataset_factory.py +++ b/bench/vllm_semantic_router_bench/dataset_factory.py @@ -7,20 +7,28 @@ from typing import Dict, List, Optional, Type +from .dataset_implementations.aqua_rat_dataset import AquaRatDataset from .dataset_implementations.arc_dataset import ( ARCChallengeDataset, ARCDataset, ARCEasyDataset, ) from .dataset_implementations.commonsenseqa_dataset import CommonsenseQADataset +from .dataset_implementations.drop_dataset import DROPDataset from .dataset_implementations.gpqa_dataset import ( GPQADataset, GPQADiamondDataset, GPQAExtendedDataset, GPQAMainDataset, ) +from .dataset_implementations.gsm8k_dataset import GSM8KDataset from .dataset_implementations.hellaswag_dataset import HellaSwagDataset + +# from .dataset_implementations.math_dataset import MATHDataset # Disabled - dataset not available from .dataset_implementations.mmlu_dataset import MMLUDataset +from .dataset_implementations.openbookqa_dataset import OpenBookQADataset +from .dataset_implementations.sciq_dataset import SciQDataset +from .dataset_implementations.strategyqa_dataset import StrategyQADataset from .dataset_implementations.truthfulqa_dataset import TruthfulQADataset from .dataset_interface import DatasetInterface @@ -101,6 +109,19 @@ def get_dataset_info(cls, name: str) -> Dict[str, str]: DatasetFactory.register_dataset("gpqa-extended", GPQAExtendedDataset) DatasetFactory.register_dataset("gpqa-diamond", GPQADiamondDataset) +# Register mathematical reasoning datasets +# DatasetFactory.register_dataset("math", MATHDataset) # Disabled - dataset not available +DatasetFactory.register_dataset("gsm8k", GSM8KDataset) +DatasetFactory.register_dataset("aqua-rat", AquaRatDataset) + +# Register multi-step reasoning datasets +DatasetFactory.register_dataset("drop", DROPDataset) +DatasetFactory.register_dataset("strategyqa", StrategyQADataset) + +# Register scientific reasoning datasets +DatasetFactory.register_dataset("sciq", SciQDataset) +DatasetFactory.register_dataset("openbookqa", OpenBookQADataset) + # Register hard reasoning datasets DatasetFactory.register_dataset("truthfulqa", TruthfulQADataset) DatasetFactory.register_dataset("commonsenseqa", CommonsenseQADataset) diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/aqua_rat_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/aqua_rat_dataset.py new file mode 100644 index 00000000..8e99e0a5 --- /dev/null +++ b/bench/vllm_semantic_router_bench/dataset_implementations/aqua_rat_dataset.py @@ -0,0 +1,173 @@ +""" +AQUA-RAT Dataset Implementation + +Algebraic Question Answering with Rationales - algebraic word problems +with step-by-step rationales for mathematical reasoning evaluation. +""" + +import os +import random +import sys +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +from datasets import load_dataset + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from ..dataset_interface import DatasetInfo, DatasetInterface, Question + + +class AquaRatDataset(DatasetInterface): + """AQUA-RAT dataset implementation for algebraic reasoning with rationales.""" + + def __init__(self): + """Initialize AQUA-RAT dataset.""" + self._dataset_cache = None + self._categories_cache = None + + @property + def dataset_name(self) -> str: + return "AQUA-RAT" + + @property + def supports_cot(self) -> bool: + return True # AQUA-RAT has rationales + + def _load_raw_dataset(self): + """Load raw AQUA-RAT dataset from Hugging Face.""" + if self._dataset_cache is not None: + return self._dataset_cache + + # Load the test split + dataset = load_dataset("aqua_rat", split="test") + self._dataset_cache = pd.DataFrame(dataset) + return self._dataset_cache + + def _get_categories(self) -> List[str]: + """Get available categories in AQUA-RAT dataset.""" + if self._categories_cache is not None: + return self._categories_cache + + # AQUA-RAT doesn't have category columns, treat as single dataset + self._categories_cache = ["default"] + return self._categories_cache + + def get_available_categories(self) -> List[str]: + """Get list of all available categories in the dataset.""" + return self._get_categories() + + def load_dataset( + self, + categories: Optional[List[str]] = None, + samples_per_category: Optional[int] = None, + seed: int = 42, + ) -> Tuple[List[Question], DatasetInfo]: + """Load AQUA-RAT dataset with optional filtering and sampling.""" + df = self._load_raw_dataset() + available_categories = self._get_categories() + + # Filter categories if specified + if categories: + missing_categories = set(categories) - set(available_categories) + if missing_categories: + raise ValueError( + f"Categories not found: {missing_categories}. " + f"Available: {available_categories}" + ) + selected_categories = categories + else: + selected_categories = available_categories + + # Sample questions if specified + if samples_per_category: + np.random.seed(seed) + random.seed(seed) + + sample_size = min(samples_per_category, len(df)) + df = df.sample(n=sample_size, random_state=seed) + + # Convert to Question objects + questions = [] + for _, row in df.iterrows(): + question_text = row["question"] + raw_options = row["options"] # List of 5 options (A, B, C, D, E) + correct_answer = row["correct"] # Letter (A, B, C, D, E) + rationale = row["rationale"] # Step-by-step explanation + + # Clean options by removing letter prefixes (e.g., "A)500" -> "500") + options = [] + for option in raw_options: + # Remove letter prefix like "A)", "B)", etc. + import re + + cleaned = re.sub(r"^[A-E]\)", "", option).strip() + options.append(cleaned) + + question = Question( + question_id=f"aqua_rat_{len(questions)}", + question=question_text, + options=options, + correct_answer=correct_answer, + category="default", + cot_content=rationale, + metadata={ + "difficulty": "Moderate", + "type": "algebraic_word_problem", + "rationale": rationale, + }, + ) + questions.append(question) + + dataset_info = DatasetInfo( + name="AQUA-RAT", + description="Algebraic word problems with step-by-step rationales", + categories=selected_categories, + total_questions=len(questions), + format_type="multiple_choice", + difficulty_level="Moderate", + ) + + return questions, dataset_info + + def format_prompt(self, question: Question, prompt_style: str = "plain") -> str: + """Format prompt for AQUA-RAT questions.""" + options_text = "\n".join( + [f"{chr(65+i)}) {opt}" for i, opt in enumerate(question.options)] + ) + + if prompt_style == "plain": + return f"""Solve this algebraic word problem: + +{question.question} + +{options_text} + +Please provide your answer in the following structured format: +ANSWER: [letter] + +For example: ANSWER: A""" + + elif prompt_style == "explicit_cot": + return f"""Solve this algebraic word problem step by step: + +Problem: {question.question} + +Options: +{options_text} + +Please work through this step-by-step: +1. Identify the variables and what is being asked +2. Set up the algebraic equations +3. Solve the equations step by step +4. Check your answer against the options +5. Select the correct answer + +Please provide your final answer in the following structured format: +ANSWER: [letter] + +For example: ANSWER: A""" + + else: + raise ValueError(f"Unknown prompt style: {prompt_style}") diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/drop_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/drop_dataset.py new file mode 100644 index 00000000..e492f169 --- /dev/null +++ b/bench/vllm_semantic_router_bench/dataset_implementations/drop_dataset.py @@ -0,0 +1,161 @@ +""" +DROP Dataset Implementation + +Discrete Reasoning Over Paragraphs - reading comprehension requiring +discrete reasoning operations over text passages. +""" + +import os +import random +import sys +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +from datasets import load_dataset + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from ..dataset_interface import DatasetInfo, DatasetInterface, Question + + +class DROPDataset(DatasetInterface): + """DROP dataset implementation for discrete reasoning over paragraphs.""" + + def __init__(self): + """Initialize DROP dataset.""" + self._dataset_cache = None + self._categories_cache = None + + @property + def dataset_name(self) -> str: + return "DROP" + + @property + def supports_cot(self) -> bool: + return False # DROP doesn't have built-in CoT content + + def _load_raw_dataset(self): + """Load raw DROP dataset from Hugging Face.""" + if self._dataset_cache is not None: + return self._dataset_cache + + # Load the validation split (test split is not public) + dataset = load_dataset("ucinlp/drop", split="validation") + self._dataset_cache = pd.DataFrame(dataset) + return self._dataset_cache + + def _get_categories(self) -> List[str]: + """Get available categories in DROP dataset.""" + if self._categories_cache is not None: + return self._categories_cache + + # DROP doesn't have category columns, treat as single dataset + self._categories_cache = ["default"] + return self._categories_cache + + def get_available_categories(self) -> List[str]: + """Get list of all available categories in the dataset.""" + return self._get_categories() + + def load_dataset( + self, + categories: Optional[List[str]] = None, + samples_per_category: Optional[int] = None, + seed: int = 42, + ) -> Tuple[List[Question], DatasetInfo]: + """Load DROP dataset with optional filtering and sampling.""" + df = self._load_raw_dataset() + available_categories = self._get_categories() + + # Filter categories if specified + if categories: + missing_categories = set(categories) - set(available_categories) + if missing_categories: + raise ValueError( + f"Categories not found: {missing_categories}. " + f"Available: {available_categories}" + ) + selected_categories = categories + else: + selected_categories = available_categories + + # Sample questions if specified + if samples_per_category: + np.random.seed(seed) + random.seed(seed) + + sample_size = min(samples_per_category, len(df)) + df = df.sample(n=sample_size, random_state=seed) + + # Convert to Question objects + questions = [] + for _, row in df.iterrows(): + passage = row["passage"] + question_text = row["question"] + # DROP has multiple possible answers + answers_spans = row["answers_spans"] + if answers_spans and len(answers_spans["spans"]) > 0: + correct_answer = answers_spans["spans"][0] # Take first valid answer + else: + correct_answer = "Unknown" + + # Combine passage and question + full_question = f"Passage: {passage}\n\nQuestion: {question_text}" + + question = Question( + question_id=f"drop_{len(questions)}", + question=full_question, + options=[], # DROP is free-form, no multiple choice + correct_answer=correct_answer, + category="default", + cot_content=None, + metadata={ + "difficulty": "Hard", + "type": "discrete_reasoning", + "passage": passage, + "question_only": question_text, + }, + ) + questions.append(question) + + dataset_info = DatasetInfo( + name="DROP", + description="Reading comprehension requiring discrete reasoning over paragraphs", + categories=selected_categories, + total_questions=len(questions), + format_type="free_form", + difficulty_level="Hard", + ) + + return questions, dataset_info + + def format_prompt(self, question: Question, prompt_style: str = "plain") -> str: + """Format prompt for DROP questions.""" + if prompt_style == "plain": + return f"""{question.question} + +Please read the passage carefully and answer the question based on the information provided. + +Please provide your answer in the following structured format: +ANSWER: [your answer] + +For example: ANSWER: 68.5 or ANSWER: germans or ANSWER: Centenary Medal""" + + elif prompt_style == "explicit_cot": + return f"""{question.question} + +Please work through this step-by-step: +1. Read the passage carefully +2. Identify the key information relevant to the question +3. Determine what type of reasoning is required (counting, arithmetic, comparison, etc.) +4. Apply the necessary reasoning operations +5. Provide your final answer + +Work through your reasoning step by step, then provide your final answer in the following structured format: +ANSWER: [your answer] + +For example: ANSWER: 68.5 or ANSWER: germans or ANSWER: Centenary Medal""" + + else: + raise ValueError(f"Unknown prompt style: {prompt_style}") diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/gsm8k_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/gsm8k_dataset.py new file mode 100644 index 00000000..4518bb0c --- /dev/null +++ b/bench/vllm_semantic_router_bench/dataset_implementations/gsm8k_dataset.py @@ -0,0 +1,160 @@ +""" +GSM8K Dataset Implementation + +Grade School Math 8K - 8,500 elementary mathematics word problems +requiring multi-step reasoning and basic arithmetic. +""" + +import os +import random +import sys +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +from datasets import load_dataset + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from ..dataset_interface import DatasetInfo, DatasetInterface, Question + + +class GSM8KDataset(DatasetInterface): + """GSM8K dataset implementation for elementary mathematical reasoning.""" + + def __init__(self): + """Initialize GSM8K dataset.""" + self._dataset_cache = None + self._categories_cache = None + + @property + def dataset_name(self) -> str: + return "GSM8K" + + @property + def supports_cot(self) -> bool: + return True # GSM8K has step-by-step solutions + + def _load_raw_dataset(self): + """Load raw GSM8K dataset from Hugging Face.""" + if self._dataset_cache is not None: + return self._dataset_cache + + # Load the test split + dataset = load_dataset("gsm8k", "main", split="test") + self._dataset_cache = pd.DataFrame(dataset) + return self._dataset_cache + + def _get_categories(self) -> List[str]: + """Get available categories in GSM8K dataset.""" + if self._categories_cache is not None: + return self._categories_cache + + # GSM8K doesn't have category columns, treat as single dataset + self._categories_cache = ["default"] + return self._categories_cache + + def get_available_categories(self) -> List[str]: + """Get list of all available categories in the dataset.""" + return self._get_categories() + + def load_dataset( + self, + categories: Optional[List[str]] = None, + samples_per_category: Optional[int] = None, + seed: int = 42, + ) -> Tuple[List[Question], DatasetInfo]: + """Load GSM8K dataset with optional filtering and sampling.""" + df = self._load_raw_dataset() + available_categories = self._get_categories() + + # Filter categories if specified (though GSM8K only has one category) + if categories: + missing_categories = set(categories) - set(available_categories) + if missing_categories: + raise ValueError( + f"Categories not found: {missing_categories}. " + f"Available: {available_categories}" + ) + selected_categories = categories + else: + selected_categories = available_categories + + # Sample questions if specified + if samples_per_category: + np.random.seed(seed) + random.seed(seed) + + sample_size = min(samples_per_category, len(df)) + df = df.sample(n=sample_size, random_state=seed) + + # Convert to Question objects + questions = [] + for _, row in df.iterrows(): + question_text = row["question"] + answer_text = row["answer"] + + # Extract the final numerical answer from the solution + import re + + # GSM8K answers end with "#### [number]" + answer_match = re.search(r"####\s*([0-9,.-]+)", answer_text) + correct_answer = answer_match.group(1) if answer_match else "Unknown" + + question = Question( + question_id=f"gsm8k_{len(questions)}", + question=question_text, + options=[], # GSM8K is free-form, no multiple choice + correct_answer=correct_answer, + category="default", + cot_content=answer_text, # Full solution as CoT + metadata={ + "difficulty": "Elementary", + "type": "word_problem", + "solution": answer_text, + }, + ) + questions.append(question) + + dataset_info = DatasetInfo( + name="GSM8K", + description="Grade school mathematics word problems requiring multi-step reasoning", + categories=selected_categories, + total_questions=len(questions), + format_type="free_form", + difficulty_level="Elementary", + ) + + return questions, dataset_info + + def format_prompt(self, question: Question, prompt_style: str = "plain") -> str: + """Format prompt for GSM8K questions.""" + if prompt_style == "plain": + return f"""Solve this math word problem: + +{question.question} + +Please provide your final answer in the following structured format: +ANSWER: [number] + +For example: ANSWER: 42""" + + elif prompt_style == "explicit_cot": + return f"""Solve this math word problem step by step, showing all your work: + +Problem: {question.question} + +Please work through this step-by-step: +1. Read the problem carefully and identify what is being asked +2. Identify the given information +3. Determine what operations are needed +4. Solve step by step, showing your calculations +5. State your final answer clearly + +Please provide your final answer in the following structured format: +ANSWER: [number] + +For example: ANSWER: 42""" + + else: + raise ValueError(f"Unknown prompt style: {prompt_style}") diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/math_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/math_dataset.py new file mode 100644 index 00000000..5ab3cce9 --- /dev/null +++ b/bench/vllm_semantic_router_bench/dataset_implementations/math_dataset.py @@ -0,0 +1,171 @@ +""" +MATH Dataset Implementation + +Hendrycks et al. MATH dataset - 12,500 competition mathematics problems +requiring advanced mathematical reasoning across algebra, calculus, geometry, etc. +""" + +import os +import random +import sys +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +from datasets import load_dataset + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from ..dataset_interface import DatasetInfo, DatasetInterface, Question + + +class MATHDataset(DatasetInterface): + """MATH (Hendrycks et al.) dataset implementation for mathematical reasoning.""" + + def __init__(self): + """Initialize MATH dataset.""" + self._dataset_cache = None + self._categories_cache = None + + @property + def dataset_name(self) -> str: + return "MATH" + + @property + def supports_cot(self) -> bool: + return True # MATH has step-by-step solutions + + def _load_raw_dataset(self): + """Load raw MATH dataset from Hugging Face.""" + if self._dataset_cache is not None: + return self._dataset_cache + + # Load the test split - try different possible dataset names + try: + dataset = load_dataset("hendrycks/math", split="test") + except Exception: + try: + dataset = load_dataset("lighteval/MATH", split="test") + except Exception: + dataset = load_dataset("competition_math", split="test") + self._dataset_cache = pd.DataFrame(dataset) + return self._dataset_cache + + def _get_categories(self) -> List[str]: + """Get available categories (subjects) in MATH dataset.""" + if self._categories_cache is not None: + return self._categories_cache + + df = self._load_raw_dataset() + # MATH has 'type' field for subject areas + self._categories_cache = sorted(df["type"].unique().tolist()) + return self._categories_cache + + def get_available_categories(self) -> List[str]: + """Get list of all available categories in the dataset.""" + return self._get_categories() + + def load_dataset( + self, + categories: Optional[List[str]] = None, + samples_per_category: Optional[int] = None, + seed: int = 42, + ) -> Tuple[List[Question], DatasetInfo]: + """Load MATH dataset with optional filtering and sampling.""" + df = self._load_raw_dataset() + available_categories = self._get_categories() + + # Filter categories if specified + if categories: + missing_categories = set(categories) - set(available_categories) + if missing_categories: + raise ValueError( + f"Categories not found: {missing_categories}. " + f"Available: {available_categories}" + ) + df = df[df["type"].isin(categories)] + selected_categories = categories + else: + selected_categories = available_categories + + # Sample questions per category + if samples_per_category: + sampled_dfs = [] + np.random.seed(seed) + random.seed(seed) + + for category in selected_categories: + category_df = df[df["type"] == category] + if len(category_df) == 0: + continue + + sample_size = min(samples_per_category, len(category_df)) + sampled_df = category_df.sample(n=sample_size, random_state=seed) + sampled_dfs.append(sampled_df) + + if sampled_dfs: + df = pd.concat(sampled_dfs, ignore_index=True) + else: + df = pd.DataFrame() + + # Convert to Question objects + questions = [] + for _, row in df.iterrows(): + # MATH problems are free-form, but we need to extract the final answer + # The solution contains the final answer in \boxed{} format + question_text = row["problem"] + solution = row["solution"] + + # Extract boxed answer as the correct answer + import re + + boxed_match = re.search(r"\\boxed\{([^}]+)\}", solution) + correct_answer = boxed_match.group(1) if boxed_match else "Unknown" + + question = Question( + question_id=f"math_{len(questions)}", + question=question_text, + options=[], # MATH is free-form, no multiple choice + correct_answer=correct_answer, + category=row["type"], + cot_content=solution, # Full solution as CoT + metadata={ + "level": row.get("level", "Unknown"), + "subject": row["type"], + "solution": solution, + }, + ) + questions.append(question) + + dataset_info = DatasetInfo( + name="MATH", + description="Competition mathematics problems requiring advanced reasoning", + categories=selected_categories, + total_questions=len(questions), + format_type="free_form", + difficulty_level="Graduate", # Competition math is very hard + ) + + return questions, dataset_info + + def format_prompt(self, question: Question, prompt_style: str = "plain") -> str: + """Format prompt for MATH questions.""" + if prompt_style == "plain": + return f"Solve this mathematics problem step by step:\n\n{question.question}\n\nProvide your final answer in the format: Answer: [your answer]" + + elif prompt_style == "explicit_cot": + return f"""Solve this mathematics problem step by step, showing all your work: + +Problem: {question.question} + +Please work through this step-by-step: +1. Identify what is being asked +2. Determine the relevant mathematical concepts +3. Set up the problem +4. Solve step by step +5. Verify your answer + +Provide your final answer in the format: Answer: [your answer]""" + + else: + raise ValueError(f"Unknown prompt style: {prompt_style}") diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/openbookqa_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/openbookqa_dataset.py new file mode 100644 index 00000000..48bf35be --- /dev/null +++ b/bench/vllm_semantic_router_bench/dataset_implementations/openbookqa_dataset.py @@ -0,0 +1,167 @@ +""" +OpenBookQA Dataset Implementation + +Elementary science questions requiring reasoning over a "book" of facts. +Tests ability to combine multiple facts and apply scientific reasoning. +""" + +import os +import random +import sys +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +from datasets import load_dataset + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from ..dataset_interface import DatasetInfo, DatasetInterface, Question + + +class OpenBookQADataset(DatasetInterface): + """OpenBookQA dataset implementation for scientific reasoning with facts.""" + + def __init__(self): + """Initialize OpenBookQA dataset.""" + self._dataset_cache = None + self._categories_cache = None + + @property + def dataset_name(self) -> str: + return "OpenBookQA" + + @property + def supports_cot(self) -> bool: + return False # OpenBookQA doesn't have built-in CoT content + + def _load_raw_dataset(self): + """Load raw OpenBookQA dataset from Hugging Face.""" + if self._dataset_cache is not None: + return self._dataset_cache + + # Load the test split + dataset = load_dataset("openbookqa", split="test") + self._dataset_cache = pd.DataFrame(dataset) + return self._dataset_cache + + def _get_categories(self) -> List[str]: + """Get available categories in OpenBookQA dataset.""" + if self._categories_cache is not None: + return self._categories_cache + + # OpenBookQA doesn't have category columns, treat as single dataset + self._categories_cache = ["default"] + return self._categories_cache + + def get_available_categories(self) -> List[str]: + """Get list of all available categories in the dataset.""" + return self._get_categories() + + def load_dataset( + self, + categories: Optional[List[str]] = None, + samples_per_category: Optional[int] = None, + seed: int = 42, + ) -> Tuple[List[Question], DatasetInfo]: + """Load OpenBookQA dataset with optional filtering and sampling.""" + df = self._load_raw_dataset() + available_categories = self._get_categories() + + # Filter categories if specified + if categories: + missing_categories = set(categories) - set(available_categories) + if missing_categories: + raise ValueError( + f"Categories not found: {missing_categories}. " + f"Available: {available_categories}" + ) + selected_categories = categories + else: + selected_categories = available_categories + + # Sample questions if specified + if samples_per_category: + np.random.seed(seed) + random.seed(seed) + + sample_size = min(samples_per_category, len(df)) + df = df.sample(n=sample_size, random_state=seed) + + # Convert to Question objects + questions = [] + for _, row in df.iterrows(): + question_stem = row["question_stem"] + choices = row["choices"] + answer_key = row["answerKey"] # A, B, C, D + + # Extract options from choices + # Handle different possible structures for choices + if isinstance(choices, dict) and "text" in choices: + options = choices["text"] + elif isinstance(choices, list): + options = [ + choice["text"] if isinstance(choice, dict) else choice + for choice in choices + ] + else: + options = [str(choices)] # Fallback + + question = Question( + question_id=f"openbookqa_{len(questions)}", + question=question_stem, + options=options, + correct_answer=answer_key, + category="default", + cot_content=None, + metadata={ + "difficulty": "Elementary", + "type": "science_reasoning", + "requires_fact_combination": True, + }, + ) + questions.append(question) + + dataset_info = DatasetInfo( + name="OpenBookQA", + description="Elementary science questions requiring reasoning over scientific facts", + categories=selected_categories, + total_questions=len(questions), + format_type="multiple_choice", + difficulty_level="Elementary", + ) + + return questions, dataset_info + + def format_prompt(self, question: Question, prompt_style: str = "plain") -> str: + """Format prompt for OpenBookQA questions.""" + options_text = "\n".join( + [f"{chr(65+i)}) {opt}" for i, opt in enumerate(question.options)] + ) + + if prompt_style == "plain": + return f"""Question: {question.question} + +{options_text} + +Think about what scientific facts and principles apply to this question. + +Provide your answer in the format 'Answer: [letter]'.""" + + elif prompt_style == "explicit_cot": + return f"""Question: {question.question} + +Options: +{options_text} + +Please work through this step-by-step: +1. Identify what scientific concept or principle the question is testing +2. Think about relevant scientific facts that might apply +3. Consider how different facts might combine to answer the question +4. Apply scientific reasoning to eliminate incorrect options +5. Select the best answer based on scientific principles + +Show your scientific reasoning step by step, then provide your answer in the format 'Answer: [letter]'.""" + + else: + raise ValueError(f"Unknown prompt style: {prompt_style}") diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/sciq_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/sciq_dataset.py new file mode 100644 index 00000000..f5c9c8e7 --- /dev/null +++ b/bench/vllm_semantic_router_bench/dataset_implementations/sciq_dataset.py @@ -0,0 +1,173 @@ +""" +SciQ Dataset Implementation + +Science Questions - multiple choice science questions requiring +scientific reasoning and knowledge application. +""" + +import os +import random +import sys +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +from datasets import load_dataset + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from ..dataset_interface import DatasetInfo, DatasetInterface, Question + + +class SciQDataset(DatasetInterface): + """SciQ dataset implementation for scientific reasoning.""" + + def __init__(self): + """Initialize SciQ dataset.""" + self._dataset_cache = None + self._categories_cache = None + + @property + def dataset_name(self) -> str: + return "SciQ" + + @property + def supports_cot(self) -> bool: + return False # SciQ doesn't have built-in CoT content + + def _load_raw_dataset(self): + """Load raw SciQ dataset from Hugging Face.""" + if self._dataset_cache is not None: + return self._dataset_cache + + # Load the test split + dataset = load_dataset("sciq", split="test") + self._dataset_cache = pd.DataFrame(dataset) + return self._dataset_cache + + def _get_categories(self) -> List[str]: + """Get available categories in SciQ dataset.""" + if self._categories_cache is not None: + return self._categories_cache + + # SciQ doesn't have category columns, treat as single dataset + self._categories_cache = ["default"] + return self._categories_cache + + def get_available_categories(self) -> List[str]: + """Get list of all available categories in the dataset.""" + return self._get_categories() + + def load_dataset( + self, + categories: Optional[List[str]] = None, + samples_per_category: Optional[int] = None, + seed: int = 42, + ) -> Tuple[List[Question], DatasetInfo]: + """Load SciQ dataset with optional filtering and sampling.""" + df = self._load_raw_dataset() + available_categories = self._get_categories() + + # Filter categories if specified + if categories: + missing_categories = set(categories) - set(available_categories) + if missing_categories: + raise ValueError( + f"Categories not found: {missing_categories}. " + f"Available: {available_categories}" + ) + selected_categories = categories + else: + selected_categories = available_categories + + # Sample questions if specified + if samples_per_category: + np.random.seed(seed) + random.seed(seed) + + sample_size = min(samples_per_category, len(df)) + df = df.sample(n=sample_size, random_state=seed) + + # Convert to Question objects + questions = [] + for _, row in df.iterrows(): + question_text = row["question"] + correct_answer = row["correct_answer"] + + # Build options list + options = [ + row["correct_answer"], + row["distractor1"], + row["distractor2"], + row["distractor3"], + ] + # Shuffle options and find correct index + random.seed(42) # Fixed seed for reproducible option order + shuffled_options = options.copy() + random.shuffle(shuffled_options) + correct_idx = shuffled_options.index(correct_answer) + correct_letter = chr(65 + correct_idx) # A, B, C, D + + question = Question( + question_id=f"sciq_{len(questions)}", + question=question_text, + options=shuffled_options, + correct_answer=correct_letter, + category="default", + cot_content=None, + metadata={ + "difficulty": "Moderate", + "type": "science_multiple_choice", + "support": row.get( + "support", "" + ), # Background passage if available + }, + ) + questions.append(question) + + dataset_info = DatasetInfo( + name="SciQ", + description="Science questions requiring scientific reasoning and knowledge", + categories=selected_categories, + total_questions=len(questions), + format_type="multiple_choice", + difficulty_level="Moderate", + ) + + return questions, dataset_info + + def format_prompt(self, question: Question, prompt_style: str = "plain") -> str: + """Format prompt for SciQ questions.""" + options_text = "\n".join( + [f"{chr(65+i)}) {opt}" for i, opt in enumerate(question.options)] + ) + + # Add support passage if available + support_text = "" + if question.metadata and question.metadata.get("support"): + support_text = f"Background: {question.metadata['support']}\n\n" + + if prompt_style == "plain": + return f"""{support_text}Question: {question.question} + +{options_text} + +Provide your answer in the format 'Answer: [letter]'.""" + + elif prompt_style == "explicit_cot": + return f"""{support_text}Question: {question.question} + +Options: +{options_text} + +Please work through this step-by-step: +1. Read the question carefully and identify what scientific concept is being tested +2. Consider any background information provided +3. Apply relevant scientific principles and knowledge +4. Eliminate incorrect options through reasoning +5. Select the best answer + +Show your scientific reasoning step by step, then provide your answer in the format 'Answer: [letter]'.""" + + else: + raise ValueError(f"Unknown prompt style: {prompt_style}") diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/strategyqa_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/strategyqa_dataset.py new file mode 100644 index 00000000..03a7618a --- /dev/null +++ b/bench/vllm_semantic_router_bench/dataset_implementations/strategyqa_dataset.py @@ -0,0 +1,161 @@ +""" +StrategyQA Dataset Implementation + +Multi-step reasoning questions requiring implicit reasoning steps +and strategic thinking to answer yes/no questions. +""" + +import os +import random +import sys +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +from datasets import load_dataset + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from ..dataset_interface import DatasetInfo, DatasetInterface, Question + + +class StrategyQADataset(DatasetInterface): + """StrategyQA dataset implementation for multi-step implicit reasoning.""" + + def __init__(self): + """Initialize StrategyQA dataset.""" + self._dataset_cache = None + self._categories_cache = None + + @property + def dataset_name(self) -> str: + return "StrategyQA" + + @property + def supports_cot(self) -> bool: + return True # StrategyQA has decomposition and evidence + + def _load_raw_dataset(self): + """Load raw StrategyQA dataset from Hugging Face.""" + if self._dataset_cache is not None: + return self._dataset_cache + + # Load the test split + dataset = load_dataset("ChilleD/StrategyQA", split="test") + self._dataset_cache = pd.DataFrame(dataset) + return self._dataset_cache + + def _get_categories(self) -> List[str]: + """Get available categories in StrategyQA dataset.""" + if self._categories_cache is not None: + return self._categories_cache + + # StrategyQA doesn't have category columns, treat as single dataset + self._categories_cache = ["default"] + return self._categories_cache + + def get_available_categories(self) -> List[str]: + """Get list of all available categories in the dataset.""" + return self._get_categories() + + def load_dataset( + self, + categories: Optional[List[str]] = None, + samples_per_category: Optional[int] = None, + seed: int = 42, + ) -> Tuple[List[Question], DatasetInfo]: + """Load StrategyQA dataset with optional filtering and sampling.""" + df = self._load_raw_dataset() + available_categories = self._get_categories() + + # Filter categories if specified + if categories: + missing_categories = set(categories) - set(available_categories) + if missing_categories: + raise ValueError( + f"Categories not found: {missing_categories}. " + f"Available: {available_categories}" + ) + selected_categories = categories + else: + selected_categories = available_categories + + # Sample questions if specified + if samples_per_category: + np.random.seed(seed) + random.seed(seed) + + sample_size = min(samples_per_category, len(df)) + df = df.sample(n=sample_size, random_state=seed) + + # Convert to Question objects + questions = [] + for _, row in df.iterrows(): + question_text = row["question"] + answer = row["answer"] # Boolean + correct_answer = "Yes" if answer else "No" + + # Build CoT from decomposition and evidence if available + cot_content = None + if "decomposition" in row and row["decomposition"]: + decomp = row["decomposition"] + if isinstance(decomp, list): + cot_content = "Reasoning steps:\n" + "\n".join( + [f"{i+1}. {step}" for i, step in enumerate(decomp)] + ) + else: + cot_content = f"Reasoning: {decomp}" + + question = Question( + question_id=f"strategyqa_{len(questions)}", + question=question_text, + options=["Yes", "No"], # Binary choice + correct_answer=correct_answer, + category="default", + cot_content=cot_content, + metadata={ + "difficulty": "Hard", + "type": "multi_step_reasoning", + "requires_implicit_steps": True, + }, + ) + questions.append(question) + + dataset_info = DatasetInfo( + name="StrategyQA", + description="Multi-step reasoning questions requiring implicit reasoning steps", + categories=selected_categories, + total_questions=len(questions), + format_type="binary_choice", + difficulty_level="Hard", + ) + + return questions, dataset_info + + def format_prompt(self, question: Question, prompt_style: str = "plain") -> str: + """Format prompt for StrategyQA questions.""" + if prompt_style == "plain": + return f"""Answer this question with Yes or No: + +{question.question} + +Think carefully about what information and reasoning steps are needed to answer this question. + +Answer: """ + + elif prompt_style == "explicit_cot": + return f"""Answer this question with Yes or No, showing your reasoning: + +Question: {question.question} + +Please work through this step-by-step: +1. Break down what the question is really asking +2. Identify what facts or knowledge are needed +3. Work through the logical steps required +4. Consider any implicit assumptions or connections +5. Reach your conclusion + +Show your reasoning step by step, then provide your final answer (Yes or No).""" + + else: + raise ValueError(f"Unknown prompt style: {prompt_style}") diff --git a/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py b/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py index 5a963ff0..995b49c0 100644 --- a/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py +++ b/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py @@ -32,10 +32,15 @@ from .dataset_interface import DatasetInfo, Question, questions_to_dataframe # Robust answer extraction patterns for structured response parsing -ANSWER_PATTERN_PRIMARY = re.compile(r"(?:answer\s*:?\s*)([A-Z])", re.IGNORECASE) -ANSWER_PATTERN_FINAL = re.compile(r"(?:final\s*answer\s*:?\s*)([A-Z])", re.IGNORECASE) +ANSWER_PATTERN_PRIMARY = re.compile( + r"(?:answer\s*:?\s+)([A-Z])(?:\s|[.!?)]|$)", re.IGNORECASE +) +ANSWER_PATTERN_FINAL = re.compile( + r"(?:final\s*answer\s*:?\s+)([A-Z])(?:\s|[.!?)]|$)", re.IGNORECASE +) ANSWER_PATTERN_CONCLUSION = re.compile( - r"(?:therefore|thus|so).*?([A-Z])", re.IGNORECASE + r"(?:therefore|thus|so).*?(?:answer\s+is\s+|is\s+)([A-Z])(?:\s|[.!?)]|$)", + re.IGNORECASE, ) @@ -196,25 +201,46 @@ def get_dataset_optimal_tokens(dataset_info, model_name=None): model_multiplier = 1.0 if model_name: model_lower = model_name.lower() + print(f" 🔍 Model detection: '{model_name}' -> '{model_lower}'") if "qwen" in model_lower: # Qwen models are more efficient and can handle longer contexts model_multiplier = 1.5 + print(f" ✅ Qwen model detected, using multiplier: {model_multiplier}") elif "deepseek" in model_lower: # DeepSeek models (e.g., V3.1) are capable and can handle longer contexts model_multiplier = 1.5 + print(f" ✅ DeepSeek model detected, using multiplier: {model_multiplier}") elif "gpt-oss" in model_lower: # GPT-OSS models use baseline token limits model_multiplier = 1.0 + print(f" ✅ GPT-OSS model detected, using multiplier: {model_multiplier}") + else: + print( + f" ⚠️ Unknown model type, using baseline multiplier: {model_multiplier}" + ) # Default to baseline for unknown models - # Base token limits per dataset (optimized for gpt-oss20b baseline) + # Base token limits per dataset (optimized for reasoning tasks with generous headroom) base_dataset_tokens = { - "gpqa": 3000, # Graduate-level scientific reasoning (increased for complex multi-step reasoning) - "truthfulqa": 800, # Misconception analysis - "hellaswag": 800, # Natural continuation reasoning - "arc": 800, # Elementary/middle school science - "commonsenseqa": 1000, # Common sense reasoning - "mmlu": 3000, # Academic knowledge (increased for complex technical domains like engineering/chemistry) + # Proven optimal datasets + "gpqa": 4000, # Graduate-level scientific reasoning (proven optimal from results) + "mmlu": 4000, # Academic knowledge (proven optimal from results) + "truthfulqa": 2500, # Misconception analysis (proven adequate from results) + # Mathematical reasoning datasets + # "math": 6000, # Competition mathematics - DISABLED: dataset not available + "gsm8k": 2500, # Elementary math word problems - simpler than competition math + "aqua-rat": 3000, # Algebraic word problems with rationales + # Multi-step reasoning datasets + "drop": 4000, # Reading comprehension with discrete reasoning - complex passages + "strategyqa": 3500, # Multi-step implicit reasoning - requires detailed thinking + # Scientific reasoning datasets + "sciq": 2000, # Science questions - moderate complexity + "openbookqa": 2500, # Elementary science with fact reasoning + # Other datasets + "hellaswag": 2000, # Natural continuation reasoning + "arc": 2000, # Elementary/middle school science + "arc-challenge": 3000, # Harder ARC questions + "commonsenseqa": 2500, # Common sense reasoning } # Find matching dataset and apply model multiplier @@ -229,16 +255,34 @@ def get_dataset_optimal_tokens(dataset_info, model_name=None): difficulty_tokens = {"graduate": 300, "hard": 300, "moderate": 200, "easy": 150} base_tokens = difficulty_tokens.get(difficulty, 200) + # Special case: Qwen3 models need higher tokens for complex reasoning datasets + if model_name and "qwen" in model_name.lower(): + if "mmlu" in dataset_name or "gpqa" in dataset_name: + final_tokens = 10240 + dataset_type = "MMLU" if "mmlu" in dataset_name else "GPQA" + print( + f" 🎯 Special case: Qwen3 + {dataset_type} = {final_tokens} tokens (fixed requirement)" + ) + return final_tokens + # elif "math" in dataset_name: # DISABLED: dataset not available + # final_tokens = 8000 # Competition math needs extensive proofs + # print(f" 🎯 Special case: Qwen3 + MATH = {final_tokens} tokens (competition math requirement)") + # return final_tokens + # Apply model-specific multiplier and round to nearest 50 final_tokens = int(base_tokens * model_multiplier) final_tokens = ((final_tokens + 25) // 50) * 50 # Round to nearest 50 + print( + f" 🧮 Token calculation: {base_tokens} × {model_multiplier} = {int(base_tokens * model_multiplier)} → {final_tokens} (rounded)" + ) + return final_tokens def get_available_models(endpoint: str, api_key: str = "") -> List[str]: """Get available models from an endpoint.""" - client = OpenAI(base_url=endpoint, api_key=api_key or None) + client = OpenAI(base_url=endpoint, api_key=api_key or None, timeout=300.0) try: models = client.models.list() return [m.id for m in models.data] @@ -247,8 +291,8 @@ def get_available_models(endpoint: str, api_key: str = "") -> List[str]: return [] -def extract_answer(response: Any) -> Optional[str]: - """Extract answer from model response.""" +def extract_answer(response: Any, question: Optional[Question] = None) -> Optional[str]: + """Extract answer from model response based on question format.""" # Normalize non-string responses into a string to be robust to providers # that return structured content (e.g., lists of parts or dicts). if response is None: @@ -285,6 +329,39 @@ def extract_answer(response: Any) -> Optional[str]: except Exception: response = str(response) + # First, try to extract structured answer format "ANSWER: [value]" + structured_answer = extract_structured_answer(response) + if structured_answer: + return structured_answer + + # Determine answer format based on question type + if question and hasattr(question, "options") and question.options: + if len(question.options) == 2 and set(question.options) == {"Yes", "No"}: + # Binary Yes/No questions (StrategyQA) + return extract_binary_answer(response) + else: + # Multiple choice questions (GPQA, MMLU, etc.) + return extract_multiple_choice_answer(response) + else: + # Free-form questions (GSM8K, DROP, etc.) + return extract_free_form_answer(response) + + +def extract_structured_answer(response: str) -> Optional[str]: + """Extract answer from structured 'ANSWER: [value]' format.""" + # Look for "ANSWER: [value]" pattern (case insensitive) + pattern = re.compile(r"ANSWER:\s*(.+?)(?:\n|$)", re.IGNORECASE) + match = pattern.search(response) + if match: + answer = match.group(1).strip() + # Clean up common trailing punctuation + answer = re.sub(r"[.!?]+$", "", answer) + return answer + return None + + +def extract_multiple_choice_answer(response: str) -> Optional[str]: + """Extract multiple choice answer (A, B, C, D, etc.).""" # Try multiple extraction patterns in order of preference patterns = [ANSWER_PATTERN_PRIMARY, ANSWER_PATTERN_FINAL, ANSWER_PATTERN_CONCLUSION] @@ -293,6 +370,20 @@ def extract_answer(response: Any) -> Optional[str]: if match: return match.group(1).upper() + # Additional patterns for common answer formats + additional_patterns = [ + r"(?:correct\s+answer\s+is\s+)([A-Z])", # "correct answer is E" + r"(?:option\s+)([A-Z])", # "option E" + r"(?:choice\s+)([A-Z])", # "choice E" + r"([A-Z])\)", # "E)" format + r"([A-Z])\s*[.!]?\s*$", # Letter at end of line + ] + + for pattern in additional_patterns: + match = re.search(pattern, response, re.IGNORECASE) + if match: + return match.group(1).upper() + # Fallback 1: Look for standalone letters at end of response lines = response.strip().split("\n") for line in reversed(lines[-3:]): # Check last 3 lines @@ -300,14 +391,206 @@ def extract_answer(response: Any) -> Optional[str]: if len(line) == 1 and line.upper() in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": return line.upper() - # Fallback 2: Find last letter in entire response - for char in reversed(response): - if char.upper() in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": - return char.upper() + # Fallback 2: Look for letters in specific contexts (more targeted) + # Check for patterns like "is E" or "answer E" in last few lines + for line in reversed(lines[-3:]): + line = line.strip() + # Look for letter after common words + context_match = re.search( + r"(?:is|answer|option|choice)\s+([A-Z])(?:\s|[.!?]|$)", line, re.IGNORECASE + ) + if context_match: + return context_match.group(1).upper() + + # Final fallback: Find last letter that appears to be an answer (not in middle of words) + # Only consider letters that are standalone or followed by punctuation + for match in re.finditer(r"\b([A-Z])(?:\s|[.!?)]|$)", response): + letter = match.group(1).upper() + if letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": + return letter # Return the last match found + + return None + + +def extract_binary_answer(response: str) -> Optional[str]: + """Extract Yes/No answer from response.""" + response_lower = response.lower() + + # Look for explicit yes/no patterns + yes_patterns = [r"\byes\b", r"\btrue\b", r"\bcorrect\b", r"\baffirmative\b"] + no_patterns = [r"\bno\b", r"\bfalse\b", r"\bincorrect\b", r"\bnegative\b"] + + # Check last few lines first (most likely to contain final answer) + lines = response.strip().split("\n") + for line in reversed(lines[-3:]): + line_lower = line.lower().strip() + + for pattern in yes_patterns: + if re.search(pattern, line_lower): + return "Yes" + + for pattern in no_patterns: + if re.search(pattern, line_lower): + return "No" + + # Fallback: check entire response + for pattern in yes_patterns: + if re.search(pattern, response_lower): + return "Yes" + + for pattern in no_patterns: + if re.search(pattern, response_lower): + return "No" + + return None + + +def extract_free_form_answer(response: str) -> Optional[str]: + """Extract free-form answer (numbers, text, etc.).""" + # For numerical answers, look for numbers with improved patterns + number_patterns = [ + r"(?:answer\s*:?\s*)([0-9,.-]+)", # "Answer: 42" or "Answer 42" + r"####\s*([0-9,.-]+)", # GSM8K format "#### 42" + r"\$([0-9,.-]+)", # Money format "$42" + r"([0-9,.-]+)\s*(?:dollars?|cents?|%|percent)", # "42 dollars" + r"(?:is\s+)([0-9,.-]+)", # "is 42" or "is 68.5" + r"(?:was\s+)([0-9,.-]+)", # "was 42" + r"(?:were\s+)([0-9,.-]+)", # "were 42" + r"([0-9,.-]+)(?:\s+(?:people|units|items|years|days|months|miles|kilometers|percent|%|dollars?|cents?))", # "68.5 people" + ] + + # Check last few lines first (most likely to contain final answer) + lines = response.strip().split("\n") + for line in reversed(lines[-3:]): + line = line.strip() + + for pattern in number_patterns: + match = re.search(pattern, line, re.IGNORECASE) + if match: + return match.group(1).replace(",", "") # Remove commas from numbers + + # Fallback: check entire response for numbers + for pattern in number_patterns: + match = re.search(pattern, response, re.IGNORECASE) + if match: + return match.group(1).replace(",", "") + + # For non-numerical free-form answers (like "germans", "Centenary Medal") + # Look for explicit answer patterns first + text_patterns = [ + r"(?:answer\s*:?\s*)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)", # "Answer: germans" or "Answer: Centenary Medal" + r"(?:is\s+)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)", # "is germans" + r"(?:was\s+)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)", # "was Centenary Medal" + r"(?:were\s+)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)", # "were germans" + r"(?:awarded\s+(?:him\s+)?(?:the\s+)?)([A-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)", # "awarded the Centenary Medal" + r"(?:received\s+(?:the\s+)?)([A-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)", # "received the Centenary Medal" + r"(?:called\s+)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)", # "called germans" + r"(?:named\s+)([a-zA-Z][a-zA-Z0-9\s-]+?)(?:\s*[.!?]|$)", # "named Centenary Medal" + ] + + # Check last few lines for text answers + for line in reversed(lines[-3:]): + line = line.strip() + + for pattern in text_patterns: + match = re.search(pattern, line, re.IGNORECASE) + if match: + answer = match.group(1).strip() + # Clean up common suffixes but preserve important words + answer = re.sub( + r"\s+(?:in\s+\d+|for\s+service).*$", "", answer, flags=re.IGNORECASE + ) + # Limit to reasonable length (1-4 words for most DROP answers) + words = answer.split() + if len(words) <= 4: + return answer + else: + return " ".join(words[:2]) # Take first 2 words for long matches + + # Final fallback: extract last meaningful line + for line in reversed(lines[-3:]): + line = line.strip() + if line and not line.startswith( + ( + "Question:", + "Answer:", + "Therefore", + "So", + "Thus", + "Based on", + "Looking at", + ) + ): + # Remove common prefixes and return clean answer + line = re.sub( + r"^(?:the\s+)?(?:answer\s+is\s+)?", "", line, flags=re.IGNORECASE + ) + # Take first few words if it's a long sentence + words = line.split() + if len(words) > 5: + return " ".join(words[:3]) # Take first 3 words + return line.strip() return None +def compare_free_form_answers(predicted: str, correct: str) -> bool: + """Compare free-form answers with normalization.""" + if not predicted or not correct: + return False + + # Normalize both answers + predicted_norm = normalize_answer(predicted) + correct_norm = normalize_answer(correct) + + # Direct match + if predicted_norm == correct_norm: + return True + + # For numerical answers, try parsing as numbers + try: + pred_num = float(predicted_norm.replace(",", "")) + correct_num = float(correct_norm.replace(",", "")) + # Allow small floating point differences + return abs(pred_num - correct_num) < 1e-6 + except (ValueError, AttributeError): + pass + + # For text answers, check if predicted contains correct or vice versa + if len(predicted_norm) > 3 and len(correct_norm) > 3: + return predicted_norm in correct_norm or correct_norm in predicted_norm + + return False + + +def normalize_answer(answer: str) -> str: + """Normalize answer for comparison.""" + if not isinstance(answer, str): + answer = str(answer) + + # Convert to lowercase and strip + answer = answer.lower().strip() + + # Remove common punctuation and extra spaces + answer = re.sub(r"[^\w\s.-]", "", answer) + answer = re.sub(r"\s+", " ", answer).strip() + + # Remove common prefixes + prefixes = [ + "the answer is", + "answer:", + "the answer:", + "answer is", + "final answer:", + "therefore", + ] + for prefix in prefixes: + if answer.startswith(prefix): + answer = answer[len(prefix) :].strip() + + return answer + + def call_model( client: OpenAI, model: str, @@ -343,41 +626,41 @@ def build_extra_body_for_model( ) -> Optional[Dict[str, Any]]: """Return an extra_body dict to toggle reasoning for a given model. + This function matches the exact pattern from reasoning_eval_consolidated.py + to ensure compatibility and consistent behavior. + - DeepSeek v3.1: {"chat_template_kwargs": {"thinking": true/false}} - - GPT-OSS: {"reasoning_effort": "low|medium|high"} when ON; if not provided, then low + - Qwen3: {"chat_template_kwargs": {"enable_thinking": true/false}} + - GPT-OSS: {"reasoning_effort": "low|high"} based on reasoning flag """ - # reasoning: True -> ON, False -> OFF, None -> base (default behavior) + # reasoning: True -> ON, False -> OFF, None -> no reasoning parameters + if reasoning is None: + return None lower = model_name.lower() + + # DeepSeek v3.1 family (matches reasoning_eval_consolidated.py pattern) if (("ds" in lower) or ("deepseek" in lower)) and ( "v31" in lower or "v3.1" in lower or "v3" in lower ): - if reasoning is True: - return {"chat_template_kwargs": {"thinking": True}} - elif reasoning is False: - return {"chat_template_kwargs": {"thinking": False}} - else: # reasoning is None (base mode) - # Base: do not set thinking for DeepSeek - let it use default behavior - return None - - # Qwen3 family + return {"chat_template_kwargs": {"thinking": reasoning}} + + # Qwen3 family (matches reasoning_eval_consolidated.py pattern) if "qwen3" in lower: - if reasoning is True: - return {"chat_template_kwargs": {"enable_thinking": True}} - if reasoning is False: - return {"chat_template_kwargs": {"enable_thinking": False}} - return None + return {"chat_template_kwargs": {"enable_thinking": reasoning}} - # GPT OSS family + # GPT-OSS family (matches reasoning_eval_consolidated.py pattern) if "gpt-oss" in lower or "openai/gpt-oss" in lower or "gpt_oss" in lower: - if reasoning is True: + if reasoning: return {"reasoning_effort": "high"} - elif reasoning is False: + else: return {"reasoning_effort": "low"} - else: # reasoning is None (base mode) - # Base: do not set reasoning_effort - let it use default behavior - return None + # OpenAI models with reasoning parameter + if "gpt" in lower or "o1" in lower: + return {"reasoning": reasoning} + + # Model does not support reasoning parameters return None @@ -396,13 +679,17 @@ def process_question_single( # Format prompt based on mode if prompt_mode == "XC": prompt = dataset.format_prompt(question, "explicit_cot") - extra_body = None + extra_body = ( + None # XC mode never uses reasoning parameters (CoT prompt instead) + ) elif prompt_mode == "AR": prompt = dataset.format_prompt(question, "plain") extra_body = ar_extra_body - else: # NR or Router-Transparent + else: # NR mode (could be Router-Transparent or direct vLLM) prompt = dataset.format_prompt(question, "plain") - extra_body = None + # For Router-Transparent: ar_extra_body=None (router decides reasoning) + # For direct vLLM: ar_extra_body contains reasoning parameters + extra_body = ar_extra_body start_time = time.time() response_text, success, prompt_tokens, completion_tokens, total_tokens = call_model( @@ -410,21 +697,29 @@ def process_question_single( ) end_time = time.time() - predicted_answer = extract_answer(response_text) if success else None - - # Compare predicted answer with correct answer (handle both letter and index formats) - if predicted_answer and predicted_answer in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": - if isinstance(question.correct_answer, str): - # Dataset stores answer as letter (e.g., MMLU: "F") - is_correct = predicted_answer == question.correct_answer - elif isinstance(question.correct_answer, int): - # Dataset stores answer as index (e.g., CommonsenseQA: 1, ARC: 0) - predicted_idx = ord(predicted_answer) - ord("A") - is_correct = predicted_idx == question.correct_answer + predicted_answer = extract_answer(response_text, question) if success else None + + # Compare predicted answer with correct answer (handle multiple formats) + is_correct = False + if predicted_answer: + if hasattr(question, "options") and question.options: + if len(question.options) == 2 and set(question.options) == {"Yes", "No"}: + # Binary Yes/No questions (StrategyQA) + is_correct = predicted_answer == question.correct_answer + elif predicted_answer in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": + # Multiple choice questions (GPQA, MMLU, etc.) + if isinstance(question.correct_answer, str): + # Dataset stores answer as letter (e.g., MMLU: "F") + is_correct = predicted_answer == question.correct_answer + elif isinstance(question.correct_answer, int): + # Dataset stores answer as index (e.g., CommonsenseQA: 1, ARC: 0) + predicted_idx = ord(predicted_answer) - ord("A") + is_correct = predicted_idx == question.correct_answer else: - is_correct = False - else: - is_correct = False + # Free-form questions (GSM8K, DROP, etc.) + is_correct = compare_free_form_answers( + predicted_answer, question.correct_answer + ) return { "mode": prompt_mode, @@ -456,7 +751,7 @@ def evaluate_model_router_transparent( temperature: float, ) -> pd.DataFrame: """Evaluate model in router-transparent mode.""" - client = OpenAI(base_url=endpoint, api_key=api_key or None) + client = OpenAI(base_url=endpoint, api_key=api_key or None, timeout=300.0) print(f"Using model: {model}, endpoint: {endpoint}") results: List[Dict[str, Any]] = [] @@ -526,7 +821,7 @@ def evaluate_model_vllm_multimode( 2. XC - CoT prompt, no reasoning toggle (prompt-based reasoning) - ONLY if dataset has CoT 3. NR_REASONING - Plain prompt, reasoning toggle ON (model-based reasoning) - ALWAYS included """ - client = OpenAI(base_url=endpoint, api_key=api_key or "dummy-key") + client = OpenAI(base_url=endpoint, api_key=api_key or "dummy-key", timeout=300.0) print(f"Using vLLM model: {model}, endpoint: {endpoint}") # Check if dataset has actual CoT content by examining sample questions @@ -565,35 +860,22 @@ def evaluate_model_vllm_multimode( ) or ("qwen3" in model_lower) # Base modes (always included) - if is_deepseek_or_qwen: - mode_variants: List[Tuple[str, str, Optional[bool]]] = [ - ("VLLM_NR", "NR", False), # Plain prompt, reasoning OFF (baseline) - ( - "VLLM_NR_REASONING", - "NR", - True, - ), # Plain prompt, reasoning ON (model reasoning) - ] - else: - mode_variants: List[Tuple[str, str, Optional[bool]]] = [ - ("VLLM_NR", "NR", None), # Plain prompt, no toggle (baseline) - ( - "VLLM_NR_REASONING", - "NR", - True, - ), # Plain prompt, reasoning toggle ON (model reasoning) - ] + # Always use explicit True/False for reasoning-capable models to ensure consistent behavior + mode_variants: List[Tuple[str, str, Optional[bool]]] = [ + ("VLLM_NR", "NR", False), # Plain prompt, reasoning OFF (baseline) + ( + "VLLM_NR_REASONING", + "NR", + True, + ), # Plain prompt, reasoning ON (model reasoning) + ] # Add XC mode only if dataset has CoT content if has_cot_content: - if is_deepseek_or_qwen: - mode_variants.insert( - 1, ("VLLM_XC", "XC", False) - ) # Insert between NR and NR_REASONING - else: - mode_variants.insert( - 1, ("VLLM_XC", "XC", None) - ) # Insert between NR and NR_REASONING + # Always use explicit False for XC mode (CoT prompt with reasoning OFF) + mode_variants.insert( + 1, ("VLLM_XC", "XC", False) + ) # Insert between NR and NR_REASONING def run_variants(q: Question) -> List[Dict[str, Any]]: local_records: List[Dict[str, Any]] = [] @@ -869,13 +1151,23 @@ def main(): print(f"vLLM models: {vllm_models}") # Function to get optimal tokens for a specific model - # For fair comparison, use consistent token limits regardless of model name + # Use model-aware token allocation for optimal performance def get_model_optimal_tokens(model_name): if args.max_tokens: return args.max_tokens else: - # Use base dataset tokens without model-specific multipliers for fair comparison - return get_dataset_optimal_tokens(dataset_info, model_name=None) + # For router evaluation, use the first vLLM model for token calculation if available + # This ensures consistent token allocation between router and vLLM evaluations + reference_model = None + if vllm_models and len(vllm_models) > 0: + reference_model = vllm_models[0] + print( + f" 🔗 Using vLLM model '{reference_model}' for router token calculation" + ) + elif model_name and model_name != "auto": + reference_model = model_name + + return get_dataset_optimal_tokens(dataset_info, model_name=reference_model) # Router evaluation (NR-only) if args.run_router and router_endpoint and router_models: