From a7aafcda7dd9605972114b042d27f319fe91df7d Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Fri, 12 Sep 2025 13:28:16 +0000
Subject: [PATCH 1/4] feat: implement dataset-agnostic benchmark with
 multi-category evaluation support. Add ARC, GPQA, TruthfulQA, CommonsenseQA,
 and HellaSwag datasets with optimized token limits and robust answer
 extraction.

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 bench/LICENSE                                 | 203 +++++
 bench/MANIFEST.in                             |  43 +
 bench/README.md                               | 200 ++++
 bench/benchmark_comparison.sh                 | 201 +++++
 bench/build_and_test.sh                       |  50 +
 bench/comprehensive_bench.sh                  | 410 +++++++++
 bench/pyproject.toml                          | 132 +++
 bench/requirements.txt                        |  18 +
 bench/router_reason_bench.py                  | 107 ++-
 bench/run_bench.sh                            |  88 --
 bench/setup.py                                | 104 +++
 bench/vllm_semantic_router_bench/__init__.py  |  39 +
 .../bench_plot.py                             | 118 ++-
 bench/vllm_semantic_router_bench/cli.py       | 288 ++++++
 .../dataset_factory.py                        | 137 +++
 .../dataset_implementations/__init__.py       |  28 +
 .../dataset_implementations/arc_dataset.py    | 227 +++++
 .../commonsenseqa_dataset.py                  | 190 ++++
 .../dataset_implementations/gpqa_dataset.py   | 280 ++++++
 .../hellaswag_dataset.py                      | 232 +++++
 .../dataset_implementations/mmlu_dataset.py   | 159 ++++
 .../truthfulqa_dataset.py                     | 226 +++++
 .../dataset_interface.py                      | 356 ++++++++
 .../router_reason_bench_multi_dataset.py      | 851 ++++++++++++++++++
 24 files changed, 4520 insertions(+), 167 deletions(-)
 create mode 100644 bench/LICENSE
 create mode 100644 bench/MANIFEST.in
 create mode 100644 bench/README.md
 create mode 100755 bench/benchmark_comparison.sh
 create mode 100755 bench/build_and_test.sh
 create mode 100755 bench/comprehensive_bench.sh
 create mode 100644 bench/pyproject.toml
 create mode 100644 bench/requirements.txt
 delete mode 100755 bench/run_bench.sh
 create mode 100644 bench/setup.py
 create mode 100644 bench/vllm_semantic_router_bench/__init__.py
 rename bench/{ => vllm_semantic_router_bench}/bench_plot.py (86%)
 create mode 100644 bench/vllm_semantic_router_bench/cli.py
 create mode 100644 bench/vllm_semantic_router_bench/dataset_factory.py
 create mode 100644 bench/vllm_semantic_router_bench/dataset_implementations/__init__.py
 create mode 100644 bench/vllm_semantic_router_bench/dataset_implementations/arc_dataset.py
 create mode 100644 bench/vllm_semantic_router_bench/dataset_implementations/commonsenseqa_dataset.py
 create mode 100644 bench/vllm_semantic_router_bench/dataset_implementations/gpqa_dataset.py
 create mode 100644 bench/vllm_semantic_router_bench/dataset_implementations/hellaswag_dataset.py
 create mode 100644 bench/vllm_semantic_router_bench/dataset_implementations/mmlu_dataset.py
 create mode 100644 bench/vllm_semantic_router_bench/dataset_implementations/truthfulqa_dataset.py
 create mode 100644 bench/vllm_semantic_router_bench/dataset_interface.py
 create mode 100644 bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py

diff --git a/bench/LICENSE b/bench/LICENSE
new file mode 100644
index 00000000..36308b6b
--- /dev/null
+++ b/bench/LICENSE
@@ -0,0 +1,203 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (which shall not include communications that are clearly marked or
+      otherwise designated in writing by the copyright owner as "Not a Work").
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based upon (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and derivative works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control
+      systems, and issue tracking systems that are managed by, or on behalf
+      of, the Licensor for the purpose of discussing and improving the Work,
+      but excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to use, reproduce, modify, merge, publish,
+      distribute, sublicense, and/or sell copies of the Work, and to
+      permit persons to whom the Work is furnished to do so, subject to
+      the following conditions:
+
+      The above copyright notice and this permission notice shall be
+      included in all copies or substantial portions of the Work.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright notice to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Support. You may choose to offer, and to
+      charge a fee for, warranty, support, indemnity or other liability
+      obligations and/or rights consistent with this License. However, in
+      accepting such obligations, You may act only on Your own behalf and
+      on Your sole responsibility, not on behalf of any other Contributor,
+      and only if You agree to indemnify, defend, and hold each Contributor
+      harmless for any liability incurred by, or claims asserted against,
+      such Contributor by reason of your accepting any such warranty or support.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same page as the copyright notice for easier identification within
+      third-party archives.
+
+   Copyright 2024 Semantic Router Team
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/bench/MANIFEST.in b/bench/MANIFEST.in
new file mode 100644
index 00000000..4d05ab6a
--- /dev/null
+++ b/bench/MANIFEST.in
@@ -0,0 +1,43 @@
+# Include package metadata and documentation
+include README.md
+include LICENSE
+include CHANGELOG.md
+include requirements.txt
+include pyproject.toml
+include setup.py
+
+# Include shell scripts
+include *.sh
+include comprehensive_bench.sh
+include benchmark_comparison.sh
+
+# Include dataset implementations
+recursive-include dataset_implementations *.py
+
+# Include example configurations and documentation
+include quick_comparison.md
+
+# Exclude development and testing files
+exclude test_*.py
+exclude *_test.py
+exclude test_*.sh
+exclude .gitignore
+exclude .pre-commit-config.yaml
+
+# Exclude build artifacts
+global-exclude *.pyc
+global-exclude *.pyo
+global-exclude *.pyd
+global-exclude __pycache__
+global-exclude .git*
+global-exclude .DS_Store
+global-exclude *.so
+global-exclude .pytest_cache
+global-exclude .mypy_cache
+global-exclude .coverage
+global-exclude htmlcov
+
+# Exclude results and temporary files
+global-exclude results/
+global-exclude *.log
+global-exclude *.tmp
diff --git a/bench/README.md b/bench/README.md
new file mode 100644
index 00000000..5ae84c15
--- /dev/null
+++ b/bench/README.md
@@ -0,0 +1,200 @@
+# vLLM Semantic Router Benchmark Suite
+
+[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
+[![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+
+A comprehensive benchmark suite for evaluating **semantic router** performance against **direct vLLM** across multiple reasoning datasets. Perfect for researchers and developers working on LLM routing, evaluation, and performance optimization.
+
+## 🎯 Key Features
+
+- **6 Major Reasoning Datasets**: MMLU-Pro, ARC, GPQA, TruthfulQA, CommonsenseQA, HellaSwag
+- **Router vs vLLM Comparison**: Side-by-side performance evaluation
+- **Multiple Evaluation Modes**: NR (neutral), XC (explicit CoT), NR_REASONING (auto-reasoning)
+- **Research-Ready Output**: CSV files and publication-quality plots
+- **Dataset-Agnostic Architecture**: Easy to extend with new datasets
+- **CLI Tools**: Simple command-line interface for common operations
+
+## 🚀 Quick Start
+
+### Installation
+
+```bash
+pip install vllm-semantic-router-bench
+```
+
+### Basic Usage
+
+```bash
+# Quick test on MMLU dataset
+vllm-semantic-router-bench test --dataset mmlu --samples 5
+
+# Full comparison between router and vLLM
+vllm-semantic-router-bench compare --dataset arc --samples 10
+
+# List available datasets
+vllm-semantic-router-bench list-datasets
+
+# Run comprehensive multi-dataset benchmark
+vllm-semantic-router-bench comprehensive
+```
+
+### Python API
+
+```python
+from vllm_semantic_router_bench import DatasetFactory, list_available_datasets
+
+# Load a dataset
+factory = DatasetFactory()
+dataset = factory.create_dataset("mmlu")
+questions, info = dataset.load_dataset(samples_per_category=10)
+
+print(f"Loaded {len(questions)} questions from {info.name}")
+print(f"Categories: {info.categories}")
+```
+
+## 📊 Supported Datasets
+
+| Dataset | Domain | Categories | Difficulty | CoT Support |
+|---------|--------|------------|------------|-------------|
+| **MMLU-Pro** | Academic Knowledge | 57 subjects | Undergraduate | ✅ |
+| **ARC** | Scientific Reasoning | Science | Grade School | ❌ |
+| **GPQA** | Graduate Q&A | Graduate-level | Graduate | ❌ |
+| **TruthfulQA** | Truthfulness | Truthfulness | Hard | ❌ |
+| **CommonsenseQA** | Common Sense | Common Sense | Hard | ❌ |
+| **HellaSwag** | Commonsense NLI | ~50 activities | Moderate | ❌ |
+
+## 🔧 Advanced Usage
+
+### Custom Evaluation Script
+
+```python
+import subprocess
+import sys
+
+# Run detailed benchmark with custom parameters
+cmd = [
+    "router-bench",  # Main benchmark script
+    "--dataset", "mmlu",
+    "--samples-per-category", "20", 
+    "--run-router", "--router-models", "auto",
+    "--run-vllm", "--vllm-models", "openai/gpt-oss-20b",
+    "--vllm-exec-modes", "NR", "NR_REASONING",
+    "--output-dir", "results/custom_test"
+]
+
+subprocess.run(cmd)
+```
+
+### Plotting Results
+
+```bash
+# Generate plots from benchmark results
+bench-plot --router-dir results/router_mmlu \
+           --vllm-dir results/vllm_mmlu \
+           --output-dir results/plots \
+           --dataset-name "MMLU-Pro"
+```
+
+## 📈 Research Output
+
+The benchmark generates research-ready outputs:
+
+- **CSV Files**: Detailed per-question results and aggregated metrics
+- **Master CSV**: Combined results across all test runs
+- **Plots**: Accuracy and token usage comparisons
+- **Summary Reports**: Markdown reports with key findings
+
+### Example Output Structure
+
+```
+results/
+├── research_results_master.csv          # Main research data
+├── comparison_20250115_143022/
+│   ├── router_mmlu/
+│   │   └── detailed_results.csv
+│   ├── vllm_mmlu/  
+│   │   └── detailed_results.csv
+│   ├── plots/
+│   │   ├── accuracy_comparison.png
+│   │   └── token_usage_comparison.png
+│   └── RESEARCH_SUMMARY.md
+```
+
+## 🛠️ Development
+
+### Local Installation
+
+```bash
+git clone https://github.com/vllm-project/semantic-router
+cd semantic-router/bench
+pip install -e ".[dev]"
+```
+
+### Adding New Datasets
+
+1. Create a new dataset implementation in `dataset_implementations/`
+2. Inherit from `DatasetInterface`
+3. Register in `dataset_factory.py`
+4. Add tests and documentation
+
+```python
+from vllm_semantic_router_bench import DatasetInterface, Question, DatasetInfo
+
+class MyDataset(DatasetInterface):
+    def load_dataset(self, **kwargs):
+        # Implementation here
+        pass
+    
+    def format_prompt(self, question, style="plain"):
+        # Implementation here  
+        pass
+```
+
+## 📋 Requirements
+
+- Python 3.8+
+- OpenAI API access (for model evaluation)
+- Hugging Face account (for dataset access)
+- 4GB+ RAM (for larger datasets)
+
+### Dependencies
+
+- `openai>=1.0.0` - OpenAI API client
+- `datasets>=2.14.0` - Hugging Face datasets
+- `pandas>=1.5.0` - Data manipulation
+- `matplotlib>=3.5.0` - Plotting
+- `seaborn>=0.11.0` - Advanced plotting
+- `tqdm>=4.64.0` - Progress bars
+
+## 🤝 Contributing
+
+We welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) for details.
+
+### Common Contributions
+
+- Adding new datasets
+- Improving evaluation metrics
+- Enhancing visualization
+- Performance optimizations
+- Documentation improvements
+
+## 📄 License
+
+This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
+
+## 🔗 Links
+
+- **Documentation**: https://vllm-semantic-router.com
+- **GitHub**: https://github.com/vllm-project/semantic-router
+- **Issues**: https://github.com/vllm-project/semantic-router/issues
+- **PyPI**: https://pypi.org/project/vllm-semantic-router-bench/
+
+## 📞 Support
+
+- **GitHub Issues**: Bug reports and feature requests
+- **Documentation**: Comprehensive guides and API reference
+- **Community**: Join our discussions and get help from other users
+
+---
+
+**Made with ❤️ by the vLLM Semantic Router Team**
diff --git a/bench/benchmark_comparison.sh b/bench/benchmark_comparison.sh
new file mode 100755
index 00000000..94e2e862
--- /dev/null
+++ b/bench/benchmark_comparison.sh
@@ -0,0 +1,201 @@
+#!/bin/bash
+
+# Multi-Dataset Reasoning Benchmark Comparison
+# 
+# Comprehensive evaluation framework comparing semantic router performance
+# against direct vLLM inference across reasoning datasets.
+#
+# Usage: ./benchmark_comparison.sh [dataset] [samples_per_category] [concurrent_requests]
+# Example: ./benchmark_comparison.sh gpqa 5 2
+
+set -e
+
+# Configuration parameters
+DATASET=${1:-"arc"}
+SAMPLES_PER_CATEGORY=${2:-5}
+CONCURRENT_REQUESTS=${3:-2}
+
+# Semantic router configuration
+ROUTER_ENDPOINT="http://127.0.0.1:8801/v1"
+ROUTER_API_KEY="1234"
+ROUTER_MODEL="auto"
+
+# Direct vLLM configuration
+VLLM_ENDPOINT="http://127.0.0.1:8000/v1"
+VLLM_API_KEY="1234"
+VLLM_MODEL="openai/gpt-oss-20b"
+
+# Evaluation parameters
+TEMPERATURE=0.0
+OUTPUT_DIR="results/comparison_$(date +%Y%m%d_%H%M%S)"
+
+echo "🎯 MULTI-DATASET REASONING BENCHMARK"
+echo "====================================="
+echo "Dataset: $DATASET"
+echo "Samples per category: $SAMPLES_PER_CATEGORY"
+echo "Concurrent requests: $CONCURRENT_REQUESTS"
+echo "Output directory: $OUTPUT_DIR"
+echo ""
+
+# Ensure we're in the bench directory
+cd "$(dirname "$0")"
+
+# Activate virtual environment if it exists
+if [ -f "../.venv/bin/activate" ]; then
+    echo "📦 Activating virtual environment..."
+    source ../.venv/bin/activate
+fi
+
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+
+echo "🔄 PHASE 1: ROUTER EVALUATION (via Envoy)"
+echo "------------------------------------------"
+echo "Endpoint: $ROUTER_ENDPOINT"
+echo "Model: $ROUTER_MODEL (router decides)"
+echo ""
+
+# Run router benchmark
+python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
+    --dataset "$DATASET" \
+    --samples-per-category "$SAMPLES_PER_CATEGORY" \
+    --concurrent-requests "$CONCURRENT_REQUESTS" \
+    --router-endpoint "$ROUTER_ENDPOINT" \
+    --router-api-key "$ROUTER_API_KEY" \
+    --router-models "$ROUTER_MODEL" \
+    --temperature "$TEMPERATURE" \
+    --output-dir "$OUTPUT_DIR" \
+    --run-router
+
+echo ""
+echo "🔄 PHASE 2: DIRECT vLLM EVALUATION"
+echo "-----------------------------------"
+echo "Endpoint: $VLLM_ENDPOINT"
+echo "Model: $VLLM_MODEL (direct access)"
+echo ""
+
+# Run direct vLLM benchmark
+python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
+    --dataset "$DATASET" \
+    --samples-per-category "$SAMPLES_PER_CATEGORY" \
+    --concurrent-requests "$CONCURRENT_REQUESTS" \
+    --vllm-endpoint "$VLLM_ENDPOINT" \
+    --vllm-api-key "$VLLM_API_KEY" \
+    --vllm-models "$VLLM_MODEL" \
+    --vllm-exec-modes "NR" "XC" \
+    --temperature "$TEMPERATURE" \
+    --output-dir "$OUTPUT_DIR" \
+    --run-vllm
+
+echo ""
+echo "🎨 PHASE 3: GENERATING COMPARISON PLOTS"
+echo "----------------------------------------"
+
+# Generate plots comparing router vs vLLM
+ROUTER_RESULT=$(find "$OUTPUT_DIR" -name "*router*auto*" -type d | head -1)
+VLLM_RESULT=$(find "$OUTPUT_DIR" -name "*vllm*gpt-oss*" -type d | head -1)
+
+if [ -n "$ROUTER_RESULT" ] && [ -f "$ROUTER_RESULT/summary.json" ] && [ -n "$VLLM_RESULT" ] && [ -f "$VLLM_RESULT/summary.json" ]; then
+    echo "Creating comparison plots (router plotted first for visibility)..."
+    
+    # Create plots directory
+    PLOTS_DIR="$OUTPUT_DIR/plots"
+    mkdir -p "$PLOTS_DIR"
+    
+    # Generate vLLM plots with router overlay (router plotted first)
+    python3 -m vllm_semantic_router_bench.bench_plot \
+        --summary "$VLLM_RESULT/summary.json" \
+        --router-summary "$ROUTER_RESULT/summary.json" \
+        --out-dir "$PLOTS_DIR" \
+        --metrics accuracy avg_response_time avg_total_tokens \
+        --font-scale 1.4 \
+        --dpi 300
+    
+    echo "✅ Plots generated in: $PLOTS_DIR"
+    echo "   - bench_plot_accuracy.png (+ PDF)"
+    echo "   - bench_plot_avg_response_time.png (+ PDF)" 
+    echo "   - bench_plot_avg_total_tokens.png (+ PDF)"
+    echo "   📊 Router trend lines plotted first to remain visible even with overlapping dots"
+else
+    echo "⚠️  Skipping plots - missing result files"
+fi
+
+echo ""
+echo "📊 BENCHMARK COMPLETED!"
+echo "======================="
+echo "Results saved to: $OUTPUT_DIR"
+echo ""
+
+# Display quick summary if results exist
+echo "📈 QUICK SUMMARY:"
+echo "-----------------"
+
+# Find and display router results
+ROUTER_RESULT=$(find "$OUTPUT_DIR" -name "*router*auto*" -type d | head -1)
+if [ -n "$ROUTER_RESULT" ] && [ -f "$ROUTER_RESULT/summary.json" ]; then
+    echo "🔀 Router (via Envoy):"
+    python3 -c "
+import json, sys
+try:
+    with open('$ROUTER_RESULT/summary.json') as f:
+        data = json.load(f)
+    print(f\"  Accuracy: {data.get('overall_accuracy', 0):.3f}\")
+    print(f\"  Avg Latency: {data.get('avg_response_time', 0):.2f}s\")
+    print(f\"  Avg Tokens: {data.get('avg_total_tokens', 0):.0f}\")
+    print(f\"  Questions: {data.get('successful_queries', 0)}/{data.get('total_questions', 0)}\")
+except Exception as e:
+    print(f\"  Error reading router results: {e}\")
+"
+fi
+
+# Find and display vLLM results
+VLLM_RESULT=$(find "$OUTPUT_DIR" -name "*vllm*gpt-oss*" -type d | head -1)
+if [ -n "$VLLM_RESULT" ] && [ -f "$VLLM_RESULT/summary.json" ]; then
+    echo "🎯 Direct vLLM:"
+    python3 -c "
+import json, sys
+try:
+    with open('$VLLM_RESULT/summary.json') as f:
+        data = json.load(f)
+    print(f\"  Accuracy: {data.get('overall_accuracy', 0):.3f}\")
+    print(f\"  Avg Latency: {data.get('avg_response_time', 0):.2f}s\")
+    print(f\"  Avg Tokens: {data.get('avg_total_tokens', 0):.0f}\")
+    print(f\"  Questions: {data.get('successful_queries', 0)}/{data.get('total_questions', 0)}\")
+    
+    # Show breakdown by mode if available
+    by_mode = data.get('by_mode', {})
+    if by_mode:
+        print(\"  Mode Breakdown:\")
+        for mode, metrics in by_mode.items():
+            if 'accuracy' in metrics:
+                print(f\"    {mode}: {metrics['accuracy']:.3f} acc, {metrics.get('avg_response_time', 0):.2f}s\")
+except Exception as e:
+    print(f\"  Error reading vLLM results: {e}\")
+"
+fi
+
+echo ""
+echo "🔍 DETAILED ANALYSIS:"
+echo "--------------------"
+echo "- Router results: $ROUTER_RESULT"
+echo "- vLLM results: $VLLM_RESULT"
+echo "- Comparison plots: $OUTPUT_DIR/plots/"
+echo "- Compare CSV files for detailed question-by-question analysis"
+echo "- Check summary.json files for comprehensive metrics"
+echo ""
+
+echo "📊 VISUALIZATION FILES:"
+echo "----------------------"
+if [ -d "$OUTPUT_DIR/plots" ]; then
+    echo "- Accuracy comparison: $OUTPUT_DIR/plots/bench_plot_accuracy.png"
+    echo "- Response time comparison: $OUTPUT_DIR/plots/bench_plot_avg_response_time.png"
+    echo "- Token usage comparison: $OUTPUT_DIR/plots/bench_plot_avg_total_tokens.png"
+    echo "- PDF versions also available in same directory"
+else
+    echo "- No plots generated (check for errors above)"
+fi
+echo ""
+
+echo "✅ Benchmark comparison complete!"
+echo "Run with different datasets: $0 mmlu 10"
+echo "Run with different datasets: $0 arc-challenge 3"
diff --git a/bench/build_and_test.sh b/bench/build_and_test.sh
new file mode 100755
index 00000000..fc27a3aa
--- /dev/null
+++ b/bench/build_and_test.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Build and test script for vLLM Semantic Router Bench PyPI package
+
+set -e
+
+echo "🔨 Building vLLM Semantic Router Bench Package"
+echo "=============================================="
+
+# Clean previous builds
+echo "🧹 Cleaning previous builds..."
+rm -rf build/ dist/ *.egg-info/
+find vllm_semantic_router_bench/ -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true
+find vllm_semantic_router_bench/ -name "*.pyc" -delete 2>/dev/null || true
+
+# Build the package
+echo "📦 Building package..."
+python -m build
+
+# Test installation in virtual environment
+echo "🧪 Testing installation..."
+python -m venv test_env
+source test_env/bin/activate
+
+# Install the built package
+pip install dist/*.whl
+
+# Test imports
+echo "🔍 Testing imports..."
+python -m vllm_semantic_router_bench.test_package
+
+# Test CLI commands
+echo "🖥️  Testing CLI commands..."
+echo "Available commands:"
+vllm-semantic-router-bench --help | head -10
+
+# Clean up
+deactivate
+rm -rf test_env/
+
+echo ""
+echo "✅ Package build and test completed successfully!"
+echo ""
+echo "📋 Next steps:"
+echo "1. Review the built package in dist/"
+echo "2. Test installation: pip install dist/*.whl"
+echo "3. Upload to PyPI: twine upload dist/*"
+echo ""
+echo "📦 Files ready for PyPI:"
+ls -la dist/
diff --git a/bench/comprehensive_bench.sh b/bench/comprehensive_bench.sh
new file mode 100755
index 00000000..bd262798
--- /dev/null
+++ b/bench/comprehensive_bench.sh
@@ -0,0 +1,410 @@
+#!/bin/bash
+
+# Comprehensive Multi-Dataset Benchmark Script for Research Report
+# This script benchmarks all available datasets with reasonable sample sizes
+# for statistical significance while maintaining manageable runtime.
+
+set -e
+
+# Configuration
+VENV_PATH="../.venv"
+ROUTER_ENDPOINT="http://127.0.0.1:8801/v1"
+VLLM_ENDPOINT="http://127.0.0.1:8000/v1"
+VLLM_MODEL="openai/gpt-oss-20b"
+ROUTER_MODEL="auto"
+OUTPUT_BASE="results/comprehensive_research_$(date +%Y%m%d_%H%M%S)"
+
+# Single persistent CSV file for all research results
+PERSISTENT_RESEARCH_CSV="results/research_results_master.csv"
+
+# Dataset configurations (dataset_name:samples_per_category)
+# Balanced for statistical significance vs runtime
+declare -A DATASET_CONFIGS=(
+    ["mmlu"]=10          # 57 subjects × 10 = 570 samples
+    ["arc"]=15           # 1 category × 15 = 15 samples  
+    ["gpqa"]=20          # 1 category × 20 = 20 samples
+    ["truthfulqa"]=15    # 1 category × 15 = 15 samples
+    ["commonsenseqa"]=20 # 1 category × 20 = 20 samples
+    ["hellaswag"]=8      # ~50 activities × 8 = ~400 samples
+)
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}🔬 COMPREHENSIVE MULTI-DATASET BENCHMARK FOR RESEARCH${NC}"
+echo -e "${BLUE}====================================================${NC}"
+echo ""
+echo -e "${YELLOW}Configuration:${NC}"
+echo "  Router Endpoint: $ROUTER_ENDPOINT"
+echo "  vLLM Endpoint: $VLLM_ENDPOINT"
+echo "  vLLM Model: $VLLM_MODEL"
+echo "  Output Directory: $OUTPUT_BASE"
+echo ""
+echo -e "${YELLOW}Dataset Sample Sizes:${NC}"
+for dataset in "${!DATASET_CONFIGS[@]}"; do
+    echo "  $dataset: ${DATASET_CONFIGS[$dataset]} samples per category"
+done
+echo ""
+
+# Activate virtual environment
+echo -e "${BLUE}🔧 Activating virtual environment...${NC}"
+source "$VENV_PATH/bin/activate"
+
+# Create output directory
+mkdir -p "$OUTPUT_BASE"
+mkdir -p "$(dirname "$PERSISTENT_RESEARCH_CSV")"
+
+# Initialize persistent research results CSV (create header only if file doesn't exist)
+if [[ ! -f "$PERSISTENT_RESEARCH_CSV" ]]; then
+    echo "Dataset,Mode,Model,Accuracy,Avg_Latency_ms,Avg_Total_Tokens,Sample_Count,Timestamp" > "$PERSISTENT_RESEARCH_CSV"
+    echo -e "${GREEN}📊 Created new master research CSV: $PERSISTENT_RESEARCH_CSV${NC}"
+else
+    echo -e "${BLUE}📊 Using existing master research CSV: $PERSISTENT_RESEARCH_CSV${NC}"
+fi
+
+# Also create a timestamped copy for this run
+RESEARCH_CSV="$OUTPUT_BASE/research_results.csv"
+cp "$PERSISTENT_RESEARCH_CSV" "$RESEARCH_CSV"
+
+# Function to extract metrics from results and append to research CSV
+extract_and_save_metrics() {
+    local dataset=$1
+    local mode=$2  # "router" or "vllm"
+    local results_dir=$3
+    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
+    
+    # Find the results files (handle nested directory structure)
+    local summary_file=""
+    local detailed_file=""
+    
+    # Look for files in nested directories
+    if [[ -d "$results_dir" ]]; then
+        summary_file=$(find "$results_dir" -name "results_summary.csv" -type f | head -1)
+        if [[ -z "$summary_file" ]]; then
+            detailed_file=$(find "$results_dir" -name "detailed_results.csv" -type f | head -1)
+        fi
+    fi
+    
+    # Use whichever file we found
+    local target_file=""
+    if [[ -f "$summary_file" ]]; then
+        target_file="$summary_file"
+    elif [[ -f "$detailed_file" ]]; then
+        target_file="$detailed_file"
+    fi
+    
+    if [[ -n "$target_file" && -f "$target_file" ]]; then
+        echo -e "${YELLOW}    📊 Extracting metrics from $target_file...${NC}"
+        
+        # Extract overall metrics from the CSV file
+        # Skip header and get the last line (overall summary) or calculate averages
+        local temp_file="/tmp/metrics_$dataset_$mode.txt"
+        
+        # Use Python to calculate averages from the CSV
+        python3 -c "
+import pandas as pd
+import sys
+
+try:
+    df = pd.read_csv('$target_file')
+    
+    # Calculate overall metrics (handle different CSV formats)
+    if len(df) > 0:
+        # Handle accuracy column (is_correct vs accuracy)
+        if 'is_correct' in df.columns:
+            avg_accuracy = df['is_correct'].mean()
+        elif 'accuracy' in df.columns:
+            avg_accuracy = df['accuracy'].mean()
+        else:
+            avg_accuracy = 0.0
+            
+        # Handle latency column (response_time vs avg_latency_ms)
+        if 'response_time' in df.columns:
+            avg_latency = df['response_time'].mean() * 1000  # Convert to ms
+        elif 'avg_latency_ms' in df.columns:
+            avg_latency = df['avg_latency_ms'].mean()
+        else:
+            avg_latency = 0.0
+            
+        # Handle token column (total_tokens vs avg_total_tokens)
+        if 'total_tokens' in df.columns:
+            avg_tokens = df['total_tokens'].mean()
+        elif 'avg_total_tokens' in df.columns:
+            avg_tokens = df['avg_total_tokens'].mean()
+        else:
+            avg_tokens = 0.0
+            
+        sample_count = len(df)
+        
+        # Determine model name
+        if '$mode' == 'router':
+            model_name = 'auto'
+        else:
+            model_name = 'openai/gpt-oss-20b'
+        
+        # For vLLM, we might have multiple modes (NR, NR_REASONING)
+        if '$mode' == 'vllm' and 'mode' in df.columns:
+            for mode_type in df['mode'].unique():
+                mode_df = df[df['mode'] == mode_type]
+                
+                # Recalculate metrics for this specific mode using correct column names
+                if 'is_correct' in mode_df.columns:
+                    mode_accuracy = mode_df['is_correct'].mean()
+                elif 'accuracy' in mode_df.columns:
+                    mode_accuracy = mode_df['accuracy'].mean()
+                else:
+                    mode_accuracy = 0.0
+                    
+                if 'response_time' in mode_df.columns:
+                    mode_latency = mode_df['response_time'].mean() * 1000
+                elif 'avg_latency_ms' in mode_df.columns:
+                    mode_latency = mode_df['avg_latency_ms'].mean()
+                else:
+                    mode_latency = 0.0
+                    
+                if 'total_tokens' in mode_df.columns:
+                    mode_tokens = mode_df['total_tokens'].mean()
+                elif 'avg_total_tokens' in mode_df.columns:
+                    mode_tokens = mode_df['avg_total_tokens'].mean()
+                else:
+                    mode_tokens = 0.0
+                    
+                mode_samples = len(mode_df)
+                
+                csv_line = f'$dataset,vLLM_{mode_type},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp'
+                print(f'    📝 Writing to CSV: {csv_line}', file=sys.stderr)
+                print(csv_line)
+        else:
+            csv_line = f'$dataset,$mode,{model_name},{avg_accuracy:.3f},{avg_latency:.1f},{avg_tokens:.1f},{sample_count},$timestamp'
+            print(f'    📝 Writing to CSV: {csv_line}', file=sys.stderr)
+            print(csv_line)
+    else:
+        print(f'$dataset,$mode,unknown,0.000,0.0,0.0,0,$timestamp', file=sys.stderr)
+        
+except Exception as e:
+    print(f'Error processing $target_file: {e}', file=sys.stderr)
+    print(f'$dataset,$mode,unknown,0.000,0.0,0.0,0,$timestamp', file=sys.stderr)
+" | tee -a "$RESEARCH_CSV" >> "$PERSISTENT_RESEARCH_CSV"
+        
+        echo -e "${GREEN}    ✅ Metrics saved to both timestamped and master research CSV${NC}"
+    else
+        echo -e "${RED}    ❌ Warning: No results files found in $results_dir${NC}"
+        # Add a placeholder entry to both files
+        echo "$dataset,$mode,unknown,0.000,0.0,0.0,0,$timestamp" | tee -a "$RESEARCH_CSV" >> "$PERSISTENT_RESEARCH_CSV"
+    fi
+}
+
+# Function to run benchmark for a dataset
+run_dataset_benchmark() {
+    local dataset=$1
+    local samples=${DATASET_CONFIGS[$dataset]}
+    
+    echo -e "${GREEN}📊 Benchmarking $dataset dataset ($samples samples per category)...${NC}"
+    
+    # Router benchmark
+    echo -e "${YELLOW}  🤖 Running router evaluation...${NC}"
+    python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
+        --dataset "$dataset" \
+        --samples-per-category "$samples" \
+        --run-router \
+        --router-endpoint "$ROUTER_ENDPOINT" \
+        --router-models "$ROUTER_MODEL" \
+        --output-dir "$OUTPUT_BASE/router_$dataset" \
+        --seed 42
+    
+    # Extract and save router metrics immediately
+    extract_and_save_metrics "$dataset" "Router" "$OUTPUT_BASE/router_$dataset"
+    
+    # vLLM benchmark  
+    echo -e "${YELLOW}  ⚡ Running vLLM evaluation...${NC}"
+    python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
+        --dataset "$dataset" \
+        --samples-per-category "$samples" \
+        --run-vllm \
+        --vllm-endpoint "$VLLM_ENDPOINT" \
+        --vllm-models "$VLLM_MODEL" \
+        --vllm-exec-modes NR NR_REASONING \
+        --output-dir "$OUTPUT_BASE/vllm_$dataset" \
+        --seed 42
+    
+    # Extract and save vLLM metrics immediately
+    extract_and_save_metrics "$dataset" "vllm" "$OUTPUT_BASE/vllm_$dataset"
+    
+    echo -e "${GREEN}  ✅ Completed $dataset benchmark${NC}"
+    echo ""
+}
+
+# Function to generate comparison plots
+generate_plots() {
+    echo -e "${BLUE}📈 Generating comparison plots...${NC}"
+    
+    for dataset in "${!DATASET_CONFIGS[@]}"; do
+        echo -e "${YELLOW}  📊 Plotting $dataset results...${NC}"
+        
+        python3 -m vllm_semantic_router_bench.bench_plot \
+            --router-dir "$OUTPUT_BASE/router_$dataset" \
+            --vllm-dir "$OUTPUT_BASE/vllm_$dataset" \
+            --output-dir "$OUTPUT_BASE/plots_$dataset" \
+            --dataset-name "$dataset"
+    done
+    
+    echo -e "${GREEN}  ✅ All plots generated${NC}"
+    echo ""
+}
+
+# Function to generate summary report
+generate_summary() {
+    echo -e "${BLUE}📋 Generating research summary...${NC}"
+    
+    local summary_file="$OUTPUT_BASE/RESEARCH_SUMMARY.md"
+    
+    cat > "$summary_file" << EOF
+# Multi-Dataset Benchmark Research Report
+
+**Generated:** $(date)
+**Configuration:** Router vs vLLM Direct Comparison
+**Router Model:** $ROUTER_MODEL  
+**vLLM Model:** $VLLM_MODEL
+
+## Dataset Overview
+
+| Dataset | Samples per Category | Total Samples | Categories | Domain |
+|---------|---------------------|---------------|------------|---------|
+EOF
+
+    # Add dataset details to summary
+    for dataset in "${!DATASET_CONFIGS[@]}"; do
+        samples=${DATASET_CONFIGS[$dataset]}
+        case $dataset in
+            "mmlu")
+                echo "| MMLU | $samples | ~570 | 57 subjects | Academic Knowledge |" >> "$summary_file"
+                ;;
+            "arc")
+                echo "| ARC | $samples | $samples | 1 (Science) | Scientific Reasoning |" >> "$summary_file"
+                ;;
+            "gpqa")
+                echo "| GPQA | $samples | $samples | 1 (Graduate) | Graduate-level Q&A |" >> "$summary_file"
+                ;;
+            "truthfulqa")
+                echo "| TruthfulQA | $samples | $samples | 1 (Truthfulness) | Truthful Responses |" >> "$summary_file"
+                ;;
+            "commonsenseqa")
+                echo "| CommonsenseQA | $samples | $samples | 1 (Common Sense) | Commonsense Reasoning |" >> "$summary_file"
+                ;;
+            "hellaswag")
+                echo "| HellaSwag | $samples | ~400 | ~50 activities | Commonsense NLI |" >> "$summary_file"
+                ;;
+        esac
+    done
+
+    cat >> "$summary_file" << EOF
+
+## Results Summary
+
+**📊 Main Research Data**: \`research_results.csv\` - Contains aggregated metrics for all datasets and modes
+
+### Accuracy Comparison
+- Router (auto model with reasoning): See research_results.csv
+- vLLM Direct (NR mode): See research_results.csv
+- vLLM Direct (NR_REASONING mode): See research_results.csv
+
+### Token Usage Analysis
+- Average tokens per response by dataset and mode (in research_results.csv)
+- Efficiency comparison between router and direct vLLM
+
+### Key Findings
+1. **Performance**: [To be filled based on results]
+2. **Efficiency**: [To be filled based on token usage]
+3. **Dataset-specific Insights**: [To be analyzed from plots]
+
+## Files Generated
+
+### Research Data (Primary)
+- \`research_results.csv\` - **Main aggregated results for research paper**
+
+### CSV Results (Detailed)
+EOF
+
+    # List all CSV files that will be generated
+    for dataset in "${!DATASET_CONFIGS[@]}"; do
+        echo "- \`router_$dataset/results_summary.csv\`" >> "$summary_file"
+        echo "- \`vllm_$dataset/results_summary.csv\`" >> "$summary_file"
+    done
+
+    cat >> "$summary_file" << EOF
+
+### Plots
+EOF
+
+    # List all plot files that will be generated
+    for dataset in "${!DATASET_CONFIGS[@]}"; do
+        echo "- \`plots_$dataset/bench_plot_accuracy.png\`" >> "$summary_file"
+        echo "- \`plots_$dataset/bench_plot_avg_total_tokens.png\`" >> "$summary_file"
+    done
+
+    cat >> "$summary_file" << EOF
+
+## Usage Instructions
+
+1. **Review CSV files** for detailed numerical results
+2. **Examine plots** for visual comparison trends  
+3. **Analyze token usage** for efficiency insights
+4. **Compare across datasets** for model capability assessment
+
+## Methodology
+
+- **Seed**: 42 (for reproducibility)
+- **Router Mode**: Auto model selection with reasoning
+- **vLLM Modes**: NR (neutral) and NR_REASONING (with reasoning)
+- **Sample Strategy**: Stratified sampling per category
+- **Evaluation**: Exact match accuracy and token usage
+
+EOF
+
+    echo -e "${GREEN}  ✅ Research summary generated: $summary_file${NC}"
+    echo ""
+}
+
+# Main execution
+echo -e "${BLUE}🚀 Starting comprehensive benchmark...${NC}"
+start_time=$(date +%s)
+
+# Run benchmarks for all datasets
+for dataset in "${!DATASET_CONFIGS[@]}"; do
+    run_dataset_benchmark "$dataset"
+done
+
+# Generate plots
+generate_plots
+
+# Generate summary
+generate_summary
+
+# Calculate total runtime
+end_time=$(date +%s)
+runtime=$((end_time - start_time))
+minutes=$((runtime / 60))
+seconds=$((runtime % 60))
+
+echo -e "${GREEN}🎉 COMPREHENSIVE BENCHMARK COMPLETED!${NC}"
+echo -e "${GREEN}====================================${NC}"
+echo ""
+echo -e "${YELLOW}📊 Results Location:${NC} $OUTPUT_BASE"
+echo -e "${YELLOW}⏱️  Total Runtime:${NC} ${minutes}m ${seconds}s"
+echo ""
+echo -e "${BLUE}📋 Next Steps:${NC}"
+echo "1. 📊 **Master research data**: $PERSISTENT_RESEARCH_CSV"
+echo "2. 📊 **This run's data**: $OUTPUT_BASE/research_results.csv"  
+echo "3. 📋 Review research summary: $OUTPUT_BASE/RESEARCH_SUMMARY.md"
+echo "4. 📈 Examine plots for visual insights"
+echo "5. 📄 Analyze detailed CSV files if needed"
+echo ""
+echo -e "${GREEN}🎓 Research CSV Format:${NC}"
+echo "   Dataset | Mode | Model | Accuracy | Avg_Latency_ms | Avg_Total_Tokens | Sample_Count | Timestamp"
+echo ""
+echo -e "${GREEN}📈 Master CSV grows with each test run - perfect for longitudinal analysis!${NC}"
+echo -e "${GREEN}✨ Ready for research report writing!${NC}"
diff --git a/bench/pyproject.toml b/bench/pyproject.toml
new file mode 100644
index 00000000..ce79128f
--- /dev/null
+++ b/bench/pyproject.toml
@@ -0,0 +1,132 @@
+[build-system]
+requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "vllm-semantic-router-bench"
+version = "1.0.0"
+description = "Comprehensive benchmark suite for semantic router vs direct vLLM evaluation across multiple reasoning datasets"
+readme = "README.md"
+requires-python = ">=3.8"
+license = {text = "Apache-2.0"}
+authors = [
+    {name = "vLLM Semantic Router Team"},
+]
+keywords = [
+    "vllm-semantic-router",
+    "benchmark", 
+    "vllm",
+    "llm",
+    "evaluation",
+    "reasoning",
+    "multiple-choice",
+    "mmlu",
+    "arc",
+    "gpqa",
+    "commonsense",
+    "hellaswag",
+    "truthfulqa",
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research", 
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Testing",
+    "Topic :: System :: Benchmark",
+]
+dependencies = [
+    "openai>=1.0.0",
+    "datasets>=2.14.0",
+    "pandas>=1.5.0",
+    "numpy>=1.21.0",
+    "tqdm>=4.64.0",
+    "requests>=2.28.0",
+    "matplotlib>=3.5.0",
+    "seaborn>=0.11.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=6.0",
+    "black>=22.0",
+    "flake8>=4.0", 
+    "mypy>=0.950",
+    "pre-commit>=2.15.0",
+]
+plotting = [
+    "matplotlib>=3.5.0",
+    "seaborn>=0.11.0",
+]
+
+[project.urls]
+Homepage = "https://github.com/vllm-project/semantic-router"
+Documentation = "https://vllm-semantic-router.com"
+Repository = "https://github.com/vllm-project/semantic-router"
+"Bug Tracker" = "https://github.com/vllm-project/semantic-router/issues"
+
+[project.scripts]
+vllm-semantic-router-bench = "vllm_semantic_router_bench.cli:main"
+router-bench = "vllm_semantic_router_bench.router_reason_bench_multi_dataset:main"
+bench-plot = "vllm_semantic_router_bench.bench_plot:main"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["vllm_semantic_router_bench*"]
+
+[tool.setuptools.package-data]
+vllm_semantic_router_bench = ["*.md", "dataset_implementations/*.py"]
+
+[tool.black]
+line-length = 88
+target-version = ['py38', 'py39', 'py310', 'py311', 'py312']
+include = '\.pyi?$'
+extend-exclude = '''
+/(
+  # directories
+  \.eggs
+  | \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | build
+  | dist
+)/
+'''
+
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+line_length = 88
+known_first_party = ["vllm_semantic_router_bench"]
+
+[tool.mypy]
+python_version = "3.8"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+check_untyped_defs = true
+disallow_untyped_decorators = true
+no_implicit_optional = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_no_return = true
+warn_unreachable = true
+strict_equality = true
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py", "*_test.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = "-v --tb=short"
diff --git a/bench/requirements.txt b/bench/requirements.txt
new file mode 100644
index 00000000..f889bc14
--- /dev/null
+++ b/bench/requirements.txt
@@ -0,0 +1,18 @@
+# Core dependencies for semantic-router-bench
+openai>=1.0.0
+datasets>=2.14.0
+pandas>=1.5.0
+numpy>=1.21.0
+tqdm>=4.64.0
+requests>=2.28.0
+
+# Plotting dependencies
+matplotlib>=3.5.0
+seaborn>=0.11.0
+
+# Optional: Development dependencies (install with pip install -e ".[dev]")
+# pytest>=6.0
+# black>=22.0
+# flake8>=4.0
+# mypy>=0.950
+# pre-commit>=2.15.0
diff --git a/bench/router_reason_bench.py b/bench/router_reason_bench.py
index 1bf666a4..f3567f98 100644
--- a/bench/router_reason_bench.py
+++ b/bench/router_reason_bench.py
@@ -17,8 +17,10 @@
 
 # This benchmark supports two usage patterns:
 # 1) Router-transparent: send a single neutral prompt; router/model decides reasoning.
-# 2) Policy evaluation: run NR (neutral), XC (explicit CoT), and optionally AR (automatic reasoning via extra_body)
-#    per question, then aggregate according to policies like Always-NR, Always-XC, CR-XC, Oracle, etc.
+# 2) vLLM 3-case evaluation: run realistic scenarios that match router decision patterns:
+#    - NR: Plain prompt, no reasoning toggle (baseline/fast)
+#    - XC: CoT prompt, no reasoning toggle (prompt-based reasoning)
+#    - NR_REASONING: Plain prompt, reasoning toggle ON (model-based reasoning)
 
 
 ANSWER_PATTERN = re.compile(r"(?:answer(?:\sis)?:?\s*)([A-J])", re.IGNORECASE)
@@ -76,7 +78,7 @@ def parse_args():
         type=str,
         nargs="+",
         default=["NR", "XC"],
-        help="Prompt styles to run on vLLM: NR (neutral), XC (explicit CoT)",
+        help="DEPRECATED: vLLM now runs 3 fixed realistic modes: NR (plain), XC (CoT), NR_REASONING (plain+toggle)",
     )
     parser.add_argument(
         "--run-router",
@@ -340,7 +342,17 @@ def call_model(
         total_tokens = getattr(usage, "total_tokens", None) if usage else None
         return text, True, prompt_tokens, completion_tokens, total_tokens
     except Exception as e:
-        print(f"Model call failed: {e}")
+        print(f"❌ Model call failed: {e}")
+        print(f"   Error type: {type(e).__name__}")
+        print(f"   Model: {model}")
+        print(f"   Endpoint: {getattr(client, '_base_url', 'unknown')}")
+        print(f"   API key set: {'Yes' if getattr(client, 'api_key', None) else 'No'}")
+        if hasattr(e, "response"):
+            print(f"   HTTP status: {getattr(e.response, 'status_code', 'unknown')}")
+            print(f"   Response text: {getattr(e.response, 'text', 'unknown')}")
+        import traceback
+
+        print(f"   Full traceback: {traceback.format_exc()}")
         return "ERROR", False, None, None, None
 
 
@@ -352,7 +364,7 @@ def build_extra_body_for_model(
     - DeepSeek v3.1: {"chat_template_kwargs": {"thinking": true/false}}
     - GPT-OSS: {"reasoning_effort": "low|medium|high"} when ON; if not provided, then low
     """
-    # reasoning: True -> ON, False -> OFF, None -> base
+    # reasoning: True -> ON, False -> OFF, None -> base (default behavior)
 
     lower = model_name.lower()
     if (("ds" in lower) or ("deepseek" in lower)) and (
@@ -360,10 +372,11 @@ def build_extra_body_for_model(
     ):
         if reasoning is True:
             return {"chat_template_kwargs": {"thinking": True}}
-        if reasoning is None or reasoning is False:
+        elif reasoning is False:
             return {"chat_template_kwargs": {"thinking": False}}
-        # Base: do not set thinking for DeepSeek
-        return None
+        else:  # reasoning is None (base mode)
+            # Base: do not set thinking for DeepSeek - let it use default behavior
+            return None
 
     # Qwen3 family
     if "qwen3" in lower:
@@ -375,12 +388,13 @@ def build_extra_body_for_model(
 
     # GPT OSS family
     if "gpt-oss" in lower or "openai/gpt-oss" in lower or "gpt_oss" in lower:
-        # Base -> low effort, On -> provided effort (e.g., high)
         if reasoning is True:
             return {"reasoning_effort": "high"}
-        if reasoning is None or reasoning is False:
+        elif reasoning is False:
             return {"reasoning_effort": "low"}
-        return None
+        else:  # reasoning is None (base mode)
+            # Base: do not set reasoning_effort - let it use default behavior
+            return None
 
     return None
 
@@ -450,8 +464,17 @@ def evaluate_model_router_transparent(
     max_tokens: int,
     temperature: float,
 ) -> pd.DataFrame:
+    """
+    Evaluate router in transparent mode - send plain prompts and let router decide reasoning.
+
+    This represents the 'auto' mode where the router internally decides whether to use
+    reasoning or not based on the question complexity.
+    """
     client = OpenAI(base_url=endpoint, api_key=api_key or None)
     print(f"Using model: {model}, endpoint: {endpoint}")
+    print(
+        f"API key provided: {'Yes' if api_key else 'No'} (length: {len(api_key) if api_key else 0})"
+    )
 
     results: List[Dict[str, Any]] = []
     questions_data = df.to_dict("records")
@@ -491,37 +514,57 @@ def evaluate_model_vllm_multimode(
     temperature: float,
     exec_modes: List[str],
 ) -> pd.DataFrame:
-    """Run vLLM with NR/XC prompts and reasoning ON/OFF variants."""
-    client = OpenAI(base_url=endpoint, api_key=api_key or None)
+    """Run vLLM with 3 realistic reasoning scenarios.
+
+    The 3 scenarios represent real-world router decision patterns:
+    1. NR - Plain prompt, no reasoning toggle (fast baseline)
+    2. XC - CoT prompt, no reasoning toggle (prompt-based reasoning)
+    3. NR_REASONING - Plain prompt, reasoning toggle ON (model-based reasoning)
+    """
+    client = OpenAI(base_url=endpoint, api_key=api_key or "dummy-key")
     print(f"Using vLLM model: {model}, endpoint: {endpoint}")
 
     results: List[Dict[str, Any]] = []
     questions_data = df.to_dict("records")
 
-    # Define mode variants: (label, prompt_mode, reasoning_flag)
-    mode_variants: List[Tuple[str, str, Optional[bool]]] = []
-    for m in exec_modes:
-        if m.upper() == "NR":
-            mode_variants.extend(
-                [
-                    ("VLLM_NR_base", "NR", None),
-                    ("VLLM_NR_reason_on", "NR", True),
-                    ("VLLM_NR_reason_off", "NR", False),
-                ]
-            )
-        elif m.upper() == "XC":
-            mode_variants.extend(
-                [
-                    ("VLLM_XC_base", "XC", None),
-                    ("VLLM_XC_reason_on", "XC", True),
-                    ("VLLM_XC_reason_off", "XC", False),
-                ]
-            )
+    # Define 3 realistic mode variants: (label, prompt_mode, reasoning_flag)
+    # For DeepSeek and Qwen3 models, explicitly set reasoning flags for all modes
+    model_lower = model.lower()
+    is_deepseek_or_qwen = (
+        (("ds" in model_lower) or ("deepseek" in model_lower))
+        and ("v31" in model_lower or "v3.1" in model_lower or "v3" in model_lower)
+    ) or ("qwen3" in model_lower)
+
+    if is_deepseek_or_qwen:
+        mode_variants: List[Tuple[str, str, Optional[bool]]] = [
+            ("VLLM_NR", "NR", False),  # Plain prompt, reasoning OFF (baseline)
+            ("VLLM_XC", "XC", False),  # CoT prompt, reasoning OFF (prompt reasoning)
+            (
+                "VLLM_NR_REASONING",
+                "NR",
+                True,
+            ),  # Plain prompt, reasoning ON (model reasoning)
+        ]
+    else:
+        mode_variants: List[Tuple[str, str, Optional[bool]]] = [
+            ("VLLM_NR", "NR", None),  # Plain prompt, no toggle (baseline)
+            ("VLLM_XC", "XC", None),  # CoT prompt, no toggle (prompt reasoning)
+            (
+                "VLLM_NR_REASONING",
+                "NR",
+                True,
+            ),  # Plain prompt, toggle ON (model reasoning)
+        ]
 
     def run_variants(q: Dict[str, Any]) -> List[Dict[str, Any]]:
         local_records: List[Dict[str, Any]] = []
         for label, prompt_mode, reasoning_flag in mode_variants:
             extra_body = build_extra_body_for_model(model, reasoning_flag)
+            # Debug: print extra_body for first question to verify configuration
+            if q == questions_data[0]:
+                print(
+                    f"  {label}: reasoning_flag={reasoning_flag}, extra_body={extra_body}"
+                )
             rec = process_question_single(
                 client,
                 model,
diff --git a/bench/run_bench.sh b/bench/run_bench.sh
deleted file mode 100755
index 67877f51..00000000
--- a/bench/run_bench.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/bin/bash
-
-# Example usage:
-# Quick run:
-# SAMPLES_PER_CATEGORY=5 CONCURRENT_REQUESTS=4 VLLM_MODELS="openai/gpt-oss-20b" ROUTER_MODELS="auto" ./run_bench.sh
-# Long run:
-# SAMPLES_PER_CATEGORY=100 CONCURRENT_REQUESTS=4 VLLM_MODELS="openai/gpt-oss-20b" ROUTER_MODELS="auto" ./run_bench.sh
-# To test only router:
-# BENCHMARK_ROUTER_ONLY=true ./run_bench.sh
-
-set -x -e
-
-export ROUTER_API_KEY="${ROUTER_API_KEY:-1234567890}"
-export VLLM_API_KEY="${VLLM_API_KEY:-1234567890}"
-export ROUTER_ENDPOINT="${ROUTER_ENDPOINT:-http://localhost:8801/v1}"
-export VLLM_ENDPOINT="${VLLM_ENDPOINT:-http://localhost:8000/v1}"
-export ROUTER_MODELS="${ROUTER_MODELS:-auto}"
-export VLLM_MODELS="${VLLM_MODELS:-openai/gpt-oss-20b}"
-export SAMPLES_PER_CATEGORY="${SAMPLES_PER_CATEGORY:-5}"
-export CONCURRENT_REQUESTS="${CONCURRENT_REQUESTS:-4}"
-export BENCHMARK_ROUTER_ONLY="${BENCHMARK_ROUTER_ONLY:-false}"
-
-# Run the benchmark
-if [ "${BENCHMARK_ROUTER_ONLY}" = "true" ]; then
-  echo "Running router-only benchmark"
-  python bench/router_reason_bench.py \
-    --run-router \
-    --router-endpoint "$ROUTER_ENDPOINT" \
-    --router-api-key "$ROUTER_API_KEY" \
-    --router-models "$ROUTER_MODELS" \
-    --samples-per-category "$SAMPLES_PER_CATEGORY" \
-    --concurrent-requests "$CONCURRENT_REQUESTS" \
-    --output-dir results/reasonbench
-else
-  echo "Running full benchmark (router + vLLM)..."
-  python bench/router_reason_bench.py \
-    --run-router \
-    --router-endpoint "$ROUTER_ENDPOINT" \
-    --router-api-key "$ROUTER_API_KEY" \
-    --router-models "$ROUTER_MODELS" \
-    --run-vllm \
-    --vllm-endpoint "$VLLM_ENDPOINT" \
-    --vllm-api-key "$VLLM_API_KEY" \
-    --vllm-models "$VLLM_MODELS" \
-    --samples-per-category "$SAMPLES_PER_CATEGORY" \
-    --vllm-exec-modes NR XC \
-    --concurrent-requests "$CONCURRENT_REQUESTS" \
-    --output-dir results/reasonbench
-fi
-
-# Generate plots if summary files exist
-echo "Checking for plot generation..."
-echo "VLLM_MODELS: $VLLM_MODELS"
-echo "ROUTER_MODELS: $ROUTER_MODELS"
-
-# Get first model name and make it path-safe
-VLLM_MODEL_FIRST=$(echo "$VLLM_MODELS" | cut -d' ' -f1)
-ROUTER_MODEL_FIRST=$(echo "$ROUTER_MODELS" | cut -d' ' -f1)
-echo "First models: VLLM=$VLLM_MODEL_FIRST, Router=$ROUTER_MODEL_FIRST"
-
-# Replace / with _ for path safety
-VLLM_MODELS_SAFE=$(echo "$VLLM_MODEL_FIRST" | tr '/' '_')
-ROUTER_MODELS_SAFE=$(echo "$ROUTER_MODEL_FIRST" | tr '/' '_')
-echo "Safe paths: VLLM=$VLLM_MODELS_SAFE, Router=$ROUTER_MODELS_SAFE"
-
-# Construct the full paths
-VLLM_SUMMARY="results/reasonbench/vllm::${VLLM_MODELS_SAFE}/summary.json"
-ROUTER_SUMMARY="results/reasonbench/router::${ROUTER_MODELS_SAFE}/summary.json"
-echo "Looking for summaries at:"
-echo "VLLM: $VLLM_SUMMARY"
-echo "Router: $ROUTER_SUMMARY"
-
-# Check if at least one summary file exists and generate plots
-if [ -f "$ROUTER_SUMMARY" ]; then
-  echo "Found router summary, generating plots..."
-  if [ -f "$VLLM_SUMMARY" ]; then
-    echo "Found both summaries, generating comparison plots..."
-    python bench/bench_plot.py \
-      --summary "$VLLM_SUMMARY" \
-      --router-summary "$ROUTER_SUMMARY"
-  else
-    echo "vLLM summary not found, generating router-only plots..."
-    python bench/bench_plot.py \
-      --router-summary "$ROUTER_SUMMARY"
-  fi
-else
-  echo "No router summary found, skipping plot generation"
-fi
diff --git a/bench/setup.py b/bench/setup.py
new file mode 100644
index 00000000..c27e0520
--- /dev/null
+++ b/bench/setup.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+"""Setup script for vllm-semantic-router-bench package."""
+
+import os
+
+from setuptools import find_packages, setup
+
+
+# Read the README file
+def read_readme():
+    readme_path = os.path.join(os.path.dirname(__file__), "README.md")
+    if os.path.exists(readme_path):
+        with open(readme_path, "r", encoding="utf-8") as f:
+            return f.read()
+    return "A comprehensive benchmark suite for vLLM Semantic Router vs direct vLLM evaluation"
+
+
+# Read requirements
+def read_requirements():
+    requirements_path = os.path.join(os.path.dirname(__file__), "requirements.txt")
+    if os.path.exists(requirements_path):
+        with open(requirements_path, "r", encoding="utf-8") as f:
+            return [
+                line.strip() for line in f if line.strip() and not line.startswith("#")
+            ]
+    return []
+
+
+setup(
+    name="vllm-semantic-router-bench",
+    version="1.0.0",
+    author="vLLM Semantic Router Team",
+    description="Comprehensive benchmark suite for vLLM Semantic Router vs direct vLLM evaluation across multiple reasoning datasets",
+    long_description=read_readme(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/vllm-project/semantic-router",
+    project_urls={
+        "Bug Tracker": "https://github.com/vllm-project/semantic-router/issues",
+        "Documentation": "https://vllm-semantic-router.com",
+        "Source": "https://github.com/vllm-project/semantic-router",
+    },
+    packages=find_packages(),
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Testing",
+        "Topic :: System :: Benchmark",
+    ],
+    python_requires=">=3.8",
+    install_requires=read_requirements(),
+    extras_require={
+        "dev": [
+            "pytest>=6.0",
+            "black>=22.0",
+            "flake8>=4.0",
+            "mypy>=0.950",
+            "pre-commit>=2.15.0",
+        ],
+        "plotting": [
+            "matplotlib>=3.5.0",
+            "seaborn>=0.11.0",
+        ],
+    },
+    entry_points={
+        "console_scripts": [
+            "vllm-semantic-router-bench=vllm_semantic_router_bench.cli:main",
+            "router-bench=vllm_semantic_router_bench.router_reason_bench_multi_dataset:main",
+            "bench-plot=vllm_semantic_router_bench.bench_plot:main",
+        ],
+    },
+    include_package_data=True,
+    package_data={
+        "vllm_semantic_router_bench": [
+            "*.md",
+            "dataset_implementations/*.py",
+        ],
+    },
+    keywords=[
+        "vllm-semantic-router",
+        "benchmark",
+        "vllm",
+        "llm",
+        "evaluation",
+        "reasoning",
+        "multiple-choice",
+        "mmlu",
+        "arc",
+        "gpqa",
+        "commonsense",
+        "hellaswag",
+        "truthfulqa",
+    ],
+    zip_safe=False,
+)
diff --git a/bench/vllm_semantic_router_bench/__init__.py b/bench/vllm_semantic_router_bench/__init__.py
new file mode 100644
index 00000000..f982787f
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/__init__.py
@@ -0,0 +1,39 @@
+"""
+vLLM Semantic Router Benchmark Suite
+
+A comprehensive benchmark suite for evaluating vLLM semantic router performance
+against direct vLLM across multiple reasoning datasets.
+
+Supported Datasets:
+- MMLU-Pro: Academic knowledge across 57 subjects
+- ARC: AI2 Reasoning Challenge for scientific reasoning
+- GPQA: Graduate-level Google-proof Q&A
+- TruthfulQA: Truthful response evaluation
+- CommonsenseQA: Commonsense reasoning evaluation
+- HellaSwag: Commonsense natural language inference
+
+Key Features:
+- Dataset-agnostic architecture with factory pattern
+- Router vs direct vLLM comparison
+- Multiple evaluation modes (NR, XC, NR_REASONING)
+- Comprehensive plotting and analysis tools
+- Research-ready CSV output
+- Configurable token limits per dataset
+"""
+
+__version__ = "1.0.0"
+__author__ = "vLLM Semantic Router Team"
+
+from .dataset_factory import DatasetFactory, list_available_datasets
+from .dataset_interface import DatasetInfo, DatasetInterface, PromptFormatter, Question
+
+# Make key classes available at package level
+__all__ = [
+    "DatasetInterface",
+    "Question",
+    "DatasetInfo",
+    "PromptFormatter",
+    "DatasetFactory",
+    "list_available_datasets",
+    "__version__",
+]
diff --git a/bench/bench_plot.py b/bench/vllm_semantic_router_bench/bench_plot.py
similarity index 86%
rename from bench/bench_plot.py
rename to bench/vllm_semantic_router_bench/bench_plot.py
index fdab467c..fe72d818 100644
--- a/bench/bench_plot.py
+++ b/bench/vllm_semantic_router_bench/bench_plot.py
@@ -6,12 +6,18 @@
 import pandas as pd
 from matplotlib import colormaps
 
+# This script plots benchmark results from the 3-case vLLM design:
+# - VLLM_NR: Plain prompt, no reasoning toggle (baseline)
+# - VLLM_XC: CoT prompt, no reasoning toggle (prompt reasoning)
+# - VLLM_NR_REASONING: Plain prompt, reasoning toggle ON (model reasoning)
+# - router: Router auto mode for comparison
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--summary",
     type=Path,
     required=True,
-    help="Path to summary.json produced by the bench",
+    help="Path to vLLM summary.json produced by the 3-case benchmark",
 )
 parser.add_argument(
     "--router-summary",
@@ -56,7 +62,7 @@
     "--max-modes",
     type=int,
     default=None,
-    help="If set, plot only the top N modes by mean of the current metric",
+    help="If set, plot only the top N modes by mean of the current metric (default: all 3 modes)",
 )
 parser.add_argument(
     "--xtick-rotation",
@@ -175,7 +181,41 @@ def plot_metric(metric: str, out_path: Path):
 
     x = range(len(cats))
 
-    # Determine modes to plot, optionally limiting to top-N by mean of metric
+    # Plot router per-category metric FIRST (with both line and diamonds)
+    # This ensures router trend is visible even if vLLM dots overlap
+    if s_router is not None:
+        router_cat = s_router.get("category_metrics", {})
+        router_vals = []
+        router_x = []
+        for idx, c in enumerate(cats):
+            v = router_cat.get(c, {}).get(metric)
+            if v is not None:
+                router_x.append(idx)
+                router_vals.append(v)
+        if router_vals:
+            # Connect router points with a line and draw larger diamond markers
+            ax.plot(
+                router_x,
+                router_vals,
+                color="tab:red",
+                linestyle="-",
+                linewidth=2.0 * args.font_scale,
+                alpha=0.85,
+                zorder=1,  # Lower zorder so it's plotted first
+            )
+            ax.scatter(
+                router_x,
+                router_vals,
+                s=90 * args.font_scale,
+                color="tab:red",
+                marker="D",
+                label="router",
+                zorder=2,  # Lower zorder so it's plotted first
+                edgecolors="white",
+                linewidths=0.6 * args.font_scale,
+            )
+
+    # Then plot vLLM modes on top
     all_modes = sorted({m for c in cats for m in cat_by_mode.get(c, {}).keys()})
     if len(all_modes) > 0:
 
@@ -213,7 +253,7 @@ def _mean(values):
                     linestyle=linestyles[i % len(linestyles)],
                     linewidth=1.4 * args.font_scale,
                     alpha=0.6,
-                    zorder=2,
+                    zorder=3,  # Higher zorder so vLLM lines are on top
                 )
             if args.style in ("points", "both"):
                 ax.scatter(
@@ -225,49 +265,27 @@ def _mean(values):
                     alpha=0.85,
                     edgecolors="white",
                     linewidths=0.5 * args.font_scale,
-                    zorder=3,
+                    zorder=4,  # Higher zorder so vLLM points are on top
                 )
 
-    # Overlay router per-category metric as diamonds, if provided
-    if s_router is not None:
-        router_cat = s_router.get("category_metrics", {})
-        router_vals = []
-        router_x = []
-        for idx, c in enumerate(cats):
-            v = router_cat.get(c, {}).get(metric)
-            if v is not None:
-                router_x.append(idx)
-                router_vals.append(v)
-        if router_vals:
-            # Connect router points with a line and draw larger diamond markers
-            ax.plot(
-                router_x,
-                router_vals,
-                color="tab:red",
-                linestyle="-",
-                linewidth=2.0 * args.font_scale,
-                alpha=0.85,
-                zorder=4,
-            )
-            ax.scatter(
-                router_x,
-                router_vals,
-                s=90 * args.font_scale,
-                color="tab:red",
-                marker="D",
-                label="router",
-                zorder=5,
-                edgecolors="white",
-                linewidths=0.6 * args.font_scale,
-            )
+    # Set x-axis labels with threshold for readability
+    MAX_CATEGORY_LABELS = 20  # Hide labels if more than this many categories
 
     ax.set_xticks(list(x))
-    ax.set_xticklabels(
-        cats,
-        rotation=args.xtick_rotation,
-        ha="right",
-        fontsize=int(14 * args.font_scale),
-    )
+    if len(cats) <= MAX_CATEGORY_LABELS:
+        ax.set_xticklabels(
+            cats,
+            rotation=args.xtick_rotation,
+            ha="right",
+            fontsize=int(14 * args.font_scale),
+        )
+    else:
+        # Too many categories - hide labels to avoid clutter
+        ax.set_xticklabels([])
+        ax.set_xlabel(
+            f"Categories ({len(cats)} total - labels hidden for readability)",
+            fontsize=int(16 * args.font_scale),
+        )
     # Control horizontal fit by expanding/shrinking x-limits around the first/last category
     if len(cats) > 0:
         n = len(cats)
@@ -333,7 +351,13 @@ def _mean(values):
     plt.close(fig)
 
 
-args.out_dir.mkdir(parents=True, exist_ok=True)
-for metric in args.metrics:
-    out_path = args.out_dir / f"bench_plot_{metric}.png"
-    plot_metric(metric, out_path)
+def main():
+    """Main entry point for the plotting script."""
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+    for metric in args.metrics:
+        out_path = args.out_dir / f"bench_plot_{metric}.png"
+        plot_metric(metric, out_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/vllm_semantic_router_bench/cli.py b/bench/vllm_semantic_router_bench/cli.py
new file mode 100644
index 00000000..b8fdab63
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/cli.py
@@ -0,0 +1,288 @@
+#!/usr/bin/env python3
+"""
+Command Line Interface for Semantic Router Benchmark Suite
+"""
+
+import argparse
+import os
+import sys
+from typing import List, Optional
+
+
+def main():
+    """Main CLI entry point for semantic-router-bench."""
+    parser = argparse.ArgumentParser(
+        prog="semantic-router-bench",
+        description="Comprehensive benchmark suite for semantic router vs direct vLLM evaluation",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Quick dataset test
+  semantic-router-bench test --dataset mmlu --samples 5
+
+  # Full benchmark comparison
+  semantic-router-bench compare --dataset arc --samples 10
+
+  # List available datasets
+  semantic-router-bench list-datasets
+
+  # Generate plots from existing results
+  semantic-router-bench plot --router-dir results/router_mmlu --vllm-dir results/vllm_mmlu
+
+For more detailed usage, see: https://vllm-semantic-router.com/docs/benchmarking
+        """,
+    )
+
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+
+    # Test command - quick single dataset evaluation
+    test_parser = subparsers.add_parser("test", help="Quick test on a single dataset")
+    test_parser.add_argument(
+        "--dataset",
+        required=True,
+        choices=["mmlu", "arc", "gpqa", "truthfulqa", "commonsenseqa", "hellaswag"],
+        help="Dataset to test",
+    )
+    test_parser.add_argument(
+        "--samples",
+        type=int,
+        default=5,
+        help="Number of samples per category (default: 5)",
+    )
+    test_parser.add_argument(
+        "--mode",
+        choices=["router", "vllm", "both"],
+        default="both",
+        help="Evaluation mode (default: both)",
+    )
+    test_parser.add_argument(
+        "--output-dir",
+        default="results/quick_test",
+        help="Output directory for results",
+    )
+
+    # Compare command - full router vs vLLM comparison
+    compare_parser = subparsers.add_parser(
+        "compare", help="Full router vs vLLM comparison"
+    )
+    compare_parser.add_argument(
+        "--dataset",
+        required=True,
+        choices=["mmlu", "arc", "gpqa", "truthfulqa", "commonsenseqa", "hellaswag"],
+        help="Dataset to benchmark",
+    )
+    compare_parser.add_argument(
+        "--samples",
+        type=int,
+        default=10,
+        help="Number of samples per category (default: 10)",
+    )
+    compare_parser.add_argument(
+        "--router-endpoint",
+        default="http://127.0.0.1:8801/v1",
+        help="Router endpoint URL",
+    )
+    compare_parser.add_argument(
+        "--vllm-endpoint", default="http://127.0.0.1:8000/v1", help="vLLM endpoint URL"
+    )
+    compare_parser.add_argument(
+        "--vllm-model", default="openai/gpt-oss-20b", help="vLLM model name"
+    )
+    compare_parser.add_argument(
+        "--output-dir",
+        default="results/comparison",
+        help="Output directory for results",
+    )
+
+    # List datasets command
+    list_parser = subparsers.add_parser("list-datasets", help="List available datasets")
+
+    # Plot command - generate plots from existing results
+    plot_parser = subparsers.add_parser(
+        "plot", help="Generate plots from benchmark results"
+    )
+    plot_parser.add_argument(
+        "--router-dir", required=True, help="Directory containing router results"
+    )
+    plot_parser.add_argument(
+        "--vllm-dir", required=True, help="Directory containing vLLM results"
+    )
+    plot_parser.add_argument(
+        "--output-dir", default="results/plots", help="Output directory for plots"
+    )
+    plot_parser.add_argument("--dataset-name", help="Dataset name for plot titles")
+
+    # Comprehensive command - run full research benchmark
+    comprehensive_parser = subparsers.add_parser(
+        "comprehensive", help="Run comprehensive multi-dataset benchmark"
+    )
+    comprehensive_parser.add_argument(
+        "--datasets",
+        nargs="+",
+        default=["mmlu", "arc", "gpqa", "truthfulqa", "commonsenseqa", "hellaswag"],
+        help="Datasets to benchmark",
+    )
+    comprehensive_parser.add_argument(
+        "--router-endpoint", default="http://127.0.0.1:8801/v1"
+    )
+    comprehensive_parser.add_argument(
+        "--vllm-endpoint", default="http://127.0.0.1:8000/v1"
+    )
+    comprehensive_parser.add_argument("--vllm-model", default="openai/gpt-oss-20b")
+
+    args = parser.parse_args()
+
+    if not args.command:
+        parser.print_help()
+        return 1
+
+    # Import modules only when needed to speed up CLI startup
+    if args.command == "test":
+        return run_test(args)
+    elif args.command == "compare":
+        return run_compare(args)
+    elif args.command == "list-datasets":
+        return list_datasets()
+    elif args.command == "plot":
+        return run_plot(args)
+    elif args.command == "comprehensive":
+        return run_comprehensive(args)
+    else:
+        parser.print_help()
+        return 1
+
+
+def run_test(args):
+    """Run quick test command."""
+    print(f"🧪 Quick test: {args.dataset} dataset ({args.samples} samples)")
+
+    # Import and run the benchmark script
+    import os
+    import subprocess
+
+    cmd = [
+        sys.executable,
+        "-m",
+        "vllm_semantic_router_bench.router_reason_bench_multi_dataset",
+        "--dataset",
+        args.dataset,
+        "--samples-per-category",
+        str(args.samples),
+        "--output-dir",
+        args.output_dir,
+        "--seed",
+        "42",
+    ]
+
+    if args.mode in ["router", "both"]:
+        cmd.extend(["--run-router", "--router-models", "auto"])
+
+    if args.mode in ["vllm", "both"]:
+        cmd.extend(
+            [
+                "--run-vllm",
+                "--vllm-models",
+                "openai/gpt-oss-20b",
+                "--vllm-exec-modes",
+                "NR",
+                "NR_REASONING",
+            ]
+        )
+
+    return subprocess.call(cmd)
+
+
+def run_compare(args):
+    """Run comparison command."""
+    print(f"⚡ Comparison: {args.dataset} dataset ({args.samples} samples)")
+
+    import os
+    import subprocess
+
+    script_path = os.path.join(
+        os.path.dirname(os.path.dirname(__file__)), "benchmark_comparison.sh"
+    )
+    cmd = [script_path, args.dataset, str(args.samples)]
+
+    env = os.environ.copy()
+    env.update(
+        {
+            "ROUTER_ENDPOINT": args.router_endpoint,
+            "VLLM_ENDPOINT": args.vllm_endpoint,
+            "VLLM_MODEL": args.vllm_model,
+            "OUTPUT_DIR": args.output_dir,
+        }
+    )
+
+    return subprocess.call(cmd, env=env)
+
+
+def list_datasets():
+    """List available datasets."""
+    try:
+        from .dataset_factory import list_available_datasets
+
+        # This function prints the datasets and returns None
+        list_available_datasets()
+
+        print("\nUsage examples:")
+        print("  semantic-router-bench test --dataset mmlu --samples 5")
+        print("  semantic-router-bench compare --dataset arc --samples 10")
+
+        return 0
+    except ImportError as e:
+        print(f"Error importing dataset factory: {e}")
+        return 1
+
+
+def run_plot(args):
+    """Run plotting command."""
+    print(f"📈 Generating plots from {args.router_dir} and {args.vllm_dir}")
+
+    import os
+    import subprocess
+
+    cmd = [
+        sys.executable,
+        "-m",
+        "vllm_semantic_router_bench.bench_plot",
+        "--router-dir",
+        args.router_dir,
+        "--vllm-dir",
+        args.vllm_dir,
+        "--output-dir",
+        args.output_dir,
+    ]
+
+    if args.dataset_name:
+        cmd.extend(["--dataset-name", args.dataset_name])
+
+    return subprocess.call(cmd)
+
+
+def run_comprehensive(args):
+    """Run comprehensive benchmark."""
+    print(f"🔬 Comprehensive benchmark: {', '.join(args.datasets)}")
+
+    import os
+    import subprocess
+
+    script_path = os.path.join(
+        os.path.dirname(os.path.dirname(__file__)), "comprehensive_bench.sh"
+    )
+
+    env = os.environ.copy()
+    env.update(
+        {
+            "ROUTER_ENDPOINT": args.router_endpoint,
+            "VLLM_ENDPOINT": args.vllm_endpoint,
+            "VLLM_MODEL": args.vllm_model,
+            "DATASETS": " ".join(args.datasets),
+        }
+    )
+
+    return subprocess.call([script_path], env=env)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bench/vllm_semantic_router_bench/dataset_factory.py b/bench/vllm_semantic_router_bench/dataset_factory.py
new file mode 100644
index 00000000..429faf9e
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_factory.py
@@ -0,0 +1,137 @@
+"""
+Dataset factory for loading different evaluation datasets.
+
+This module provides a factory pattern for instantiating different dataset
+implementations in a unified way.
+"""
+
+from typing import Dict, List, Optional, Type
+
+from .dataset_implementations.arc_dataset import (
+    ARCChallengeDataset,
+    ARCDataset,
+    ARCEasyDataset,
+)
+from .dataset_implementations.commonsenseqa_dataset import CommonsenseQADataset
+from .dataset_implementations.gpqa_dataset import (
+    GPQADataset,
+    GPQADiamondDataset,
+    GPQAExtendedDataset,
+    GPQAMainDataset,
+)
+from .dataset_implementations.hellaswag_dataset import HellaSwagDataset
+from .dataset_implementations.mmlu_dataset import MMLUDataset
+from .dataset_implementations.truthfulqa_dataset import TruthfulQADataset
+from .dataset_interface import DatasetInterface
+
+
+class DatasetFactory:
+    """Factory for creating dataset instances."""
+
+    _registered_datasets: Dict[str, Type[DatasetInterface]] = {}
+
+    @classmethod
+    def register_dataset(cls, name: str, dataset_class: Type[DatasetInterface]) -> None:
+        """Register a new dataset class.
+
+        Args:
+            name: Name to register the dataset under
+            dataset_class: Class implementing DatasetInterface
+        """
+        cls._registered_datasets[name.lower()] = dataset_class
+
+    @classmethod
+    def get_available_datasets(cls) -> List[str]:
+        """Get list of all registered dataset names."""
+        return list(cls._registered_datasets.keys())
+
+    @classmethod
+    def create_dataset(cls, name: str) -> DatasetInterface:
+        """Create a dataset instance by name.
+
+        Args:
+            name: Name of the dataset to create
+
+        Returns:
+            Dataset instance implementing DatasetInterface
+
+        Raises:
+            ValueError: If dataset name is not registered
+        """
+        name_lower = name.lower()
+        if name_lower not in cls._registered_datasets:
+            available = ", ".join(cls.get_available_datasets())
+            raise ValueError(
+                f"Unknown dataset: {name}. Available datasets: {available}"
+            )
+
+        dataset_class = cls._registered_datasets[name_lower]
+        return dataset_class()
+
+    @classmethod
+    def get_dataset_info(cls, name: str) -> Dict[str, str]:
+        """Get basic info about a dataset without loading it.
+
+        Args:
+            name: Name of the dataset
+
+        Returns:
+            Dictionary with dataset information
+        """
+        dataset = cls.create_dataset(name)
+        return {
+            "name": dataset.dataset_name,
+            "supports_cot": str(dataset.supports_cot),
+            "categories_count": str(len(dataset.get_available_categories())),
+        }
+
+
+# Register built-in datasets
+DatasetFactory.register_dataset("mmlu", MMLUDataset)
+DatasetFactory.register_dataset("mmlu-pro", MMLUDataset)
+
+# Register ARC datasets
+DatasetFactory.register_dataset("arc", ARCDataset)
+DatasetFactory.register_dataset("arc-easy", ARCEasyDataset)
+DatasetFactory.register_dataset("arc-challenge", ARCChallengeDataset)
+
+# Register GPQA datasets
+DatasetFactory.register_dataset("gpqa", GPQAMainDataset)
+DatasetFactory.register_dataset("gpqa-main", GPQAMainDataset)
+DatasetFactory.register_dataset("gpqa-extended", GPQAExtendedDataset)
+DatasetFactory.register_dataset("gpqa-diamond", GPQADiamondDataset)
+
+# Register hard reasoning datasets
+DatasetFactory.register_dataset("truthfulqa", TruthfulQADataset)
+DatasetFactory.register_dataset("commonsenseqa", CommonsenseQADataset)
+DatasetFactory.register_dataset("hellaswag", HellaSwagDataset)
+
+
+def list_available_datasets() -> None:
+    """Print information about all available datasets."""
+    print("Available datasets:")
+    print("-" * 50)
+
+    for name in DatasetFactory.get_available_datasets():
+        try:
+            info = DatasetFactory.get_dataset_info(name)
+            print(f"• {name}")
+            print(f"  Name: {info['name']}")
+            print(f"  Supports CoT: {info['supports_cot']}")
+            print(f"  Categories: {info['categories_count']}")
+            print()
+        except Exception as e:
+            print(f"• {name} (error loading info: {e})")
+            print()
+
+
+def create_dataset(name: str) -> DatasetInterface:
+    """Convenience function to create a dataset instance.
+
+    Args:
+        name: Name of the dataset to create
+
+    Returns:
+        Dataset instance
+    """
+    return DatasetFactory.create_dataset(name)
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/__init__.py b/bench/vllm_semantic_router_bench/dataset_implementations/__init__.py
new file mode 100644
index 00000000..00804dc7
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/__init__.py
@@ -0,0 +1,28 @@
+"""Dataset implementations for the benchmark."""
+
+from .arc_dataset import ARCChallengeDataset, ARCDataset, ARCEasyDataset
+from .commonsenseqa_dataset import CommonsenseQADataset
+from .gpqa_dataset import (
+    GPQADataset,
+    GPQADiamondDataset,
+    GPQAExtendedDataset,
+    GPQAMainDataset,
+)
+from .hellaswag_dataset import HellaSwagDataset
+from .mmlu_dataset import MMLUDataset, load_mmlu_pro_dataset
+from .truthfulqa_dataset import TruthfulQADataset
+
+__all__ = [
+    "MMLUDataset",
+    "load_mmlu_pro_dataset",
+    "ARCDataset",
+    "ARCEasyDataset",
+    "ARCChallengeDataset",
+    "CommonsenseQADataset",
+    "GPQADataset",
+    "GPQAMainDataset",
+    "GPQAExtendedDataset",
+    "GPQADiamondDataset",
+    "HellaSwagDataset",
+    "TruthfulQADataset",
+]
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/arc_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/arc_dataset.py
new file mode 100644
index 00000000..b0e9a8c8
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/arc_dataset.py
@@ -0,0 +1,227 @@
+"""
+ARC Dataset Implementation
+
+AI2 Reasoning Challenge for elementary and middle school science questions
+with automatic subject categorization across Biology, Chemistry, Physics,
+Earth Science, and General Science.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, PromptFormatter, Question
+
+
+class ARCDataset(DatasetInterface):
+    """ARC (AI2 Reasoning Challenge) dataset implementation."""
+
+    def __init__(self, variant: str = "both"):
+        """Initialize ARC dataset.
+
+        Args:
+            variant: Which ARC variant to use ("easy", "challenge", or "both")
+        """
+        self.variant = variant.lower()
+        if self.variant not in ["easy", "challenge", "both"]:
+            raise ValueError("variant must be 'easy', 'challenge', or 'both'")
+
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        if self.variant == "both":
+            return "ARC"
+        return f"ARC-{self.variant.title()}"
+
+    @property
+    def supports_cot(self) -> bool:
+        return False  # ARC doesn't have built-in CoT content
+
+    def _load_raw_dataset(self):
+        """Load raw ARC dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        datasets_to_load = []
+
+        if self.variant in ["easy", "both"]:
+            easy_dataset = load_dataset("allenai/ai2_arc", "ARC-Easy", split="test")
+            easy_df = pd.DataFrame(easy_dataset)
+            easy_df["difficulty"] = "Easy"
+            easy_df["arc_variant"] = "ARC-Easy"
+            datasets_to_load.append(easy_df)
+
+        if self.variant in ["challenge", "both"]:
+            challenge_dataset = load_dataset(
+                "allenai/ai2_arc", "ARC-Challenge", split="test"
+            )
+            challenge_df = pd.DataFrame(challenge_dataset)
+            challenge_df["difficulty"] = "Challenge"
+            challenge_df["arc_variant"] = "ARC-Challenge"
+            datasets_to_load.append(challenge_df)
+
+        if len(datasets_to_load) == 1:
+            self._dataset_cache = datasets_to_load[0]
+        else:
+            self._dataset_cache = pd.concat(datasets_to_load, ignore_index=True)
+
+        return self._dataset_cache
+
+    def _get_category(self) -> str:
+        """
+        ARC dataset doesn't have explicit subject categories.
+        Use a single 'Science' category since all questions are science-related.
+        """
+        return "Science"
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load ARC dataset."""
+        df = self._load_raw_dataset()
+
+        # Convert to Question objects and infer categories
+        questions = []
+        for _, row in df.iterrows():
+            # Extract choices - ARC format has choices as dict with labels
+            choices_dict = row["choices"]
+            if isinstance(choices_dict, dict):
+                # Extract text choices in order
+                labels = choices_dict.get("label", [])
+                texts = choices_dict.get("text", [])
+                options = [text for text in texts if text]  # Filter out empty choices
+            else:
+                options = []
+
+            # Convert answer key from letter to index
+            answer_key = str(row["answerKey"])
+            if len(options) > 0 and answer_key in "ABCDEFGHIJ":
+                correct_answer_index = ord(answer_key) - ord("A")
+                # Ensure the index is within bounds
+                if correct_answer_index >= len(options):
+                    correct_answer_index = None
+            else:
+                correct_answer_index = None
+
+            # Skip questions with invalid answer keys
+            if correct_answer_index is None:
+                continue
+
+            # Use single category since ARC doesn't have explicit subjects
+            category = self._get_category()
+
+            question = Question(
+                question_id=str(row.get("id", f"arc_{len(questions)}")),
+                category=category,
+                question=str(row["question"]),
+                options=options,
+                correct_answer=correct_answer_index,  # Now an integer index
+                cot_content=None,  # ARC doesn't have CoT
+                metadata={
+                    "source": "ARC",
+                    "difficulty": row["difficulty"],
+                    "arc_variant": row["arc_variant"],
+                },
+            )
+            questions.append(question)
+
+        # Get all unique categories
+        all_categories = sorted(list(set(q.category for q in questions)))
+        self._categories_cache = all_categories
+
+        # Filter by categories if specified
+        if categories:
+            questions = [q for q in questions if q.category in categories]
+            if not questions:
+                valid_categories = ", ".join(all_categories)
+                raise ValueError(
+                    f"No data found for specified categories. "
+                    f"Valid categories are: {valid_categories}"
+                )
+
+        # Sample if requested
+        if samples_per_category:
+            random.seed(seed)
+            np.random.seed(seed)
+
+            # Group by category
+            category_questions = {}
+            for q in questions:
+                if q.category not in category_questions:
+                    category_questions[q.category] = []
+                category_questions[q.category].append(q)
+
+            # Sample from each category
+            sampled_questions = []
+            for category, cat_questions in category_questions.items():
+                if len(cat_questions) > samples_per_category:
+                    sampled = random.sample(cat_questions, samples_per_category)
+                    sampled_questions.extend(sampled)
+                else:
+                    sampled_questions.extend(cat_questions)
+
+            questions = sampled_questions
+
+        # Create dataset info
+        dataset_info = DatasetInfo(
+            name=self.dataset_name,
+            description=f"AI2 Reasoning Challenge ({self.variant})",
+            categories=list(set(q.category for q in questions)),
+            total_questions=len(questions),
+            format_type="multiple_choice",
+            difficulty_level="elementary" if self.variant == "easy" else "mixed",
+        )
+
+        return questions, dataset_info
+
+    def get_available_categories(self) -> List[str]:
+        """Get all available ARC categories."""
+        if self._categories_cache is None:
+            # Load dataset to get categories
+            self.load_dataset()
+        return self._categories_cache or []
+
+    def format_prompt(self, question: Question, prompt_style: str = "plain") -> str:
+        """Format ARC question into prompt."""
+        if prompt_style == "plain":
+            return PromptFormatter.format_enhanced_prompt(
+                question.question, question.options, "ARC", "mixed", "plain"
+            )
+        elif prompt_style == "cot":
+            return PromptFormatter.format_enhanced_prompt(
+                question.question, question.options, "ARC", "mixed", "cot"
+            )
+        elif prompt_style == "explicit_cot":
+            # ARC doesn't have CoT content, so fall back to regular CoT
+            return PromptFormatter.format_cot_prompt(
+                question.question, question.options
+            )
+        else:
+            raise ValueError(f"Unknown prompt style: {prompt_style}")
+
+
+# Convenience classes for specific variants
+class ARCEasyDataset(ARCDataset):
+    """ARC-Easy dataset."""
+
+    def __init__(self):
+        super().__init__(variant="easy")
+
+
+class ARCChallengeDataset(ARCDataset):
+    """ARC-Challenge dataset."""
+
+    def __init__(self):
+        super().__init__(variant="challenge")
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/commonsenseqa_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/commonsenseqa_dataset.py
new file mode 100644
index 00000000..1735c4ef
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/commonsenseqa_dataset.py
@@ -0,0 +1,190 @@
+"""
+CommonsenseQA dataset implementation.
+
+This module implements the DatasetInterface for CommonsenseQA dataset which
+tests commonsense reasoning across various conceptual domains.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, PromptFormatter, Question
+
+
+class CommonsenseQADataset(DatasetInterface):
+    """CommonsenseQA dataset implementation."""
+
+    def __init__(self):
+        """Initialize CommonsenseQA dataset."""
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return "CommonsenseQA"
+
+    @property
+    def supports_cot(self) -> bool:
+        return True  # CommonsenseQA benefits from reasoning
+
+    def _load_raw_dataset(self):
+        """Load raw CommonsenseQA dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        try:
+            # Load train and validation splits
+            train_dataset = load_dataset("commonsense_qa", split="train")
+            val_dataset = load_dataset("commonsense_qa", split="validation")
+
+            # Combine both splits for more data
+            train_df = pd.DataFrame(train_dataset)
+            val_df = pd.DataFrame(val_dataset)
+            self._dataset_cache = pd.concat([train_df, val_df], ignore_index=True)
+
+        except Exception as e:
+            print(f"Warning: Could not load CommonsenseQA dataset: {e}")
+            print("You may need to check your internet connection or dataset access.")
+            # Create empty dataframe as fallback
+            self._dataset_cache = pd.DataFrame()
+
+        return self._dataset_cache
+
+    def _get_category(self) -> str:
+        """
+        CommonsenseQA doesn't have explicit subject categories.
+        All questions test commonsense reasoning.
+        """
+        return "Common Sense"
+
+    def get_available_categories(self) -> List[str]:
+        """Get all available categories in the dataset."""
+        return [self._get_category()]
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load CommonsenseQA dataset with filtering and sampling."""
+        df = self._load_raw_dataset()
+
+        if df.empty:
+            return [], DatasetInfo(
+                name=self.dataset_name,
+                categories=[],
+                total_questions=0,
+            )
+
+        # Use single category for all questions
+        single_category = self._get_category()
+
+        # Sample questions if specified (treat all questions as single category)
+        if samples_per_category:
+            random.seed(seed)
+            np.random.seed(seed)
+            if len(df) > samples_per_category:
+                df = df.sample(samples_per_category, random_state=seed)
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            # Extract multiple choice options
+            choices = row["choices"]
+            choice_texts = choices["text"]
+            choice_labels = choices["label"]  # ['A', 'B', 'C', 'D', 'E']
+
+            # Find correct answer index
+            answer_key = row["answerKey"]
+            correct_idx = choice_labels.index(answer_key)
+
+            question = Question(
+                question_id=row["id"],
+                question=row["question"],
+                options=choice_texts,
+                correct_answer=correct_idx,  # 0-indexed
+                category=single_category,  # Use single category for all questions
+                cot_content=None,  # CommonsenseQA doesn't provide CoT
+            )
+            questions.append(question)
+
+        dataset_info = DatasetInfo(
+            name=self.dataset_name,
+            description="CommonsenseQA tests commonsense reasoning across various conceptual domains",
+            categories=[single_category],  # Single category for all questions
+            total_questions=len(questions),
+            format_type="multiple_choice",
+            difficulty_level="hard",
+        )
+
+        return questions, dataset_info
+
+    def format_prompt(self, question: Question, style: str = "plain") -> str:
+        """Format a question into a prompt."""
+        formatter = PromptFormatter()
+
+        if style == "plain":
+            return formatter.format_enhanced_prompt(
+                question.question, question.options, "CommonsenseQA", "hard", "plain"
+            )
+        elif style == "cot":
+            return formatter.format_enhanced_prompt(
+                question.question, question.options, "CommonsenseQA", "hard", "cot"
+            )
+        elif style == "explicit_cot":
+            return formatter.format_explicit_cot_prompt(
+                question.question, question.options, question.cot_content
+            )
+        else:
+            raise ValueError(f"Unknown prompt style: {style}")
+
+
+class CommonsenseQAPromptFormatter(PromptFormatter):
+    """Prompt formatter for CommonsenseQA questions."""
+
+    def format_plain_prompt(self, question: str, options: List[str]) -> str:
+        """Format a plain prompt for CommonsenseQA."""
+        formatted_options = ""
+        for i, option in enumerate(options):
+            letter = chr(ord("A") + i)
+            formatted_options += f"{letter}) {option}\n"
+
+        prompt = (
+            f"Question: {question}\n\n"
+            f"Options:\n{formatted_options}\n"
+            f"Please choose the answer that demonstrates the best commonsense reasoning. "
+            f"Provide your answer in the format 'Answer: [letter]'."
+        )
+        return prompt
+
+    def format_cot_prompt(self, question: str, options: List[str]) -> str:
+        """Format a chain-of-thought prompt for CommonsenseQA."""
+        formatted_options = ""
+        for i, option in enumerate(options):
+            letter = chr(ord("A") + i)
+            formatted_options += f"{letter}) {option}\n"
+
+        prompt = (
+            f"Question: {question}\n\n"
+            f"Options:\n{formatted_options}\n"
+            f"Please think step-by-step about this question using commonsense reasoning. "
+            f"Consider what you know about the world and how things typically work. "
+            f"Then provide your final answer in the format 'Answer: [letter]'."
+        )
+        return prompt
+
+    def format_explicit_cot_prompt(
+        self, question: str, options: List[str], cot_content: Optional[str]
+    ) -> str:
+        """Format an explicit chain-of-thought prompt for CommonsenseQA."""
+        # CommonsenseQA doesn't provide CoT content, so fall back to regular CoT
+        return self.format_cot_prompt(question, options)
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/gpqa_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/gpqa_dataset.py
new file mode 100644
index 00000000..04abccca
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/gpqa_dataset.py
@@ -0,0 +1,280 @@
+"""
+GPQA Dataset Implementation
+
+Graduate-level Google-proof Q&A dataset for advanced scientific reasoning
+evaluation. Supports Main, Extended, and Diamond variants with Chain-of-Thought
+reasoning content.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, PromptFormatter, Question
+
+
+class GPQADataset(DatasetInterface):
+    """GPQA (Graduate-level Google-proof Q&A) dataset implementation."""
+
+    def __init__(self, subset: str = "gpqa_main"):
+        """Initialize GPQA dataset.
+
+        Args:
+            subset: Which GPQA subset to use ("gpqa_main", "gpqa_extended", or "gpqa_diamond")
+        """
+        self.subset = subset
+        valid_subsets = ["gpqa_main", "gpqa_extended", "gpqa_diamond"]
+        if self.subset not in valid_subsets:
+            raise ValueError(f"subset must be one of {valid_subsets}")
+
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return f"GPQA-{self.subset.replace('gpqa_', '').title()}"
+
+    @property
+    def supports_cot(self) -> bool:
+        return True  # GPQA has reasoning explanations
+
+    def _load_raw_dataset(self):
+        """Load raw GPQA dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        try:
+            # Try loading from the official GPQA dataset
+            dataset = load_dataset("Idavidrein/gpqa", self.subset, split="train")
+            self._dataset_cache = pd.DataFrame(dataset)
+        except Exception as e:
+            # Fallback: try alternative dataset names or warn user
+            print(f"Warning: Could not load GPQA dataset {self.subset}: {e}")
+            print(
+                "You may need to install the dataset manually or check the dataset name."
+            )
+            # Create empty dataframe as fallback
+            self._dataset_cache = pd.DataFrame()
+
+        return self._dataset_cache
+
+    def _standardize_subject_category(self, subject: str) -> str:
+        """Standardize subject names to consistent categories."""
+        subject_lower = subject.lower() if subject else ""
+
+        # Map various subject names to standard categories
+        if any(word in subject_lower for word in ["physics", "phys"]):
+            return "Physics"
+        elif any(word in subject_lower for word in ["chemistry", "chem"]):
+            return "Chemistry"
+        elif any(word in subject_lower for word in ["biology", "bio"]):
+            return "Biology"
+        elif any(word in subject_lower for word in ["math", "mathematics"]):
+            return "Mathematics"
+        else:
+            return "Other"
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load GPQA dataset."""
+        df = self._load_raw_dataset()
+
+        if df.empty:
+            # Return empty dataset if loading failed
+            return [], DatasetInfo(
+                name=self.dataset_name,
+                description="GPQA dataset (failed to load)",
+                categories=[],
+                total_questions=0,
+                format_type="multiple_choice",
+                difficulty_level="graduate",
+            )
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            # Handle different possible column names for GPQA
+            question_text = str(row.get("Question", row.get("question", "")))
+
+            # Extract multiple choice options
+            options = []
+            correct_answer = None
+
+            # GPQA has correct answer and incorrect answers as separate columns
+            correct_answer_text = None
+            if "Correct Answer" in row and pd.notna(row["Correct Answer"]):
+                correct_answer_text = str(row["Correct Answer"])
+            elif "Answer" in row and pd.notna(row["Answer"]):
+                correct_answer_text = str(row["Answer"])
+            elif "answer" in row and pd.notna(row["answer"]):
+                correct_answer_text = str(row["answer"])
+
+            # Collect all answer options
+            incorrect_answers = []
+            for i in [1, 2, 3]:
+                col_name = f"Incorrect Answer {i}"
+                if col_name in row and pd.notna(row[col_name]):
+                    incorrect_answers.append(str(row[col_name]))
+
+            # Create options list with correct answer in random position
+            if correct_answer_text and incorrect_answers:
+                options = incorrect_answers + [correct_answer_text]
+                random.shuffle(options)  # Randomize order
+                correct_answer = options.index(
+                    correct_answer_text
+                )  # Find index after shuffle
+            else:
+                # Fallback: try other formats
+                options = []
+                correct_answer = None
+
+                # Try to extract from individual option columns (A, B, C, D)
+                for letter in ["A", "B", "C", "D"]:
+                    if letter in row and pd.notna(row[letter]):
+                        options.append(str(row[letter]))
+
+                if options and correct_answer_text:
+                    # Try to find correct answer in options
+                    try:
+                        correct_answer = options.index(correct_answer_text)
+                    except ValueError:
+                        correct_answer = 0  # Default to first option if not found
+
+            # Get subject/category
+            subject = row.get(
+                "Subject", row.get("subject", row.get("Category", "Other"))
+            )
+            category = self._standardize_subject_category(str(subject))
+
+            # Get explanation/reasoning if available
+            explanation = None
+            for col in ["Explanation", "explanation", "reasoning", "Reasoning"]:
+                if col in row and pd.notna(row[col]):
+                    explanation = str(row[col])
+                    break
+
+            # Skip questions without proper multiple choice format
+            if not options or correct_answer is None:
+                continue
+
+            question = Question(
+                question_id=str(row.get("Record ID", f"gpqa_{len(questions)}")),
+                category=category,
+                question=question_text,
+                options=options,
+                correct_answer=correct_answer,
+                cot_content=explanation,
+                metadata={
+                    "source": "GPQA",
+                    "subset": self.subset,
+                    "difficulty": "graduate",
+                    "subject": str(subject),
+                },
+            )
+            questions.append(question)
+
+        # Get all unique categories
+        all_categories = sorted(list(set(q.category for q in questions)))
+        self._categories_cache = all_categories
+
+        # Filter by categories if specified
+        if categories:
+            questions = [q for q in questions if q.category in categories]
+            if not questions:
+                valid_categories = ", ".join(all_categories)
+                raise ValueError(
+                    f"No data found for specified categories. "
+                    f"Valid categories are: {valid_categories}"
+                )
+
+        # Sample if requested
+        if samples_per_category:
+            random.seed(seed)
+            np.random.seed(seed)
+
+            # Group by category
+            category_questions = {}
+            for q in questions:
+                if q.category not in category_questions:
+                    category_questions[q.category] = []
+                category_questions[q.category].append(q)
+
+            # Sample from each category
+            sampled_questions = []
+            for category, cat_questions in category_questions.items():
+                if len(cat_questions) > samples_per_category:
+                    sampled = random.sample(cat_questions, samples_per_category)
+                    sampled_questions.extend(sampled)
+                else:
+                    sampled_questions.extend(cat_questions)
+
+            questions = sampled_questions
+
+        # Create dataset info
+        dataset_info = DatasetInfo(
+            name=self.dataset_name,
+            description="Graduate-level Google-proof Q&A benchmark",
+            categories=list(set(q.category for q in questions)),
+            total_questions=len(questions),
+            format_type="multiple_choice",
+            difficulty_level="graduate",
+        )
+
+        return questions, dataset_info
+
+    def get_available_categories(self) -> List[str]:
+        """Get all available GPQA categories."""
+        if self._categories_cache is None:
+            # Load dataset to get categories
+            self.load_dataset()
+        return self._categories_cache or []
+
+    def format_prompt(self, question: Question, prompt_style: str = "plain") -> str:
+        """Format GPQA question into prompt."""
+        if prompt_style == "plain":
+            return PromptFormatter.format_enhanced_prompt(
+                question.question, question.options, "GPQA", "graduate", "plain"
+            )
+        elif prompt_style == "cot":
+            return PromptFormatter.format_enhanced_prompt(
+                question.question, question.options, "GPQA", "graduate", "cot"
+            )
+        elif prompt_style == "explicit_cot":
+            return PromptFormatter.format_explicit_cot_prompt(
+                question.question, question.options, question.cot_content
+            )
+        else:
+            raise ValueError(f"Unknown prompt style: {prompt_style}")
+
+
+# Convenience classes for specific subsets
+class GPQAMainDataset(GPQADataset):
+    """GPQA Main dataset."""
+
+    def __init__(self):
+        super().__init__(subset="gpqa_main")
+
+
+class GPQAExtendedDataset(GPQADataset):
+    """GPQA Extended dataset."""
+
+    def __init__(self):
+        super().__init__(subset="gpqa_extended")
+
+
+class GPQADiamondDataset(GPQADataset):
+    """GPQA Diamond dataset (highest quality subset)."""
+
+    def __init__(self):
+        super().__init__(subset="gpqa_diamond")
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/hellaswag_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/hellaswag_dataset.py
new file mode 100644
index 00000000..3665dcf2
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/hellaswag_dataset.py
@@ -0,0 +1,232 @@
+"""
+HellaSwag dataset implementation.
+
+This module implements the DatasetInterface for HellaSwag dataset which
+tests commonsense reasoning about everyday activities and situations.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, PromptFormatter, Question
+
+
+class HellaSwagDataset(DatasetInterface):
+    """HellaSwag dataset implementation."""
+
+    def __init__(self):
+        """Initialize HellaSwag dataset."""
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return "HellaSwag"
+
+    @property
+    def supports_cot(self) -> bool:
+        return True  # HellaSwag benefits from reasoning about context
+
+    def _load_raw_dataset(self):
+        """Load raw HellaSwag dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        try:
+            # Load train and validation splits
+            train_dataset = load_dataset("hellaswag", split="train")
+            val_dataset = load_dataset("hellaswag", split="validation")
+
+            # Combine both splits for more data
+            train_df = pd.DataFrame(train_dataset)
+            val_df = pd.DataFrame(val_dataset)
+            self._dataset_cache = pd.concat([train_df, val_df], ignore_index=True)
+
+        except Exception as e:
+            print(f"Warning: Could not load HellaSwag dataset: {e}")
+            print("You may need to check your internet connection or dataset access.")
+            # Create empty dataframe as fallback
+            self._dataset_cache = pd.DataFrame()
+
+        return self._dataset_cache
+
+    def _extract_categories(self, df: pd.DataFrame) -> List[str]:
+        """Extract categories from HellaSwag dataset using activity labels."""
+        if df.empty:
+            return []
+
+        # Use activity_label as categories, but clean them up
+        def clean_activity_label(label: str) -> str:
+            """Clean up activity labels to make them more readable."""
+            # Remove underscores and capitalize properly
+            cleaned = label.replace("_", " ").title()
+
+            # Handle some common cases
+            replacements = {
+                "Tv": "TV",
+                "Diy": "DIY",
+                "Atv": "ATV",
+                "Bmx": "BMX",
+                "Sumo": "Sumo Wrestling",
+                "Mma": "MMA",
+            }
+
+            for old, new in replacements.items():
+                cleaned = cleaned.replace(old, new)
+
+            return cleaned
+
+        # Add cleaned category column
+        if "category" not in df.columns:
+            df["category"] = df["activity_label"].apply(clean_activity_label)
+
+        return sorted(df["category"].unique().tolist())
+
+    def get_available_categories(self) -> List[str]:
+        """Get all available categories in the dataset."""
+        if self._categories_cache is None:
+            df = self._load_raw_dataset()
+            self._categories_cache = self._extract_categories(df)
+        return self._categories_cache
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load HellaSwag dataset with filtering and sampling."""
+        df = self._load_raw_dataset()
+
+        if df.empty:
+            return [], DatasetInfo(
+                name=self.dataset_name,
+                categories=[],
+                total_questions=0,
+            )
+
+        # Extract categories
+        all_categories = self._extract_categories(df)
+
+        # Filter by categories if specified
+        if categories:
+            df = df[df["category"].isin(categories)]
+            if df.empty:
+                valid_categories = ", ".join(all_categories)
+                raise ValueError(
+                    f"No data found for specified categories. Valid categories are: {valid_categories}"
+                )
+
+        # Sample questions per category if specified
+        if samples_per_category:
+            random.seed(seed)
+            np.random.seed(seed)
+            sampled_dfs = []
+            for category in df["category"].unique():
+                category_df = df[df["category"] == category]
+                if len(category_df) > samples_per_category:
+                    sampled_df = category_df.sample(
+                        samples_per_category, random_state=seed
+                    )
+                    sampled_dfs.append(sampled_df)
+                else:
+                    sampled_dfs.append(category_df)
+            df = pd.concat(sampled_dfs) if sampled_dfs else pd.DataFrame()
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            # Construct the full context
+            context = row["ctx"]  # This is the full context (ctx_a + ctx_b combined)
+            endings = row["endings"]  # List of 4 possible endings
+            correct_idx = int(str(row["label"]))  # Convert string label to int (0-3)
+
+            question = Question(
+                question_id=f"hellaswag_{row['ind']}",
+                question=f"Context: {context}\n\nWhat happens next?",
+                options=endings,
+                correct_answer=correct_idx,  # 0-indexed
+                category=row["category"],
+                cot_content=None,  # HellaSwag doesn't provide CoT
+            )
+            questions.append(question)
+
+        dataset_info = DatasetInfo(
+            name=self.dataset_name,
+            description="HellaSwag tests commonsense reasoning about everyday activities and situations",
+            categories=sorted(df["category"].unique().tolist()) if not df.empty else [],
+            total_questions=len(questions),
+            format_type="multiple_choice",
+            difficulty_level="moderate",
+        )
+
+        return questions, dataset_info
+
+    def format_prompt(self, question: Question, style: str = "plain") -> str:
+        """Format a question into a prompt."""
+        formatter = PromptFormatter()
+
+        if style == "plain":
+            return formatter.format_enhanced_prompt(
+                question.question, question.options, "HellaSwag", "moderate", "plain"
+            )
+        elif style == "cot":
+            return formatter.format_enhanced_prompt(
+                question.question, question.options, "HellaSwag", "moderate", "cot"
+            )
+        elif style == "explicit_cot":
+            return formatter.format_explicit_cot_prompt(
+                question.question, question.options, question.cot_content
+            )
+        else:
+            raise ValueError(f"Unknown prompt style: {style}")
+
+
+class HellaSwagPromptFormatter(PromptFormatter):
+    """Prompt formatter for HellaSwag questions."""
+
+    def format_plain_prompt(self, question: str, options: List[str]) -> str:
+        """Format a plain prompt for HellaSwag."""
+        formatted_options = ""
+        for i, option in enumerate(options):
+            letter = chr(ord("A") + i)
+            formatted_options += f"{letter}) {option}\n"
+
+        prompt = (
+            f"{question}\n\n"
+            f"Options:\n{formatted_options}\n"
+            f"Please choose the most logical and natural continuation. "
+            f"Provide your answer in the format 'Answer: [letter]'."
+        )
+        return prompt
+
+    def format_cot_prompt(self, question: str, options: List[str]) -> str:
+        """Format a chain-of-thought prompt for HellaSwag."""
+        formatted_options = ""
+        for i, option in enumerate(options):
+            letter = chr(ord("A") + i)
+            formatted_options += f"{letter}) {option}\n"
+
+        prompt = (
+            f"{question}\n\n"
+            f"Options:\n{formatted_options}\n"
+            f"Please think step-by-step about what would most likely happen next in this situation. "
+            f"Consider the context, the activity being performed, and what would be the most natural continuation. "
+            f"Then provide your final answer in the format 'Answer: [letter]'."
+        )
+        return prompt
+
+    def format_explicit_cot_prompt(
+        self, question: str, options: List[str], cot_content: Optional[str]
+    ) -> str:
+        """Format an explicit chain-of-thought prompt for HellaSwag."""
+        # HellaSwag doesn't provide CoT content, so fall back to regular CoT
+        return self.format_cot_prompt(question, options)
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/mmlu_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/mmlu_dataset.py
new file mode 100644
index 00000000..bf4f64b3
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/mmlu_dataset.py
@@ -0,0 +1,159 @@
+"""
+MMLU-Pro Dataset Implementation
+
+Academic knowledge evaluation across 14 subject categories with
+Chain-of-Thought reasoning support.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, PromptFormatter, Question
+
+
+class MMLUDataset(DatasetInterface):
+    """MMLU-Pro dataset implementation."""
+
+    def __init__(self):
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return "MMLU-Pro"
+
+    @property
+    def supports_cot(self) -> bool:
+        return True
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load MMLU-Pro dataset."""
+        # Load raw dataset
+        if self._dataset_cache is None:
+            dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="test")
+            self._dataset_cache = pd.DataFrame(dataset)
+
+        df = self._dataset_cache.copy()
+        all_categories = sorted(df["category"].unique().tolist())
+        self._categories_cache = all_categories
+
+        # Filter by categories if specified
+        if categories:
+            df = df[df["category"].isin(categories)]
+            if df.empty:
+                valid_categories = ", ".join(all_categories)
+                raise ValueError(
+                    f"No data found for specified categories. "
+                    f"Valid categories are: {valid_categories}"
+                )
+
+        # Sample if requested
+        if samples_per_category:
+            random.seed(seed)
+            np.random.seed(seed)
+            sampled_dfs = []
+            for category in df["category"].unique():
+                category_df = df[df["category"] == category]
+                if len(category_df) > samples_per_category:
+                    sampled_df = category_df.sample(
+                        samples_per_category, random_state=seed
+                    )
+                    sampled_dfs.append(sampled_df)
+                else:
+                    sampled_dfs.append(category_df)
+            df = pd.concat(sampled_dfs)
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            question = Question(
+                question_id=str(row.get("question_id", f"mmlu_{len(questions)}")),
+                category=str(row["category"]),
+                question=str(row["question"]),
+                options=row["options"] if isinstance(row["options"], list) else [],
+                correct_answer=str(row["answer"]),
+                cot_content=(
+                    row.get("cot_content") if pd.notna(row.get("cot_content")) else None
+                ),
+                metadata={
+                    "source": "MMLU-Pro",
+                    "difficulty": row.get("difficulty", "unknown"),
+                },
+            )
+            questions.append(question)
+
+        # Create dataset info
+        dataset_info = DatasetInfo(
+            name="MMLU-Pro",
+            description="Massive Multitask Language Understanding - Professional",
+            categories=list(df["category"].unique()),
+            total_questions=len(questions),
+            format_type="multiple_choice",
+            difficulty_level="undergraduate",
+        )
+
+        return questions, dataset_info
+
+    def get_available_categories(self) -> List[str]:
+        """Get all available MMLU categories."""
+        if self._categories_cache is None:
+            # Load dataset to get categories
+            self.load_dataset()
+        return self._categories_cache or []
+
+    def format_prompt(self, question: Question, prompt_style: str = "plain") -> str:
+        """Format MMLU question into prompt."""
+        if prompt_style == "plain":
+            return PromptFormatter.format_plain_prompt(
+                question.question, question.options
+            )
+        elif prompt_style == "cot":
+            return PromptFormatter.format_cot_prompt(
+                question.question, question.options
+            )
+        elif prompt_style == "explicit_cot":
+            return PromptFormatter.format_explicit_cot_prompt(
+                question.question, question.options, question.cot_content
+            )
+        else:
+            raise ValueError(f"Unknown prompt style: {prompt_style}")
+
+
+# Legacy compatibility function
+def load_mmlu_pro_dataset(
+    categories: Optional[List[str]] = None,
+    samples_per_category: Optional[int] = None,
+    seed: int = 42,
+) -> Tuple[pd.DataFrame, List[str]]:
+    """Legacy function for backward compatibility."""
+    mmlu = MMLUDataset()
+    questions, dataset_info = mmlu.load_dataset(categories, samples_per_category, seed)
+
+    # Convert back to DataFrame format for compatibility
+    records = []
+    for q in questions:
+        record = {
+            "question_id": q.question_id,
+            "category": q.category,
+            "question": q.question,
+            "options": q.options,
+            "answer": q.correct_answer,
+            "cot_content": q.cot_content,
+        }
+        records.append(record)
+
+    df = pd.DataFrame(records)
+    return df, dataset_info.categories
diff --git a/bench/vllm_semantic_router_bench/dataset_implementations/truthfulqa_dataset.py b/bench/vllm_semantic_router_bench/dataset_implementations/truthfulqa_dataset.py
new file mode 100644
index 00000000..dfa6e989
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_implementations/truthfulqa_dataset.py
@@ -0,0 +1,226 @@
+"""
+TruthfulQA dataset implementation.
+
+This module implements the DatasetInterface for TruthfulQA dataset which
+tests whether language models are truthful in generating answers to questions.
+"""
+
+import os
+import random
+import sys
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ..dataset_interface import DatasetInfo, DatasetInterface, PromptFormatter, Question
+
+
+class TruthfulQADataset(DatasetInterface):
+    """TruthfulQA dataset implementation."""
+
+    def __init__(self):
+        """Initialize TruthfulQA dataset."""
+        self._dataset_cache = None
+        self._categories_cache = None
+
+    @property
+    def dataset_name(self) -> str:
+        return "TruthfulQA"
+
+    @property
+    def supports_cot(self) -> bool:
+        return True  # TruthfulQA benefits from reasoning
+
+    def _load_raw_dataset(self):
+        """Load raw TruthfulQA dataset from Hugging Face."""
+        if self._dataset_cache is not None:
+            return self._dataset_cache
+
+        try:
+            # Load the multiple choice version
+            dataset = load_dataset("truthful_qa", "multiple_choice", split="validation")
+            self._dataset_cache = pd.DataFrame(dataset)
+        except Exception as e:
+            print(f"Warning: Could not load TruthfulQA dataset: {e}")
+            print("You may need to check your internet connection or dataset access.")
+            # Create empty dataframe as fallback
+            self._dataset_cache = pd.DataFrame()
+
+        return self._dataset_cache
+
+    def _extract_categories(self, df: pd.DataFrame) -> List[str]:
+        """Extract categories from TruthfulQA dataset.
+
+        TruthfulQA doesn't have explicit categories, so we'll create them
+        based on question topics/themes.
+        """
+        if df.empty:
+            return []
+
+        # For now, we'll use a single "Truthfulness" category
+        # In the future, we could implement topic classification
+        def get_category() -> str:
+            """
+            TruthfulQA doesn't have explicit categories.
+            All questions test truthfulness and misconception detection.
+            """
+            return "Truthfulness"
+
+        # Add single category since TruthfulQA doesn't have explicit subjects
+        if "category" not in df.columns:
+            df["category"] = get_category()
+
+        return sorted(df["category"].unique().tolist())
+
+    def get_available_categories(self) -> List[str]:
+        """Get all available categories in the dataset."""
+        if self._categories_cache is None:
+            df = self._load_raw_dataset()
+            self._categories_cache = self._extract_categories(df)
+        return self._categories_cache
+
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load TruthfulQA dataset with filtering and sampling."""
+        df = self._load_raw_dataset()
+
+        if df.empty:
+            return [], DatasetInfo(
+                name=self.dataset_name,
+                categories=[],
+                total_questions=0,
+            )
+
+        # Extract categories
+        all_categories = self._extract_categories(df)
+
+        # Filter by categories if specified
+        if categories:
+            df = df[df["category"].isin(categories)]
+            if df.empty:
+                valid_categories = ", ".join(all_categories)
+                raise ValueError(
+                    f"No data found for specified categories. Valid categories are: {valid_categories}"
+                )
+
+        # Sample questions per category if specified
+        if samples_per_category:
+            random.seed(seed)
+            np.random.seed(seed)
+            sampled_dfs = []
+            for category in df["category"].unique():
+                category_df = df[df["category"] == category]
+                if len(category_df) > samples_per_category:
+                    sampled_df = category_df.sample(
+                        samples_per_category, random_state=seed
+                    )
+                    sampled_dfs.append(sampled_df)
+                else:
+                    sampled_dfs.append(category_df)
+            df = pd.concat(sampled_dfs) if sampled_dfs else pd.DataFrame()
+
+        # Convert to Question objects
+        questions = []
+        for _, row in df.iterrows():
+            # Extract multiple choice options
+            mc1_targets = row["mc1_targets"]
+            choices = mc1_targets["choices"]
+            labels = mc1_targets["labels"]
+
+            # Find the correct answer (label = 1)
+            correct_idx = None
+            for i, label in enumerate(labels):
+                if label == 1:
+                    correct_idx = i
+                    break
+
+            if correct_idx is not None:
+                question = Question(
+                    question_id=f"truthfulqa_{len(questions)}",
+                    question=row["question"],
+                    options=choices,
+                    correct_answer=correct_idx,  # 0-indexed
+                    category=row["category"],
+                    cot_content=None,  # TruthfulQA doesn't provide CoT
+                )
+                questions.append(question)
+
+        dataset_info = DatasetInfo(
+            name=self.dataset_name,
+            description="TruthfulQA tests whether language models are truthful in generating answers to questions",
+            categories=sorted(df["category"].unique().tolist()) if not df.empty else [],
+            total_questions=len(questions),
+            format_type="multiple_choice",
+            difficulty_level="hard",
+        )
+
+        return questions, dataset_info
+
+    def format_prompt(self, question: Question, style: str = "plain") -> str:
+        """Format a question into a prompt."""
+        formatter = PromptFormatter()
+
+        if style == "plain":
+            return formatter.format_enhanced_prompt(
+                question.question, question.options, "TruthfulQA", "hard", "plain"
+            )
+        elif style == "cot":
+            return formatter.format_enhanced_prompt(
+                question.question, question.options, "TruthfulQA", "hard", "cot"
+            )
+        elif style == "explicit_cot":
+            return formatter.format_explicit_cot_prompt(
+                question.question, question.options, question.cot_content
+            )
+        else:
+            raise ValueError(f"Unknown prompt style: {style}")
+
+
+class TruthfulQAPromptFormatter(PromptFormatter):
+    """Prompt formatter for TruthfulQA questions."""
+
+    def format_plain_prompt(self, question: str, options: List[str]) -> str:
+        """Format a plain prompt for TruthfulQA."""
+        formatted_options = ""
+        for i, option in enumerate(options):
+            letter = chr(ord("A") + i)
+            formatted_options += f"{letter}) {option}\n"
+
+        prompt = (
+            f"Question: {question}\n\n"
+            f"Options:\n{formatted_options}\n"
+            f"Please choose the most truthful and accurate answer. "
+            f"Provide your answer in the format 'Answer: [letter]'."
+        )
+        return prompt
+
+    def format_cot_prompt(self, question: str, options: List[str]) -> str:
+        """Format a chain-of-thought prompt for TruthfulQA."""
+        formatted_options = ""
+        for i, option in enumerate(options):
+            letter = chr(ord("A") + i)
+            formatted_options += f"{letter}) {option}\n"
+
+        prompt = (
+            f"Question: {question}\n\n"
+            f"Options:\n{formatted_options}\n"
+            f"Please think step-by-step about which answer is most truthful and accurate. "
+            f"Consider whether each option represents a fact or a common misconception. "
+            f"Then provide your final answer in the format 'Answer: [letter]'."
+        )
+        return prompt
+
+    def format_explicit_cot_prompt(
+        self, question: str, options: List[str], cot_content: Optional[str]
+    ) -> str:
+        """Format an explicit chain-of-thought prompt for TruthfulQA."""
+        # TruthfulQA doesn't provide CoT content, so fall back to regular CoT
+        return self.format_cot_prompt(question, options)
diff --git a/bench/vllm_semantic_router_bench/dataset_interface.py b/bench/vllm_semantic_router_bench/dataset_interface.py
new file mode 100644
index 00000000..d4c3c1fe
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/dataset_interface.py
@@ -0,0 +1,356 @@
+"""
+Multi-Dataset Evaluation Interface
+
+Provides abstract base classes and standardized interfaces for reasoning
+dataset evaluation across MMLU, ARC, GPQA, TruthfulQA, CommonsenseQA, and HellaSwag.
+
+Key Features:
+- Unified Question and DatasetInfo data structures
+- Abstract DatasetInterface for consistent implementations
+- Enhanced PromptFormatter with dataset-specific optimizations
+- Support for Chain-of-Thought (CoT) reasoning modes
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+import pandas as pd
+
+
+@dataclass
+class Question:
+    """
+    Standardized question representation for multi-choice reasoning tasks.
+
+    Attributes:
+        question_id: Unique identifier for the question
+        category: Subject or topic category
+        question: The question text
+        options: List of answer choices
+        correct_answer: Index (int) of the correct option
+        cot_content: Optional chain-of-thought reasoning
+        metadata: Additional dataset-specific information
+    """
+
+    question_id: str
+    category: str
+    question: str
+    options: List[str]
+    correct_answer: str
+    cot_content: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class DatasetInfo:
+    """
+    Dataset metadata and configuration information.
+
+    Attributes:
+        name: Dataset name (e.g., "GPQA-Main", "ARC-Challenge")
+        description: Brief description of the dataset
+        categories: List of available subject categories
+        total_questions: Total number of questions loaded
+        format_type: Question format (typically "multiple_choice")
+        difficulty_level: Complexity level (e.g., "graduate", "undergraduate")
+    """
+
+    name: str
+    description: str
+    categories: List[str]
+    total_questions: int
+    format_type: str
+    difficulty_level: str
+
+
+class DatasetInterface(ABC):
+    """Abstract base class for all dataset implementations."""
+
+    @abstractmethod
+    def load_dataset(
+        self,
+        categories: Optional[List[str]] = None,
+        samples_per_category: Optional[int] = None,
+        seed: int = 42,
+    ) -> Tuple[List[Question], DatasetInfo]:
+        """Load and return questions from the dataset.
+
+        Args:
+            categories: List of categories to filter by. If None, load all.
+            samples_per_category: Max samples per category. If None, load all.
+            seed: Random seed for reproducible sampling.
+
+        Returns:
+            Tuple of (questions_list, dataset_info)
+        """
+        pass
+
+    @abstractmethod
+    def get_available_categories(self) -> List[str]:
+        """Get list of all available categories in the dataset."""
+        pass
+
+    @abstractmethod
+    def format_prompt(self, question: Question, prompt_style: str = "plain") -> str:
+        """Format a question into a prompt string.
+
+        Args:
+            question: Question object to format
+            prompt_style: Style of prompt ("plain", "cot", "explicit_cot")
+
+        Returns:
+            Formatted prompt string
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def dataset_name(self) -> str:
+        """Return the name of this dataset."""
+        pass
+
+    @property
+    @abstractmethod
+    def supports_cot(self) -> bool:
+        """Return True if dataset has chain-of-thought content."""
+        pass
+
+
+class PromptFormatter:
+    """Utility class for formatting prompts consistently across datasets."""
+
+    @staticmethod
+    def get_dataset_specific_instructions(dataset_name: str, difficulty: str) -> str:
+        """Get dataset-specific instructions to improve accuracy."""
+        dataset_name = dataset_name.lower()
+        difficulty = difficulty.lower()
+
+        if "gpqa" in dataset_name:
+            return (
+                "- This is a graduate-level scientific question\n"
+                "- Consider the underlying scientific principles\n"
+                "- Eliminate obviously incorrect options first\n"
+            )
+        elif "truthfulqa" in dataset_name:
+            return (
+                "- This question may contain common misconceptions\n"
+                "- Be wary of answers that sound plausible but are incorrect\n"
+                "- Choose the most factually accurate option\n"
+            )
+        elif "hellaswag" in dataset_name:
+            return (
+                "- Choose the most natural and logical continuation\n"
+                "- Consider common sense and typical sequences of events\n"
+                "- Think about what would realistically happen next\n"
+            )
+        elif "commonsenseqa" in dataset_name:
+            return (
+                "- Apply common sense reasoning\n"
+                "- Consider everyday knowledge and experiences\n"
+                "- Think about typical cause-and-effect relationships\n"
+            )
+        elif "arc" in dataset_name:
+            return (
+                "- This is a science question requiring logical reasoning\n"
+                "- Apply scientific knowledge and principles\n"
+                "- Consider the most scientifically accurate answer\n"
+            )
+        elif "mmlu" in dataset_name:
+            return (
+                "- This requires specific domain knowledge\n"
+                "- Choose the most accurate and complete answer\n"
+                "- Consider technical precision and accuracy\n"
+            )
+        else:
+            return ""
+
+    @staticmethod
+    def get_letter_mapping() -> Dict[int, str]:
+        """Get A-Z letter mapping for options (supports up to 26 options)."""
+        return {
+            0: "A",
+            1: "B",
+            2: "C",
+            3: "D",
+            4: "E",
+            5: "F",
+            6: "G",
+            7: "H",
+            8: "I",
+            9: "J",
+            10: "K",
+            11: "L",
+            12: "M",
+            13: "N",
+            14: "O",
+            15: "P",
+            16: "Q",
+            17: "R",
+            18: "S",
+            19: "T",
+            20: "U",
+            21: "V",
+            22: "W",
+            23: "X",
+            24: "Y",
+            25: "Z",
+        }
+
+    @staticmethod
+    def format_options(options: List[str]) -> str:
+        """Format options list into lettered format."""
+        letter_mapping = PromptFormatter.get_letter_mapping()
+        formatted = ""
+        for i, option in enumerate(options):
+            if option.lower() != "n/a":
+                if i in letter_mapping:
+                    formatted += f"{letter_mapping[i]}) {option}\n"
+                else:
+                    # Fallback for options beyond Z (unlikely but safe)
+                    formatted += f"{i+1}.) {option}\n"
+        return formatted.rstrip()
+
+    @staticmethod
+    def format_plain_prompt(question: str, options: List[str]) -> str:
+        """Format a basic multiple choice prompt."""
+        formatted_options = PromptFormatter.format_options(options)
+        return (
+            f"Question: {question}\n\nOptions:\n{formatted_options}\n\n"
+            "Instructions:\n"
+            "- Read the question carefully\n"
+            "- Consider each option thoroughly\n"
+            "- Choose the single best answer\n"
+            "- Respond with ONLY the format: Answer: [letter]\n"
+            "- Do not include any other text after your answer\n\n"
+            "Your response:"
+        )
+
+    @staticmethod
+    def format_cot_prompt(question: str, options: List[str]) -> str:
+        """Format a chain-of-thought prompt."""
+        formatted_options = PromptFormatter.format_options(options)
+        return (
+            f"Question: {question}\n\nOptions:\n{formatted_options}\n\n"
+            "Instructions:\n"
+            "- Think through this step-by-step\n"
+            "- Analyze each option carefully\n"
+            "- Explain your reasoning briefly\n"
+            "- End with your final answer in the exact format: Answer: [letter]\n\n"
+            "Your response:"
+        )
+
+    @staticmethod
+    def format_explicit_cot_prompt(
+        question: str, options: List[str], cot_content: Optional[str]
+    ) -> str:
+        """Format a prompt with explicit CoT content."""
+        formatted_options = PromptFormatter.format_options(options)
+        cot_section = f"\nExplanation: {cot_content}\n" if cot_content else "\n"
+        return (
+            f"Question: {question}\n\nOptions:\n{formatted_options}"
+            f"{cot_section}\n"
+            "Instructions:\n"
+            "- Use the provided explanation as guidance\n"
+            "- Consider how it applies to each option\n"
+            "- Choose the best answer based on the reasoning\n"
+            "- Provide your final answer in the exact format: Answer: [letter]\n\n"
+            "Your response:"
+        )
+
+    @staticmethod
+    def format_enhanced_prompt(
+        question: str,
+        options: List[str],
+        dataset_name: str,
+        difficulty: str,
+        prompt_style: str = "plain",
+    ) -> str:
+        """Format an enhanced prompt with dataset-specific guidance."""
+        formatted_options = PromptFormatter.format_options(options)
+        dataset_instructions = PromptFormatter.get_dataset_specific_instructions(
+            dataset_name, difficulty
+        )
+
+        if prompt_style == "cot":
+            base_instructions = (
+                "Instructions:\n"
+                "- Think through this step-by-step\n"
+                "- Analyze each option carefully\n"
+            )
+            if dataset_instructions:
+                base_instructions += dataset_instructions
+            base_instructions += (
+                "- Explain your reasoning briefly\n"
+                "- End with your final answer in the exact format: Answer: [letter]\n\n"
+            )
+        else:  # plain
+            base_instructions = (
+                "Instructions:\n"
+                "- Read the question carefully\n"
+                "- Consider each option thoroughly\n"
+            )
+            if dataset_instructions:
+                base_instructions += dataset_instructions
+            base_instructions += (
+                "- Choose the single best answer\n"
+                "- Respond with ONLY the format: Answer: [letter]\n"
+                "- Do not include any other text after your answer\n\n"
+            )
+
+        return (
+            f"Question: {question}\n\nOptions:\n{formatted_options}\n\n"
+            f"{base_instructions}"
+            "Your response:"
+        )
+
+
+def questions_to_dataframe(questions: List[Question]) -> pd.DataFrame:
+    """Convert list of Question objects to pandas DataFrame for compatibility."""
+    records = []
+    for q in questions:
+        record = {
+            "question_id": q.question_id,
+            "category": q.category,
+            "question": q.question,
+            "options": q.options,
+            "answer": q.correct_answer,
+            "cot_content": q.cot_content,
+        }
+        # Add metadata fields if present
+        if q.metadata:
+            record.update(q.metadata)
+        records.append(record)
+    return pd.DataFrame(records)
+
+
+def dataframe_to_questions(df: pd.DataFrame) -> List[Question]:
+    """Convert pandas DataFrame back to list of Question objects."""
+    questions = []
+    for _, row in df.iterrows():
+        # Extract metadata (any columns not in the standard Question fields)
+        standard_fields = {
+            "question_id",
+            "category",
+            "question",
+            "options",
+            "answer",
+            "cot_content",
+        }
+        metadata = {
+            k: v for k, v in row.items() if k not in standard_fields and pd.notna(v)
+        }
+
+        question = Question(
+            question_id=str(row["question_id"]),
+            category=str(row["category"]),
+            question=str(row["question"]),
+            options=row["options"] if isinstance(row["options"], list) else [],
+            correct_answer=str(row["answer"]),
+            cot_content=(
+                row.get("cot_content") if pd.notna(row.get("cot_content")) else None
+            ),
+            metadata=metadata if metadata else None,
+        )
+        questions.append(question)
+    return questions
diff --git a/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py b/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py
new file mode 100644
index 00000000..6ad9d746
--- /dev/null
+++ b/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py
@@ -0,0 +1,851 @@
+"""
+Multi-Dataset Reasoning Benchmark
+
+A comprehensive evaluation framework for comparing semantic router performance
+against direct vLLM inference across various reasoning datasets.
+
+Features:
+- Dataset-agnostic architecture supporting MMLU, ARC, GPQA, TruthfulQA, CommonsenseQA, HellaSwag
+- Optimized token limits per dataset complexity
+- Multiple reasoning modes (NR, XC, NR_REASONING)
+- Structured response parsing with robust answer extraction
+- Comprehensive metrics and visualization
+"""
+
+import argparse
+import json
+import os
+import random
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, List, Optional, Tuple
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from openai import OpenAI
+from tqdm import tqdm
+
+from .dataset_factory import DatasetFactory, list_available_datasets
+from .dataset_interface import DatasetInfo, Question, questions_to_dataframe
+
+# Robust answer extraction patterns for structured response parsing
+ANSWER_PATTERN_PRIMARY = re.compile(r"(?:answer\s*:?\s*)([A-Z])", re.IGNORECASE)
+ANSWER_PATTERN_FINAL = re.compile(r"(?:final\s*answer\s*:?\s*)([A-Z])", re.IGNORECASE)
+ANSWER_PATTERN_CONCLUSION = re.compile(
+    r"(?:therefore|thus|so).*?([A-Z])", re.IGNORECASE
+)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Multi-Dataset Reasoning Benchmark: Comprehensive evaluation framework for semantic router vs direct vLLM"
+    )
+
+    # Dataset selection
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="mmlu",
+        help="Dataset to evaluate on. Use --list-datasets to see available options.",
+    )
+    parser.add_argument(
+        "--list-datasets",
+        action="store_true",
+        help="List all available datasets and exit",
+    )
+
+    # Semantic router configuration
+    parser.add_argument(
+        "--router-endpoint",
+        type=str,
+        default=os.environ.get("ROUTER_ENDPOINT", "http://127.0.0.1:8801/v1"),
+        help="Semantic router endpoint URL",
+    )
+    parser.add_argument(
+        "--router-api-key",
+        type=str,
+        default=os.environ.get(
+            "ROUTER_API_KEY", os.environ.get("OPENAI_API_KEY", "1234")
+        ),
+        help="API key for router endpoint",
+    )
+    parser.add_argument(
+        "--router-models",
+        type=str,
+        nargs="+",
+        default=["auto"],
+        help="Router models to evaluate (default: auto).",
+    )
+
+    # Direct vLLM configuration
+    parser.add_argument(
+        "--vllm-endpoint",
+        type=str,
+        default=os.environ.get("VLLM_ENDPOINT", ""),
+        help="Direct vLLM endpoint URL",
+    )
+    parser.add_argument(
+        "--vllm-api-key",
+        type=str,
+        default=os.environ.get("VLLM_API_KEY", os.environ.get("OPENAI_API_KEY", "")),
+        help="API key for vLLM endpoint",
+    )
+    parser.add_argument(
+        "--vllm-models",
+        type=str,
+        nargs="+",
+        default=[],
+        help="Direct vLLM models to evaluate (leave empty to fetch from endpoint).",
+    )
+
+    # vLLM reasoning modes
+    parser.add_argument(
+        "--vllm-exec-modes",
+        type=str,
+        nargs="+",
+        default=["NR", "XC"],
+        help="vLLM reasoning modes: NR (neutral), XC (chain-of-thought), NR_REASONING (reasoning-enabled)",
+    )
+    parser.add_argument(
+        "--run-router",
+        action="store_true",
+        help="Evaluate semantic router performance",
+    )
+    parser.add_argument(
+        "--run-vllm",
+        action="store_true",
+        help="Evaluate direct vLLM performance across multiple reasoning modes",
+    )
+
+    # Dataset filtering options
+    parser.add_argument(
+        "--categories",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of categories to evaluate. If not provided, all available categories will be used.",
+    )
+    parser.add_argument(
+        "--samples-per-category",
+        type=int,
+        default=5,
+        help="Number of questions to sample per category. If not provided, all questions will be used.",
+    )
+
+    # Execution options
+    parser.add_argument(
+        "--concurrent-requests",
+        type=int,
+        default=1,
+        help="Number of concurrent requests to make",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="results/reasonbench",
+        help="Directory to save results",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=None,
+        help="Maximum number of tokens to generate (default: dataset-optimal)",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.0,
+        help="Temperature for text generation",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducibility",
+    )
+    parser.add_argument(
+        "--ar-extra-body",
+        type=str,
+        default="",
+        help=(
+            'JSON string passed as extra_body for AR mode (e.g., \'{"reasoning":{"effort":"medium"}}\'). '
+            "If empty, AR modes are disabled."
+        ),
+    )
+    return parser.parse_args()
+
+
+def get_dataset_optimal_tokens(dataset_info):
+    """
+    Determine optimal token limit based on dataset complexity and reasoning requirements.
+
+    Token limits are optimized for structured response generation while maintaining
+    efficiency across different reasoning complexity levels.
+    """
+    dataset_name = dataset_info.name.lower()
+    difficulty = dataset_info.difficulty_level.lower()
+
+    # Optimized token limits per dataset
+    dataset_tokens = {
+        "gpqa": 500,  # Graduate-level scientific reasoning
+        "truthfulqa": 250,  # Misconception analysis
+        "hellaswag": 250,  # Natural continuation reasoning
+        "arc": 220,  # Elementary/middle school science
+        "commonsenseqa": 300,  # Common sense reasoning
+        "mmlu": 150 if difficulty == "undergraduate" else 200,  # Academic knowledge
+    }
+
+    # Find matching dataset
+    for dataset_key, tokens in dataset_tokens.items():
+        if dataset_key in dataset_name:
+            return tokens
+
+    # Default based on difficulty level
+    difficulty_tokens = {"graduate": 300, "hard": 300, "moderate": 200, "easy": 150}
+
+    return difficulty_tokens.get(difficulty, 200)
+
+
+def get_available_models(endpoint: str, api_key: str = "") -> List[str]:
+    """Get available models from an endpoint."""
+    client = OpenAI(base_url=endpoint, api_key=api_key or None)
+    try:
+        models = client.models.list()
+        return [m.id for m in models.data]
+    except Exception as e:
+        print(f"Error communicating with endpoint to list models: {e}")
+        return []
+
+
+def extract_answer(response: Any) -> Optional[str]:
+    """Extract answer from model response."""
+    # Normalize non-string responses into a string to be robust to providers
+    # that return structured content (e.g., lists of parts or dicts).
+    if response is None:
+        return None
+
+    if not isinstance(response, str):
+        try:
+            # Handle list-of-parts shapes
+            if isinstance(response, list):
+                parts: List[str] = []
+                for part in response:
+                    if isinstance(part, dict):
+                        if "text" in part and isinstance(part["text"], str):
+                            parts.append(part["text"])
+                        elif "content" in part and isinstance(part["content"], str):
+                            parts.append(part["content"])
+                        else:
+                            parts.append(str(part))
+                    else:
+                        parts.append(str(part))
+                response = "\n".join(parts)
+            # Handle dict shapes
+            elif isinstance(response, dict):
+                for key in ("content", "text", "reasoning_content"):
+                    val = response.get(key) if isinstance(response, dict) else None
+                    if isinstance(val, str) and val:
+                        response = val
+                        break
+                else:
+                    # Fallback to JSON stringification
+                    response = json.dumps(response, ensure_ascii=False)
+            else:
+                response = str(response)
+        except Exception:
+            response = str(response)
+
+    # Try multiple extraction patterns in order of preference
+    patterns = [ANSWER_PATTERN_PRIMARY, ANSWER_PATTERN_FINAL, ANSWER_PATTERN_CONCLUSION]
+
+    for pattern in patterns:
+        match = pattern.search(response)
+        if match:
+            return match.group(1).upper()
+
+    # Fallback 1: Look for standalone letters at end of response
+    lines = response.strip().split("\n")
+    for line in reversed(lines[-3:]):  # Check last 3 lines
+        line = line.strip()
+        if len(line) == 1 and line.upper() in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+            return line.upper()
+
+    # Fallback 2: Find last letter in entire response
+    for char in reversed(response):
+        if char.upper() in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+            return char.upper()
+
+    return None
+
+
+def call_model(
+    client: OpenAI,
+    model: str,
+    prompt: str,
+    max_tokens: int,
+    temperature: float,
+    extra_body: Optional[Dict[str, Any]] = None,
+) -> Tuple[str, bool, Optional[int], Optional[int], Optional[int]]:
+    """Call model with given parameters."""
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=max_tokens,
+            temperature=temperature,
+            extra_body=extra_body if extra_body else None,
+        )
+        # For reasoning models, content might be in reasoning_content instead of content
+        message = response.choices[0].message
+        text = message.content or getattr(message, "reasoning_content", None) or ""
+        usage = getattr(response, "usage", None)
+        prompt_tokens = getattr(usage, "prompt_tokens", None) if usage else None
+        completion_tokens = getattr(usage, "completion_tokens", None) if usage else None
+        total_tokens = getattr(usage, "total_tokens", None) if usage else None
+        return text, True, prompt_tokens, completion_tokens, total_tokens
+    except Exception as e:
+        print(f"Model call failed: {e}")
+        return "ERROR", False, None, None, None
+
+
+def build_extra_body_for_model(
+    model_name: str, reasoning: Optional[bool]
+) -> Optional[Dict[str, Any]]:
+    """Return an extra_body dict to toggle reasoning for a given model.
+
+    - DeepSeek v3.1: {"chat_template_kwargs": {"thinking": true/false}}
+    - GPT-OSS: {"reasoning_effort": "low|medium|high"} when ON; if not provided, then low
+    """
+    # reasoning: True -> ON, False -> OFF, None -> base (default behavior)
+
+    lower = model_name.lower()
+    if (("ds" in lower) or ("deepseek" in lower)) and (
+        "v31" in lower or "v3.1" in lower or "v3" in lower
+    ):
+        if reasoning is True:
+            return {"chat_template_kwargs": {"thinking": True}}
+        elif reasoning is False:
+            return {"chat_template_kwargs": {"thinking": False}}
+        else:  # reasoning is None (base mode)
+            # Base: do not set thinking for DeepSeek - let it use default behavior
+            return None
+
+    # Qwen3 family
+    if "qwen3" in lower:
+        if reasoning is True:
+            return {"chat_template_kwargs": {"enable_thinking": True}}
+        if reasoning is False:
+            return {"chat_template_kwargs": {"enable_thinking": False}}
+        return None
+
+    # GPT OSS family
+    if "gpt-oss" in lower or "openai/gpt-oss" in lower or "gpt_oss" in lower:
+        if reasoning is True:
+            return {"reasoning_effort": "high"}
+        elif reasoning is False:
+            return {"reasoning_effort": "low"}
+        else:  # reasoning is None (base mode)
+            # Base: do not set reasoning_effort - let it use default behavior
+            return None
+
+    return None
+
+
+def process_question_single(
+    client: OpenAI,
+    model: str,
+    question: Question,
+    dataset: Any,  # DatasetInterface
+    prompt_mode: str,
+    max_tokens: int,
+    temperature: float,
+    ar_extra_body: Optional[Dict[str, Any]] = None,
+    mode_label: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Process a single question with the model."""
+    # Format prompt based on mode
+    if prompt_mode == "XC":
+        prompt = dataset.format_prompt(question, "explicit_cot")
+        extra_body = None
+    elif prompt_mode == "AR":
+        prompt = dataset.format_prompt(question, "plain")
+        extra_body = ar_extra_body
+    else:  # NR or Router-Transparent
+        prompt = dataset.format_prompt(question, "plain")
+        extra_body = None
+
+    start_time = time.time()
+    response_text, success, prompt_tokens, completion_tokens, total_tokens = call_model(
+        client, model, prompt, max_tokens, temperature, extra_body=extra_body
+    )
+    end_time = time.time()
+
+    predicted_answer = extract_answer(response_text) if success else None
+
+    # Compare predicted answer with correct answer (handle both letter and index formats)
+    if predicted_answer and predicted_answer in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+        if isinstance(question.correct_answer, str):
+            # Dataset stores answer as letter (e.g., MMLU: "F")
+            is_correct = predicted_answer == question.correct_answer
+        elif isinstance(question.correct_answer, int):
+            # Dataset stores answer as index (e.g., CommonsenseQA: 1, ARC: 0)
+            predicted_idx = ord(predicted_answer) - ord("A")
+            is_correct = predicted_idx == question.correct_answer
+        else:
+            is_correct = False
+    else:
+        is_correct = False
+
+    return {
+        "mode": prompt_mode,
+        "mode_label": mode_label or prompt_mode,
+        "question_id": question.question_id,
+        "category": question.category,
+        "question": question.question,
+        "options": question.options,
+        "correct_answer": question.correct_answer,
+        "model_response": response_text,
+        "predicted_answer": predicted_answer,
+        "is_correct": is_correct,
+        "response_time": end_time - start_time,
+        "success": success,
+        "prompt_tokens": prompt_tokens,
+        "completion_tokens": completion_tokens,
+        "total_tokens": total_tokens,
+    }
+
+
+def evaluate_model_router_transparent(
+    questions: List[Question],
+    dataset: Any,  # DatasetInterface
+    model: str,
+    endpoint: str,
+    api_key: str,
+    concurrent_requests: int,
+    max_tokens: int,
+    temperature: float,
+) -> pd.DataFrame:
+    """Evaluate model in router-transparent mode."""
+    client = OpenAI(base_url=endpoint, api_key=api_key or None)
+    print(f"Using model: {model}, endpoint: {endpoint}")
+
+    results: List[Dict[str, Any]] = []
+
+    with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
+        futures = []
+        for question in questions:
+            futures.append(
+                executor.submit(
+                    process_question_single,
+                    client,
+                    model,
+                    question,
+                    dataset,
+                    "NR",
+                    max_tokens,
+                    temperature,
+                    None,
+                    mode_label="Router_NR",
+                )
+            )
+
+        for future in tqdm(
+            futures, total=len(futures), desc=f"Evaluating {model} (Router-Transparent)"
+        ):
+            results.append(future.result())
+
+    return pd.DataFrame(results)
+
+
+def evaluate_model_vllm_multimode(
+    questions: List[Question],
+    dataset: Any,  # DatasetInterface
+    model: str,
+    endpoint: str,
+    api_key: str,
+    concurrent_requests: int,
+    max_tokens: int,
+    temperature: float,
+    exec_modes: List[str],
+) -> pd.DataFrame:
+    """Run vLLM with 2-3 realistic reasoning scenarios.
+
+    The scenarios represent real-world router decision patterns:
+    1. NR - Plain prompt, no reasoning toggle (fast baseline) - ALWAYS included
+    2. XC - CoT prompt, no reasoning toggle (prompt-based reasoning) - ONLY if dataset has CoT
+    3. NR_REASONING - Plain prompt, reasoning toggle ON (model-based reasoning) - ALWAYS included
+    """
+    client = OpenAI(base_url=endpoint, api_key=api_key or "dummy-key")
+    print(f"Using vLLM model: {model}, endpoint: {endpoint}")
+
+    # Check if dataset has actual CoT content by examining sample questions
+    has_cot_content = any(
+        q.cot_content is not None and q.cot_content.strip() for q in questions[:10]
+    )
+
+    if has_cot_content:
+        print(f"  Dataset has CoT content - using 3 modes: NR, XC, NR_REASONING")
+    else:
+        print(
+            f"  Dataset lacks CoT content - using 2 modes: NR, NR_REASONING (skipping XC)"
+        )
+
+    results: List[Dict[str, Any]] = []
+
+    # Define mode variants based on model type and CoT availability
+    model_lower = model.lower()
+    is_deepseek_or_qwen = (
+        (("ds" in model_lower) or ("deepseek" in model_lower))
+        and ("v31" in model_lower or "v3.1" in model_lower or "v3" in model_lower)
+    ) or ("qwen3" in model_lower)
+
+    # Base modes (always included)
+    if is_deepseek_or_qwen:
+        mode_variants: List[Tuple[str, str, Optional[bool]]] = [
+            ("VLLM_NR", "NR", False),  # Plain prompt, reasoning OFF (baseline)
+            (
+                "VLLM_NR_REASONING",
+                "NR",
+                True,
+            ),  # Plain prompt, reasoning ON (model reasoning)
+        ]
+    else:
+        mode_variants: List[Tuple[str, str, Optional[bool]]] = [
+            ("VLLM_NR", "NR", None),  # Plain prompt, no toggle (baseline)
+            (
+                "VLLM_NR_REASONING",
+                "NR",
+                True,
+            ),  # Plain prompt, reasoning toggle ON (model reasoning)
+        ]
+
+    # Add XC mode only if dataset has CoT content
+    if has_cot_content:
+        if is_deepseek_or_qwen:
+            mode_variants.insert(
+                1, ("VLLM_XC", "XC", False)
+            )  # Insert between NR and NR_REASONING
+        else:
+            mode_variants.insert(
+                1, ("VLLM_XC", "XC", None)
+            )  # Insert between NR and NR_REASONING
+
+    def run_variants(q: Question) -> List[Dict[str, Any]]:
+        local_records: List[Dict[str, Any]] = []
+        for label, prompt_mode, reasoning_flag in mode_variants:
+            extra_body = build_extra_body_for_model(model, reasoning_flag)
+            # Debug: print extra_body for first question to verify configuration
+            if q == questions[0]:
+                print(
+                    f"  {label}: reasoning_flag={reasoning_flag}, extra_body={extra_body}"
+                )
+            rec = process_question_single(
+                client,
+                model,
+                q,
+                dataset,
+                prompt_mode,
+                max_tokens,
+                temperature,
+                ar_extra_body=extra_body,
+                mode_label=label,
+            )
+            local_records.append(rec)
+        return local_records
+
+    with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
+        futures = [executor.submit(run_variants, q) for q in questions]
+        for future in tqdm(
+            futures, total=len(futures), desc=f"Evaluating {model} (vLLM modes)"
+        ):
+            results.extend(future.result())
+
+    return pd.DataFrame(results)
+
+
+def analyze_results(results_df: pd.DataFrame) -> Dict[str, Any]:
+    """Analyze results and compute metrics."""
+    valid = results_df[results_df["success"]]
+    overall_acc = valid["is_correct"].mean() if not valid.empty else 0.0
+
+    category_metrics: Dict[str, Dict[str, Any]] = {}
+    for category in valid["category"].unique():
+        sub = valid[valid["category"] == category]
+        category_metrics[category] = {
+            "accuracy": float(sub["is_correct"].mean()) if not sub.empty else 0.0,
+            "avg_response_time": (
+                float(sub["response_time"].mean()) if not sub.empty else 0.0
+            ),
+            "avg_prompt_tokens": (
+                float(sub["prompt_tokens"].dropna().mean())
+                if not sub["prompt_tokens"].dropna().empty
+                else None
+            ),
+            "avg_completion_tokens": (
+                float(sub["completion_tokens"].dropna().mean())
+                if not sub["completion_tokens"].dropna().empty
+                else None
+            ),
+            "avg_total_tokens": (
+                float(sub["total_tokens"].dropna().mean())
+                if not sub["total_tokens"].dropna().empty
+                else None
+            ),
+        }
+
+    avg_latency = valid["response_time"].mean() if not valid.empty else 0.0
+    avg_prompt_tokens = (
+        valid["prompt_tokens"].dropna().mean() if not valid.empty else None
+    )
+    avg_completion_tokens = (
+        valid["completion_tokens"].dropna().mean() if not valid.empty else None
+    )
+    avg_total_tokens = (
+        valid["total_tokens"].dropna().mean() if not valid.empty else None
+    )
+
+    # Optional: metrics by mode_label
+    by_mode: Dict[str, Dict[str, Any]] = {}
+    if "mode_label" in valid.columns:
+        for label in valid["mode_label"].unique():
+            sub = valid[valid["mode_label"] == label]
+            by_mode[label] = {
+                "accuracy": float(sub["is_correct"].mean()) if not sub.empty else 0.0,
+                "avg_response_time": (
+                    float(sub["response_time"].mean()) if not sub.empty else 0.0
+                ),
+                "avg_prompt_tokens": (
+                    float(sub["prompt_tokens"].dropna().mean())
+                    if not sub["prompt_tokens"].dropna().empty
+                    else None
+                ),
+                "avg_completion_tokens": (
+                    float(sub["completion_tokens"].dropna().mean())
+                    if not sub["completion_tokens"].dropna().empty
+                    else None
+                ),
+                "avg_total_tokens": (
+                    float(sub["total_tokens"].dropna().mean())
+                    if not sub["total_tokens"].dropna().empty
+                    else None
+                ),
+            }
+
+    return {
+        "overall_accuracy": float(overall_acc),
+        "category_metrics": category_metrics,
+        "avg_response_time": float(avg_latency) if avg_latency is not None else 0.0,
+        "avg_prompt_tokens": (
+            float(avg_prompt_tokens) if avg_prompt_tokens is not None else None
+        ),
+        "avg_completion_tokens": (
+            float(avg_completion_tokens) if avg_completion_tokens is not None else None
+        ),
+        "avg_total_tokens": (
+            float(avg_total_tokens) if avg_total_tokens is not None else None
+        ),
+        "total_questions": int(len(results_df)),
+        "successful_queries": int(len(valid)),
+        "failed_queries": int(len(results_df) - len(valid)),
+        "by_mode": by_mode,
+    }
+
+
+def save_results(
+    results_df: pd.DataFrame,
+    analysis: Dict[str, Any],
+    model: str,
+    dataset_name: str,
+    output_dir: str,
+):
+    """Save results to files."""
+    model_name = model.replace("/", "_")
+    model_dir = os.path.join(output_dir, f"{dataset_name}_{model_name}")
+    os.makedirs(model_dir, exist_ok=True)
+
+    results_df.to_csv(os.path.join(model_dir, "detailed_results.csv"), index=False)
+
+    with open(os.path.join(model_dir, "summary.json"), "w") as f:
+        json.dump(
+            {
+                "model": model,
+                "dataset": dataset_name,
+                **analysis,
+            },
+            f,
+            indent=2,
+        )
+
+    print("\n" + "=" * 50)
+    print(f"Model: {model} | Dataset: {dataset_name}")
+    print(f"Overall Accuracy: {analysis['overall_accuracy']:.4f}")
+    print(f"Total Questions: {analysis['total_questions']}")
+    print(f"Successful Queries: {analysis['successful_queries']}")
+    print(f"Failed Queries: {analysis['failed_queries']}")
+    print(
+        f"Avg Latency: {analysis['avg_response_time']:.2f}s | Avg Total Tokens: {analysis['avg_total_tokens']}"
+    )
+    print("=" * 50 + "\n")
+
+    if "category_metrics" in analysis:
+        print("Category Metrics (acc | latency | total_tokens):")
+        printable = []
+        for category, met in analysis["category_metrics"].items():
+            printable.append((category, met.get("accuracy", 0.0)))
+        for category, acc in sorted(printable, key=lambda x: x[1], reverse=True):
+            m = analysis["category_metrics"][category]
+            print(
+                f"  {category}: acc={m['accuracy']:.4f}, latency={m['avg_response_time']:.2f}s, tokens={m['avg_total_tokens']}"
+            )
+        print()
+
+
+def main():
+    args = parse_args()
+
+    # Handle dataset listing
+    if args.list_datasets:
+        list_available_datasets()
+        return
+
+    # Set random seeds
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    # Load dataset
+    print(f"Loading dataset: {args.dataset}")
+    try:
+        dataset = DatasetFactory.create_dataset(args.dataset)
+        questions, dataset_info = dataset.load_dataset(
+            categories=args.categories,
+            samples_per_category=args.samples_per_category,
+            seed=args.seed,
+        )
+        print(
+            f"Dataset loaded: {len(questions)} questions across {len(dataset_info.categories)} categories"
+        )
+        print(f"Categories: {', '.join(dataset_info.categories)}")
+
+        # Check for empty dataset
+        if len(questions) == 0:
+            print(f"❌ No questions loaded from dataset '{args.dataset}'")
+            print("This could be due to:")
+            print("  - Dataset requiring authentication (gated dataset)")
+            print("  - Network connectivity issues")
+            print("  - Invalid dataset name or configuration")
+            print("\nTry a different dataset:")
+            list_available_datasets()
+            return
+
+    except Exception as e:
+        print(f"Error loading dataset '{args.dataset}': {e}")
+        print("\nAvailable datasets:")
+        list_available_datasets()
+        return
+
+    # Resolve endpoints and models
+    router_endpoint = (
+        args.router_endpoint
+        or os.environ.get("ROUTER_ENDPOINT")
+        or "http://127.0.0.1:8801/v1"
+    )
+    router_api_key = (
+        args.router_api_key
+        or os.environ.get("ROUTER_API_KEY")
+        or os.environ.get("OPENAI_API_KEY")
+        or "1234"
+    )
+
+    vllm_endpoint = args.vllm_endpoint or os.environ.get("VLLM_ENDPOINT", "")
+    vllm_api_key = (
+        args.vllm_api_key
+        or os.environ.get("VLLM_API_KEY")
+        or os.environ.get("OPENAI_API_KEY")
+        or ""
+    )
+
+    router_models = args.router_models
+    if router_models and len(router_models) == 1 and "," in router_models[0]:
+        router_models = router_models[0].split(",")
+    if not router_models or (len(router_models) == 1 and router_models[0] == "auto"):
+        print("Fetching available models from router endpoint...")
+        fetched_models = get_available_models(router_endpoint, router_api_key)
+        if fetched_models:
+            router_models = fetched_models
+        else:
+            print("No models returned from endpoint, using 'auto' as fallback")
+            router_models = ["auto"]
+
+    vllm_models = args.vllm_models
+    if vllm_models and len(vllm_models) == 1 and "," in vllm_models[0]:
+        vllm_models = vllm_models[0].split(",")
+    if not vllm_models and vllm_endpoint:
+        print("Fetching available models from vLLM endpoint...")
+        vllm_models = get_available_models(vllm_endpoint, vllm_api_key)
+
+    print(f"Router models: {router_models}")
+    print(f"vLLM models: {vllm_models}")
+
+    # Determine optimal token limit for this dataset
+    if args.max_tokens:
+        optimal_tokens = args.max_tokens
+        print(f"Using user-specified max_tokens: {optimal_tokens}")
+    else:
+        optimal_tokens = get_dataset_optimal_tokens(dataset_info)
+        print(
+            f"Using dataset-optimal max_tokens: {optimal_tokens} (for {dataset_info.name})"
+        )
+
+    # Router evaluation (NR-only)
+    if args.run_router and router_endpoint and router_models:
+        for model in router_models:
+            print(f"\nEvaluating router model: {model}")
+            rt_df = evaluate_model_router_transparent(
+                questions=questions,
+                dataset=dataset,
+                model=model,
+                endpoint=router_endpoint,
+                api_key=router_api_key,
+                concurrent_requests=args.concurrent_requests,
+                max_tokens=optimal_tokens,
+                temperature=args.temperature,
+            )
+            analysis = analyze_results(rt_df)
+            save_results(
+                results_df=rt_df,
+                analysis=analysis,
+                model=f"router::{model}",
+                dataset_name=dataset_info.name,
+                output_dir=args.output_dir,
+            )
+
+    # Direct vLLM evaluation (NR/XC with reasoning ON/OFF)
+    if args.run_vllm and vllm_endpoint and vllm_models:
+        for model in vllm_models:
+            print(f"\nEvaluating vLLM model: {model}")
+            vdf = evaluate_model_vllm_multimode(
+                questions=questions,
+                dataset=dataset,
+                model=model,
+                endpoint=vllm_endpoint,
+                api_key=vllm_api_key,
+                concurrent_requests=args.concurrent_requests,
+                max_tokens=optimal_tokens,
+                temperature=args.temperature,
+                exec_modes=args.vllm_exec_modes,
+            )
+            analysis = analyze_results(vdf)
+            save_results(
+                results_df=vdf,
+                analysis=analysis,
+                model=f"vllm::{model}",
+                dataset_name=dataset_info.name,
+                output_dir=args.output_dir,
+            )
+
+
+if __name__ == "__main__":
+    main()

From c707e8394b10e310fa962920e5dd7e23a4ca206d Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Sat, 13 Sep 2025 13:21:25 +0000
Subject: [PATCH 2/4] fix plot issue

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 bench/comprehensive_bench.sh                  | 27 +++++++---
 .../router_reason_bench_multi_dataset.py      | 54 ++++++++++++++++---
 2 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/bench/comprehensive_bench.sh b/bench/comprehensive_bench.sh
index bd262798..b2f00703 100755
--- a/bench/comprehensive_bench.sh
+++ b/bench/comprehensive_bench.sh
@@ -215,10 +215,10 @@ run_dataset_benchmark() {
         --router-models "$ROUTER_MODEL" \
         --output-dir "$OUTPUT_BASE/router_$dataset" \
         --seed 42
-    
+
     # Extract and save router metrics immediately
     extract_and_save_metrics "$dataset" "Router" "$OUTPUT_BASE/router_$dataset"
-    
+
     # vLLM benchmark  
     echo -e "${YELLOW}  ⚡ Running vLLM evaluation...${NC}"
     python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
@@ -245,13 +245,24 @@ generate_plots() {
     for dataset in "${!DATASET_CONFIGS[@]}"; do
         echo -e "${YELLOW}  📊 Plotting $dataset results...${NC}"
         
-        python3 -m vllm_semantic_router_bench.bench_plot \
-            --router-dir "$OUTPUT_BASE/router_$dataset" \
-            --vllm-dir "$OUTPUT_BASE/vllm_$dataset" \
-            --output-dir "$OUTPUT_BASE/plots_$dataset" \
-            --dataset-name "$dataset"
+        # Find the summary.json files
+        ROUTER_SUMMARY=$(find "$OUTPUT_BASE/router_$dataset" -name "summary.json" -type f | head -1)
+        VLLM_SUMMARY=$(find "$OUTPUT_BASE/vllm_$dataset" -name "summary.json" -type f | head -1)
+
+        if [[ -f "$VLLM_SUMMARY" ]]; then
+            PLOT_CMD="python3 -m vllm_semantic_router_bench.bench_plot --summary \"$VLLM_SUMMARY\" --out-dir \"$OUTPUT_BASE/plots_$dataset\""
+
+            if [[ -f "$ROUTER_SUMMARY" ]]; then
+                PLOT_CMD="$PLOT_CMD --router-summary \"$ROUTER_SUMMARY\""
+            fi
+
+            echo -e "${BLUE}    Running: $PLOT_CMD${NC}"
+            eval $PLOT_CMD
+        else
+            echo -e "${RED}    ⚠️  No vLLM summary.json found for $dataset, skipping plots${NC}"
+        fi
     done
-    
+
     echo -e "${GREEN}  ✅ All plots generated${NC}"
     echo ""
 }
diff --git a/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py b/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py
index 6ad9d746..710f5ae5 100644
--- a/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py
+++ b/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py
@@ -452,10 +452,31 @@ def evaluate_model_router_transparent(
                 )
             )
 
-        for future in tqdm(
-            futures, total=len(futures), desc=f"Evaluating {model} (Router-Transparent)"
-        ):
-            results.append(future.result())
+        try:
+            for future in tqdm(
+                futures,
+                total=len(futures),
+                desc=f"Evaluating {model} (Router-Transparent)",
+            ):
+                results.append(future.result())
+        except KeyboardInterrupt:
+            print(
+                "\n⚠️  Router evaluation interrupted by user. Saving partial results..."
+            )
+            # Cancel remaining futures
+            for future in futures:
+                future.cancel()
+            # Collect results from completed futures
+            for future in futures:
+                if future.done() and not future.cancelled():
+                    try:
+                        results.append(future.result())
+                    except Exception:
+                        pass  # Skip failed results
+            if not results:
+                print("❌ No router results to save.")
+                raise
+            print(f"✅ Saved {len(results)} partial router results.")
 
     return pd.DataFrame(results)
 
@@ -558,10 +579,27 @@ def run_variants(q: Question) -> List[Dict[str, Any]]:
 
     with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
         futures = [executor.submit(run_variants, q) for q in questions]
-        for future in tqdm(
-            futures, total=len(futures), desc=f"Evaluating {model} (vLLM modes)"
-        ):
-            results.extend(future.result())
+        try:
+            for future in tqdm(
+                futures, total=len(futures), desc=f"Evaluating {model} (vLLM modes)"
+            ):
+                results.extend(future.result())
+        except KeyboardInterrupt:
+            print("\n⚠️  Benchmark interrupted by user. Saving partial results...")
+            # Cancel remaining futures
+            for future in futures:
+                future.cancel()
+            # Collect results from completed futures
+            for future in futures:
+                if future.done() and not future.cancelled():
+                    try:
+                        results.extend(future.result())
+                    except Exception:
+                        pass  # Skip failed results
+            if not results:
+                print("❌ No results to save.")
+                raise
+            print(f"✅ Saved {len(results)} partial results.")
 
     return pd.DataFrame(results)
 

From ea44d10b27988b88878f3a233872ae0dc88969a3 Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Sat, 13 Sep 2025 21:30:45 +0000
Subject: [PATCH 3/4] larger max_token for reasoning support

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 .../router_reason_bench_multi_dataset.py           | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py b/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py
index 710f5ae5..270fe8ea 100644
--- a/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py
+++ b/bench/vllm_semantic_router_bench/router_reason_bench_multi_dataset.py
@@ -188,14 +188,14 @@ def get_dataset_optimal_tokens(dataset_info):
     dataset_name = dataset_info.name.lower()
     difficulty = dataset_info.difficulty_level.lower()
 
-    # Optimized token limits per dataset
+    # Optimized token limits per dataset (increased for reasoning mode support)
     dataset_tokens = {
-        "gpqa": 500,  # Graduate-level scientific reasoning
-        "truthfulqa": 250,  # Misconception analysis
-        "hellaswag": 250,  # Natural continuation reasoning
-        "arc": 220,  # Elementary/middle school science
-        "commonsenseqa": 300,  # Common sense reasoning
-        "mmlu": 150 if difficulty == "undergraduate" else 200,  # Academic knowledge
+        "gpqa": 1500,  # Graduate-level scientific reasoning
+        "truthfulqa": 800,  # Misconception analysis
+        "hellaswag": 800,  # Natural continuation reasoning
+        "arc": 800,  # Elementary/middle school science
+        "commonsenseqa": 1000,  # Common sense reasoning
+        "mmlu": 600 if difficulty == "undergraduate" else 800,  # Academic knowledge
     }
 
     # Find matching dataset

From 372de01ba66ca18cd57e804bea53f6020b5e4f5c Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Sun, 14 Sep 2025 01:59:31 +0000
Subject: [PATCH 4/4] use the models on vllm, not hardcoded

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 bench/comprehensive_bench.sh | 100 +++++++++++++++++++++++++++++++----
 1 file changed, 89 insertions(+), 11 deletions(-)

diff --git a/bench/comprehensive_bench.sh b/bench/comprehensive_bench.sh
index b2f00703..5054d537 100755
--- a/bench/comprehensive_bench.sh
+++ b/bench/comprehensive_bench.sh
@@ -6,14 +6,99 @@
 
 set -e
 
-# Configuration
+# Default Configuration
 VENV_PATH="../.venv"
 ROUTER_ENDPOINT="http://127.0.0.1:8801/v1"
 VLLM_ENDPOINT="http://127.0.0.1:8000/v1"
-VLLM_MODEL="openai/gpt-oss-20b"
+VLLM_MODEL=""  # Will be auto-detected from endpoint if not specified
 ROUTER_MODEL="auto"
 OUTPUT_BASE="results/comprehensive_research_$(date +%Y%m%d_%H%M%S)"
 
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --vllm-model)
+            VLLM_MODEL="$2"
+            shift 2
+            ;;
+        --vllm-endpoint)
+            VLLM_ENDPOINT="$2"
+            shift 2
+            ;;
+        --router-endpoint)
+            ROUTER_ENDPOINT="$2"
+            shift 2
+            ;;
+        --router-model)
+            ROUTER_MODEL="$2"
+            shift 2
+            ;;
+        --output-base)
+            OUTPUT_BASE="$2"
+            shift 2
+            ;;
+        --help|-h)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --vllm-model MODEL      Specify vLLM model (auto-detected if not provided)"
+            echo "  --vllm-endpoint URL     vLLM endpoint URL (default: http://127.0.0.1:8000/v1)"
+            echo "  --router-endpoint URL   Router endpoint URL (default: http://127.0.0.1:8801/v1)"
+            echo "  --router-model MODEL    Router model (default: auto)"
+            echo "  --output-base DIR       Output directory base (default: results/comprehensive_research_TIMESTAMP)"
+            echo "  --help, -h              Show this help message"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Auto-detect vLLM model if not specified
+if [[ -z "$VLLM_MODEL" ]]; then
+    echo -e "${BLUE}🔍 Auto-detecting vLLM model from endpoint...${NC}"
+    
+    # Try to fetch models from the vLLM endpoint
+    VLLM_MODELS_JSON=$(curl -s "$VLLM_ENDPOINT/models" 2>/dev/null || echo "")
+    
+    if [[ -n "$VLLM_MODELS_JSON" ]]; then
+        # Extract the first model ID from the JSON response
+        VLLM_MODEL=$(echo "$VLLM_MODELS_JSON" | python3 -c "
+import json
+import sys
+try:
+    data = json.load(sys.stdin)
+    if 'data' in data and len(data['data']) > 0:
+        print(data['data'][0]['id'])
+    else:
+        print('')
+except:
+    print('')
+" 2>/dev/null)
+        
+        if [[ -n "$VLLM_MODEL" ]]; then
+            echo -e "${GREEN}✅ Auto-detected vLLM model: $VLLM_MODEL${NC}"
+        else
+            echo -e "${RED}❌ Failed to parse models from endpoint response${NC}"
+            echo -e "${YELLOW}⚠️  Using fallback model: openai/gpt-oss-20b${NC}"
+            VLLM_MODEL="openai/gpt-oss-20b"
+        fi
+    else
+        echo -e "${RED}❌ Failed to fetch models from vLLM endpoint: $VLLM_ENDPOINT${NC}"
+        echo -e "${YELLOW}⚠️  Using fallback model: openai/gpt-oss-20b${NC}"
+        VLLM_MODEL="openai/gpt-oss-20b"
+    fi
+fi
+
 # Single persistent CSV file for all research results
 PERSISTENT_RESEARCH_CSV="results/research_results_master.csv"
 
@@ -28,13 +113,6 @@ declare -A DATASET_CONFIGS=(
     ["hellaswag"]=8      # ~50 activities × 8 = ~400 samples
 )
 
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-BLUE='\033[0;34m'
-YELLOW='\033[1;33m'
-NC='\033[0m' # No Color
-
 echo -e "${BLUE}🔬 COMPREHENSIVE MULTI-DATASET BENCHMARK FOR RESEARCH${NC}"
 echo -e "${BLUE}====================================================${NC}"
 echo ""
@@ -142,9 +220,9 @@ try:
         
         # Determine model name
         if '$mode' == 'router':
-            model_name = 'auto'
+            model_name = '$ROUTER_MODEL'
         else:
-            model_name = 'openai/gpt-oss-20b'
+            model_name = '$VLLM_MODEL'
         
         # For vLLM, we might have multiple modes (NR, NR_REASONING)
         if '$mode' == 'vllm' and 'mode' in df.columns: