vllm-project · rootfs · Oct 10, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025
@@ -0,0 +1,164 @@
+# Example Configuration for MCP-Based Category Classifier (HTTP Transport)
+#
+# This configuration demonstrates how to use an external MCP (Model Context Protocol)
+# service via HTTP for category classification instead of the built-in Candle/ModernBERT models.
+#
+# Use cases:
+# - Offload classification to a remote HTTP service
+# - Use custom classification models not supported in-tree
+# - Scale classification independently from the router
+# - Integrate with existing ML infrastructure via REST API
+#
+# Note: This example uses HTTP transport. The MCP server should expose an HTTP endpoint
+# that implements the MCP protocol (e.g., http://localhost:8080/mcp)
+
+# BERT model for semantic caching and tool selection
+bert_model:
+  model_id: "sentence-transformers/all-MiniLM-L6-v2"
+  threshold: 0.85
+  use_cpu: true
+
+# Classifier configuration
+classifier:
+  # Disable in-tree category classifier (leave model_id empty)
+  category_model:
+    model_id: ""  # Empty = disabled
+
+  # Enable MCP-based category classifier (HTTP transport only)
+  mcp_category_model:
+    enabled: true                    # Enable MCP classifier
+    transport_type: "http"           # HTTP transport
+    url: "http://localhost:8090/mcp" # MCP server endpoint
+
+    tool_name: "classify_text"       # MCP tool name to call
+    threshold: 0.6                   # Confidence threshold
+    timeout_seconds: 30              # Request timeout
+
+# Categories for routing queries
+# 
+# Categories are automatically loaded from MCP server via 'list_categories' tool.
+# The MCP server controls BOTH classification AND routing decisions.
+#
+# How it works:
+#   1. Router connects to MCP server at startup
+#   2. Calls 'list_categories' tool: MCP returns {"categories": ["business", "law", ...]}
+#   3. For each request, calls 'classify_text' tool which returns:
+#      {
+#        "class": 3,
+#        "confidence": 0.85,
+#        "model": "openai/gpt-oss-20b",        # MCP decides which model to use
+#        "use_reasoning": true                  # MCP decides whether to use reasoning
+#      }
+#   4. Router uses the model and reasoning settings from MCP response
+#
+# BENEFITS:
+#   - MCP server makes intelligent routing decisions per query
+#   - No hardcoded routing rules needed in config
+#   - MCP can adapt routing based on query complexity, content, etc.
+#   - Centralized routing logic in MCP server
+#
+# FALLBACK:
+#   - If MCP doesn't return model/use_reasoning, uses default_model below
+#   - Can also add category-specific overrides here if needed
+#
+categories: []
+
+# Default model to use when category can't be determined
+default_model: openai/gpt-oss-20b
+
+# vLLM endpoints configuration
+vllm_endpoints:
+  - name: endpoint1
+    address: 127.0.0.1
+    port: 8000
+    models:
+      - openai/gpt-oss-20b
+    weight: 1
+    health_check_path: /health
+
+# Model-specific configuration
+model_config:
+  openai/gpt-oss-20b:
+    reasoning_family: gpt-oss
+    preferred_endpoints:
+      - endpoint1
+    pii_policy:
+      allow_by_default: true
+
+# Reasoning family configurations
+reasoning_families:
+  deepseek:
+    type: chat_template_kwargs
+    parameter: thinking
+  qwen3:
+    type: chat_template_kwargs
+    parameter: enable_thinking
+  gpt-oss:
+    type: reasoning_effort
+    parameter: reasoning_effort
+  gpt:
+    type: reasoning_effort
+    parameter: reasoning_effort
+
+# Default reasoning effort level
+default_reasoning_effort: high
+
+# Tools configuration (optional)
+tools:
+  enabled: false
+  top_k: 5
+  similarity_threshold: 0.7
+  tools_db_path: "config/tools_db.json"
+  fallback_to_empty: true
+
+# API configuration
+api:
+  batch_classification:
+    max_batch_size: 100
+    concurrency_threshold: 5
+    max_concurrency: 8
+    metrics:
+      enabled: true
+      detailed_goroutine_tracking: true
+      high_resolution_timing: false
+      sample_rate: 1.0
+      duration_buckets:
+        - 0.001
+        - 0.005
+        - 0.01
+        - 0.025
+        - 0.05
+        - 0.1
+        - 0.25
+        - 0.5
+        - 1
+        - 2.5
+        - 5
+        - 10
+        - 30
+      size_buckets:
+        - 1
+        - 2
+        - 5
+        - 10
+        - 20
+        - 50
+        - 100
+        - 200
+
+# Observability configuration
+observability:
+  tracing:
+    enabled: false
+    provider: "opentelemetry"
+    exporter:
+      type: "otlp"
+      endpoint: "localhost:4317"
+      insecure: true
+    sampling:
+      type: "always_on"
+    resource:
+      service_name: "semantic-router"
+      service_version: "1.0.0"
+      deployment_environment: "production"
+
@@ -0,0 +1,108 @@
+# MCP Classification Server
+
+Example MCP server that provides text classification with intelligent routing for the semantic router.
+
+## Features
+
+- **Dynamic Categories**: Loaded from MCP server at runtime via `list_categories`
+- **Intelligent Routing**: Returns `model` and `use_reasoning` in classification response  
+- **Regex-Based**: Simple pattern matching (replace with ML models for production)
+- **Dual Transport**: Supports both HTTP and stdio
+
+## Categories
+
+| Index | Category | Example Keywords |
+|-------|----------|------------------|
+| 0 | math | calculate, equation, formula, integral |
+| 1 | science | physics, chemistry, biology, atom, DNA |
+| 2 | technology | computer, programming, AI, cloud |
+| 3 | history | ancient, war, empire, civilization |
+| 4 | general | Catch-all for other queries |
+
+## Quick Start
+
+```bash
+# Install dependencies
+pip install -r requirements.txt
+
+# HTTP mode (for semantic router)
+python server.py --http --port 8090
+
+# Stdio mode (for MCP clients)
+python server.py
+```
+
+**Test the server:**
+
+```bash
+curl http://localhost:8090/health
+# → {"status": "ok", "categories": ["math", "science", "technology", "history", "general"]}
+```
+
+## Configuration
+
+**Router config (`config-mcp-classifier-example.yaml`):**
+
+```yaml
+classifier:
+  category_model:
+    model_id: ""  # Empty = use MCP
+
+  mcp_category_model:
+    enabled: true
+    transport_type: "http"
+    url: "http://localhost:8090/mcp"
+    tool_name: "classify_text"
+    threshold: 0.6
+    timeout_seconds: 30
+
+categories: []  # Loaded dynamically from MCP
+default_model: openai/gpt-oss-20b
+```
+
+## How It Works
+
+**Intelligent Routing Rules:**
+
+- Long query (>20 words) + complex words (`why`, `how`, `explain`) → `use_reasoning: true`
+- Math + short query → `use_reasoning: false`  
+- High confidence (>0.9) → `use_reasoning: false`
+- Low confidence (<0.6) → `use_reasoning: true`
+- Default → `use_reasoning: true`
+
+**Response Format:**
+
+```json
+{
+  "class": 1,
+  "confidence": 0.85,
+  "model": "openai/gpt-oss-20b",
+  "use_reasoning": true
+}
+```
+
+## Customization
+
+Edit `CATEGORIES` to add categories:
+
+```python
+CATEGORIES = {
+    "your_category": {
+        "patterns": [r"\b(keyword1|keyword2)\b"],
+        "description": "Your description"
+    }
+}
+```
+
+Edit `decide_routing()` for custom routing logic:
+
+```python
+def decide_routing(text, category, confidence):
+    if category == "math":
+        return "deepseek/deepseek-math", False
+    return "openai/gpt-oss-20b", True
+```
+
+## License
+
+MIT
@@ -0,0 +1,2 @@
+mcp>=1.0.0
+aiohttp>=3.9.0