vllm-project · JaredforReal · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
@@ -110,7 +110,7 @@ jobs:
           docker ps --filter "name=milvus-semantic-cache"
 
       - name: Run semantic router tests
-        run: make test
+        run: make test --debug=v
         env:
           CI: true
           CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }}
@@ -126,6 +126,7 @@ jobs:
           docker stop milvus-semantic-cache || true
           docker rm milvus-semantic-cache || true
           echo "Milvus container cleaned up"
+          SKIP_TOOL_CALL_TESTS: true
 
       - name: Upload test artifacts on failure
         if: failure()

@@ -0,0 +1,108 @@
+# Development Configuration Example with Stdout Tracing
+# This configuration enables distributed tracing with stdout exporter
+# for local development and debugging.
+
+bert_model:
+  model_id: models/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: true
+
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 0.8
+  max_entries: 100
+  ttl_seconds: 600
+  eviction_policy: "fifo"
+  use_hnsw: true # Enable HNSW for faster search
+  hnsw_m: 16
+  hnsw_ef_construction: 200
+
+tools:
+  enabled: false
+  top_k: 3
+  similarity_threshold: 0.2
+  tools_db_path: "config/tools_db.json"
+  fallback_to_empty: true
+
+prompt_guard:
+  enabled: false
+
+vllm_endpoints:
+  - name: "local-endpoint"
+    address: "127.0.0.1"
+    port: 8000
+    weight: 1
+
+model_config:
+  "test-model":
+    pii_policy:
+      allow_by_default: true
+
+classifier:
+  category_model:
+    model_id: "models/category_classifier_modernbert-base_model"
+    use_modernbert: true
+    threshold: 0.6
+    use_cpu: true
+    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+
+categories:
+  - name: test
+    system_prompt: "You are a test assistant."
+    # Example: Category-level cache settings
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.85
+    model_scores:
+      - model: test-model
+        score: 1.0
+        use_reasoning: false
+
+default_model: test-model
+
+# Enable OpenAI Responses API adapter (experimental)
+enable_responses_adapter: true
+
+# Auto model name for automatic model selection (optional)
+# Uncomment and set to customize the model name for automatic routing
+# auto_model_name: "MoM"
+
+api:
+  batch_classification:
+    max_batch_size: 10
+    metrics:
+      enabled: true
+
+# Observability Configuration - Development with Stdout
+observability:
+  tracing:
+    # Enable tracing for development/debugging
+    enabled: true
+
+    # OpenTelemetry provider
+    provider: "opentelemetry"
+
+    exporter:
+      # Stdout exporter prints traces to console (great for debugging)
+      type: "stdout"
+
+      # No endpoint needed for stdout
+      # endpoint: ""
+      # insecure: true
+
+    sampling:
+      # Always sample in development to see all traces
+      type: "always_on"
+
+      # Rate not used for always_on
+      # rate: 1.0
+
+    resource:
+      # Service name for trace identification
+      service_name: "vllm-semantic-router-dev"
+
+      # Version for development
+      service_version: "dev"
+
+      # Environment identifier
+      deployment_environment: "development"
@@ -24,7 +24,7 @@ semantic_cache:
   # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
   # Default: "bert" (fastest, lowest memory)
   embedding_model: "bert"
-
+  
 tools:
   enabled: true
   top_k: 3
@@ -480,6 +480,9 @@ reasoning_families:
 # Global default reasoning effort level
 default_reasoning_effort: high
 
+# Enable OpenAI Responses API adapter (experimental)
+enable_responses_adapter: true
+
 # API Configuration
 api:
   batch_classification: