vllm-project
diff --git a/‎.github/workflows/k8s-integration-test.yml‎
Lines changed: 616 additions & 0 deletions b/‎.github/workflows/k8s-integration-test.yml‎
Lines changed: 616 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 13 additions & 0 deletions b/‎README.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎config/config.development.yaml‎
Lines changed: 97 additions & 0 deletions b/‎config/config.development.yaml‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎config/config.e2e.yaml‎
Lines changed: 4 additions & 4 deletions b/‎config/config.e2e.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎config/config.production.yaml‎
Lines changed: 132 additions & 0 deletions b/‎config/config.production.yaml‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎config/config.yaml‎
Lines changed: 18 additions & 0 deletions b/‎config/config.yaml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎deploy/docker-compose.tracing.yaml‎
Lines changed: 55 additions & 0 deletions b/‎deploy/docker-compose.tracing.yaml‎
Lines changed: 55 additions & 0 deletions
@@ -62,6 +62,18 @@ Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts t
 
 Cache the semantic representation of the prompt so as to reduce the number of prompt tokens and improve the overall inference latency.
 
+### Distributed Tracing 🔍
+
+Comprehensive observability with OpenTelemetry distributed tracing provides fine-grained visibility into the request processing pipeline:
+
+- **Request Flow Tracing**: Track requests through classification, security checks, caching, and routing
+- **Performance Analysis**: Identify bottlenecks with detailed timing for each operation
+- **Security Monitoring**: Trace PII detection and jailbreak prevention operations
+- **Routing Decisions**: Understand why specific models were selected
+- **OpenTelemetry Standard**: Industry-standard tracing with support for Jaeger, Tempo, and other OTLP backends
+
+See [Distributed Tracing Guide](https://vllm-semantic-router.com/docs/tutorials/observability/distributed-tracing/) for complete setup instructions.
+
 ## Documentation 📖
 
 For comprehensive documentation including detailed setup instructions, architecture guides, and API references, visit:
@@ -74,6 +86,7 @@ The documentation includes:
 - **[System Architecture](https://vllm-semantic-router.com/docs/overview/architecture/system-architecture/)** - Technical deep dive
 - **[Model Training](https://vllm-semantic-router.com/docs/training/training-overview/)** - How classification models work
 - **[API Reference](https://vllm-semantic-router.com/docs/api/router/)** - Complete API documentation
+- **[Distributed Tracing](https://vllm-semantic-router.com/docs/tutorials/observability/distributed-tracing/)** - Observability and debugging guide
 
 ## Community 👋
 
 
@@ -0,0 +1,97 @@
+# Development Configuration Example with Stdout Tracing
+# This configuration enables distributed tracing with stdout exporter
+# for local development and debugging.
+
+bert_model:
+  model_id: sentence-transformers/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: true
+
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 0.8
+  max_entries: 100
+  ttl_seconds: 600
+  eviction_policy: "fifo"
+
+tools:
+  enabled: false
+  top_k: 3
+  similarity_threshold: 0.2
+  tools_db_path: "config/tools_db.json"
+  fallback_to_empty: true
+
+prompt_guard:
+  enabled: false
+
+vllm_endpoints:
+  - name: "local-endpoint"
+    address: "127.0.0.1"
+    port: 8000
+    models:
+      - "test-model"
+    weight: 1
+
+model_config:
+  "test-model":
+    pii_policy:
+      allow_by_default: true
+
+classifier:
+  category_model:
+    model_id: "models/category_classifier_modernbert-base_model"
+    use_modernbert: true
+    threshold: 0.6
+    use_cpu: true
+    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+
+categories:
+  - name: test
+    system_prompt: "You are a test assistant."
+    model_scores:
+      - model: test-model
+        score: 1.0
+        use_reasoning: false
+
+default_model: test-model
+
+api:
+  batch_classification:
+    max_batch_size: 10
+    metrics:
+      enabled: true
+
+# Observability Configuration - Development with Stdout
+observability:
+  tracing:
+    # Enable tracing for development/debugging
+    enabled: true
+    
+    # OpenTelemetry provider
+    provider: "opentelemetry"
+    
+    exporter:
+      # Stdout exporter prints traces to console (great for debugging)
+      type: "stdout"
+      
+      # No endpoint needed for stdout
+      # endpoint: ""
+      # insecure: true
+    
+    sampling:
+      # Always sample in development to see all traces
+      type: "always_on"
+      
+      # Rate not used for always_on
+      # rate: 1.0
+    
+    resource:
+      # Service name for trace identification
+      service_name: "vllm-semantic-router-dev"
+      
+      # Version for development
+      service_version: "dev"
+      
+      # Environment identifier
+      deployment_environment: "development"
@@ -61,14 +61,14 @@ model_config:
     reasoning_family: "qwen3"  # This model uses Qwen reasoning syntax
     preferred_endpoints: ["qwen-endpoint"]
     pii_policy:
-      allow_by_default: true
-      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]
+      allow_by_default: false  # Strict PII blocking model
+      pii_types_allowed: ["EMAIL_ADDRESS"]  # Only allow emails
   "Model-B":
     use_reasoning: false
     preferred_endpoints: ["tinyllama-endpoint"]
     pii_policy:
-      allow_by_default: true
-      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]
+      allow_by_default: true  # Permissive PII model for safe routing
+      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER", "US_SSN", "CREDIT_CARD"]
 
 # Classifier configuration for text classification
 classifier:
 
@@ -0,0 +1,132 @@
+# Production Configuration Example with OTLP Tracing
+# This configuration enables distributed tracing with OpenTelemetry OTLP exporter
+# for production deployment with Jaeger or other OTLP-compatible backends.
+
+bert_model:
+  model_id: sentence-transformers/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: true
+
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 0.8
+  max_entries: 1000
+  ttl_seconds: 3600
+  eviction_policy: "fifo"
+
+tools:
+  enabled: true
+  top_k: 3
+  similarity_threshold: 0.2
+  tools_db_path: "config/tools_db.json"
+  fallback_to_empty: true
+
+prompt_guard:
+  enabled: true
+  use_modernbert: true
+  model_id: "models/jailbreak_classifier_modernbert-base_model"
+  threshold: 0.7
+  use_cpu: true
+  jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
+
+vllm_endpoints:
+  - name: "endpoint1"
+    address: "127.0.0.1"
+    port: 8000
+    models:
+      - "openai/gpt-oss-20b"
+    weight: 1
+
+model_config:
+  "openai/gpt-oss-20b":
+    reasoning_family: "gpt-oss"
+    preferred_endpoints: ["endpoint1"]
+    pii_policy:
+      allow_by_default: true
+
+classifier:
+  category_model:
+    model_id: "models/category_classifier_modernbert-base_model"
+    use_modernbert: true
+    threshold: 0.6
+    use_cpu: true
+    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+  pii_model:
+    model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
+    use_modernbert: true
+    threshold: 0.7
+    use_cpu: true
+    pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
+
+categories:
+  - name: math
+    system_prompt: "You are a mathematics expert. Provide step-by-step solutions."
+    model_scores:
+      - model: openai/gpt-oss-20b
+        score: 1.0
+        use_reasoning: true
+  - name: other
+    system_prompt: "You are a helpful assistant."
+    model_scores:
+      - model: openai/gpt-oss-20b
+        score: 0.7
+        use_reasoning: false
+
+default_model: openai/gpt-oss-20b
+
+reasoning_families:
+  gpt-oss:
+    type: "reasoning_effort"
+    parameter: "reasoning_effort"
+
+default_reasoning_effort: high
+
+api:
+  batch_classification:
+    max_batch_size: 100
+    concurrency_threshold: 5
+    max_concurrency: 8
+    metrics:
+      enabled: true
+
+# Observability Configuration - Production with OTLP
+observability:
+  tracing:
+    # Enable distributed tracing for production monitoring
+    enabled: true
+    
+    # OpenTelemetry provider (standard implementation)
+    provider: "opentelemetry"
+    
+    exporter:
+      # OTLP exporter for Jaeger, Tempo, or other OTLP backends
+      type: "otlp"
+      
+      # Jaeger OTLP endpoint (default: 4317 for gRPC)
+      # For Jaeger: localhost:4317
+      # For Grafana Tempo: tempo:4317
+      # For Datadog: trace-agent:4317
+      endpoint: "jaeger:4317"
+      
+      # Use insecure connection (set to false in production with TLS)
+      insecure: true
+    
+    sampling:
+      # Probabilistic sampling for production (reduces overhead)
+      type: "probabilistic"
+      
+      # Sample 10% of requests (adjust based on traffic volume)
+      # Higher rates (0.5-1.0) for low traffic
+      # Lower rates (0.01-0.1) for high traffic
+      rate: 0.1
+    
+    resource:
+      # Service name for trace identification
+      service_name: "vllm-semantic-router"
+      
+      # Version for tracking deployments
+      service_version: "v0.1.0"
+      
+      # Environment identifier
+      deployment_environment: "production"
@@ -182,3 +182,21 @@ api:
       sample_rate: 1.0
       duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
       size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
+
+# Observability Configuration
+observability:
+  tracing:
+    enabled: false  # Enable distributed tracing (default: false)
+    provider: "opentelemetry"  # Provider: opentelemetry, openinference, openllmetry
+    exporter:
+      type: "stdout"  # Exporter: otlp, jaeger, zipkin, stdout
+      endpoint: "localhost:4317"  # OTLP endpoint (when type: otlp)
+      insecure: true  # Use insecure connection (no TLS)
+    sampling:
+      type: "always_on"  # Sampling: always_on, always_off, probabilistic
+      rate: 1.0  # Sampling rate for probabilistic (0.0-1.0)
+    resource:
+      service_name: "vllm-semantic-router"
+      service_version: "v0.1.0"
+      deployment_environment: "development"
+
@@ -0,0 +1,55 @@
+version: '3.8'
+
+services:
+  # Jaeger all-in-one for distributed tracing
+  jaeger:
+    image: jaegertracing/all-in-one:latest
+    container_name: jaeger
+    ports:
+      - "4317:4317"   # OTLP gRPC
+      - "4318:4318"   # OTLP HTTP
+      - "16686:16686" # Jaeger UI
+      - "14268:14268" # Jaeger collector
+    environment:
+      - COLLECTOR_OTLP_ENABLED=true
+    networks:
+      - router-network
+
+  # Semantic Router with tracing enabled
+  semantic-router:
+    image: vllm-semantic-router:latest
+    container_name: semantic-router
+    depends_on:
+      - jaeger
+    ports:
+      - "50051:50051" # gRPC ExtProc
+      - "8080:8080"   # Classification API
+      - "9190:9190"   # Metrics
+    volumes:
+      - ./config:/config
+    environment:
+      - CONFIG_PATH=/config/config.tracing.yaml
+    networks:
+      - router-network
+
+  # Grafana for visualization
+  grafana:
+    image: grafana/grafana:latest
+    container_name: grafana
+    ports:
+      - "3000:3000"
+    environment:
+      - GF_AUTH_ANONYMOUS_ENABLED=true
+      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+    volumes:
+      - ./grafana/provisioning:/etc/grafana/provisioning
+      - grafana-storage:/var/lib/grafana
+    networks:
+      - router-network
+
+networks:
+  router-network:
+    driver: bridge
+
+volumes:
+  grafana-storage: