Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts t

Cache the semantic representation of the prompt so as to reduce the number of prompt tokens and improve the overall inference latency.

### Distributed Tracing 🔍

Comprehensive observability with OpenTelemetry distributed tracing provides fine-grained visibility into the request processing pipeline:

- **Request Flow Tracing**: Track requests through classification, security checks, caching, and routing
- **Performance Analysis**: Identify bottlenecks with detailed timing for each operation
- **Security Monitoring**: Trace PII detection and jailbreak prevention operations
- **Routing Decisions**: Understand why specific models were selected
- **OpenTelemetry Standard**: Industry-standard tracing with support for Jaeger, Tempo, and other OTLP backends

See [Distributed Tracing Guide](https://vllm-semantic-router.com/docs/tutorials/observability/distributed-tracing/) for complete setup instructions.

## Documentation 📖

For comprehensive documentation including detailed setup instructions, architecture guides, and API references, visit:
Expand All @@ -74,6 +86,7 @@ The documentation includes:
- **[System Architecture](https://vllm-semantic-router.com/docs/overview/architecture/system-architecture/)** - Technical deep dive
- **[Model Training](https://vllm-semantic-router.com/docs/training/training-overview/)** - How classification models work
- **[API Reference](https://vllm-semantic-router.com/docs/api/router/)** - Complete API documentation
- **[Distributed Tracing](https://vllm-semantic-router.com/docs/tutorials/observability/distributed-tracing/)** - Observability and debugging guide

## Community 👋

Expand Down
97 changes: 97 additions & 0 deletions config/config.development.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Development Configuration Example with Stdout Tracing
# This configuration enables distributed tracing with stdout exporter
# for local development and debugging.

bert_model:
model_id: sentence-transformers/all-MiniLM-L12-v2
threshold: 0.6
use_cpu: true

semantic_cache:
enabled: true
backend_type: "memory"
similarity_threshold: 0.8
max_entries: 100
ttl_seconds: 600
eviction_policy: "fifo"

tools:
enabled: false
top_k: 3
similarity_threshold: 0.2
tools_db_path: "config/tools_db.json"
fallback_to_empty: true

prompt_guard:
enabled: false

vllm_endpoints:
- name: "local-endpoint"
address: "127.0.0.1"
port: 8000
models:
- "test-model"
weight: 1

model_config:
"test-model":
pii_policy:
allow_by_default: true

classifier:
category_model:
model_id: "models/category_classifier_modernbert-base_model"
use_modernbert: true
threshold: 0.6
use_cpu: true
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"

categories:
- name: test
system_prompt: "You are a test assistant."
model_scores:
- model: test-model
score: 1.0
use_reasoning: false

default_model: test-model

api:
batch_classification:
max_batch_size: 10
metrics:
enabled: true

# Observability Configuration - Development with Stdout
observability:
tracing:
# Enable tracing for development/debugging
enabled: true

# OpenTelemetry provider
provider: "opentelemetry"

exporter:
# Stdout exporter prints traces to console (great for debugging)
type: "stdout"

# No endpoint needed for stdout
# endpoint: ""
# insecure: true

sampling:
# Always sample in development to see all traces
type: "always_on"

# Rate not used for always_on
# rate: 1.0

resource:
# Service name for trace identification
service_name: "vllm-semantic-router-dev"

# Version for development
service_version: "dev"

# Environment identifier
deployment_environment: "development"
132 changes: 132 additions & 0 deletions config/config.production.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# Production Configuration Example with OTLP Tracing
# This configuration enables distributed tracing with OpenTelemetry OTLP exporter
# for production deployment with Jaeger or other OTLP-compatible backends.

bert_model:
model_id: sentence-transformers/all-MiniLM-L12-v2
threshold: 0.6
use_cpu: true

semantic_cache:
enabled: true
backend_type: "memory"
similarity_threshold: 0.8
max_entries: 1000
ttl_seconds: 3600
eviction_policy: "fifo"

tools:
enabled: true
top_k: 3
similarity_threshold: 0.2
tools_db_path: "config/tools_db.json"
fallback_to_empty: true

prompt_guard:
enabled: true
use_modernbert: true
model_id: "models/jailbreak_classifier_modernbert-base_model"
threshold: 0.7
use_cpu: true
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"

vllm_endpoints:
- name: "endpoint1"
address: "127.0.0.1"
port: 8000
models:
- "openai/gpt-oss-20b"
weight: 1

model_config:
"openai/gpt-oss-20b":
reasoning_family: "gpt-oss"
preferred_endpoints: ["endpoint1"]
pii_policy:
allow_by_default: true

classifier:
category_model:
model_id: "models/category_classifier_modernbert-base_model"
use_modernbert: true
threshold: 0.6
use_cpu: true
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
pii_model:
model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
use_modernbert: true
threshold: 0.7
use_cpu: true
pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"

categories:
- name: math
system_prompt: "You are a mathematics expert. Provide step-by-step solutions."
model_scores:
- model: openai/gpt-oss-20b
score: 1.0
use_reasoning: true
- name: other
system_prompt: "You are a helpful assistant."
model_scores:
- model: openai/gpt-oss-20b
score: 0.7
use_reasoning: false

default_model: openai/gpt-oss-20b

reasoning_families:
gpt-oss:
type: "reasoning_effort"
parameter: "reasoning_effort"

default_reasoning_effort: high

api:
batch_classification:
max_batch_size: 100
concurrency_threshold: 5
max_concurrency: 8
metrics:
enabled: true

# Observability Configuration - Production with OTLP
observability:
tracing:
# Enable distributed tracing for production monitoring
enabled: true

# OpenTelemetry provider (standard implementation)
provider: "opentelemetry"

exporter:
# OTLP exporter for Jaeger, Tempo, or other OTLP backends
type: "otlp"

# Jaeger OTLP endpoint (default: 4317 for gRPC)
# For Jaeger: localhost:4317
# For Grafana Tempo: tempo:4317
# For Datadog: trace-agent:4317
endpoint: "jaeger:4317"

# Use insecure connection (set to false in production with TLS)
insecure: true

sampling:
# Probabilistic sampling for production (reduces overhead)
type: "probabilistic"

# Sample 10% of requests (adjust based on traffic volume)
# Higher rates (0.5-1.0) for low traffic
# Lower rates (0.01-0.1) for high traffic
rate: 0.1

resource:
# Service name for trace identification
service_name: "vllm-semantic-router"

# Version for tracking deployments
service_version: "v0.1.0"

# Environment identifier
deployment_environment: "production"
18 changes: 18 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,21 @@ api:
sample_rate: 1.0
duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]

# Observability Configuration
observability:
tracing:
enabled: false # Enable distributed tracing (default: false)
provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry
exporter:
type: "stdout" # Exporter: otlp, jaeger, zipkin, stdout
endpoint: "localhost:4317" # OTLP endpoint (when type: otlp)
insecure: true # Use insecure connection (no TLS)
sampling:
type: "always_on" # Sampling: always_on, always_off, probabilistic
rate: 1.0 # Sampling rate for probabilistic (0.0-1.0)
resource:
service_name: "vllm-semantic-router"
service_version: "v0.1.0"
deployment_environment: "development"

55 changes: 55 additions & 0 deletions deploy/docker-compose.tracing.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
version: '3.8'

services:
# Jaeger all-in-one for distributed tracing
jaeger:
image: jaegertracing/all-in-one:latest
container_name: jaeger
ports:
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
- "16686:16686" # Jaeger UI
- "14268:14268" # Jaeger collector
environment:
- COLLECTOR_OTLP_ENABLED=true
networks:
- router-network

# Semantic Router with tracing enabled
semantic-router:
image: vllm-semantic-router:latest
container_name: semantic-router
depends_on:
- jaeger
ports:
- "50051:50051" # gRPC ExtProc
- "8080:8080" # Classification API
- "9190:9190" # Metrics
volumes:
- ./config:/config
environment:
- CONFIG_PATH=/config/config.tracing.yaml
networks:
- router-network

# Grafana for visualization
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- "3000:3000"
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning
- grafana-storage:/var/lib/grafana
networks:
- router-network

networks:
router-network:
driver: bridge

volumes:
grafana-storage:
Loading
Loading