Skip to content

Commit c4a7046

Browse files
committed
Merge branch 'feat/reduce_ci_duration' of https://github.com/Aias00/semantic-router into feat/reduce_ci_duration
2 parents 8e61565 + d3d91a1 commit c4a7046

31 files changed

+6302
-100
lines changed

.github/workflows/k8s-integration-test.yml

Lines changed: 616 additions & 0 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,18 @@ Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts t
6262

6363
Cache the semantic representation of the prompt so as to reduce the number of prompt tokens and improve the overall inference latency.
6464

65+
### Distributed Tracing 🔍
66+
67+
Comprehensive observability with OpenTelemetry distributed tracing provides fine-grained visibility into the request processing pipeline:
68+
69+
- **Request Flow Tracing**: Track requests through classification, security checks, caching, and routing
70+
- **Performance Analysis**: Identify bottlenecks with detailed timing for each operation
71+
- **Security Monitoring**: Trace PII detection and jailbreak prevention operations
72+
- **Routing Decisions**: Understand why specific models were selected
73+
- **OpenTelemetry Standard**: Industry-standard tracing with support for Jaeger, Tempo, and other OTLP backends
74+
75+
See [Distributed Tracing Guide](https://vllm-semantic-router.com/docs/tutorials/observability/distributed-tracing/) for complete setup instructions.
76+
6577
## Documentation 📖
6678

6779
For comprehensive documentation including detailed setup instructions, architecture guides, and API references, visit:
@@ -74,6 +86,7 @@ The documentation includes:
7486
- **[System Architecture](https://vllm-semantic-router.com/docs/overview/architecture/system-architecture/)** - Technical deep dive
7587
- **[Model Training](https://vllm-semantic-router.com/docs/training/training-overview/)** - How classification models work
7688
- **[API Reference](https://vllm-semantic-router.com/docs/api/router/)** - Complete API documentation
89+
- **[Distributed Tracing](https://vllm-semantic-router.com/docs/tutorials/observability/distributed-tracing/)** - Observability and debugging guide
7790

7891
## Community 👋
7992

config/config.development.yaml

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# Development Configuration Example with Stdout Tracing
2+
# This configuration enables distributed tracing with stdout exporter
3+
# for local development and debugging.
4+
5+
bert_model:
6+
model_id: sentence-transformers/all-MiniLM-L12-v2
7+
threshold: 0.6
8+
use_cpu: true
9+
10+
semantic_cache:
11+
enabled: true
12+
backend_type: "memory"
13+
similarity_threshold: 0.8
14+
max_entries: 100
15+
ttl_seconds: 600
16+
eviction_policy: "fifo"
17+
18+
tools:
19+
enabled: false
20+
top_k: 3
21+
similarity_threshold: 0.2
22+
tools_db_path: "config/tools_db.json"
23+
fallback_to_empty: true
24+
25+
prompt_guard:
26+
enabled: false
27+
28+
vllm_endpoints:
29+
- name: "local-endpoint"
30+
address: "127.0.0.1"
31+
port: 8000
32+
models:
33+
- "test-model"
34+
weight: 1
35+
36+
model_config:
37+
"test-model":
38+
pii_policy:
39+
allow_by_default: true
40+
41+
classifier:
42+
category_model:
43+
model_id: "models/category_classifier_modernbert-base_model"
44+
use_modernbert: true
45+
threshold: 0.6
46+
use_cpu: true
47+
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
48+
49+
categories:
50+
- name: test
51+
system_prompt: "You are a test assistant."
52+
model_scores:
53+
- model: test-model
54+
score: 1.0
55+
use_reasoning: false
56+
57+
default_model: test-model
58+
59+
api:
60+
batch_classification:
61+
max_batch_size: 10
62+
metrics:
63+
enabled: true
64+
65+
# Observability Configuration - Development with Stdout
66+
observability:
67+
tracing:
68+
# Enable tracing for development/debugging
69+
enabled: true
70+
71+
# OpenTelemetry provider
72+
provider: "opentelemetry"
73+
74+
exporter:
75+
# Stdout exporter prints traces to console (great for debugging)
76+
type: "stdout"
77+
78+
# No endpoint needed for stdout
79+
# endpoint: ""
80+
# insecure: true
81+
82+
sampling:
83+
# Always sample in development to see all traces
84+
type: "always_on"
85+
86+
# Rate not used for always_on
87+
# rate: 1.0
88+
89+
resource:
90+
# Service name for trace identification
91+
service_name: "vllm-semantic-router-dev"
92+
93+
# Version for development
94+
service_version: "dev"
95+
96+
# Environment identifier
97+
deployment_environment: "development"

config/config.e2e.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,14 @@ model_config:
6161
reasoning_family: "qwen3" # This model uses Qwen reasoning syntax
6262
preferred_endpoints: ["qwen-endpoint"]
6363
pii_policy:
64-
allow_by_default: true
65-
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]
64+
allow_by_default: false # Strict PII blocking model
65+
pii_types_allowed: ["EMAIL_ADDRESS"] # Only allow emails
6666
"Model-B":
6767
use_reasoning: false
6868
preferred_endpoints: ["tinyllama-endpoint"]
6969
pii_policy:
70-
allow_by_default: true
71-
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]
70+
allow_by_default: true # Permissive PII model for safe routing
71+
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER", "US_SSN", "CREDIT_CARD"]
7272

7373
# Classifier configuration for text classification
7474
classifier:

config/config.production.yaml

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# Production Configuration Example with OTLP Tracing
2+
# This configuration enables distributed tracing with OpenTelemetry OTLP exporter
3+
# for production deployment with Jaeger or other OTLP-compatible backends.
4+
5+
bert_model:
6+
model_id: sentence-transformers/all-MiniLM-L12-v2
7+
threshold: 0.6
8+
use_cpu: true
9+
10+
semantic_cache:
11+
enabled: true
12+
backend_type: "memory"
13+
similarity_threshold: 0.8
14+
max_entries: 1000
15+
ttl_seconds: 3600
16+
eviction_policy: "fifo"
17+
18+
tools:
19+
enabled: true
20+
top_k: 3
21+
similarity_threshold: 0.2
22+
tools_db_path: "config/tools_db.json"
23+
fallback_to_empty: true
24+
25+
prompt_guard:
26+
enabled: true
27+
use_modernbert: true
28+
model_id: "models/jailbreak_classifier_modernbert-base_model"
29+
threshold: 0.7
30+
use_cpu: true
31+
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
32+
33+
vllm_endpoints:
34+
- name: "endpoint1"
35+
address: "127.0.0.1"
36+
port: 8000
37+
models:
38+
- "openai/gpt-oss-20b"
39+
weight: 1
40+
41+
model_config:
42+
"openai/gpt-oss-20b":
43+
reasoning_family: "gpt-oss"
44+
preferred_endpoints: ["endpoint1"]
45+
pii_policy:
46+
allow_by_default: true
47+
48+
classifier:
49+
category_model:
50+
model_id: "models/category_classifier_modernbert-base_model"
51+
use_modernbert: true
52+
threshold: 0.6
53+
use_cpu: true
54+
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
55+
pii_model:
56+
model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
57+
use_modernbert: true
58+
threshold: 0.7
59+
use_cpu: true
60+
pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
61+
62+
categories:
63+
- name: math
64+
system_prompt: "You are a mathematics expert. Provide step-by-step solutions."
65+
model_scores:
66+
- model: openai/gpt-oss-20b
67+
score: 1.0
68+
use_reasoning: true
69+
- name: other
70+
system_prompt: "You are a helpful assistant."
71+
model_scores:
72+
- model: openai/gpt-oss-20b
73+
score: 0.7
74+
use_reasoning: false
75+
76+
default_model: openai/gpt-oss-20b
77+
78+
reasoning_families:
79+
gpt-oss:
80+
type: "reasoning_effort"
81+
parameter: "reasoning_effort"
82+
83+
default_reasoning_effort: high
84+
85+
api:
86+
batch_classification:
87+
max_batch_size: 100
88+
concurrency_threshold: 5
89+
max_concurrency: 8
90+
metrics:
91+
enabled: true
92+
93+
# Observability Configuration - Production with OTLP
94+
observability:
95+
tracing:
96+
# Enable distributed tracing for production monitoring
97+
enabled: true
98+
99+
# OpenTelemetry provider (standard implementation)
100+
provider: "opentelemetry"
101+
102+
exporter:
103+
# OTLP exporter for Jaeger, Tempo, or other OTLP backends
104+
type: "otlp"
105+
106+
# Jaeger OTLP endpoint (default: 4317 for gRPC)
107+
# For Jaeger: localhost:4317
108+
# For Grafana Tempo: tempo:4317
109+
# For Datadog: trace-agent:4317
110+
endpoint: "jaeger:4317"
111+
112+
# Use insecure connection (set to false in production with TLS)
113+
insecure: true
114+
115+
sampling:
116+
# Probabilistic sampling for production (reduces overhead)
117+
type: "probabilistic"
118+
119+
# Sample 10% of requests (adjust based on traffic volume)
120+
# Higher rates (0.5-1.0) for low traffic
121+
# Lower rates (0.01-0.1) for high traffic
122+
rate: 0.1
123+
124+
resource:
125+
# Service name for trace identification
126+
service_name: "vllm-semantic-router"
127+
128+
# Version for tracking deployments
129+
service_version: "v0.1.0"
130+
131+
# Environment identifier
132+
deployment_environment: "production"

config/config.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,3 +182,21 @@ api:
182182
sample_rate: 1.0
183183
duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
184184
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
185+
186+
# Observability Configuration
187+
observability:
188+
tracing:
189+
enabled: false # Enable distributed tracing (default: false)
190+
provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry
191+
exporter:
192+
type: "stdout" # Exporter: otlp, jaeger, zipkin, stdout
193+
endpoint: "localhost:4317" # OTLP endpoint (when type: otlp)
194+
insecure: true # Use insecure connection (no TLS)
195+
sampling:
196+
type: "always_on" # Sampling: always_on, always_off, probabilistic
197+
rate: 1.0 # Sampling rate for probabilistic (0.0-1.0)
198+
resource:
199+
service_name: "vllm-semantic-router"
200+
service_version: "v0.1.0"
201+
deployment_environment: "development"
202+

deploy/docker-compose.tracing.yaml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
version: '3.8'
2+
3+
services:
4+
# Jaeger all-in-one for distributed tracing
5+
jaeger:
6+
image: jaegertracing/all-in-one:latest
7+
container_name: jaeger
8+
ports:
9+
- "4317:4317" # OTLP gRPC
10+
- "4318:4318" # OTLP HTTP
11+
- "16686:16686" # Jaeger UI
12+
- "14268:14268" # Jaeger collector
13+
environment:
14+
- COLLECTOR_OTLP_ENABLED=true
15+
networks:
16+
- router-network
17+
18+
# Semantic Router with tracing enabled
19+
semantic-router:
20+
image: vllm-semantic-router:latest
21+
container_name: semantic-router
22+
depends_on:
23+
- jaeger
24+
ports:
25+
- "50051:50051" # gRPC ExtProc
26+
- "8080:8080" # Classification API
27+
- "9190:9190" # Metrics
28+
volumes:
29+
- ./config:/config
30+
environment:
31+
- CONFIG_PATH=/config/config.tracing.yaml
32+
networks:
33+
- router-network
34+
35+
# Grafana for visualization
36+
grafana:
37+
image: grafana/grafana:latest
38+
container_name: grafana
39+
ports:
40+
- "3000:3000"
41+
environment:
42+
- GF_AUTH_ANONYMOUS_ENABLED=true
43+
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
44+
volumes:
45+
- ./grafana/provisioning:/etc/grafana/provisioning
46+
- grafana-storage:/var/lib/grafana
47+
networks:
48+
- router-network
49+
50+
networks:
51+
router-network:
52+
driver: bridge
53+
54+
volumes:
55+
grafana-storage:

0 commit comments

Comments
 (0)