Skip to content

Commit e54f55a

Browse files
committed
docs: update Kubernetes installation guides to use Helm with online values files
- Add Helm-based deployment for semantic router across all K8s guides - Replace relative paths with GitHub raw URLs for values and resource files - Update ai-gateway, aibrix, and production-stack documentation - Add cleanup sections with proper helm uninstall commands - Create semantic-router-values directories with configuration files - Remove Kustomize deployment options in favor of Helm-only approach Signed-off-by: bitliu <[email protected]>
1 parent 90ad936 commit e54f55a

File tree

5 files changed

+693
-27
lines changed

5 files changed

+693
-27
lines changed
Lines changed: 314 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,314 @@
1+
# Semantic Router Configuration for AI Gateway
2+
config:
3+
model_config:
4+
"base-model":
5+
reasoning_family: "qwen3" # This model uses Qwen-3 reasoning syntax
6+
# preferred_endpoints omitted - let upstream handle endpoint selection
7+
pii_policy:
8+
allow_by_default: false
9+
# Define available LoRA adapters for this base model
10+
# These names must match the LoRA modules registered with vLLM at startup
11+
loras:
12+
- name: "science-expert"
13+
description: "Specialized for science domains: biology, chemistry, physics, health, engineering"
14+
- name: "social-expert"
15+
description: "Optimized for social sciences: business, economics"
16+
- name: "math-expert"
17+
description: "Fine-tuned for mathematics and quantitative reasoning"
18+
- name: "law-expert"
19+
description: "Specialized for legal questions and law-related topics"
20+
- name: "humanities-expert"
21+
description: "Optimized for humanities: psychology, history, philosophy"
22+
- name: "general-expert"
23+
description: "General-purpose adapter for diverse topics"
24+
25+
# Categories with LoRA routing
26+
# Each category uses the base-model model with a specific LoRA adapter
27+
categories:
28+
- name: business
29+
system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations."
30+
# jailbreak_enabled: true # Optional: Override global jailbreak detection per category
31+
# jailbreak_threshold: 0.8 # Optional: Override global jailbreak threshold per category
32+
model_scores:
33+
- model: base-model # Base model name (for endpoint selection and PII policy)
34+
lora_name: social-expert # LoRA adapter name (used as final model name in request)
35+
score: 0.7
36+
use_reasoning: false # Business performs better without reasoning
37+
- name: law
38+
system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions. Provide accurate legal information and analysis while clearly stating that your responses are for informational purposes only and do not constitute legal advice. Always recommend consulting with qualified legal professionals for specific legal matters."
39+
model_scores:
40+
- model: base-model
41+
lora_name: law-expert
42+
score: 0.4
43+
use_reasoning: false
44+
- name: psychology
45+
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
46+
semantic_cache_enabled: true
47+
semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances
48+
model_scores:
49+
- model: base-model
50+
lora_name: humanities-expert
51+
score: 0.6
52+
use_reasoning: false
53+
- name: biology
54+
system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology. Explain biological concepts with scientific accuracy, use appropriate terminology, and provide examples from current research. Connect biological principles to real-world applications and emphasize the interconnectedness of biological systems."
55+
model_scores:
56+
- model: base-model
57+
lora_name: science-expert
58+
score: 0.9
59+
use_reasoning: false
60+
- name: chemistry
61+
system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations."
62+
model_scores:
63+
- model: base-model
64+
lora_name: science-expert
65+
score: 0.6
66+
use_reasoning: true # Enable reasoning for complex chemistry
67+
- name: history
68+
system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis."
69+
model_scores:
70+
- model: base-model
71+
lora_name: humanities-expert
72+
score: 0.7
73+
use_reasoning: false
74+
- name: other
75+
system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
76+
semantic_cache_enabled: true
77+
semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive
78+
model_scores:
79+
- model: base-model
80+
lora_name: general-expert
81+
score: 0.7
82+
use_reasoning: false
83+
- name: health
84+
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
85+
semantic_cache_enabled: true
86+
semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes
87+
model_scores:
88+
- model: base-model
89+
lora_name: science-expert
90+
score: 0.5
91+
use_reasoning: false
92+
- name: economics
93+
system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory. Analyze economic phenomena using established economic principles, provide data-driven insights, and explain complex economic concepts in accessible terms. Consider both theoretical frameworks and real-world applications in your responses."
94+
model_scores:
95+
- model: base-model
96+
lora_name: social-expert
97+
score: 1.0
98+
use_reasoning: false
99+
- name: math
100+
system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way."
101+
model_scores:
102+
- model: base-model
103+
lora_name: math-expert
104+
score: 1.0
105+
use_reasoning: true # Enable reasoning for complex math
106+
- name: physics
107+
system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate."
108+
model_scores:
109+
- model: base-model
110+
lora_name: science-expert
111+
score: 0.7
112+
use_reasoning: true # Enable reasoning for physics
113+
- name: computer science
114+
system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful."
115+
model_scores:
116+
- model: base-model
117+
lora_name: science-expert
118+
score: 0.6
119+
use_reasoning: false
120+
- name: philosophy
121+
system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought. Engage with complex philosophical questions by presenting multiple perspectives, analyzing arguments rigorously, and encouraging critical thinking. Draw connections between philosophical concepts and contemporary issues while maintaining intellectual honesty about the complexity and ongoing nature of philosophical debates."
122+
model_scores:
123+
- model: base-model
124+
lora_name: humanities-expert
125+
score: 0.5
126+
use_reasoning: false
127+
- name: engineering
128+
system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering. Apply engineering principles, design methodologies, and problem-solving approaches to provide practical solutions. Consider safety, efficiency, sustainability, and cost-effectiveness in your recommendations. Use technical precision while explaining concepts clearly, and emphasize the importance of proper engineering practices and standards."
129+
model_scores:
130+
- model: base-model
131+
lora_name: science-expert
132+
score: 0.7
133+
use_reasoning: false
134+
- name: thinking
135+
system_prompt: "You are a thinking expert, should think multiple steps before answering. Please answer the question step by step."
136+
model_scores:
137+
- model: general-expert
138+
score: 0.7
139+
use_reasoning: true
140+
141+
default_model: base-model
142+
143+
bert_model:
144+
model_id: models/all-MiniLM-L12-v2
145+
threshold: 0.6
146+
use_cpu: true
147+
148+
semantic_cache:
149+
enabled: true
150+
backend_type: "memory" # Options: "memory", "milvus", or "hybrid"
151+
similarity_threshold: 0.8
152+
max_entries: 1000 # Only applies to memory backend
153+
ttl_seconds: 3600
154+
eviction_policy: "fifo"
155+
# HNSW index configuration (for memory backend only)
156+
use_hnsw: true # Enable HNSW index for faster similarity search
157+
hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory)
158+
hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build)
159+
160+
# Hybrid cache configuration (when backend_type: "hybrid")
161+
# Combines in-memory HNSW for fast search with Milvus for scalable storage
162+
# max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000)
163+
# backend_config_path: "config/milvus.yaml" # Path to Milvus config
164+
165+
# Embedding model for semantic similarity matching
166+
# Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
167+
# Default: "bert" (fastest, lowest memory)
168+
embedding_model: "bert"
169+
170+
tools:
171+
enabled: true
172+
top_k: 3
173+
similarity_threshold: 0.2
174+
tools_db_path: "config/tools_db.json"
175+
fallback_to_empty: true
176+
177+
prompt_guard:
178+
enabled: true # Global default - can be overridden per category with jailbreak_enabled
179+
use_modernbert: true
180+
model_id: "models/jailbreak_classifier_modernbert-base_model"
181+
threshold: 0.7
182+
use_cpu: true
183+
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
184+
185+
# Classifier configuration
186+
classifier:
187+
category_model:
188+
model_id: "models/category_classifier_modernbert-base_model"
189+
use_modernbert: true
190+
threshold: 0.6
191+
use_cpu: true
192+
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
193+
pii_model:
194+
model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
195+
use_modernbert: true
196+
threshold: 0.7
197+
use_cpu: true
198+
pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
199+
200+
keyword_rules:
201+
- category: "thinking"
202+
operator: "OR"
203+
keywords: ["urgent", "immediate", "asap", "think", "careful"]
204+
case_sensitive: false
205+
206+
207+
# Router Configuration for Dual-Path Selection
208+
router:
209+
# High confidence threshold for automatic LoRA selection
210+
high_confidence_threshold: 0.99
211+
# Low latency threshold in milliseconds for LoRA path selection
212+
low_latency_threshold_ms: 2000
213+
# Baseline scores for path evaluation
214+
lora_baseline_score: 0.8
215+
traditional_baseline_score: 0.7
216+
embedding_baseline_score: 0.75
217+
# Success rate calculation threshold
218+
success_confidence_threshold: 0.8
219+
# Large batch size threshold for parallel processing
220+
large_batch_threshold: 4
221+
# Default performance metrics (milliseconds)
222+
lora_default_execution_time_ms: 1345
223+
traditional_default_execution_time_ms: 4567
224+
# Default processing requirements
225+
default_confidence_threshold: 0.95
226+
default_max_latency_ms: 5000
227+
default_batch_size: 4
228+
default_avg_execution_time_ms: 3000
229+
# Default confidence and success rates
230+
lora_default_confidence: 0.99
231+
traditional_default_confidence: 0.95
232+
lora_default_success_rate: 0.98
233+
traditional_default_success_rate: 0.95
234+
# Scoring weights for intelligent path selection (balanced approach)
235+
multi_task_lora_weight: 0.30 # LoRA advantage for multi-task processing
236+
single_task_traditional_weight: 0.30 # Traditional advantage for single tasks
237+
large_batch_lora_weight: 0.25 # LoRA advantage for large batches (≥4)
238+
small_batch_traditional_weight: 0.25 # Traditional advantage for single items
239+
medium_batch_weight: 0.10 # Neutral weight for medium batches (2-3)
240+
high_confidence_lora_weight: 0.25 # LoRA advantage for high confidence (≥0.99)
241+
low_confidence_traditional_weight: 0.25 # Traditional for lower confidence (≤0.9)
242+
low_latency_lora_weight: 0.30 # LoRA advantage for low latency (≤2000ms)
243+
high_latency_traditional_weight: 0.10 # Traditional acceptable for relaxed timing
244+
performance_history_weight: 0.20 # Historical performance comparison factor
245+
# Traditional model specific configurations
246+
traditional_bert_confidence_threshold: 0.95 # Traditional BERT confidence threshold
247+
traditional_modernbert_confidence_threshold: 0.8 # Traditional ModernBERT confidence threshold
248+
traditional_pii_detection_threshold: 0.5 # Traditional PII detection confidence threshold
249+
traditional_token_classification_threshold: 0.9 # Traditional token classification threshold
250+
traditional_dropout_prob: 0.1 # Traditional model dropout probability
251+
traditional_attention_dropout_prob: 0.1 # Traditional model attention dropout probability
252+
tie_break_confidence: 0.5 # Confidence value for tie-breaking situations
253+
254+
# Reasoning family configurations
255+
reasoning_families:
256+
deepseek:
257+
type: "chat_template_kwargs"
258+
parameter: "thinking"
259+
260+
qwen3:
261+
type: "chat_template_kwargs"
262+
parameter: "enable_thinking"
263+
264+
gpt-oss:
265+
type: "reasoning_effort"
266+
parameter: "reasoning_effort"
267+
gpt:
268+
type: "reasoning_effort"
269+
parameter: "reasoning_effort"
270+
271+
# Global default reasoning effort level
272+
default_reasoning_effort: high
273+
274+
# API Configuration
275+
api:
276+
batch_classification:
277+
max_batch_size: 100
278+
concurrency_threshold: 5
279+
max_concurrency: 8
280+
metrics:
281+
enabled: true
282+
detailed_goroutine_tracking: true
283+
high_resolution_timing: false
284+
sample_rate: 1.0
285+
duration_buckets:
286+
[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
287+
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
288+
289+
# Embedding Models Configuration
290+
# These models provide intelligent embedding generation with automatic routing:
291+
# - Qwen3-Embedding-0.6B: Up to 32K context, high quality,
292+
# - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128)
293+
embedding_models:
294+
qwen3_model_path: "models/Qwen3-Embedding-0.6B"
295+
gemma_model_path: "models/embeddinggemma-300m"
296+
use_cpu: true # Set to false for GPU acceleration (requires CUDA)
297+
298+
# Observability Configuration
299+
observability:
300+
tracing:
301+
enabled: false # Enable distributed tracing for docker-compose stack
302+
provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry
303+
exporter:
304+
type: "otlp" # Export spans to Jaeger (via OTLP gRPC)
305+
endpoint: "jaeger:4317" # Jaeger collector inside compose network
306+
insecure: true # Use insecure connection (no TLS)
307+
sampling:
308+
type: "always_on" # Sampling: always_on, always_off, probabilistic
309+
rate: 1.0 # Sampling rate for probabilistic (0.0-1.0)
310+
resource:
311+
service_name: "vllm-semantic-router"
312+
service_version: "v0.1.0"
313+
deployment_environment: "development"
314+

0 commit comments

Comments
 (0)