Skip to content

Commit c3ce62e

Browse files
authored
Adjust Istio config to latest architecture(#711)
templating logic of the OPENAI_API_KEY Signed-off-by: Sanjeev Rampal <[email protected]>
1 parent 7957612 commit c3ce62e

File tree

6 files changed

+894
-402
lines changed

6 files changed

+894
-402
lines changed

deploy/kubernetes/istio/config.yaml

Lines changed: 138 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,10 @@
1-
bert_model:
2-
model_id: models/all-MiniLM-L12-v2
3-
threshold: 0.6
4-
use_cpu: true
5-
6-
semantic_cache:
7-
enabled: false
8-
backend_type: "memory" # Options: "memory" or "milvus"
9-
similarity_threshold: 0.8
10-
max_entries: 1000 # Only applies to memory backend
11-
ttl_seconds: 3600
12-
eviction_policy: "fifo"
13-
# Embedding model for semantic similarity matching
14-
# Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
15-
embedding_model: "bert" # Default: BERT (fastest, lowest memory for Kubernetes)
16-
17-
tools:
18-
enabled: false
19-
top_k: 3
20-
similarity_threshold: 0.2
21-
tools_db_path: "config/tools_db.json"
22-
fallback_to_empty: true
23-
24-
prompt_guard:
25-
enabled: false # Global default - can be overridden per category with jailbreak_enabled
26-
use_modernbert: true
27-
model_id: "models/jailbreak_classifier_modernbert-base_model"
28-
threshold: 0.7
29-
use_cpu: true
30-
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
31-
32-
# vLLM Endpoints Configuration
33-
# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6)
34-
# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1
35-
# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
36-
vllm_endpoints:
37-
- name: "endpoint1"
38-
address: "10.98.150.102" # Static IPv4 of llama3-8b k8s service
39-
port: 80
40-
weight: 1
41-
- name: "endpoint2"
42-
address: "10.98.118.242" # Static IPv4 of phi4-mini k8s service
43-
port: 80
44-
weight: 1
45-
461
model_config:
472
"llama3-8b":
48-
# reasoning_family: "" # This model uses Qwen-3 reasoning syntax
49-
preferred_endpoints: ["endpoint1"]
503
allow_by_default: true
514
"phi4-mini":
52-
# reasoning_family: "" # This model uses Qwen-3 reasoning syntax
53-
preferred_endpoints: ["endpoint2"]
545
allow_by_default: true
556

56-
# Classifier configuration
57-
classifier:
58-
category_model:
59-
model_id: "models/category_classifier_modernbert-base_model"
60-
use_modernbert: true
61-
threshold: 0.6
62-
use_cpu: true
63-
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
64-
pii_model:
65-
model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
66-
use_modernbert: true
67-
threshold: 0.7
68-
use_cpu: true
69-
pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
7+
default_model: "llama3-8b"
708

719
# Categories - now only contain metadata for domain classification
7210
categories:
@@ -101,7 +39,7 @@ decisions:
10139
plugins:
10240
- type: "system_prompt"
10341
configuration:
104-
enabled: true
42+
enabled: false
10543
system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations."
10644
mode: "replace"
10745
- name: law
@@ -118,7 +56,7 @@ decisions:
11856
plugins:
11957
- type: "system_prompt"
12058
configuration:
121-
enabled: true
59+
enabled: false
12260
system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions. Provide accurate legal information and analysis while clearly stating that your responses are for informational purposes only and do not constitute legal advice. Always recommend consulting with qualified legal professionals for specific legal matters."
12361
mode: "replace"
12462
- name: psychology
@@ -135,12 +73,12 @@ decisions:
13573
plugins:
13674
- type: "system_prompt"
13775
configuration:
138-
enabled: true
76+
enabled: false
13977
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
14078
mode: "replace"
14179
- type: "semantic-cache"
14280
configuration:
143-
enabled: true
81+
enabled: false
14482
similarity_threshold: 0.92
14583
- name: biology
14684
description: "Route biology queries"
@@ -156,7 +94,7 @@ decisions:
15694
plugins:
15795
- type: "system_prompt"
15896
configuration:
159-
enabled: true
97+
enabled: false
16098
system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology. Explain biological concepts with scientific accuracy, use appropriate terminology, and provide examples from current research. Connect biological principles to real-world applications and emphasize the interconnectedness of biological systems."
16199
mode: "replace"
162100
- name: chemistry
@@ -169,11 +107,11 @@ decisions:
169107
name: "chemistry"
170108
modelRefs:
171109
- model: llama3-8b
172-
use_reasoning: false
110+
use_reasoning: true
173111
plugins:
174112
- type: "system_prompt"
175113
configuration:
176-
enabled: true
114+
enabled: false
177115
system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations."
178116
mode: "replace"
179117
- name: history
@@ -190,7 +128,7 @@ decisions:
190128
plugins:
191129
- type: "system_prompt"
192130
configuration:
193-
enabled: true
131+
enabled: false
194132
system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis."
195133
mode: "replace"
196134
- name: other
@@ -207,12 +145,12 @@ decisions:
207145
plugins:
208146
- type: "system_prompt"
209147
configuration:
210-
enabled: true
148+
enabled: false
211149
system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
212150
mode: "replace"
213151
- type: "semantic-cache"
214152
configuration:
215-
enabled: true
153+
enabled: false
216154
similarity_threshold: 0.75
217155
- name: health
218156
description: "Route health and medical queries"
@@ -228,12 +166,12 @@ decisions:
228166
plugins:
229167
- type: "system_prompt"
230168
configuration:
231-
enabled: true
169+
enabled: false
232170
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
233171
mode: "replace"
234172
- type: "semantic-cache"
235173
configuration:
236-
enabled: true
174+
enabled: false
237175
similarity_threshold: 0.95
238176
- name: economics
239177
description: "Route economics queries"
@@ -249,7 +187,7 @@ decisions:
249187
plugins:
250188
- type: "system_prompt"
251189
configuration:
252-
enabled: true
190+
enabled: false
253191
system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory. Analyze economic phenomena using established economic principles, provide data-driven insights, and explain complex economic concepts in accessible terms. Consider both theoretical frameworks and real-world applications in your responses."
254192
mode: "replace"
255193
- name: math
@@ -266,7 +204,7 @@ decisions:
266204
plugins:
267205
- type: "system_prompt"
268206
configuration:
269-
enabled: true
207+
enabled: false
270208
system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way."
271209
mode: "replace"
272210
- name: physics
@@ -279,11 +217,11 @@ decisions:
279217
name: "physics"
280218
modelRefs:
281219
- model: llama3-8b
282-
use_reasoning: false
220+
use_reasoning: true
283221
plugins:
284222
- type: "system_prompt"
285223
configuration:
286-
enabled: true
224+
enabled: false
287225
system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate."
288226
mode: "replace"
289227
- name: computer_science
@@ -300,8 +238,9 @@ decisions:
300238
plugins:
301239
- type: "system_prompt"
302240
configuration:
303-
enabled: true
241+
enabled: false
304242
system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful."
243+
305244
mode: "replace"
306245
- name: philosophy
307246
description: "Route philosophy queries"
@@ -317,11 +256,12 @@ decisions:
317256
plugins:
318257
- type: "system_prompt"
319258
configuration:
320-
enabled: true
259+
enabled: false
321260
system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought. Engage with complex philosophical questions by presenting multiple perspectives, analyzing arguments rigorously, and encouraging critical thinking. Draw connections between philosophical concepts and contemporary issues while maintaining intellectual honesty about the complexity and ongoing nature of philosophical debates."
322261
mode: "replace"
323262
- name: engineering
324263
description: "Route engineering queries"
264+
325265
priority: 10
326266
rules:
327267
operator: "OR"
@@ -334,25 +274,114 @@ decisions:
334274
plugins:
335275
- type: "system_prompt"
336276
configuration:
337-
enabled: true
277+
enabled: false
338278
system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering. Apply engineering principles, design methodologies, and problem-solving approaches to provide practical solutions. Consider safety, efficiency, sustainability, and cost-effectiveness in your recommendations. Use technical precision while explaining concepts clearly, and emphasize the importance of proper engineering practices and standards."
339279
mode: "replace"
340280

341-
default_model: "llama3-8b"
281+
bert_model:
282+
model_id: models/all-MiniLM-L12-v2
283+
threshold: 0.6
284+
use_cpu: true
285+
286+
semantic_cache:
287+
enabled: false
288+
backend_type: "memory" # Options: "memory", "milvus", or "hybrid"
289+
similarity_threshold: 0.8
290+
max_entries: 1000 # Only applies to memory backend
291+
ttl_seconds: 3600
292+
eviction_policy: "fifo"
293+
# HNSW index configuration (for memory backend only)
294+
use_hnsw: true # Enable HNSW index for faster similarity search
295+
hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory)
296+
hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build)
297+
298+
# Hybrid cache configuration (when backend_type: "hybrid")
299+
# Combines in-memory HNSW for fast search with Milvus for scalable storage
300+
# max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000)
301+
# backend_config_path: "config/milvus.yaml" # Path to Milvus config
302+
303+
# Embedding model for semantic similarity matching
304+
# Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
305+
# Default: "bert" (fastest, lowest memory)
306+
embedding_model: "bert"
342307

343-
# Auto model name for automatic model selection (optional)
344-
# This is the model name that clients should use to trigger automatic model selection
345-
# If not specified, defaults to "MoM" (Mixture of Models)
346-
# For backward compatibility, "auto" is always accepted as an alias
347-
# Example: auto_model_name: "MoM" # or any other name you prefer
348-
# auto_model_name: "MoM"
308+
tools:
309+
enabled: false
310+
top_k: 3
311+
similarity_threshold: 0.2
312+
tools_db_path: "config/tools_db.json"
313+
fallback_to_empty: true
349314

350-
# Include configured models in /v1/models list endpoint (optional, default: false)
351-
# When false (default): only the auto model name is returned in the /v1/models endpoint
352-
# When true: all models configured in model_config are also included in the /v1/models endpoint
353-
# This is useful for clients that need to discover all available models
354-
# Example: include_config_models_in_list: true
355-
# include_config_models_in_list: false
315+
prompt_guard:
316+
enabled: false # Global default - can be overridden per category with jailbreak_enabled
317+
use_modernbert: true
318+
model_id: "models/jailbreak_classifier_modernbert-base_model"
319+
threshold: 0.7
320+
use_cpu: true
321+
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
322+
323+
# Classifier configuration
324+
classifier:
325+
category_model:
326+
model_id: "models/category_classifier_modernbert-base_model"
327+
use_modernbert: true
328+
threshold: 0.6
329+
use_cpu: true
330+
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
331+
pii_model:
332+
model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
333+
use_modernbert: true
334+
threshold: 0.7
335+
use_cpu: true
336+
pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
337+
338+
339+
# Router Configuration for Dual-Path Selection
340+
router:
341+
# High confidence threshold for automatic LoRA selection
342+
high_confidence_threshold: 0.99
343+
# Low latency threshold in milliseconds for LoRA path selection
344+
low_latency_threshold_ms: 2000
345+
# Baseline scores for path evaluation
346+
lora_baseline_score: 0.8
347+
traditional_baseline_score: 0.7
348+
embedding_baseline_score: 0.75
349+
# Success rate calculation threshold
350+
success_confidence_threshold: 0.8
351+
# Large batch size threshold for parallel processing
352+
large_batch_threshold: 4
353+
# Default performance metrics (milliseconds)
354+
lora_default_execution_time_ms: 1345
355+
traditional_default_execution_time_ms: 4567
356+
# Default processing requirements
357+
default_confidence_threshold: 0.95
358+
default_max_latency_ms: 5000
359+
default_batch_size: 4
360+
default_avg_execution_time_ms: 3000
361+
# Default confidence and success rates
362+
lora_default_confidence: 0.99
363+
traditional_default_confidence: 0.95
364+
lora_default_success_rate: 0.98
365+
traditional_default_success_rate: 0.95
366+
# Scoring weights for intelligent path selection (balanced approach)
367+
multi_task_lora_weight: 0.30 # LoRA advantage for multi-task processing
368+
single_task_traditional_weight: 0.30 # Traditional advantage for single tasks
369+
large_batch_lora_weight: 0.25 # LoRA advantage for large batches (≥4)
370+
small_batch_traditional_weight: 0.25 # Traditional advantage for single items
371+
medium_batch_weight: 0.10 # Neutral weight for medium batches (2-3)
372+
high_confidence_lora_weight: 0.25 # LoRA advantage for high confidence (≥0.99)
373+
low_confidence_traditional_weight: 0.25 # Traditional for lower confidence (≤0.9)
374+
low_latency_lora_weight: 0.30 # LoRA advantage for low latency (≤2000ms)
375+
high_latency_traditional_weight: 0.10 # Traditional acceptable for relaxed timing
376+
performance_history_weight: 0.20 # Historical performance comparison factor
377+
# Traditional model specific configurations
378+
traditional_bert_confidence_threshold: 0.95 # Traditional BERT confidence threshold
379+
traditional_modernbert_confidence_threshold: 0.8 # Traditional ModernBERT confidence threshold
380+
traditional_pii_detection_threshold: 0.5 # Traditional PII detection confidence threshold
381+
traditional_token_classification_threshold: 0.9 # Traditional token classification threshold
382+
traditional_dropout_prob: 0.1 # Traditional model dropout probability
383+
traditional_attention_dropout_prob: 0.1 # Traditional model attention dropout probability
384+
tie_break_confidence: 0.5 # Confidence value for tie-breaking situations
356385

357386
# Reasoning family configurations
358387
reasoning_families:
@@ -371,12 +400,12 @@ reasoning_families:
371400
type: "reasoning_effort"
372401
parameter: "reasoning_effort"
373402

374-
# Global default reasoning effort level
375-
default_reasoning_effort: high
376-
377403
# Gateway route cache clearing
378404
clear_route_cache: true # Enable for some gateways such as Istio
379405

406+
# Global default reasoning effort level
407+
default_reasoning_effort: high
408+
380409
# API Configuration
381410
api:
382411
batch_classification:
@@ -392,10 +421,19 @@ api:
392421
[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
393422
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
394423

424+
# Embedding Models Configuration
425+
# These models provide intelligent embedding generation with automatic routing:
426+
# - Qwen3-Embedding-0.6B: Up to 32K context, high quality,
427+
# - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128)
428+
embedding_models:
429+
qwen3_model_path: "models/Qwen3-Embedding-0.6B"
430+
# gemma_model_path: "models/embeddinggemma-300m"
431+
use_cpu: true # Set to false for GPU acceleration (requires CUDA)
432+
395433
# Observability Configuration
396434
observability:
397435
tracing:
398-
enabled: true # Enable distributed tracing for docker-compose stack
436+
enabled: false # Enable distributed tracing for docker-compose stack
399437
provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry
400438
exporter:
401439
type: "otlp" # Export spans to Jaeger (via OTLP gRPC)
@@ -408,3 +446,4 @@ observability:
408446
service_name: "vllm-semantic-router"
409447
service_version: "v0.1.0"
410448
deployment_environment: "development"
449+

0 commit comments

Comments
 (0)