diff --git a/deploy/kubernetes/istio/config.yaml b/deploy/kubernetes/istio/config.yaml index 8f0fd1a40..7ff964fc2 100644 --- a/deploy/kubernetes/istio/config.yaml +++ b/deploy/kubernetes/istio/config.yaml @@ -1,72 +1,10 @@ -bert_model: - model_id: models/all-MiniLM-L12-v2 - threshold: 0.6 - use_cpu: true - -semantic_cache: - enabled: false - backend_type: "memory" # Options: "memory" or "milvus" - similarity_threshold: 0.8 - max_entries: 1000 # Only applies to memory backend - ttl_seconds: 3600 - eviction_policy: "fifo" - # Embedding model for semantic similarity matching - # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context) - embedding_model: "bert" # Default: BERT (fastest, lowest memory for Kubernetes) - -tools: - enabled: false - top_k: 3 - similarity_threshold: 0.2 - tools_db_path: "config/tools_db.json" - fallback_to_empty: true - -prompt_guard: - enabled: false # Global default - can be overridden per category with jailbreak_enabled - use_modernbert: true - model_id: "models/jailbreak_classifier_modernbert-base_model" - threshold: 0.7 - use_cpu: true - jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" - -# vLLM Endpoints Configuration -# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6) -# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1 -# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field) -vllm_endpoints: - - name: "endpoint1" - address: "10.98.150.102" # Static IPv4 of llama3-8b k8s service - port: 80 - weight: 1 - - name: "endpoint2" - address: "10.98.118.242" # Static IPv4 of phi4-mini k8s service - port: 80 - weight: 1 - model_config: "llama3-8b": - # reasoning_family: "" # This model uses Qwen-3 reasoning syntax - preferred_endpoints: ["endpoint1"] allow_by_default: true "phi4-mini": - # reasoning_family: "" # This model uses Qwen-3 reasoning syntax - preferred_endpoints: ["endpoint2"] allow_by_default: true -# Classifier configuration -classifier: - category_model: - model_id: "models/category_classifier_modernbert-base_model" - use_modernbert: true - threshold: 0.6 - use_cpu: true - category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" - pii_model: - model_id: "models/pii_classifier_modernbert-base_presidio_token_model" - use_modernbert: true - threshold: 0.7 - use_cpu: true - pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" +default_model: "llama3-8b" # Categories - now only contain metadata for domain classification categories: @@ -101,7 +39,7 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations." mode: "replace" - name: law @@ -118,7 +56,7 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions. Provide accurate legal information and analysis while clearly stating that your responses are for informational purposes only and do not constitute legal advice. Always recommend consulting with qualified legal professionals for specific legal matters." mode: "replace" - name: psychology @@ -135,12 +73,12 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice." mode: "replace" - type: "semantic-cache" configuration: - enabled: true + enabled: false similarity_threshold: 0.92 - name: biology description: "Route biology queries" @@ -156,7 +94,7 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology. Explain biological concepts with scientific accuracy, use appropriate terminology, and provide examples from current research. Connect biological principles to real-world applications and emphasize the interconnectedness of biological systems." mode: "replace" - name: chemistry @@ -169,11 +107,11 @@ decisions: name: "chemistry" modelRefs: - model: llama3-8b - use_reasoning: false + use_reasoning: true plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations." mode: "replace" - name: history @@ -190,7 +128,7 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis." mode: "replace" - name: other @@ -207,12 +145,12 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics." mode: "replace" - type: "semantic-cache" configuration: - enabled: true + enabled: false similarity_threshold: 0.75 - name: health description: "Route health and medical queries" @@ -228,12 +166,12 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies." mode: "replace" - type: "semantic-cache" configuration: - enabled: true + enabled: false similarity_threshold: 0.95 - name: economics description: "Route economics queries" @@ -249,7 +187,7 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory. Analyze economic phenomena using established economic principles, provide data-driven insights, and explain complex economic concepts in accessible terms. Consider both theoretical frameworks and real-world applications in your responses." mode: "replace" - name: math @@ -266,7 +204,7 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way." mode: "replace" - name: physics @@ -279,11 +217,11 @@ decisions: name: "physics" modelRefs: - model: llama3-8b - use_reasoning: false + use_reasoning: true plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate." mode: "replace" - name: computer_science @@ -300,8 +238,9 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful." + mode: "replace" - name: philosophy description: "Route philosophy queries" @@ -317,11 +256,12 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought. Engage with complex philosophical questions by presenting multiple perspectives, analyzing arguments rigorously, and encouraging critical thinking. Draw connections between philosophical concepts and contemporary issues while maintaining intellectual honesty about the complexity and ongoing nature of philosophical debates." mode: "replace" - name: engineering description: "Route engineering queries" + priority: 10 rules: operator: "OR" @@ -334,25 +274,114 @@ decisions: plugins: - type: "system_prompt" configuration: - enabled: true + enabled: false system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering. Apply engineering principles, design methodologies, and problem-solving approaches to provide practical solutions. Consider safety, efficiency, sustainability, and cost-effectiveness in your recommendations. Use technical precision while explaining concepts clearly, and emphasize the importance of proper engineering practices and standards." mode: "replace" -default_model: "llama3-8b" +bert_model: + model_id: models/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true + +semantic_cache: + enabled: false + backend_type: "memory" # Options: "memory", "milvus", or "hybrid" + similarity_threshold: 0.8 + max_entries: 1000 # Only applies to memory backend + ttl_seconds: 3600 + eviction_policy: "fifo" + # HNSW index configuration (for memory backend only) + use_hnsw: true # Enable HNSW index for faster similarity search + hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory) + hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build) + + # Hybrid cache configuration (when backend_type: "hybrid") + # Combines in-memory HNSW for fast search with Milvus for scalable storage + # max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000) + # backend_config_path: "config/milvus.yaml" # Path to Milvus config + + # Embedding model for semantic similarity matching + # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context) + # Default: "bert" (fastest, lowest memory) + embedding_model: "bert" -# Auto model name for automatic model selection (optional) -# This is the model name that clients should use to trigger automatic model selection -# If not specified, defaults to "MoM" (Mixture of Models) -# For backward compatibility, "auto" is always accepted as an alias -# Example: auto_model_name: "MoM" # or any other name you prefer -# auto_model_name: "MoM" +tools: + enabled: false + top_k: 3 + similarity_threshold: 0.2 + tools_db_path: "config/tools_db.json" + fallback_to_empty: true -# Include configured models in /v1/models list endpoint (optional, default: false) -# When false (default): only the auto model name is returned in the /v1/models endpoint -# When true: all models configured in model_config are also included in the /v1/models endpoint -# This is useful for clients that need to discover all available models -# Example: include_config_models_in_list: true -# include_config_models_in_list: false +prompt_guard: + enabled: false # Global default - can be overridden per category with jailbreak_enabled + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + +# Classifier configuration +classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + use_modernbert: true + threshold: 0.6 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + pii_model: + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" + use_modernbert: true + threshold: 0.7 + use_cpu: true + pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" + + +# Router Configuration for Dual-Path Selection +router: + # High confidence threshold for automatic LoRA selection + high_confidence_threshold: 0.99 + # Low latency threshold in milliseconds for LoRA path selection + low_latency_threshold_ms: 2000 + # Baseline scores for path evaluation + lora_baseline_score: 0.8 + traditional_baseline_score: 0.7 + embedding_baseline_score: 0.75 + # Success rate calculation threshold + success_confidence_threshold: 0.8 + # Large batch size threshold for parallel processing + large_batch_threshold: 4 + # Default performance metrics (milliseconds) + lora_default_execution_time_ms: 1345 + traditional_default_execution_time_ms: 4567 + # Default processing requirements + default_confidence_threshold: 0.95 + default_max_latency_ms: 5000 + default_batch_size: 4 + default_avg_execution_time_ms: 3000 + # Default confidence and success rates + lora_default_confidence: 0.99 + traditional_default_confidence: 0.95 + lora_default_success_rate: 0.98 + traditional_default_success_rate: 0.95 + # Scoring weights for intelligent path selection (balanced approach) + multi_task_lora_weight: 0.30 # LoRA advantage for multi-task processing + single_task_traditional_weight: 0.30 # Traditional advantage for single tasks + large_batch_lora_weight: 0.25 # LoRA advantage for large batches (≥4) + small_batch_traditional_weight: 0.25 # Traditional advantage for single items + medium_batch_weight: 0.10 # Neutral weight for medium batches (2-3) + high_confidence_lora_weight: 0.25 # LoRA advantage for high confidence (≥0.99) + low_confidence_traditional_weight: 0.25 # Traditional for lower confidence (≤0.9) + low_latency_lora_weight: 0.30 # LoRA advantage for low latency (≤2000ms) + high_latency_traditional_weight: 0.10 # Traditional acceptable for relaxed timing + performance_history_weight: 0.20 # Historical performance comparison factor + # Traditional model specific configurations + traditional_bert_confidence_threshold: 0.95 # Traditional BERT confidence threshold + traditional_modernbert_confidence_threshold: 0.8 # Traditional ModernBERT confidence threshold + traditional_pii_detection_threshold: 0.5 # Traditional PII detection confidence threshold + traditional_token_classification_threshold: 0.9 # Traditional token classification threshold + traditional_dropout_prob: 0.1 # Traditional model dropout probability + traditional_attention_dropout_prob: 0.1 # Traditional model attention dropout probability + tie_break_confidence: 0.5 # Confidence value for tie-breaking situations # Reasoning family configurations reasoning_families: @@ -371,12 +400,12 @@ reasoning_families: type: "reasoning_effort" parameter: "reasoning_effort" -# Global default reasoning effort level -default_reasoning_effort: high - # Gateway route cache clearing clear_route_cache: true # Enable for some gateways such as Istio +# Global default reasoning effort level +default_reasoning_effort: high + # API Configuration api: batch_classification: @@ -392,10 +421,19 @@ api: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] +# Embedding Models Configuration +# These models provide intelligent embedding generation with automatic routing: +# - Qwen3-Embedding-0.6B: Up to 32K context, high quality, +# - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128) +embedding_models: + qwen3_model_path: "models/Qwen3-Embedding-0.6B" +# gemma_model_path: "models/embeddinggemma-300m" + use_cpu: true # Set to false for GPU acceleration (requires CUDA) + # Observability Configuration observability: tracing: - enabled: true # Enable distributed tracing for docker-compose stack + enabled: false # Enable distributed tracing for docker-compose stack provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry exporter: type: "otlp" # Export spans to Jaeger (via OTLP gRPC) @@ -408,3 +446,4 @@ observability: service_name: "vllm-semantic-router" service_version: "v0.1.0" deployment_environment: "development" + diff --git a/deploy/kubernetes/istio/deployment.yaml b/deploy/kubernetes/istio/deployment.yaml index 1830aacf0..4152ec993 100644 --- a/deploy/kubernetes/istio/deployment.yaml +++ b/deploy/kubernetes/istio/deployment.yaml @@ -32,6 +32,7 @@ spec: "pii_classifier_modernbert-base_model" "jailbreak_classifier_modernbert-base_model" "pii_classifier_modernbert-base_presidio_token_model" + "Qwen3-Embedding-0.6B" ) mkdir -p /app/models cd /app/models @@ -92,19 +93,30 @@ spec: echo "PII token classifier model already exists, skipping..." fi + # Download Qwen3 Embedding model + if [ ! -f "Qwen3-Embedding-0.6B/model.safetensors" ]; then + echo "Downloading Qwen3-Embedding-0.6B model (missing model weights)..." + rm -rf Qwen3-Embedding-0.6B + hf download Qwen/Qwen3-Embedding-0.6B --local-dir Qwen3-Embedding-0.6B + echo "Downloaded Qwen3-Embedding-0.6B files:" + ls -la Qwen3-Embedding-0.6B/ + else + echo "Qwen3-Embedding-0.6B model already exists with weights, skipping..." + fi + echo "All missing models downloaded successfully!" ls -la /app/models/ env: - name: HF_HUB_CACHE value: /tmp/hf_cache - # Reduced resource requirements for init container + # Increased resource requirements for init container to prevent timeouts during model download resources: requests: - memory: "512Mi" - cpu: "250m" - limits: memory: "1Gi" cpu: "500m" + limits: + memory: "2Gi" + cpu: "1" volumeMounts: - name: models-volume mountPath: /app/models @@ -137,25 +149,25 @@ spec: livenessProbe: tcpSocket: port: 50051 - initialDelaySeconds: 60 + initialDelaySeconds: 300 # Wait 5 minutes for model loading periodSeconds: 30 timeoutSeconds: 10 - failureThreshold: 3 + failureThreshold: 5 # Allow more failures readinessProbe: tcpSocket: port: 50051 - initialDelaySeconds: 90 + initialDelaySeconds: 300 # Wait 5 minutes for model loading periodSeconds: 30 timeoutSeconds: 10 - failureThreshold: 3 - # Significantly reduced resource requirements for kind cluster + failureThreshold: 5 # Allow more failures + # Increased memory for multiple models including Qwen3-Embedding-0.6B resources: requests: - memory: "3Gi" # Reduced from 8Gi - cpu: "1" # Reduced from 2 + memory: "4Gi" # Increased to handle all models + cpu: "1" limits: - memory: "6Gi" # Reduced from 12Gi - cpu: "2" # Reduced from 4 + memory: "8Gi" # Increased to prevent OOMKill + cpu: "2" volumes: - name: config-volume configMap: diff --git a/deploy/kubernetes/llmd-base/llmd+public-llm/README.md b/deploy/kubernetes/llmd-base/llmd+public-llm/README.md index d2485ceaa..534eee9ca 100644 --- a/deploy/kubernetes/llmd-base/llmd+public-llm/README.md +++ b/deploy/kubernetes/llmd-base/llmd+public-llm/README.md @@ -201,9 +201,18 @@ kubectl patch deployment inference-gateway-istio --type='json' -p='[ kubectl exec -it deploy/inference-gateway-istio -- printenv | grep OPENAI_API_KEY ``` -## Step 13: Create HTTPRoutes for Local LLM and for the OpenAI target +## Step 13: Patch the OPENAI_API_KEY into the HTTPRoute for OpenAI -Deploy the HTTPRoute manifest for the openai route destination. In the provided manifest note again that we match on the contents of the x-selected-model and also setup the injection of the OpenAI api key as a bearer token for enabling the access into OpenAI api for this route. For the local LLM we use a route similar to the llm-d guide since we want the prompt query to also get routed via the inferencepool and LLM-D scheduler for the Llama pool which will then pick one of the multiple endpoints in the pool serving the Llama LLM in this example. +Patch the OPEN_AI_API_KEY from your environment into a template file to generate the manifest for the HTTPRoute representing the OpenAI target. Note that you can skip step 12 by doing this step but for now we also listed step 12 in case you have other automation options for generating the httproute manifest while templating in the value of the OPENAI_API_KEY. + +```bash +## Patch the OPENAI_API_KEY into the template to create the httproute manifest file +sed "s/{{OPENAI_API_KEY}}/$OPENAI_API_KEY/g" deploy/kubernetes/llmd-base/llmd+public-llm/httproute-openai.template > deploy/kubernetes/llmd-base/llmd+public-llm/httproute-openai.yaml +``` + +## Step 14: Create HTTPRoutes for Local LLM and for the OpenAI target + +Now deploy the HTTPRoute manifest for the openai route destination. In the manifest note again that we match on the contents of the x-selected-model and also setup the injection of the OpenAI api key as a bearer token for enabling the access into OpenAI api for this route. For the local LLM we use a route similar to the llm-d guide since we want the prompt query to also get routed via the inferencepool and LLM-D scheduler for the Llama pool which will then pick one of the multiple endpoints in the pool serving the Llama LLM in this example. ```bash ## HTTpRoute for OpenAI @@ -216,7 +225,7 @@ kc apply -f deploy/kubernetes/llmd-base/llmd+public-llm/httproute-openai.yaml kubectl apply -f deploy/kubernetes/llmd-base/httproute-llama-pool.yaml ``` -## Step 14: Testing the Deployment +## Step 15: Testing the Deployment To expose the IP on which the Istio gateway listens to client requests from outside the cluster, you can choose any standard kubernetes option for external load balancing. We tested our feature by [deploying and configuring metallb](https://metallb.universe.tf/installation/) into the cluster to be the LoadBalancer provider. Please refer to metallb documentation for installation procedures if needed. Finally, for the minikube case, we get the external url as shown below. ```bash diff --git a/deploy/kubernetes/llmd-base/llmd+public-llm/config.yaml.local b/deploy/kubernetes/llmd-base/llmd+public-llm/config.yaml.local index 0e4cdbca1..7ff964fc2 100644 --- a/deploy/kubernetes/llmd-base/llmd+public-llm/config.yaml.local +++ b/deploy/kubernetes/llmd-base/llmd+public-llm/config.yaml.local @@ -1,188 +1,387 @@ -bert_model: - model_id: models/all-MiniLM-L12-v2 - threshold: 0.6 - use_cpu: true - -semantic_cache: - enabled: false - backend_type: "memory" # Options: "memory" or "milvus" - similarity_threshold: 0.8 - max_entries: 1000 # Only applies to memory backend - ttl_seconds: 3600 - eviction_policy: "fifo" - # Embedding model for semantic similarity matching - # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context) - embedding_model: "bert" # Default: BERT (fastest, lowest memory for Kubernetes) - -tools: - enabled: false - top_k: 3 - similarity_threshold: 0.2 - tools_db_path: "config/tools_db.json" - fallback_to_empty: true - -prompt_guard: - enabled: false # Global default - can be overridden per category with jailbreak_enabled - use_modernbert: true - model_id: "models/jailbreak_classifier_modernbert-base_model" - threshold: 0.7 - use_cpu: true - jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" - -# vLLM Endpoints Configuration -# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6) -# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1 -# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field) -vllm_endpoints: - - name: "endpoint1" - address: "10.98.150.102" # Static IPv4 of llama3-8b k8s service - port: 80 - weight: 1 - - name: "endpoint2" - address: "10.98.118.242" # Static IPv4 of phi4-mini k8s service - port: 80 - weight: 1 - model_config: "llama3-8b": - # reasoning_family: "" # This model uses Qwen-3 reasoning syntax - preferred_endpoints: ["endpoint1"] - pii_policy: - allow_by_default: true + allow_by_default: true "phi4-mini": - # reasoning_family: "" # This model uses Qwen-3 reasoning syntax - preferred_endpoints: ["endpoint2"] - pii_policy: - allow_by_default: true + allow_by_default: true -# Classifier configuration -classifier: - category_model: - model_id: "models/category_classifier_modernbert-base_model" - use_modernbert: true - threshold: 0.6 - use_cpu: true - category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" - pii_model: - model_id: "models/pii_classifier_modernbert-base_presidio_token_model" - use_modernbert: true - threshold: 0.7 - use_cpu: true - pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" +default_model: "llama3-8b" -# Categories with new use_reasoning field structure +# Categories - now only contain metadata for domain classification categories: - name: business - system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations." - # jailbreak_enabled: true # Optional: Override global jailbreak detection per category - # jailbreak_threshold: 0.8 # Optional: Override global jailbreak threshold per category - model_scores: + - name: law + - name: psychology + - name: biology + - name: chemistry + - name: history + - name: other + - name: health + - name: economics + - name: math + - name: physics + - name: computer science + - name: philosophy + - name: engineering + +# Decisions - define routing logic with rules, model selection, and plugins +decisions: + - name: business + description: "Route business and management queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "business" + modelRefs: - model: llama3-8b - score: 0.8 - use_reasoning: false # Business performs better without reasoning - - model: phi4-mini - score: 0.3 - use_reasoning: false # Business performs better without reasoning + use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations." + mode: "replace" - name: law - system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions. Provide accurate legal information and analysis while clearly stating that your responses are for informational purposes only and do not constitute legal advice. Always recommend consulting with qualified legal professionals for specific legal matters." - model_scores: + description: "Route legal queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "law" + modelRefs: - model: llama3-8b - score: 0.4 use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions. Provide accurate legal information and analysis while clearly stating that your responses are for informational purposes only and do not constitute legal advice. Always recommend consulting with qualified legal professionals for specific legal matters." + mode: "replace" - name: psychology - system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice." - semantic_cache_enabled: true - semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances - model_scores: + description: "Route psychology queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "psychology" + modelRefs: - model: llama3-8b - score: 0.6 use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice." + mode: "replace" + - type: "semantic-cache" + configuration: + enabled: false + similarity_threshold: 0.92 - name: biology - system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology. Explain biological concepts with scientific accuracy, use appropriate terminology, and provide examples from current research. Connect biological principles to real-world applications and emphasize the interconnectedness of biological systems." - model_scores: + description: "Route biology queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "biology" + modelRefs: - model: llama3-8b - score: 0.9 use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology. Explain biological concepts with scientific accuracy, use appropriate terminology, and provide examples from current research. Connect biological principles to real-world applications and emphasize the interconnectedness of biological systems." + mode: "replace" - name: chemistry - system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations." - model_scores: + description: "Route chemistry queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "chemistry" + modelRefs: - model: llama3-8b - score: 0.6 - use_reasoning: false # Enable reasoning for complex chemistry + use_reasoning: true + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations." + mode: "replace" - name: history - system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis." - model_scores: + description: "Route history queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "history" + modelRefs: - model: llama3-8b - score: 0.7 use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis." + mode: "replace" - name: other - system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics." - semantic_cache_enabled: true - semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive - model_scores: + description: "Route general queries" + priority: 5 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "other" + modelRefs: - model: llama3-8b - score: 0.7 use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics." + mode: "replace" + - type: "semantic-cache" + configuration: + enabled: false + similarity_threshold: 0.75 - name: health - system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies." - semantic_cache_enabled: true - semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes - model_scores: + description: "Route health and medical queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "health" + modelRefs: - model: llama3-8b - score: 0.5 use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies." + mode: "replace" + - type: "semantic-cache" + configuration: + enabled: false + similarity_threshold: 0.95 - name: economics - system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory. Analyze economic phenomena using established economic principles, provide data-driven insights, and explain complex economic concepts in accessible terms. Consider both theoretical frameworks and real-world applications in your responses." - model_scores: + description: "Route economics queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "economics" + modelRefs: - model: llama3-8b - score: 1.0 use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory. Analyze economic phenomena using established economic principles, provide data-driven insights, and explain complex economic concepts in accessible terms. Consider both theoretical frameworks and real-world applications in your responses." + mode: "replace" - name: math - system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way." - model_scores: + description: "Route mathematics queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "math" + modelRefs: - model: phi4-mini - score: 1.0 - use_reasoning: false # Enable reasoning for complex math + use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way." + mode: "replace" - name: physics - system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate." - model_scores: + description: "Route physics queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "physics" + modelRefs: - model: llama3-8b - score: 0.7 - use_reasoning: false # Enable reasoning for physics - - name: computer science - system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful." - model_scores: + use_reasoning: true + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate." + mode: "replace" + - name: computer_science + description: "Route computer science queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "computer science" + modelRefs: - model: llama3-8b - score: 0.6 use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful." + + mode: "replace" - name: philosophy - system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought. Engage with complex philosophical questions by presenting multiple perspectives, analyzing arguments rigorously, and encouraging critical thinking. Draw connections between philosophical concepts and contemporary issues while maintaining intellectual honesty about the complexity and ongoing nature of philosophical debates." - model_scores: + description: "Route philosophy queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "philosophy" + modelRefs: - model: llama3-8b - score: 0.5 use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought. Engage with complex philosophical questions by presenting multiple perspectives, analyzing arguments rigorously, and encouraging critical thinking. Draw connections between philosophical concepts and contemporary issues while maintaining intellectual honesty about the complexity and ongoing nature of philosophical debates." + mode: "replace" - name: engineering - system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering. Apply engineering principles, design methodologies, and problem-solving approaches to provide practical solutions. Consider safety, efficiency, sustainability, and cost-effectiveness in your recommendations. Use technical precision while explaining concepts clearly, and emphasize the importance of proper engineering practices and standards." - model_scores: + description: "Route engineering queries" + + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "engineering" + modelRefs: - model: llama3-8b - score: 0.7 use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering. Apply engineering principles, design methodologies, and problem-solving approaches to provide practical solutions. Consider safety, efficiency, sustainability, and cost-effectiveness in your recommendations. Use technical precision while explaining concepts clearly, and emphasize the importance of proper engineering practices and standards." + mode: "replace" -default_model: "llama3-8b" +bert_model: + model_id: models/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true + +semantic_cache: + enabled: false + backend_type: "memory" # Options: "memory", "milvus", or "hybrid" + similarity_threshold: 0.8 + max_entries: 1000 # Only applies to memory backend + ttl_seconds: 3600 + eviction_policy: "fifo" + # HNSW index configuration (for memory backend only) + use_hnsw: true # Enable HNSW index for faster similarity search + hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory) + hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build) + + # Hybrid cache configuration (when backend_type: "hybrid") + # Combines in-memory HNSW for fast search with Milvus for scalable storage + # max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000) + # backend_config_path: "config/milvus.yaml" # Path to Milvus config + + # Embedding model for semantic similarity matching + # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context) + # Default: "bert" (fastest, lowest memory) + embedding_model: "bert" + +tools: + enabled: false + top_k: 3 + similarity_threshold: 0.2 + tools_db_path: "config/tools_db.json" + fallback_to_empty: true + +prompt_guard: + enabled: false # Global default - can be overridden per category with jailbreak_enabled + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + +# Classifier configuration +classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + use_modernbert: true + threshold: 0.6 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + pii_model: + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" + use_modernbert: true + threshold: 0.7 + use_cpu: true + pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" -# Auto model name for automatic model selection (optional) -# This is the model name that clients should use to trigger automatic model selection -# If not specified, defaults to "MoM" (Mixture of Models) -# For backward compatibility, "auto" is always accepted as an alias -# Example: auto_model_name: "MoM" # or any other name you prefer -# auto_model_name: "MoM" -# Include configured models in /v1/models list endpoint (optional, default: false) -# When false (default): only the auto model name is returned in the /v1/models endpoint -# When true: all models configured in model_config are also included in the /v1/models endpoint -# This is useful for clients that need to discover all available models -# Example: include_config_models_in_list: true -# include_config_models_in_list: false +# Router Configuration for Dual-Path Selection +router: + # High confidence threshold for automatic LoRA selection + high_confidence_threshold: 0.99 + # Low latency threshold in milliseconds for LoRA path selection + low_latency_threshold_ms: 2000 + # Baseline scores for path evaluation + lora_baseline_score: 0.8 + traditional_baseline_score: 0.7 + embedding_baseline_score: 0.75 + # Success rate calculation threshold + success_confidence_threshold: 0.8 + # Large batch size threshold for parallel processing + large_batch_threshold: 4 + # Default performance metrics (milliseconds) + lora_default_execution_time_ms: 1345 + traditional_default_execution_time_ms: 4567 + # Default processing requirements + default_confidence_threshold: 0.95 + default_max_latency_ms: 5000 + default_batch_size: 4 + default_avg_execution_time_ms: 3000 + # Default confidence and success rates + lora_default_confidence: 0.99 + traditional_default_confidence: 0.95 + lora_default_success_rate: 0.98 + traditional_default_success_rate: 0.95 + # Scoring weights for intelligent path selection (balanced approach) + multi_task_lora_weight: 0.30 # LoRA advantage for multi-task processing + single_task_traditional_weight: 0.30 # Traditional advantage for single tasks + large_batch_lora_weight: 0.25 # LoRA advantage for large batches (≥4) + small_batch_traditional_weight: 0.25 # Traditional advantage for single items + medium_batch_weight: 0.10 # Neutral weight for medium batches (2-3) + high_confidence_lora_weight: 0.25 # LoRA advantage for high confidence (≥0.99) + low_confidence_traditional_weight: 0.25 # Traditional for lower confidence (≤0.9) + low_latency_lora_weight: 0.30 # LoRA advantage for low latency (≤2000ms) + high_latency_traditional_weight: 0.10 # Traditional acceptable for relaxed timing + performance_history_weight: 0.20 # Historical performance comparison factor + # Traditional model specific configurations + traditional_bert_confidence_threshold: 0.95 # Traditional BERT confidence threshold + traditional_modernbert_confidence_threshold: 0.8 # Traditional ModernBERT confidence threshold + traditional_pii_detection_threshold: 0.5 # Traditional PII detection confidence threshold + traditional_token_classification_threshold: 0.9 # Traditional token classification threshold + traditional_dropout_prob: 0.1 # Traditional model dropout probability + traditional_attention_dropout_prob: 0.1 # Traditional model attention dropout probability + tie_break_confidence: 0.5 # Confidence value for tie-breaking situations # Reasoning family configurations reasoning_families: @@ -201,12 +400,12 @@ reasoning_families: type: "reasoning_effort" parameter: "reasoning_effort" -# Global default reasoning effort level -default_reasoning_effort: high - # Gateway route cache clearing clear_route_cache: true # Enable for some gateways such as Istio +# Global default reasoning effort level +default_reasoning_effort: high + # API Configuration api: batch_classification: @@ -222,10 +421,19 @@ api: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] +# Embedding Models Configuration +# These models provide intelligent embedding generation with automatic routing: +# - Qwen3-Embedding-0.6B: Up to 32K context, high quality, +# - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128) +embedding_models: + qwen3_model_path: "models/Qwen3-Embedding-0.6B" +# gemma_model_path: "models/embeddinggemma-300m" + use_cpu: true # Set to false for GPU acceleration (requires CUDA) + # Observability Configuration observability: tracing: - enabled: true # Enable distributed tracing for docker-compose stack + enabled: false # Enable distributed tracing for docker-compose stack provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry exporter: type: "otlp" # Export spans to Jaeger (via OTLP gRPC) @@ -238,3 +446,4 @@ observability: service_name: "vllm-semantic-router" service_version: "v0.1.0" deployment_environment: "development" + diff --git a/deploy/kubernetes/llmd-base/llmd+public-llm/config.yaml.openai b/deploy/kubernetes/llmd-base/llmd+public-llm/config.yaml.openai index 6a86c53be..c510307b9 100644 --- a/deploy/kubernetes/llmd-base/llmd+public-llm/config.yaml.openai +++ b/deploy/kubernetes/llmd-base/llmd+public-llm/config.yaml.openai @@ -1,188 +1,401 @@ -bert_model: - model_id: models/all-MiniLM-L12-v2 - threshold: 0.6 - use_cpu: true - -semantic_cache: - enabled: false - backend_type: "memory" # Options: "memory" or "milvus" - similarity_threshold: 0.8 - max_entries: 1000 # Only applies to memory backend - ttl_seconds: 3600 - eviction_policy: "fifo" - # Embedding model for semantic similarity matching - # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context) - embedding_model: "bert" # Default: BERT (fastest, lowest memory for Kubernetes) - -tools: - enabled: false - top_k: 3 - similarity_threshold: 0.2 - tools_db_path: "config/tools_db.json" - fallback_to_empty: true - -prompt_guard: - enabled: false # Global default - can be overridden per category with jailbreak_enabled - use_modernbert: true - model_id: "models/jailbreak_classifier_modernbert-base_model" - threshold: 0.7 - use_cpu: true - jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" - -# vLLM Endpoints Configuration -# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6) -# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1 -# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field) -vllm_endpoints: - - name: "endpoint1" - address: "10.98.150.102" # Static IPv4 of llama3-8b k8s service - port: 80 - weight: 1 - - name: "endpoint2" - address: "10.98.118.242" # Static IPv4 of phi4-mini k8s service - port: 80 - weight: 1 - model_config: "llama3-8b": - # reasoning_family: "" # This model uses Qwen-3 reasoning syntax - preferred_endpoints: ["endpoint1"] - pii_policy: - allow_by_default: true + allow_by_default: true "gpt-4o-mini": - # reasoning_family: "" # This model uses Qwen-3 reasoning syntax - preferred_endpoints: ["endpoint2"] - pii_policy: - allow_by_default: true + allow_by_default: true -# Classifier configuration -classifier: - category_model: - model_id: "models/category_classifier_modernbert-base_model" - use_modernbert: true - threshold: 0.6 - use_cpu: true - category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" - pii_model: - model_id: "models/pii_classifier_modernbert-base_presidio_token_model" - use_modernbert: true - threshold: 0.7 - use_cpu: true - pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" +default_model: "gpt-4o-mini" -# Categories with new use_reasoning field structure +# Categories - now only contain metadata for domain classification categories: - name: business - system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations." - # jailbreak_enabled: true # Optional: Override global jailbreak detection per category - # jailbreak_threshold: 0.8 # Optional: Override global jailbreak threshold per category - model_scores: + - name: law + - name: psychology + - name: biology + - name: chemistry + - name: history + - name: other + - name: health + - name: economics + - name: math + - name: physics + - name: computer science + - name: philosophy + - name: engineering + +# Decisions - define routing logic with rules, model selection, and plugins +decisions: + - name: business + description: "Route business and management queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "business" + modelRefs: - model: llama3-8b - score: 0.8 - use_reasoning: false # Business performs better without reasoning - - model: gpt-4o-mini - score: 0.3 - use_reasoning: false # Business performs better without reasoning +# lora_name: llama3-8b + use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations." + mode: "replace" - name: law - system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions. Provide accurate legal information and analysis while clearly stating that your responses are for informational purposes only and do not constitute legal advice. Always recommend consulting with qualified legal professionals for specific legal matters." - model_scores: + description: "Route legal queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "law" + modelRefs: - model: llama3-8b - score: 0.4 +# lora_name: llama3-8b use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions. Provide accurate legal information and analysis while clearly stating that your responses are for informational purposes only and do not constitute legal advice. Always recommend consulting with qualified legal professionals for specific legal matters." + mode: "replace" - name: psychology - system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice." - semantic_cache_enabled: true - semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances - model_scores: + description: "Route psychology queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "psychology" + modelRefs: - model: llama3-8b - score: 0.6 +# lora_name: llama3-8b use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice." + mode: "replace" + - type: "semantic-cache" + configuration: + enabled: false + similarity_threshold: 0.92 - name: biology - system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology. Explain biological concepts with scientific accuracy, use appropriate terminology, and provide examples from current research. Connect biological principles to real-world applications and emphasize the interconnectedness of biological systems." - model_scores: + description: "Route biology queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "biology" + modelRefs: - model: llama3-8b - score: 0.9 +# lora_name: llama3-8b use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology. Explain biological concepts with scientific accuracy, use appropriate terminology, and provide examples from current research. Connect biological principles to real-world applications and emphasize the interconnectedness of biological systems." + mode: "replace" - name: chemistry - system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations." - model_scores: + description: "Route chemistry queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "chemistry" + modelRefs: - model: llama3-8b - score: 0.6 - use_reasoning: false # Enable reasoning for complex chemistry +# lora_name: llama3-8b + use_reasoning: true + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations." + mode: "replace" - name: history - system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis." - model_scores: + description: "Route history queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "history" + modelRefs: - model: llama3-8b - score: 0.7 +# lora_name: llama3-8b use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis." + mode: "replace" - name: other - system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics." - semantic_cache_enabled: true - semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive - model_scores: + description: "Route general queries" + priority: 5 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "other" + modelRefs: - model: llama3-8b - score: 0.7 +# lora_name: llama3-8b use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics." + mode: "replace" + - type: "semantic-cache" + configuration: + enabled: false + similarity_threshold: 0.75 - name: health - system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies." - semantic_cache_enabled: true - semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes - model_scores: + description: "Route health and medical queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "health" + modelRefs: - model: llama3-8b - score: 0.5 +# lora_name: llama3-8b use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies." + mode: "replace" + - type: "semantic-cache" + configuration: + enabled: false + similarity_threshold: 0.95 - name: economics - system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory. Analyze economic phenomena using established economic principles, provide data-driven insights, and explain complex economic concepts in accessible terms. Consider both theoretical frameworks and real-world applications in your responses." - model_scores: + description: "Route economics queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "economics" + modelRefs: - model: llama3-8b - score: 1.0 +# lora_name: llama3-8b use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory. Analyze economic phenomena using established economic principles, provide data-driven insights, and explain complex economic concepts in accessible terms. Consider both theoretical frameworks and real-world applications in your responses." + mode: "replace" - name: math - system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way." - model_scores: + description: "Route mathematics queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "math" + modelRefs: - model: gpt-4o-mini - score: 1.0 - use_reasoning: false # Enable reasoning for complex math +# lora_name: gpt-4o-mini + use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way." + mode: "replace" - name: physics - system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate." - model_scores: + description: "Route physics queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "physics" + modelRefs: - model: llama3-8b - score: 0.7 - use_reasoning: false # Enable reasoning for physics - - name: computer science - system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful." - model_scores: +# lora_name: llama3-8b + use_reasoning: true + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate." + mode: "replace" + - name: computer_science + description: "Route computer science queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "computer science" + modelRefs: - model: llama3-8b - score: 0.6 +# lora_name: llama3-8b use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful." + + mode: "replace" - name: philosophy - system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought. Engage with complex philosophical questions by presenting multiple perspectives, analyzing arguments rigorously, and encouraging critical thinking. Draw connections between philosophical concepts and contemporary issues while maintaining intellectual honesty about the complexity and ongoing nature of philosophical debates." - model_scores: + description: "Route philosophy queries" + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "philosophy" + modelRefs: - model: llama3-8b - score: 0.5 +# lora_name: llama3-8b use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought. Engage with complex philosophical questions by presenting multiple perspectives, analyzing arguments rigorously, and encouraging critical thinking. Draw connections between philosophical concepts and contemporary issues while maintaining intellectual honesty about the complexity and ongoing nature of philosophical debates." + mode: "replace" - name: engineering - system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering. Apply engineering principles, design methodologies, and problem-solving approaches to provide practical solutions. Consider safety, efficiency, sustainability, and cost-effectiveness in your recommendations. Use technical precision while explaining concepts clearly, and emphasize the importance of proper engineering practices and standards." - model_scores: + description: "Route engineering queries" + + priority: 10 + rules: + operator: "OR" + conditions: + - type: "domain" + name: "engineering" + modelRefs: - model: llama3-8b - score: 0.7 +# lora_name: llama3-8b use_reasoning: false + plugins: + - type: "system_prompt" + configuration: + enabled: false + system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering. Apply engineering principles, design methodologies, and problem-solving approaches to provide practical solutions. Consider safety, efficiency, sustainability, and cost-effectiveness in your recommendations. Use technical precision while explaining concepts clearly, and emphasize the importance of proper engineering practices and standards." + mode: "replace" + +bert_model: + model_id: models/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true + +semantic_cache: + enabled: false + backend_type: "memory" # Options: "memory", "milvus", or "hybrid" + similarity_threshold: 0.8 + max_entries: 1000 # Only applies to memory backend + ttl_seconds: 3600 + eviction_policy: "fifo" + # HNSW index configuration (for memory backend only) + use_hnsw: true # Enable HNSW index for faster similarity search + hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory) + hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build) -default_model: "llama3-8b" + # Hybrid cache configuration (when backend_type: "hybrid") + # Combines in-memory HNSW for fast search with Milvus for scalable storage + # max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000) + # backend_config_path: "config/milvus.yaml" # Path to Milvus config -# Auto model name for automatic model selection (optional) -# This is the model name that clients should use to trigger automatic model selection -# If not specified, defaults to "MoM" (Mixture of Models) -# For backward compatibility, "auto" is always accepted as an alias -# Example: auto_model_name: "MoM" # or any other name you prefer -# auto_model_name: "MoM" + # Embedding model for semantic similarity matching + # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context) + # Default: "bert" (fastest, lowest memory) + embedding_model: "bert" -# Include configured models in /v1/models list endpoint (optional, default: false) -# When false (default): only the auto model name is returned in the /v1/models endpoint -# When true: all models configured in model_config are also included in the /v1/models endpoint -# This is useful for clients that need to discover all available models -# Example: include_config_models_in_list: true -# include_config_models_in_list: false +tools: + enabled: false + top_k: 3 + similarity_threshold: 0.2 + tools_db_path: "config/tools_db.json" + fallback_to_empty: true + +prompt_guard: + enabled: false # Global default - can be overridden per category with jailbreak_enabled + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + +# Classifier configuration +classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + use_modernbert: true + threshold: 0.6 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + pii_model: + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" + use_modernbert: true + threshold: 0.7 + use_cpu: true + pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" + + +# Router Configuration for Dual-Path Selection +router: + # High confidence threshold for automatic LoRA selection + high_confidence_threshold: 0.99 + # Low latency threshold in milliseconds for LoRA path selection + low_latency_threshold_ms: 2000 + # Baseline scores for path evaluation + lora_baseline_score: 0.8 + traditional_baseline_score: 0.7 + embedding_baseline_score: 0.75 + # Success rate calculation threshold + success_confidence_threshold: 0.8 + # Large batch size threshold for parallel processing + large_batch_threshold: 4 + # Default performance metrics (milliseconds) + lora_default_execution_time_ms: 1345 + traditional_default_execution_time_ms: 4567 + # Default processing requirements + default_confidence_threshold: 0.95 + default_max_latency_ms: 5000 + default_batch_size: 4 + default_avg_execution_time_ms: 3000 + # Default confidence and success rates + lora_default_confidence: 0.99 + traditional_default_confidence: 0.95 + lora_default_success_rate: 0.98 + traditional_default_success_rate: 0.95 + # Scoring weights for intelligent path selection (balanced approach) + multi_task_lora_weight: 0.30 # LoRA advantage for multi-task processing + single_task_traditional_weight: 0.30 # Traditional advantage for single tasks + large_batch_lora_weight: 0.25 # LoRA advantage for large batches (≥4) + small_batch_traditional_weight: 0.25 # Traditional advantage for single items + medium_batch_weight: 0.10 # Neutral weight for medium batches (2-3) + high_confidence_lora_weight: 0.25 # LoRA advantage for high confidence (≥0.99) + low_confidence_traditional_weight: 0.25 # Traditional for lower confidence (≤0.9) + low_latency_lora_weight: 0.30 # LoRA advantage for low latency (≤2000ms) + high_latency_traditional_weight: 0.10 # Traditional acceptable for relaxed timing + performance_history_weight: 0.20 # Historical performance comparison factor + # Traditional model specific configurations + traditional_bert_confidence_threshold: 0.95 # Traditional BERT confidence threshold + traditional_modernbert_confidence_threshold: 0.8 # Traditional ModernBERT confidence threshold + traditional_pii_detection_threshold: 0.5 # Traditional PII detection confidence threshold + traditional_token_classification_threshold: 0.9 # Traditional token classification threshold + traditional_dropout_prob: 0.1 # Traditional model dropout probability + traditional_attention_dropout_prob: 0.1 # Traditional model attention dropout probability + tie_break_confidence: 0.5 # Confidence value for tie-breaking situations # Reasoning family configurations reasoning_families: @@ -201,12 +414,12 @@ reasoning_families: type: "reasoning_effort" parameter: "reasoning_effort" -# Global default reasoning effort level -default_reasoning_effort: high - # Gateway route cache clearing clear_route_cache: true # Enable for some gateways such as Istio +# Global default reasoning effort level +default_reasoning_effort: high + # API Configuration api: batch_classification: @@ -222,10 +435,19 @@ api: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] +# Embedding Models Configuration +# These models provide intelligent embedding generation with automatic routing: +# - Qwen3-Embedding-0.6B: Up to 32K context, high quality, +# - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128) +embedding_models: + qwen3_model_path: "models/Qwen3-Embedding-0.6B" +# gemma_model_path: "models/embeddinggemma-300m" + use_cpu: true # Set to false for GPU acceleration (requires CUDA) + # Observability Configuration observability: tracing: - enabled: true # Enable distributed tracing for docker-compose stack + enabled: false # Enable distributed tracing for docker-compose stack provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry exporter: type: "otlp" # Export spans to Jaeger (via OTLP gRPC) @@ -238,3 +460,4 @@ observability: service_name: "vllm-semantic-router" service_version: "v0.1.0" deployment_environment: "development" + diff --git a/deploy/kubernetes/llmd-base/llmd+public-llm/httproute-openai.yaml b/deploy/kubernetes/llmd-base/llmd+public-llm/httproute-openai.template similarity index 93% rename from deploy/kubernetes/llmd-base/llmd+public-llm/httproute-openai.yaml rename to deploy/kubernetes/llmd-base/llmd+public-llm/httproute-openai.template index eec63e730..0c103ab7c 100644 --- a/deploy/kubernetes/llmd-base/llmd+public-llm/httproute-openai.yaml +++ b/deploy/kubernetes/llmd-base/llmd+public-llm/httproute-openai.template @@ -27,6 +27,6 @@ spec: - name: Host value: "api.openai.com" - name: Authorization - value: "Bearer %(OPENAI_API_KEY)s" + value: "Bearer {{OPENAI_API_KEY}}" timeouts: request: 300s