| 
 | 1 | +# Example configuration for Intent-Aware LoRA Routing  | 
 | 2 | +# This demonstrates how to use the lora_name field to route requests to different  | 
 | 3 | +# LoRA adapters based on the classified intent/category.  | 
 | 4 | +#  | 
 | 5 | +# Prerequisites:  | 
 | 6 | +# 1. vLLM server must be started with --enable-lora flag  | 
 | 7 | +# 2. LoRA adapters must be registered at server startup using --lora-modules  | 
 | 8 | +#    Example: vllm serve meta-llama/Llama-2-7b-hf \  | 
 | 9 | +#               --enable-lora \  | 
 | 10 | +#               --lora-modules technical-lora=/path/to/technical-adapter \  | 
 | 11 | +#                              medical-lora=/path/to/medical-adapter \  | 
 | 12 | +#                              legal-lora=/path/to/legal-adapter  | 
 | 13 | +#  | 
 | 14 | +# How it works:  | 
 | 15 | +# - When a request is classified into a category (e.g., "technical")  | 
 | 16 | +# - The router selects the best ModelScore for that category  | 
 | 17 | +# - If the ModelScore has a lora_name specified, that name is used as the final model name  | 
 | 18 | +# - The request is sent to vLLM with model="technical-lora" instead of model="llama2-7b"  | 
 | 19 | +# - vLLM automatically routes to the appropriate LoRA adapter  | 
 | 20 | + | 
 | 21 | +bert_model:  | 
 | 22 | +  model_id: models/all-MiniLM-L12-v2  | 
 | 23 | +  threshold: 0.6  | 
 | 24 | +  use_cpu: true  | 
 | 25 | + | 
 | 26 | +# vLLM Endpoints Configuration  | 
 | 27 | +vllm_endpoints:  | 
 | 28 | +  - name: "vllm-primary"  | 
 | 29 | +    address: "172.28.0.20"  | 
 | 30 | +    port: 8002  | 
 | 31 | +    weight: 1  | 
 | 32 | + | 
 | 33 | +# Base model configuration  | 
 | 34 | +# IMPORTANT: LoRA adapters must be defined here before they can be referenced in model_scores  | 
 | 35 | +model_config:  | 
 | 36 | +  "llama2-7b":  | 
 | 37 | +    reasoning_family: "llama2"  | 
 | 38 | +    preferred_endpoints: ["vllm-primary"]  | 
 | 39 | +    pii_policy:  | 
 | 40 | +      allow_by_default: true  | 
 | 41 | +    # Define available LoRA adapters for this model  | 
 | 42 | +    # These names must match the LoRA modules registered with vLLM at startup  | 
 | 43 | +    loras:  | 
 | 44 | +      - name: "technical-lora"  | 
 | 45 | +        description: "Optimized for programming and technical questions"  | 
 | 46 | +      - name: "medical-lora"  | 
 | 47 | +        description: "Specialized for medical and healthcare domain"  | 
 | 48 | +      - name: "legal-lora"  | 
 | 49 | +        description: "Fine-tuned for legal questions and law-related topics"  | 
 | 50 | + | 
 | 51 | +# Classifier configuration  | 
 | 52 | +classifier:  | 
 | 53 | +  category_model:  | 
 | 54 | +    model_id: "models/category_classifier_modernbert-base_model"  | 
 | 55 | +    use_modernbert: true  | 
 | 56 | +    threshold: 0.6  | 
 | 57 | +    use_cpu: true  | 
 | 58 | +    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"  | 
 | 59 | + | 
 | 60 | +# Categories with LoRA routing  | 
 | 61 | +categories:  | 
 | 62 | +  - name: technical  | 
 | 63 | +    description: "Programming, software engineering, and technical questions"  | 
 | 64 | +    system_prompt: "You are an expert software engineer with deep knowledge of programming languages, algorithms, system design, and best practices. Provide clear, accurate technical guidance with code examples when appropriate."  | 
 | 65 | +    model_scores:  | 
 | 66 | +      - model: llama2-7b          # Base model name (for endpoint selection and PII policy)  | 
 | 67 | +        lora_name: technical-lora  # LoRA adapter name (used as final model name in request)  | 
 | 68 | +        score: 1.0  | 
 | 69 | +        use_reasoning: true  | 
 | 70 | +        reasoning_effort: medium  | 
 | 71 | + | 
 | 72 | +  - name: medical  | 
 | 73 | +    description: "Medical and healthcare questions"  | 
 | 74 | +    system_prompt: "You are a medical expert with comprehensive knowledge of anatomy, physiology, diseases, treatments, and healthcare practices. Provide accurate medical information while emphasizing that responses are for educational purposes only and not a substitute for professional medical advice."  | 
 | 75 | +    model_scores:  | 
 | 76 | +      - model: llama2-7b  | 
 | 77 | +        lora_name: medical-lora    # Different LoRA adapter for medical domain  | 
 | 78 | +        score: 1.0  | 
 | 79 | +        use_reasoning: true  | 
 | 80 | +        reasoning_effort: high  | 
 | 81 | + | 
 | 82 | +  - name: legal  | 
 | 83 | +    description: "Legal questions and law-related topics"  | 
 | 84 | +    system_prompt: "You are a legal expert with knowledge of legal principles, case law, and statutory interpretation. Provide accurate legal information while clearly stating that responses are for informational purposes only and do not constitute legal advice."  | 
 | 85 | +    model_scores:  | 
 | 86 | +      - model: llama2-7b  | 
 | 87 | +        lora_name: legal-lora      # Different LoRA adapter for legal domain  | 
 | 88 | +        score: 1.0  | 
 | 89 | +        use_reasoning: true  | 
 | 90 | +        reasoning_effort: high  | 
 | 91 | + | 
 | 92 | +  - name: general  | 
 | 93 | +    description: "General questions that don't fit specific domains"  | 
 | 94 | +    system_prompt: "You are a helpful AI assistant with broad knowledge across many topics. Provide clear, accurate, and helpful responses."  | 
 | 95 | +    model_scores:  | 
 | 96 | +      - model: llama2-7b           # No lora_name specified - uses base model  | 
 | 97 | +        score: 0.8  | 
 | 98 | +        use_reasoning: false  | 
 | 99 | + | 
 | 100 | +# Default model for fallback  | 
 | 101 | +default_model: llama2-7b  | 
 | 102 | + | 
 | 103 | +# Benefits of LoRA Routing:  | 
 | 104 | +# 1. Domain-Specific Expertise: Each LoRA adapter is fine-tuned for specific domains  | 
 | 105 | +# 2. Cost Efficiency: Share base model weights across adapters, reducing memory footprint  | 
 | 106 | +# 3. Easy A/B Testing: Gradually roll out new adapters by adjusting scores  | 
 | 107 | +# 4. Flexible Deployment: Add/remove adapters without restarting the router  | 
 | 108 | +# 5. Performance: vLLM efficiently serves multiple LoRA adapters with minimal overhead  | 
 | 109 | +#  | 
 | 110 | +# Use Cases:  | 
 | 111 | +# - Multi-domain chatbots (technical support, medical advice, legal information)  | 
 | 112 | +# - Task-specific optimization (code generation, summarization, translation)  | 
 | 113 | +# - Language-specific adapters for multilingual systems  | 
 | 114 | +# - Customer-specific adapters for personalized experiences  | 
 | 115 | +# - Version testing (compare different adapter versions)  | 
 | 116 | + | 
0 commit comments