vllm-project
diff --git a/‎src/semantic-router/pkg/k8s/testdata/input/01-basic.yaml‎
Lines changed: 22 additions & 11 deletions b/‎src/semantic-router/pkg/k8s/testdata/input/01-basic.yaml‎
Lines changed: 22 additions & 11 deletions
diff --git a/‎src/semantic-router/pkg/k8s/testdata/input/02-keyword-only.yaml‎
Lines changed: 24 additions & 13 deletions b/‎src/semantic-router/pkg/k8s/testdata/input/02-keyword-only.yaml‎
Lines changed: 24 additions & 13 deletions
diff --git a/‎src/semantic-router/pkg/k8s/testdata/input/03-embedding-only.yaml‎
Lines changed: 31 additions & 18 deletions b/‎src/semantic-router/pkg/k8s/testdata/input/03-embedding-only.yaml‎
Lines changed: 31 additions & 18 deletions
diff --git a/‎src/semantic-router/pkg/k8s/testdata/input/04-domain-only.yaml‎
Lines changed: 21 additions & 14 deletions b/‎src/semantic-router/pkg/k8s/testdata/input/04-domain-only.yaml‎
Lines changed: 21 additions & 14 deletions
diff --git a/‎src/semantic-router/pkg/k8s/testdata/input/05-keyword-embedding.yaml‎
Lines changed: 26 additions & 12 deletions b/‎src/semantic-router/pkg/k8s/testdata/input/05-keyword-embedding.yaml‎
Lines changed: 26 additions & 12 deletions
@@ -5,16 +5,25 @@ metadata:
   name: test-pool
   namespace: default
 spec:
-  defaultModel: "qwen3-8b"
+  defaultModel: "qwen-2.5-7b"
   models:
-    - name: "qwen3-8b"
+    - name: "qwen-2.5-7b"
       reasoningFamily: "qwen3"
       pricing:
-        inputTokenPrice: 0.000001
-        outputTokenPrice: 0.000002
+        inputTokenPrice: 0.0000005  # $0.5 per 1M input tokens
+        outputTokenPrice: 0.000001   # $1 per 1M output tokens
       loras:
-        - name: "tech-expert"
+        - name: "tech-support"
           description: "Technical support specialist"
+
+    - name: "qwen-2.5-72b"
+      reasoningFamily: "qwen3"
+      pricing:
+        inputTokenPrice: 0.000003    # $3 per 1M input tokens
+        outputTokenPrice: 0.000006   # $6 per 1M output tokens
+      loras:
+        - name: "advanced-reasoning"
+          description: "Advanced reasoning and problem solving"
 ---
 apiVersion: vllm.ai/v1alpha1
 kind: IntelligentRoute
@@ -43,7 +52,7 @@ spec:
   decisions:
     - name: "urgent_tech"
       priority: 100
-      description: "Urgent technical support requests"
+      description: "Urgent technical support requests - use large model with reasoning"
       signals:
         operator: "AND"
         conditions:
@@ -52,18 +61,19 @@ spec:
           - type: "embedding"
             name: "tech_support"
       modelRefs:
-        - model: "qwen3-8b"
+        - model: "qwen-2.5-72b"
+          loraName: "advanced-reasoning"
           useReasoning: true
-          reasoningEffort: "medium"
+          reasoningEffort: "high"
       plugins:
         - type: "semantic-cache"
           configuration:
             enabled: true
             threshold: 0.9
-    
+
     - name: "general_tech"
       priority: 50
-      description: "General technical queries"
+      description: "General technical queries - use small model for efficiency"
       signals:
         operator: "OR"
         conditions:
@@ -72,6 +82,7 @@ spec:
           - type: "domain"
             name: "computer_science"
       modelRefs:
-        - model: "qwen3-8b"
+        - model: "qwen-2.5-7b"
+          loraName: "tech-support"
           useReasoning: false
 
@@ -5,13 +5,23 @@ metadata:
   name: keyword-pool
   namespace: default
 spec:
-  defaultModel: "base-model"
+  defaultModel: "gemma-2-9b"
   models:
-    - name: "base-model"
-      reasoningFamily: "qwen3"
+    - name: "gemma-2-9b"
       pricing:
-        inputTokenPrice: 0.000001
-        outputTokenPrice: 0.000002
+        inputTokenPrice: 0.0000004   # $0.4 per 1M input tokens - fast for simple keyword matching
+        outputTokenPrice: 0.0000008  # $0.8 per 1M output tokens
+      loras:
+        - name: "greeting-handler"
+          description: "Optimized for greeting responses"
+
+    - name: "gemma-2-27b"
+      pricing:
+        inputTokenPrice: 0.000002    # $2 per 1M input tokens - better for complex urgent requests
+        outputTokenPrice: 0.000004   # $4 per 1M output tokens
+      loras:
+        - name: "urgent-specialist"
+          description: "Specialized in handling urgent requests"
 
 ---
 apiVersion: vllm.ai/v1alpha1
@@ -33,27 +43,28 @@ spec:
 
   decisions:
     - name: "urgent_request"
-      description: "Handle urgent requests"
+      description: "Handle urgent requests with larger model"
       priority: 100
       signals:
         operator: "AND"
         conditions:
           - type: "keyword"
             name: "urgent"
       modelRefs:
-        - model: "base-model"
-          use_reasoning: true
-          reasoning_effort: "high"
-    
+        - model: "gemma-2-27b"
+          loraName: "urgent-specialist"
+          useReasoning: false
+
     - name: "greeting_response"
-      description: "Handle greetings"
+      description: "Handle greetings with fast small model"
       priority: 50
       signals:
         operator: "AND"
         conditions:
           - type: "keyword"
             name: "greeting"
       modelRefs:
-        - model: "base-model"
-          use_reasoning: false
+        - model: "gemma-2-9b"
+          loraName: "greeting-handler"
+          useReasoning: false
 
@@ -5,13 +5,25 @@ metadata:
   name: embedding-pool
   namespace: default
 spec:
-  defaultModel: "semantic-model"
+  defaultModel: "deepseek-v3"
   models:
-    - name: "semantic-model"
+    - name: "deepseek-v3"
       reasoningFamily: "deepseek"
       pricing:
-        inputTokenPrice: 0.000002
-        outputTokenPrice: 0.000003
+        inputTokenPrice: 0.00000027  # $0.27 per 1M input tokens - efficient for semantic matching
+        outputTokenPrice: 0.0000011  # $1.1 per 1M output tokens
+      loras:
+        - name: "customer-support"
+          description: "Customer support specialist"
+
+    - name: "deepseek-r1"
+      reasoningFamily: "deepseek"
+      pricing:
+        inputTokenPrice: 0.00000055  # $0.55 per 1M input tokens - better reasoning
+        outputTokenPrice: 0.0000022  # $2.2 per 1M output tokens
+      loras:
+        - name: "technical-expert"
+          description: "Technical problem solving expert"
 
 ---
 apiVersion: vllm.ai/v1alpha1
@@ -38,29 +50,30 @@ spec:
         aggregationMethod: "mean"
 
   decisions:
-    - name: "support_ticket"
-      description: "Customer support requests"
+    - name: "tech_troubleshoot"
+      description: "Technical troubleshooting - use reasoning model"
       priority: 100
       signals:
         operator: "AND"
         conditions:
           - type: "embedding"
-            name: "customer_support"
+            name: "technical_issue"
       modelRefs:
-        - model: "semantic-model"
-          use_reasoning: true
-          reasoning_effort: "medium"
-    
-    - name: "tech_troubleshoot"
-      description: "Technical troubleshooting"
-      priority: 90
+        - model: "deepseek-r1"
+          loraName: "technical-expert"
+          useReasoning: true
+          reasoningEffort: "high"
+
+    - name: "support_ticket"
+      description: "Customer support requests - use fast model"
+      priority: 80
       signals:
         operator: "AND"
         conditions:
           - type: "embedding"
-            name: "technical_issue"
+            name: "customer_support"
       modelRefs:
-        - model: "semantic-model"
-          use_reasoning: true
-          reasoning_effort: "high"
+        - model: "deepseek-v3"
+          loraName: "customer-support"
+          useReasoning: false
 
@@ -5,13 +5,20 @@ metadata:
   name: domain-pool
   namespace: default
 spec:
-  defaultModel: "specialist-model"
+  defaultModel: "mistral-7b"
   models:
-    - name: "specialist-model"
-      reasoningFamily: "gpt"
+    - name: "mistral-7b"
       pricing:
-        inputTokenPrice: 0.000003
-        outputTokenPrice: 0.000005
+        inputTokenPrice: 0.0000007   # $0.7 per 1M - fast for simple STEM queries
+        outputTokenPrice: 0.0000014  # $1.4 per 1M
+      loras:
+        - name: "stem-tutor"
+          description: "STEM education tutor"
+
+    - name: "mistral-large"
+      pricing:
+        inputTokenPrice: 0.000003    # $3 per 1M - better for complex STEM problems
+        outputTokenPrice: 0.000009   # $9 per 1M
       loras:
         - name: "math-expert"
           description: "Mathematics specialist"
@@ -34,7 +41,7 @@ spec:
 
   decisions:
     - name: "stem_query"
-      description: "STEM domain queries"
+      description: "Complex STEM domain queries - use large model"
       priority: 100
       signals:
         operator: "OR"
@@ -46,20 +53,20 @@ spec:
           - type: "domain"
             name: "computer_science"
       modelRefs:
-        - model: "specialist-model"
-          use_reasoning: true
-          reasoning_effort: "high"
-    
+        - model: "mistral-large"
+          loraName: "math-expert"
+          useReasoning: false
+
     - name: "chemistry_query"
-      description: "Chemistry domain queries"
+      description: "Chemistry domain queries - use small model"
       priority: 80
       signals:
         operator: "AND"
         conditions:
           - type: "domain"
             name: "chemistry"
       modelRefs:
-        - model: "specialist-model"
-          use_reasoning: true
-          reasoning_effort: "medium"
+        - model: "mistral-7b"
+          loraName: "stem-tutor"
+          useReasoning: false
 
@@ -5,13 +5,25 @@ metadata:
   name: hybrid-pool
   namespace: default
 spec:
-  defaultModel: "hybrid-model"
+  defaultModel: "qwen-2.5-14b"
   models:
-    - name: "hybrid-model"
+    - name: "qwen-2.5-14b"
       reasoningFamily: "qwen3"
       pricing:
-        inputTokenPrice: 0.000001
-        outputTokenPrice: 0.000002
+        inputTokenPrice: 0.000001    # $1 per 1M - balanced performance
+        outputTokenPrice: 0.000002   # $2 per 1M
+      loras:
+        - name: "support-agent"
+          description: "Customer support agent"
+
+    - name: "qwen-2.5-72b"
+      reasoningFamily: "qwen3"
+      pricing:
+        inputTokenPrice: 0.000003    # $3 per 1M - for urgent/complex issues
+        outputTokenPrice: 0.000006   # $6 per 1M
+      loras:
+        - name: "emergency-specialist"
+          description: "Emergency support specialist"
 
 ---
 apiVersion: vllm.ai/v1alpha1
@@ -38,7 +50,7 @@ spec:
 
   decisions:
     - name: "urgent_support"
-      description: "Urgent support requests combining keyword and semantic matching"
+      description: "Urgent support requests - use large model with reasoning"
       priority: 100
       signals:
         operator: "AND"
@@ -48,12 +60,13 @@ spec:
           - type: "embedding"
             name: "support_request"
       modelRefs:
-        - model: "hybrid-model"
-          use_reasoning: true
-          reasoning_effort: "high"
-    
+        - model: "qwen-2.5-72b"
+          loraName: "emergency-specialist"
+          useReasoning: true
+          reasoningEffort: "high"
+
     - name: "general_support"
-      description: "General support requests"
+      description: "General support requests - use small model"
       priority: 50
       signals:
         operator: "OR"
@@ -63,6 +76,7 @@ spec:
           - type: "embedding"
             name: "support_request"
       modelRefs:
-        - model: "hybrid-model"
-          use_reasoning: false
+        - model: "qwen-2.5-14b"
+          loraName: "support-agent"
+          useReasoning: false