Skip to content

Commit 6ee7e04

Browse files
committed
feat: implement decision-based routing with plugin architecture
Signed-off-by: bitliu <[email protected]>
1 parent c7113c7 commit 6ee7e04

32 files changed

+567
-255
lines changed

src/semantic-router/pkg/k8s/testdata/input/01-basic.yaml

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,25 @@ metadata:
55
name: test-pool
66
namespace: default
77
spec:
8-
defaultModel: "qwen3-8b"
8+
defaultModel: "qwen-2.5-7b"
99
models:
10-
- name: "qwen3-8b"
10+
- name: "qwen-2.5-7b"
1111
reasoningFamily: "qwen3"
1212
pricing:
13-
inputTokenPrice: 0.000001
14-
outputTokenPrice: 0.000002
13+
inputTokenPrice: 0.0000005 # $0.5 per 1M input tokens
14+
outputTokenPrice: 0.000001 # $1 per 1M output tokens
1515
loras:
16-
- name: "tech-expert"
16+
- name: "tech-support"
1717
description: "Technical support specialist"
18+
19+
- name: "qwen-2.5-72b"
20+
reasoningFamily: "qwen3"
21+
pricing:
22+
inputTokenPrice: 0.000003 # $3 per 1M input tokens
23+
outputTokenPrice: 0.000006 # $6 per 1M output tokens
24+
loras:
25+
- name: "advanced-reasoning"
26+
description: "Advanced reasoning and problem solving"
1827
---
1928
apiVersion: vllm.ai/v1alpha1
2029
kind: IntelligentRoute
@@ -43,7 +52,7 @@ spec:
4352
decisions:
4453
- name: "urgent_tech"
4554
priority: 100
46-
description: "Urgent technical support requests"
55+
description: "Urgent technical support requests - use large model with reasoning"
4756
signals:
4857
operator: "AND"
4958
conditions:
@@ -52,18 +61,19 @@ spec:
5261
- type: "embedding"
5362
name: "tech_support"
5463
modelRefs:
55-
- model: "qwen3-8b"
64+
- model: "qwen-2.5-72b"
65+
loraName: "advanced-reasoning"
5666
useReasoning: true
57-
reasoningEffort: "medium"
67+
reasoningEffort: "high"
5868
plugins:
5969
- type: "semantic-cache"
6070
configuration:
6171
enabled: true
6272
threshold: 0.9
63-
73+
6474
- name: "general_tech"
6575
priority: 50
66-
description: "General technical queries"
76+
description: "General technical queries - use small model for efficiency"
6777
signals:
6878
operator: "OR"
6979
conditions:
@@ -72,6 +82,7 @@ spec:
7282
- type: "domain"
7383
name: "computer_science"
7484
modelRefs:
75-
- model: "qwen3-8b"
85+
- model: "qwen-2.5-7b"
86+
loraName: "tech-support"
7687
useReasoning: false
7788

src/semantic-router/pkg/k8s/testdata/input/02-keyword-only.yaml

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,23 @@ metadata:
55
name: keyword-pool
66
namespace: default
77
spec:
8-
defaultModel: "base-model"
8+
defaultModel: "gemma-2-9b"
99
models:
10-
- name: "base-model"
11-
reasoningFamily: "qwen3"
10+
- name: "gemma-2-9b"
1211
pricing:
13-
inputTokenPrice: 0.000001
14-
outputTokenPrice: 0.000002
12+
inputTokenPrice: 0.0000004 # $0.4 per 1M input tokens - fast for simple keyword matching
13+
outputTokenPrice: 0.0000008 # $0.8 per 1M output tokens
14+
loras:
15+
- name: "greeting-handler"
16+
description: "Optimized for greeting responses"
17+
18+
- name: "gemma-2-27b"
19+
pricing:
20+
inputTokenPrice: 0.000002 # $2 per 1M input tokens - better for complex urgent requests
21+
outputTokenPrice: 0.000004 # $4 per 1M output tokens
22+
loras:
23+
- name: "urgent-specialist"
24+
description: "Specialized in handling urgent requests"
1525

1626
---
1727
apiVersion: vllm.ai/v1alpha1
@@ -33,27 +43,28 @@ spec:
3343

3444
decisions:
3545
- name: "urgent_request"
36-
description: "Handle urgent requests"
46+
description: "Handle urgent requests with larger model"
3747
priority: 100
3848
signals:
3949
operator: "AND"
4050
conditions:
4151
- type: "keyword"
4252
name: "urgent"
4353
modelRefs:
44-
- model: "base-model"
45-
use_reasoning: true
46-
reasoning_effort: "high"
47-
54+
- model: "gemma-2-27b"
55+
loraName: "urgent-specialist"
56+
useReasoning: false
57+
4858
- name: "greeting_response"
49-
description: "Handle greetings"
59+
description: "Handle greetings with fast small model"
5060
priority: 50
5161
signals:
5262
operator: "AND"
5363
conditions:
5464
- type: "keyword"
5565
name: "greeting"
5666
modelRefs:
57-
- model: "base-model"
58-
use_reasoning: false
67+
- model: "gemma-2-9b"
68+
loraName: "greeting-handler"
69+
useReasoning: false
5970

src/semantic-router/pkg/k8s/testdata/input/03-embedding-only.yaml

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,25 @@ metadata:
55
name: embedding-pool
66
namespace: default
77
spec:
8-
defaultModel: "semantic-model"
8+
defaultModel: "deepseek-v3"
99
models:
10-
- name: "semantic-model"
10+
- name: "deepseek-v3"
1111
reasoningFamily: "deepseek"
1212
pricing:
13-
inputTokenPrice: 0.000002
14-
outputTokenPrice: 0.000003
13+
inputTokenPrice: 0.00000027 # $0.27 per 1M input tokens - efficient for semantic matching
14+
outputTokenPrice: 0.0000011 # $1.1 per 1M output tokens
15+
loras:
16+
- name: "customer-support"
17+
description: "Customer support specialist"
18+
19+
- name: "deepseek-r1"
20+
reasoningFamily: "deepseek"
21+
pricing:
22+
inputTokenPrice: 0.00000055 # $0.55 per 1M input tokens - better reasoning
23+
outputTokenPrice: 0.0000022 # $2.2 per 1M output tokens
24+
loras:
25+
- name: "technical-expert"
26+
description: "Technical problem solving expert"
1527

1628
---
1729
apiVersion: vllm.ai/v1alpha1
@@ -38,29 +50,30 @@ spec:
3850
aggregationMethod: "mean"
3951

4052
decisions:
41-
- name: "support_ticket"
42-
description: "Customer support requests"
53+
- name: "tech_troubleshoot"
54+
description: "Technical troubleshooting - use reasoning model"
4355
priority: 100
4456
signals:
4557
operator: "AND"
4658
conditions:
4759
- type: "embedding"
48-
name: "customer_support"
60+
name: "technical_issue"
4961
modelRefs:
50-
- model: "semantic-model"
51-
use_reasoning: true
52-
reasoning_effort: "medium"
53-
54-
- name: "tech_troubleshoot"
55-
description: "Technical troubleshooting"
56-
priority: 90
62+
- model: "deepseek-r1"
63+
loraName: "technical-expert"
64+
useReasoning: true
65+
reasoningEffort: "high"
66+
67+
- name: "support_ticket"
68+
description: "Customer support requests - use fast model"
69+
priority: 80
5770
signals:
5871
operator: "AND"
5972
conditions:
6073
- type: "embedding"
61-
name: "technical_issue"
74+
name: "customer_support"
6275
modelRefs:
63-
- model: "semantic-model"
64-
use_reasoning: true
65-
reasoning_effort: "high"
76+
- model: "deepseek-v3"
77+
loraName: "customer-support"
78+
useReasoning: false
6679

src/semantic-router/pkg/k8s/testdata/input/04-domain-only.yaml

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,20 @@ metadata:
55
name: domain-pool
66
namespace: default
77
spec:
8-
defaultModel: "specialist-model"
8+
defaultModel: "mistral-7b"
99
models:
10-
- name: "specialist-model"
11-
reasoningFamily: "gpt"
10+
- name: "mistral-7b"
1211
pricing:
13-
inputTokenPrice: 0.000003
14-
outputTokenPrice: 0.000005
12+
inputTokenPrice: 0.0000007 # $0.7 per 1M - fast for simple STEM queries
13+
outputTokenPrice: 0.0000014 # $1.4 per 1M
14+
loras:
15+
- name: "stem-tutor"
16+
description: "STEM education tutor"
17+
18+
- name: "mistral-large"
19+
pricing:
20+
inputTokenPrice: 0.000003 # $3 per 1M - better for complex STEM problems
21+
outputTokenPrice: 0.000009 # $9 per 1M
1522
loras:
1623
- name: "math-expert"
1724
description: "Mathematics specialist"
@@ -34,7 +41,7 @@ spec:
3441

3542
decisions:
3643
- name: "stem_query"
37-
description: "STEM domain queries"
44+
description: "Complex STEM domain queries - use large model"
3845
priority: 100
3946
signals:
4047
operator: "OR"
@@ -46,20 +53,20 @@ spec:
4653
- type: "domain"
4754
name: "computer_science"
4855
modelRefs:
49-
- model: "specialist-model"
50-
use_reasoning: true
51-
reasoning_effort: "high"
52-
56+
- model: "mistral-large"
57+
loraName: "math-expert"
58+
useReasoning: false
59+
5360
- name: "chemistry_query"
54-
description: "Chemistry domain queries"
61+
description: "Chemistry domain queries - use small model"
5562
priority: 80
5663
signals:
5764
operator: "AND"
5865
conditions:
5966
- type: "domain"
6067
name: "chemistry"
6168
modelRefs:
62-
- model: "specialist-model"
63-
use_reasoning: true
64-
reasoning_effort: "medium"
69+
- model: "mistral-7b"
70+
loraName: "stem-tutor"
71+
useReasoning: false
6572

src/semantic-router/pkg/k8s/testdata/input/05-keyword-embedding.yaml

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,25 @@ metadata:
55
name: hybrid-pool
66
namespace: default
77
spec:
8-
defaultModel: "hybrid-model"
8+
defaultModel: "qwen-2.5-14b"
99
models:
10-
- name: "hybrid-model"
10+
- name: "qwen-2.5-14b"
1111
reasoningFamily: "qwen3"
1212
pricing:
13-
inputTokenPrice: 0.000001
14-
outputTokenPrice: 0.000002
13+
inputTokenPrice: 0.000001 # $1 per 1M - balanced performance
14+
outputTokenPrice: 0.000002 # $2 per 1M
15+
loras:
16+
- name: "support-agent"
17+
description: "Customer support agent"
18+
19+
- name: "qwen-2.5-72b"
20+
reasoningFamily: "qwen3"
21+
pricing:
22+
inputTokenPrice: 0.000003 # $3 per 1M - for urgent/complex issues
23+
outputTokenPrice: 0.000006 # $6 per 1M
24+
loras:
25+
- name: "emergency-specialist"
26+
description: "Emergency support specialist"
1527

1628
---
1729
apiVersion: vllm.ai/v1alpha1
@@ -38,7 +50,7 @@ spec:
3850

3951
decisions:
4052
- name: "urgent_support"
41-
description: "Urgent support requests combining keyword and semantic matching"
53+
description: "Urgent support requests - use large model with reasoning"
4254
priority: 100
4355
signals:
4456
operator: "AND"
@@ -48,12 +60,13 @@ spec:
4860
- type: "embedding"
4961
name: "support_request"
5062
modelRefs:
51-
- model: "hybrid-model"
52-
use_reasoning: true
53-
reasoning_effort: "high"
54-
63+
- model: "qwen-2.5-72b"
64+
loraName: "emergency-specialist"
65+
useReasoning: true
66+
reasoningEffort: "high"
67+
5568
- name: "general_support"
56-
description: "General support requests"
69+
description: "General support requests - use small model"
5770
priority: 50
5871
signals:
5972
operator: "OR"
@@ -63,6 +76,7 @@ spec:
6376
- type: "embedding"
6477
name: "support_request"
6578
modelRefs:
66-
- model: "hybrid-model"
67-
use_reasoning: false
79+
- model: "qwen-2.5-14b"
80+
loraName: "support-agent"
81+
useReasoning: false
6882

0 commit comments

Comments
 (0)