Skip to content

Commit cb3e304

Browse files
yossiovadiaclaude
andcommitted
refactor: simplify model names to Model-A and Model-B for E2E testing
- Update LLM Katan configuration to use simplified model names - Simplify 00-client-request-test.py to use Model-A as default - Update documentation to reflect math → Model-B, creative → Model-A routing - Improve test readability and maintainability 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]> Signed-off-by: Yossi Ovadia <[email protected]>
1 parent 000b1f7 commit cb3e304

File tree

4 files changed

+54
-54
lines changed

4 files changed

+54
-54
lines changed

config/config.e2e.yaml

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -43,27 +43,27 @@ vllm_endpoints:
4343
address: "127.0.0.1"
4444
port: 8000
4545
models:
46-
- "Qwen/Qwen2-0.5B-Instruct"
46+
- "Model-A"
4747
weight: 1
4848
health_check_path: "/health"
4949
- name: "tinyllama-endpoint"
5050
address: "127.0.0.1"
5151
port: 8001
5252
models:
53-
- "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
53+
- "Model-B"
5454
weight: 1
5555
health_check_path: "/health"
5656

5757
model_config:
5858

59-
"Qwen/Qwen2-0.5B-Instruct":
59+
"Model-A":
6060
use_reasoning: false
6161
reasoning_family: "qwen3" # This model uses Qwen reasoning syntax
6262
preferred_endpoints: ["qwen-endpoint"]
6363
pii_policy:
6464
allow_by_default: true
6565
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]
66-
"TinyLlama/TinyLlama-1.1B-Chat-v1.0":
66+
"Model-B":
6767
use_reasoning: false
6868
preferred_endpoints: ["tinyllama-endpoint"]
6969
pii_policy:
@@ -90,191 +90,191 @@ categories:
9090
reasoning_description: "Business content is typically conversational"
9191
reasoning_effort: low # Business conversations need low reasoning effort
9292
model_scores:
93-
- model: "Qwen/Qwen2-0.5B-Instruct"
93+
- model: "Model-A"
9494
score: 0.8
9595
use_reasoning: false
96-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
96+
- model: "Model-B"
9797
score: 0.4
9898
use_reasoning: false
99-
- model: "Qwen/Qwen2-0.5B-Instruct"
99+
- model: "Model-A"
100100
score: 0.2
101101
use_reasoning: false
102102
- name: law
103103
use_reasoning: false
104104
reasoning_description: "Legal content is typically explanatory"
105105
model_scores:
106-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
106+
- model: "Model-B"
107107
score: 0.8
108108
use_reasoning: false
109-
- model: "Qwen/Qwen2-0.5B-Instruct"
109+
- model: "Model-A"
110110
score: 0.6
111111
use_reasoning: false
112-
- model: "Qwen/Qwen2-0.5B-Instruct"
112+
- model: "Model-A"
113113
score: 0.4
114114
use_reasoning: false
115115
- name: psychology
116116
use_reasoning: false
117117
reasoning_description: "Psychology content is usually explanatory"
118118
model_scores:
119-
- model: "Qwen/Qwen2-0.5B-Instruct"
119+
- model: "Model-A"
120120
score: 0.6
121121
use_reasoning: false
122-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
122+
- model: "Model-B"
123123
score: 0.4
124124
use_reasoning: false
125-
- model: "Qwen/Qwen2-0.5B-Instruct"
125+
- model: "Model-A"
126126
score: 0.4
127127
use_reasoning: false
128128
- name: biology
129129
use_reasoning: true
130130
reasoning_description: "Biological processes benefit from structured analysis"
131131
model_scores:
132-
- model: "Qwen/Qwen2-0.5B-Instruct"
132+
- model: "Model-A"
133133
score: 0.8
134134
use_reasoning: false
135-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
135+
- model: "Model-B"
136136
score: 0.6
137137
use_reasoning: false
138-
- model: "Qwen/Qwen2-0.5B-Instruct"
138+
- model: "Model-A"
139139
score: 0.2
140140
use_reasoning: false
141141
- name: chemistry
142142
use_reasoning: true
143143
reasoning_description: "Chemical reactions and formulas require systematic thinking"
144144
reasoning_effort: high # Chemistry requires high reasoning effort
145145
model_scores:
146-
- model: "Qwen/Qwen2-0.5B-Instruct"
146+
- model: "Model-A"
147147
score: 0.8
148148
use_reasoning: true
149-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
149+
- model: "Model-B"
150150
score: 0.6
151151
use_reasoning: false
152-
- model: "Qwen/Qwen2-0.5B-Instruct"
152+
- model: "Model-A"
153153
score: 0.6
154154
use_reasoning: false
155155
- name: history
156156
use_reasoning: false
157157
reasoning_description: "Historical content is narrative-based"
158158
model_scores:
159-
- model: "Qwen/Qwen2-0.5B-Instruct"
159+
- model: "Model-A"
160160
score: 0.8
161161
use_reasoning: false
162-
- model: "Qwen/Qwen2-0.5B-Instruct"
162+
- model: "Model-A"
163163
score: 0.6
164164
use_reasoning: false
165-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
165+
- model: "Model-B"
166166
score: 0.4
167167
use_reasoning: false
168168
- name: other
169169
use_reasoning: false
170170
reasoning_description: "General content doesn't require reasoning"
171171
model_scores:
172-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
172+
- model: "Model-B"
173173
score: 0.8
174174
use_reasoning: false
175-
- model: "Qwen/Qwen2-0.5B-Instruct"
175+
- model: "Model-A"
176176
score: 0.6
177177
use_reasoning: false
178-
- model: "Qwen/Qwen2-0.5B-Instruct"
178+
- model: "Model-A"
179179
score: 0.6
180180
use_reasoning: false
181181
- name: health
182182
use_reasoning: false
183183
reasoning_description: "Health information is typically informational"
184184
model_scores:
185-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
185+
- model: "Model-B"
186186
score: 0.8
187187
use_reasoning: false
188-
- model: "Qwen/Qwen2-0.5B-Instruct"
188+
- model: "Model-A"
189189
score: 0.8
190190
use_reasoning: false
191-
- model: "Qwen/Qwen2-0.5B-Instruct"
191+
- model: "Model-A"
192192
score: 0.6
193193
use_reasoning: false
194194
- name: economics
195195
use_reasoning: false
196196
reasoning_description: "Economic discussions are usually explanatory"
197197
model_scores:
198-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
198+
- model: "Model-B"
199199
score: 0.8
200200
use_reasoning: false
201-
- model: "Qwen/Qwen2-0.5B-Instruct"
201+
- model: "Model-A"
202202
score: 0.8
203203
use_reasoning: false
204-
- model: "Qwen/Qwen2-0.5B-Instruct"
204+
- model: "Model-A"
205205
score: 0.1
206206
use_reasoning: false
207207
- name: math
208208
use_reasoning: true
209209
reasoning_description: "Mathematical problems require step-by-step reasoning"
210210
reasoning_effort: high # Math problems need high reasoning effort
211211
model_scores:
212-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
212+
- model: "Model-B"
213213
score: 1.0
214214
use_reasoning: true
215-
- model: "Qwen/Qwen2-0.5B-Instruct"
215+
- model: "Model-A"
216216
score: 0.9
217217
use_reasoning: true
218-
- model: "Qwen/Qwen2-0.5B-Instruct"
218+
- model: "Model-A"
219219
score: 0.8
220220
use_reasoning: false
221-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
221+
- model: "Model-B"
222222
score: 0.6
223223
use_reasoning: false
224224
- name: physics
225225
use_reasoning: true
226226
reasoning_description: "Physics concepts need logical analysis"
227227
model_scores:
228-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
228+
- model: "Model-B"
229229
score: 0.4
230230
use_reasoning: true
231-
- model: "Qwen/Qwen2-0.5B-Instruct"
231+
- model: "Model-A"
232232
score: 0.4
233233
use_reasoning: false
234-
- model: "Qwen/Qwen2-0.5B-Instruct"
234+
- model: "Model-A"
235235
score: 0.4
236236
use_reasoning: false
237237
- name: computer science
238238
use_reasoning: true
239239
reasoning_description: "Programming and algorithms need logical reasoning"
240240
model_scores:
241-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
241+
- model: "Model-B"
242242
score: 0.6
243243
use_reasoning: false
244-
- model: "Qwen/Qwen2-0.5B-Instruct"
244+
- model: "Model-A"
245245
score: 0.6
246246
use_reasoning: false
247-
- model: "Qwen/Qwen2-0.5B-Instruct"
247+
- model: "Model-A"
248248
score: 0.1
249249
use_reasoning: false
250250
- name: philosophy
251251
use_reasoning: false
252252
reasoning_description: "Philosophical discussions are conversational"
253253
model_scores:
254-
- model: "Qwen/Qwen2-0.5B-Instruct"
254+
- model: "Model-A"
255255
score: 0.6
256256
use_reasoning: false
257-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
257+
- model: "Model-B"
258258
score: 0.2
259259
use_reasoning: false
260-
- model: "Qwen/Qwen2-0.5B-Instruct"
260+
- model: "Model-A"
261261
score: 0.2
262262
use_reasoning: false
263263
- name: engineering
264264
use_reasoning: true
265265
reasoning_description: "Engineering problems require systematic problem-solving"
266266
model_scores:
267-
- model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
267+
- model: "Model-B"
268268
score: 0.6
269269
use_reasoning: false
270-
- model: "Qwen/Qwen2-0.5B-Instruct"
270+
- model: "Model-A"
271271
score: 0.6
272272
use_reasoning: false
273-
- model: "Qwen/Qwen2-0.5B-Instruct"
273+
- model: "Model-A"
274274
score: 0.2
275275
use_reasoning: false
276276

277-
default_model: "Qwen/Qwen2-0.5B-Instruct"
277+
default_model: "Model-A"
278278

279279
# API Configuration
280280
api:

e2e-tests/00-client-request-test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
ENVOY_URL = "http://localhost:8801"
2424
OPENAI_ENDPOINT = "/v1/chat/completions"
2525
DEFAULT_MODEL = (
26-
"Qwen/Qwen2-0.5B-Instruct" # Use configured model that matches router config
26+
"Model-A" # Use configured model that matches router config
2727
)
2828
MAX_RETRIES = 3
2929
RETRY_DELAY = 2

e2e-tests/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ This test suite provides a progressive approach to testing the Semantic Router,
88
- Tests sending requests to the Envoy proxy
99
- Verifies basic request formatting and endpoint availability
1010
- Tests malformed request validation
11-
- Tests content-based smart routing (math → TinyLlama, creative → Qwen)
11+
- Tests content-based smart routing (math → Model-B, creative → Model-A)
1212

1313
2. **01-envoy-extproc-test.py** - TBD (To Be Developed)
1414
- Tests that Envoy correctly forwards requests to the ExtProc
@@ -48,8 +48,8 @@ For fast development and testing with real tiny models (no GPU required):
4848
./e2e-tests/start-llm-katan.sh
4949

5050
# Or manually start individual servers:
51-
llm-katan --model Qwen/Qwen3-0.6B --port 8000 --served-model-name "Qwen/Qwen2-0.5B-Instruct"
52-
llm-katan --model Qwen/Qwen3-0.6B --port 8001 --served-model-name "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
51+
llm-katan --model Qwen/Qwen3-0.6B --port 8000 --served-model-name "Model-A"
52+
llm-katan --model Qwen/Qwen3-0.6B --port 8001 --served-model-name "Model-B"
5353

5454
# Terminal 2: Start Envoy proxy
5555
make run-envoy

e2e-tests/start-llm-katan.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ PIDS_FILE="$E2E_DIR/llm_katan_pids.txt"
1616
# Model configurations for LLM Katan servers
1717
# Format: "port:real_model::served_model_name"
1818
LLM_KATAN_MODELS=(
19-
"8000:Qwen/Qwen3-0.6B::Qwen/Qwen2-0.5B-Instruct"
20-
"8001:Qwen/Qwen3-0.6B::TinyLlama/TinyLlama-1.1B-Chat-v1.0"
19+
"8000:Qwen/Qwen3-0.6B::Model-A"
20+
"8001:Qwen/Qwen3-0.6B::Model-B"
2121
)
2222

2323
# Function to check if LLM Katan is available

0 commit comments

Comments
 (0)