Skip to content

Commit 7532320

Browse files
authored
test(e2e): expand classification coverage and fix cache test issues (#585)
- Add 6 new category test cases to classification test (biology, chemistry, physics, law, economics, psychology) expanding coverage from 4 to 10 categories - Fix cache test model names from gemma3:27b to Model-A to match router config - Fix cache metric names from llm_router_cache_* to llm_cache_* - Add missing unittest import to cache test - Increase cache test timeouts from 10s to 120s for CPU inference - Apply black formatting to cache test These improvements enable comprehensive testing of classification accuracy across all major category types and resolve cache test compatibility issues. Signed-off-by: Yossi Ovadia <[email protected]>
1 parent 3951728 commit 7532320

File tree

2 files changed

+49
-26
lines changed

2 files changed

+49
-26
lines changed

e2e-tests/03-classification-api-test.py

100755100644
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,36 @@
4242
"text": "Describe the main causes of World War I",
4343
"expected_category": "history",
4444
},
45+
{
46+
"name": "Biology Query",
47+
"text": "Explain the process of photosynthesis in plants",
48+
"expected_category": "biology",
49+
},
50+
{
51+
"name": "Chemistry Query",
52+
"text": "What is the molecular formula for glucose and how does it react with oxygen?",
53+
"expected_category": "chemistry",
54+
},
55+
{
56+
"name": "Physics Query",
57+
"text": "Calculate the force required to accelerate a 10kg object at 5m/s²",
58+
"expected_category": "physics",
59+
},
60+
{
61+
"name": "Law Query",
62+
"text": "What are the key differences between civil law and criminal law?",
63+
"expected_category": "law",
64+
},
65+
{
66+
"name": "Economics Query",
67+
"text": "Explain the concept of supply and demand in market economics",
68+
"expected_category": "economics",
69+
},
70+
{
71+
"name": "Psychology Query",
72+
"text": "Describe the stages of cognitive development according to Piaget",
73+
"expected_category": "psychology",
74+
},
4575
]
4676

4777

e2e-tests/04-cache-test.py

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import os
1111
import sys
1212
import time
13+
import unittest
1314
import uuid
1415

1516
import requests
@@ -45,7 +46,7 @@ def setUp(self):
4546
# Check Envoy
4647
try:
4748
payload = {
48-
"model": "gemma3:27b",
49+
"model": "Model-A",
4950
"messages": [{"role": "user", "content": "test"}],
5051
}
5152

@@ -89,7 +90,7 @@ def setUp(self):
8990
# Check if cache is enabled in metrics
9091
response = requests.get(ROUTER_METRICS_URL)
9192
metrics_text = response.text
92-
if "llm_router_cache" not in metrics_text:
93+
if "llm_cache" not in metrics_text:
9394
self.skipTest("Cache metrics not found. Semantic cache may be disabled.")
9495

9596
def test_cache_hit_with_identical_query(self):
@@ -105,13 +106,11 @@ def test_cache_hit_with_identical_query(self):
105106
# Get baseline cache metrics
106107
response = requests.get(ROUTER_METRICS_URL)
107108
baseline_metrics = response.text
108-
baseline_hits = (
109-
extract_metric(baseline_metrics, "llm_router_cache_hits_total") or 0
110-
)
109+
baseline_hits = extract_metric(baseline_metrics, "llm_cache_hits_total") or 0
111110

112111
self.print_request_info(
113112
payload={
114-
"model": "gemma3:27b",
113+
"model": "Model-A",
115114
"messages": [
116115
{"role": "system", "content": "You are a helpful assistant."},
117116
{"role": "user", "content": query},
@@ -123,7 +122,7 @@ def test_cache_hit_with_identical_query(self):
123122

124123
# First request should be a cache miss
125124
payload = {
126-
"model": "gemma3:27b",
125+
"model": "Model-A",
127126
"messages": [
128127
{"role": "system", "content": "You are a helpful assistant."},
129128
{"role": "user", "content": query},
@@ -136,7 +135,7 @@ def test_cache_hit_with_identical_query(self):
136135
# First request
137136
self.print_subtest_header("First Request (Expected Cache Miss)")
138137
response1 = requests.post(
139-
f"{ENVOY_URL}{OPENAI_ENDPOINT}", headers=headers, json=payload, timeout=10
138+
f"{ENVOY_URL}{OPENAI_ENDPOINT}", headers=headers, json=payload, timeout=120
140139
)
141140

142141
response1_json = response1.json()
@@ -157,7 +156,7 @@ def test_cache_hit_with_identical_query(self):
157156
# Second identical request
158157
self.print_subtest_header("Second Request (Expected Cache Hit)")
159158
response2 = requests.post(
160-
f"{ENVOY_URL}{OPENAI_ENDPOINT}", headers=headers, json=payload, timeout=10
159+
f"{ENVOY_URL}{OPENAI_ENDPOINT}", headers=headers, json=payload, timeout=120
161160
)
162161

163162
response2_json = response2.json()
@@ -178,9 +177,7 @@ def test_cache_hit_with_identical_query(self):
178177
# Check if cache hits increased
179178
response = requests.get(ROUTER_METRICS_URL)
180179
updated_metrics = response.text
181-
updated_hits = (
182-
extract_metric(updated_metrics, "llm_router_cache_hits_total") or 0
183-
)
180+
updated_hits = extract_metric(updated_metrics, "llm_cache_hits_total") or 0
184181

185182
passed = (model1 == model2) and (updated_hits > baseline_hits)
186183
self.print_test_result(
@@ -215,14 +212,12 @@ def test_cache_hit_with_similar_query(self):
215212
# Get baseline cache metrics
216213
response = requests.get(ROUTER_METRICS_URL)
217214
baseline_metrics = response.text
218-
baseline_hits = (
219-
extract_metric(baseline_metrics, "llm_router_cache_hits_total") or 0
220-
)
215+
baseline_hits = extract_metric(baseline_metrics, "llm_cache_hits_total") or 0
221216

222217
# First request with original query
223218
self.print_subtest_header("Original Query")
224219
payload1 = {
225-
"model": "gemma3:27b",
220+
"model": "Model-A",
226221
"messages": [
227222
{"role": "system", "content": "You are a helpful assistant."},
228223
{"role": "user", "content": original_query},
@@ -237,7 +232,7 @@ def test_cache_hit_with_similar_query(self):
237232
headers = {"Content-Type": "application/json", "X-Session-ID": session_id}
238233

239234
response1 = requests.post(
240-
f"{ENVOY_URL}{OPENAI_ENDPOINT}", headers=headers, json=payload1, timeout=10
235+
f"{ENVOY_URL}{OPENAI_ENDPOINT}", headers=headers, json=payload1, timeout=120
241236
)
242237

243238
response1_json = response1.json()
@@ -259,7 +254,7 @@ def test_cache_hit_with_similar_query(self):
259254
# Second request with similar query
260255
self.print_subtest_header("Similar Query")
261256
payload2 = {
262-
"model": "gemma3:27b",
257+
"model": "Model-A",
263258
"messages": [
264259
{"role": "system", "content": "You are a helpful assistant."},
265260
{"role": "user", "content": similar_query},
@@ -273,7 +268,7 @@ def test_cache_hit_with_similar_query(self):
273268
)
274269

275270
response2 = requests.post(
276-
f"{ENVOY_URL}{OPENAI_ENDPOINT}", headers=headers, json=payload2, timeout=10
271+
f"{ENVOY_URL}{OPENAI_ENDPOINT}", headers=headers, json=payload2, timeout=120
277272
)
278273

279274
response2_json = response2.json()
@@ -295,9 +290,7 @@ def test_cache_hit_with_similar_query(self):
295290
# Check cache metrics
296291
response = requests.get(ROUTER_METRICS_URL)
297292
updated_metrics = response.text
298-
updated_hits = (
299-
extract_metric(updated_metrics, "llm_router_cache_hits_total") or 0
300-
)
293+
updated_hits = extract_metric(updated_metrics, "llm_cache_hits_total") or 0
301294

302295
passed = (model1 == model2) and (updated_hits > baseline_hits)
303296
self.print_test_result(
@@ -328,10 +321,10 @@ def test_cache_metrics(self):
328321

329322
# Look for specific cache metrics
330323
cache_metrics = [
331-
"llm_router_cache_hits_total",
332-
"llm_router_cache_misses_total",
333-
"llm_router_cache_size",
334-
"llm_router_cache_max_size",
324+
"llm_cache_hits_total",
325+
"llm_cache_misses_total",
326+
"llm_cache_size",
327+
"llm_cache_max_size",
335328
]
336329

337330
metrics_found = {}

0 commit comments

Comments
 (0)