1010import os
1111import sys
1212import time
13+ import unittest
1314import uuid
1415
1516import requests
@@ -45,7 +46,7 @@ def setUp(self):
4546 # Check Envoy
4647 try :
4748 payload = {
48- "model" : "gemma3:27b " ,
49+ "model" : "Model-A " ,
4950 "messages" : [{"role" : "user" , "content" : "test" }],
5051 }
5152
@@ -89,7 +90,7 @@ def setUp(self):
8990 # Check if cache is enabled in metrics
9091 response = requests .get (ROUTER_METRICS_URL )
9192 metrics_text = response .text
92- if "llm_router_cache " not in metrics_text :
93+ if "llm_cache " not in metrics_text :
9394 self .skipTest ("Cache metrics not found. Semantic cache may be disabled." )
9495
9596 def test_cache_hit_with_identical_query (self ):
@@ -105,13 +106,11 @@ def test_cache_hit_with_identical_query(self):
105106 # Get baseline cache metrics
106107 response = requests .get (ROUTER_METRICS_URL )
107108 baseline_metrics = response .text
108- baseline_hits = (
109- extract_metric (baseline_metrics , "llm_router_cache_hits_total" ) or 0
110- )
109+ baseline_hits = extract_metric (baseline_metrics , "llm_cache_hits_total" ) or 0
111110
112111 self .print_request_info (
113112 payload = {
114- "model" : "gemma3:27b " ,
113+ "model" : "Model-A " ,
115114 "messages" : [
116115 {"role" : "system" , "content" : "You are a helpful assistant." },
117116 {"role" : "user" , "content" : query },
@@ -123,7 +122,7 @@ def test_cache_hit_with_identical_query(self):
123122
124123 # First request should be a cache miss
125124 payload = {
126- "model" : "gemma3:27b " ,
125+ "model" : "Model-A " ,
127126 "messages" : [
128127 {"role" : "system" , "content" : "You are a helpful assistant." },
129128 {"role" : "user" , "content" : query },
@@ -136,7 +135,7 @@ def test_cache_hit_with_identical_query(self):
136135 # First request
137136 self .print_subtest_header ("First Request (Expected Cache Miss)" )
138137 response1 = requests .post (
139- f"{ ENVOY_URL } { OPENAI_ENDPOINT } " , headers = headers , json = payload , timeout = 10
138+ f"{ ENVOY_URL } { OPENAI_ENDPOINT } " , headers = headers , json = payload , timeout = 120
140139 )
141140
142141 response1_json = response1 .json ()
@@ -157,7 +156,7 @@ def test_cache_hit_with_identical_query(self):
157156 # Second identical request
158157 self .print_subtest_header ("Second Request (Expected Cache Hit)" )
159158 response2 = requests .post (
160- f"{ ENVOY_URL } { OPENAI_ENDPOINT } " , headers = headers , json = payload , timeout = 10
159+ f"{ ENVOY_URL } { OPENAI_ENDPOINT } " , headers = headers , json = payload , timeout = 120
161160 )
162161
163162 response2_json = response2 .json ()
@@ -178,9 +177,7 @@ def test_cache_hit_with_identical_query(self):
178177 # Check if cache hits increased
179178 response = requests .get (ROUTER_METRICS_URL )
180179 updated_metrics = response .text
181- updated_hits = (
182- extract_metric (updated_metrics , "llm_router_cache_hits_total" ) or 0
183- )
180+ updated_hits = extract_metric (updated_metrics , "llm_cache_hits_total" ) or 0
184181
185182 passed = (model1 == model2 ) and (updated_hits > baseline_hits )
186183 self .print_test_result (
@@ -215,14 +212,12 @@ def test_cache_hit_with_similar_query(self):
215212 # Get baseline cache metrics
216213 response = requests .get (ROUTER_METRICS_URL )
217214 baseline_metrics = response .text
218- baseline_hits = (
219- extract_metric (baseline_metrics , "llm_router_cache_hits_total" ) or 0
220- )
215+ baseline_hits = extract_metric (baseline_metrics , "llm_cache_hits_total" ) or 0
221216
222217 # First request with original query
223218 self .print_subtest_header ("Original Query" )
224219 payload1 = {
225- "model" : "gemma3:27b " ,
220+ "model" : "Model-A " ,
226221 "messages" : [
227222 {"role" : "system" , "content" : "You are a helpful assistant." },
228223 {"role" : "user" , "content" : original_query },
@@ -237,7 +232,7 @@ def test_cache_hit_with_similar_query(self):
237232 headers = {"Content-Type" : "application/json" , "X-Session-ID" : session_id }
238233
239234 response1 = requests .post (
240- f"{ ENVOY_URL } { OPENAI_ENDPOINT } " , headers = headers , json = payload1 , timeout = 10
235+ f"{ ENVOY_URL } { OPENAI_ENDPOINT } " , headers = headers , json = payload1 , timeout = 120
241236 )
242237
243238 response1_json = response1 .json ()
@@ -259,7 +254,7 @@ def test_cache_hit_with_similar_query(self):
259254 # Second request with similar query
260255 self .print_subtest_header ("Similar Query" )
261256 payload2 = {
262- "model" : "gemma3:27b " ,
257+ "model" : "Model-A " ,
263258 "messages" : [
264259 {"role" : "system" , "content" : "You are a helpful assistant." },
265260 {"role" : "user" , "content" : similar_query },
@@ -273,7 +268,7 @@ def test_cache_hit_with_similar_query(self):
273268 )
274269
275270 response2 = requests .post (
276- f"{ ENVOY_URL } { OPENAI_ENDPOINT } " , headers = headers , json = payload2 , timeout = 10
271+ f"{ ENVOY_URL } { OPENAI_ENDPOINT } " , headers = headers , json = payload2 , timeout = 120
277272 )
278273
279274 response2_json = response2 .json ()
@@ -295,9 +290,7 @@ def test_cache_hit_with_similar_query(self):
295290 # Check cache metrics
296291 response = requests .get (ROUTER_METRICS_URL )
297292 updated_metrics = response .text
298- updated_hits = (
299- extract_metric (updated_metrics , "llm_router_cache_hits_total" ) or 0
300- )
293+ updated_hits = extract_metric (updated_metrics , "llm_cache_hits_total" ) or 0
301294
302295 passed = (model1 == model2 ) and (updated_hits > baseline_hits )
303296 self .print_test_result (
@@ -328,10 +321,10 @@ def test_cache_metrics(self):
328321
329322 # Look for specific cache metrics
330323 cache_metrics = [
331- "llm_router_cache_hits_total " ,
332- "llm_router_cache_misses_total " ,
333- "llm_router_cache_size " ,
334- "llm_router_cache_max_size " ,
324+ "llm_cache_hits_total " ,
325+ "llm_cache_misses_total " ,
326+ "llm_cache_size " ,
327+ "llm_cache_max_size " ,
335328 ]
336329
337330 metrics_found = {}
0 commit comments