Skip to content

Commit c5aef71

Browse files
CopilotrootfsCopilot
committed
Allow semantic cache similarity threshold to be set at the category level (vllm-project#493)
* Initial plan * Add category-level cache settings: enabled and similarity_threshold Co-authored-by: rootfs <[email protected]> * Add comprehensive tests for category-level cache settings Co-authored-by: rootfs <[email protected]> * Update config files and documentation for category-level cache settings - Updated 7 config YAML files (development, production, testing, e2e, and 3 recipes) with commented examples of category-level cache settings - Added comprehensive documentation section explaining category-level cache configuration - Updated semantic cache overview and in-memory cache docs with category-level examples - Added best practices for threshold selection and privacy considerations Co-authored-by: rootfs <[email protected]> * Remove duplicate code in FindSimilar functions Refactored FindSimilar() to delegate to FindSimilarWithThreshold() with default threshold instead of duplicating the entire implementation. This eliminates 226 lines of duplicate code across inmemory_cache.go and milvus_cache.go. Co-authored-by: rootfs <[email protected]> * Update src/semantic-router/pkg/extproc/request_handler.go Co-authored-by: Copilot <[email protected]> * Revert changes from unsigned commit ae39fe2 Restored the classificationText empty check that was removed in the previous commit. Co-authored-by: rootfs <[email protected]> --------- Co-authored-by: copilot-swe-agent[bot] <[email protected]> Co-authored-by: rootfs <[email protected]> Co-authored-by: Huamin Chen <[email protected]> Co-authored-by: Copilot <[email protected]> Signed-off-by: Huamin Chen <[email protected]>
1 parent e152644 commit c5aef71

18 files changed

+563
-41
lines changed

config/config.development.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ classifier:
4747
categories:
4848
- name: test
4949
system_prompt: "You are a test assistant."
50+
# Example: Category-level cache settings
51+
# semantic_cache_enabled: true
52+
# semantic_cache_similarity_threshold: 0.85
5053
model_scores:
5154
- model: test-model
5255
score: 1.0

config/config.e2e.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ categories:
107107
score: 0.4
108108
use_reasoning: false
109109
- name: psychology
110+
# Example: Strict cache threshold for psychology - clinical nuances matter
111+
# semantic_cache_enabled: true
112+
# semantic_cache_similarity_threshold: 0.92
110113
model_scores:
111114
- model: "Model-A"
112115
score: 0.6
@@ -156,6 +159,9 @@ categories:
156159
score: 0.4
157160
use_reasoning: false
158161
- name: other
162+
# Example: Lower threshold for general queries - better cache hit rate
163+
# semantic_cache_enabled: true
164+
# semantic_cache_similarity_threshold: 0.75
159165
model_scores:
160166
- model: "Model-B"
161167
score: 0.8
@@ -168,6 +174,9 @@ categories:
168174
score: 0.6
169175
use_reasoning: false
170176
- name: health
177+
# Example: Very strict cache threshold for health - word changes matter medically
178+
# semantic_cache_enabled: true
179+
# semantic_cache_similarity_threshold: 0.95
171180
model_scores:
172181
- model: "Model-B"
173182
score: 0.8

config/config.production.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,18 @@ classifier:
6060
categories:
6161
- name: math
6262
system_prompt: "You are a mathematics expert. Provide step-by-step solutions."
63+
# Example: High threshold for math - precision matters
64+
# semantic_cache_enabled: true
65+
# semantic_cache_similarity_threshold: 0.92
6366
model_scores:
6467
- model: openai/gpt-oss-20b
6568
score: 1.0
6669
use_reasoning: true
6770
- name: other
6871
system_prompt: "You are a helpful assistant."
72+
# Example: Lower threshold for general queries - more cache hits
73+
# semantic_cache_enabled: true
74+
# semantic_cache_similarity_threshold: 0.75
6975
model_scores:
7076
- model: openai/gpt-oss-20b
7177
score: 0.7

config/config.recipe-accuracy.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ categories:
8787
use_reasoning: true # Enable reasoning for legal analysis
8888
- name: psychology
8989
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
90+
# Category-level cache override (if global cache is enabled)
91+
# semantic_cache_enabled: true
92+
# semantic_cache_similarity_threshold: 0.92 # Strict for clinical nuances
9093
model_scores:
9194
- model: openai/gpt-oss-20b
9295
score: 1.0
@@ -117,6 +120,9 @@ categories:
117120
use_reasoning: false # Default queries don't need reasoning
118121
- name: health
119122
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
123+
# Category-level cache override (if global cache is enabled)
124+
# semantic_cache_enabled: true
125+
# semantic_cache_similarity_threshold: 0.95 # Very strict - medical accuracy critical
120126
model_scores:
121127
- model: openai/gpt-oss-20b
122128
score: 1.0

config/config.recipe-latency.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,9 @@ categories:
105105
use_reasoning: false
106106
- name: other
107107
system_prompt: "Provide helpful responses."
108+
# Category-level cache (optional, already enabled globally with low threshold)
109+
# semantic_cache_enabled: true
110+
# semantic_cache_similarity_threshold: 0.65 # Even lower for general queries
108111
model_scores:
109112
- model: openai/gpt-oss-20b
110113
score: 0.7

config/config.recipe-token-efficiency.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ categories:
110110
use_reasoning: false
111111
- name: other
112112
system_prompt: "You are a helpful assistant. Provide concise, accurate responses."
113+
# Category-level cache (optional, already enabled globally)
114+
# semantic_cache_enabled: true
115+
# semantic_cache_similarity_threshold: 0.7 # Match global or slightly lower
113116
model_scores:
114117
- model: openai/gpt-oss-20b
115118
score: 0.7

config/config.testing.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ model_config:
4242

4343
categories:
4444
- name: other
45+
# Category-level cache settings (optional - falls back to global if not set)
46+
# semantic_cache_enabled: true
47+
# semantic_cache_similarity_threshold: 0.8
4548
model_scores:
4649
- model: openai/gpt-oss-20b
4750
score: 0.7

config/config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ categories:
7878
use_reasoning: false
7979
- name: psychology
8080
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
81+
semantic_cache_enabled: true
82+
semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances
8183
model_scores:
8284
- model: qwen3
8385
score: 0.6
@@ -102,12 +104,16 @@ categories:
102104
use_reasoning: false
103105
- name: other
104106
system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
107+
semantic_cache_enabled: true
108+
semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive
105109
model_scores:
106110
- model: qwen3
107111
score: 0.7
108112
use_reasoning: false
109113
- name: health
110114
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
115+
semantic_cache_enabled: true
116+
semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes
111117
model_scores:
112118
- model: qwen3
113119
score: 0.5

src/semantic-router/pkg/cache/cache_interface.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ type CacheBackend interface {
3333
// Returns the cached response, match status, and any error
3434
FindSimilar(model string, query string) ([]byte, bool, error)
3535

36+
// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
37+
// This allows category-specific similarity thresholds
38+
// Returns the cached response, match status, and any error
39+
FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error)
40+
3641
// Close releases all resources held by the cache backend
3742
Close() error
3843

src/semantic-router/pkg/cache/inmemory_cache.go

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -240,20 +240,25 @@ func (c *InMemoryCache) AddEntry(requestID string, model string, query string, r
240240
return nil
241241
}
242242

243-
// FindSimilar searches for semantically similar cached requests
243+
// FindSimilar searches for semantically similar cached requests using the default threshold
244244
func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, error) {
245+
return c.FindSimilarWithThreshold(model, query, c.similarityThreshold)
246+
}
247+
248+
// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
249+
func (c *InMemoryCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) {
245250
start := time.Now()
246251

247252
if !c.enabled {
248-
observability.Debugf("InMemoryCache.FindSimilar: cache disabled")
253+
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: cache disabled")
249254
return nil, false, nil
250255
}
251256
queryPreview := query
252257
if len(query) > 50 {
253258
queryPreview = query[:50] + "..."
254259
}
255-
observability.Debugf("InMemoryCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)",
256-
model, queryPreview, len(query))
260+
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f",
261+
model, queryPreview, len(query), threshold)
257262

258263
// Generate semantic embedding using the configured model
259264
queryEmbedding, err := c.generateEmbedding(query)
@@ -270,7 +275,7 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
270275
entriesChecked int
271276
expiredCount int
272277
)
273-
// Capture the lookup time after acquiring the read lock so TTL checks arent skewed by embedding work or lock wait
278+
// Capture the lookup time after acquiring the read lock so TTL checks aren't skewed by embedding work or lock wait
274279
now := time.Now()
275280

276281
// Compare with completed entries for the same model, tracking only the best match
@@ -325,26 +330,26 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
325330
// Handle case where no suitable entries exist
326331
if bestIndex < 0 {
327332
atomic.AddInt64(&c.missCount, 1)
328-
observability.Debugf("InMemoryCache.FindSimilar: no entries found with responses")
333+
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: no entries found with responses")
329334
metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds())
330335
metrics.RecordCacheMiss()
331336
return nil, false, nil
332337
}
333338

334339
// Check if the best match meets the similarity threshold
335-
if bestSimilarity >= c.similarityThreshold {
340+
if bestSimilarity >= threshold {
336341
atomic.AddInt64(&c.hitCount, 1)
337342

338343
c.mu.Lock()
339344
c.updateAccessInfo(bestIndex, bestEntry)
340345
c.mu.Unlock()
341346

342-
observability.Debugf("InMemoryCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
343-
bestSimilarity, c.similarityThreshold, len(bestEntry.ResponseBody))
347+
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
348+
bestSimilarity, threshold, len(bestEntry.ResponseBody))
344349
observability.LogEvent("cache_hit", map[string]interface{}{
345350
"backend": "memory",
346351
"similarity": bestSimilarity,
347-
"threshold": c.similarityThreshold,
352+
"threshold": threshold,
348353
"model": model,
349354
})
350355
metrics.RecordCacheOperation("memory", "find_similar", "hit", time.Since(start).Seconds())
@@ -353,12 +358,12 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
353358
}
354359

355360
atomic.AddInt64(&c.missCount, 1)
356-
observability.Debugf("InMemoryCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)",
357-
bestSimilarity, c.similarityThreshold, entriesChecked)
361+
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)",
362+
bestSimilarity, threshold, entriesChecked)
358363
observability.LogEvent("cache_miss", map[string]interface{}{
359364
"backend": "memory",
360365
"best_similarity": bestSimilarity,
361-
"threshold": c.similarityThreshold,
366+
"threshold": threshold,
362367
"model": model,
363368
"entries_checked": entriesChecked,
364369
})

0 commit comments

Comments
 (0)