Skip to content

Commit 7b78326

Browse files
CopilotrootfsCopilot
authored
Allow semantic cache similarity threshold to be set at the category level (#493)
* Initial plan * Add category-level cache settings: enabled and similarity_threshold Co-authored-by: rootfs <[email protected]> * Add comprehensive tests for category-level cache settings Co-authored-by: rootfs <[email protected]> * Update config files and documentation for category-level cache settings - Updated 7 config YAML files (development, production, testing, e2e, and 3 recipes) with commented examples of category-level cache settings - Added comprehensive documentation section explaining category-level cache configuration - Updated semantic cache overview and in-memory cache docs with category-level examples - Added best practices for threshold selection and privacy considerations Co-authored-by: rootfs <[email protected]> * Remove duplicate code in FindSimilar functions Refactored FindSimilar() to delegate to FindSimilarWithThreshold() with default threshold instead of duplicating the entire implementation. This eliminates 226 lines of duplicate code across inmemory_cache.go and milvus_cache.go. Co-authored-by: rootfs <[email protected]> * Update src/semantic-router/pkg/extproc/request_handler.go Co-authored-by: Copilot <[email protected]> * Revert changes from unsigned commit ae39fe2 Restored the classificationText empty check that was removed in the previous commit. Co-authored-by: rootfs <[email protected]> --------- Co-authored-by: copilot-swe-agent[bot] <[email protected]> Co-authored-by: rootfs <[email protected]> Co-authored-by: Huamin Chen <[email protected]> Co-authored-by: Copilot <[email protected]>
1 parent 2eb7b58 commit 7b78326

18 files changed

+563
-41
lines changed

config/config.development.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ classifier:
4747
categories:
4848
- name: test
4949
system_prompt: "You are a test assistant."
50+
# Example: Category-level cache settings
51+
# semantic_cache_enabled: true
52+
# semantic_cache_similarity_threshold: 0.85
5053
model_scores:
5154
- model: test-model
5255
score: 1.0

config/config.e2e.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ categories:
107107
score: 0.4
108108
use_reasoning: false
109109
- name: psychology
110+
# Example: Strict cache threshold for psychology - clinical nuances matter
111+
# semantic_cache_enabled: true
112+
# semantic_cache_similarity_threshold: 0.92
110113
model_scores:
111114
- model: "Model-A"
112115
score: 0.6
@@ -156,6 +159,9 @@ categories:
156159
score: 0.4
157160
use_reasoning: false
158161
- name: other
162+
# Example: Lower threshold for general queries - better cache hit rate
163+
# semantic_cache_enabled: true
164+
# semantic_cache_similarity_threshold: 0.75
159165
model_scores:
160166
- model: "Model-B"
161167
score: 0.8
@@ -168,6 +174,9 @@ categories:
168174
score: 0.6
169175
use_reasoning: false
170176
- name: health
177+
# Example: Very strict cache threshold for health - word changes matter medically
178+
# semantic_cache_enabled: true
179+
# semantic_cache_similarity_threshold: 0.95
171180
model_scores:
172181
- model: "Model-B"
173182
score: 0.8

config/config.production.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,18 @@ classifier:
6060
categories:
6161
- name: math
6262
system_prompt: "You are a mathematics expert. Provide step-by-step solutions."
63+
# Example: High threshold for math - precision matters
64+
# semantic_cache_enabled: true
65+
# semantic_cache_similarity_threshold: 0.92
6366
model_scores:
6467
- model: openai/gpt-oss-20b
6568
score: 1.0
6669
use_reasoning: true
6770
- name: other
6871
system_prompt: "You are a helpful assistant."
72+
# Example: Lower threshold for general queries - more cache hits
73+
# semantic_cache_enabled: true
74+
# semantic_cache_similarity_threshold: 0.75
6975
model_scores:
7076
- model: openai/gpt-oss-20b
7177
score: 0.7

config/config.recipe-accuracy.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ categories:
8787
use_reasoning: true # Enable reasoning for legal analysis
8888
- name: psychology
8989
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
90+
# Category-level cache override (if global cache is enabled)
91+
# semantic_cache_enabled: true
92+
# semantic_cache_similarity_threshold: 0.92 # Strict for clinical nuances
9093
model_scores:
9194
- model: openai/gpt-oss-20b
9295
score: 1.0
@@ -117,6 +120,9 @@ categories:
117120
use_reasoning: false # Default queries don't need reasoning
118121
- name: health
119122
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
123+
# Category-level cache override (if global cache is enabled)
124+
# semantic_cache_enabled: true
125+
# semantic_cache_similarity_threshold: 0.95 # Very strict - medical accuracy critical
120126
model_scores:
121127
- model: openai/gpt-oss-20b
122128
score: 1.0

config/config.recipe-latency.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,9 @@ categories:
105105
use_reasoning: false
106106
- name: other
107107
system_prompt: "Provide helpful responses."
108+
# Category-level cache (optional, already enabled globally with low threshold)
109+
# semantic_cache_enabled: true
110+
# semantic_cache_similarity_threshold: 0.65 # Even lower for general queries
108111
model_scores:
109112
- model: openai/gpt-oss-20b
110113
score: 0.7

config/config.recipe-token-efficiency.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ categories:
110110
use_reasoning: false
111111
- name: other
112112
system_prompt: "You are a helpful assistant. Provide concise, accurate responses."
113+
# Category-level cache (optional, already enabled globally)
114+
# semantic_cache_enabled: true
115+
# semantic_cache_similarity_threshold: 0.7 # Match global or slightly lower
113116
model_scores:
114117
- model: openai/gpt-oss-20b
115118
score: 0.7

config/config.testing.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ model_config:
4242

4343
categories:
4444
- name: other
45+
# Category-level cache settings (optional - falls back to global if not set)
46+
# semantic_cache_enabled: true
47+
# semantic_cache_similarity_threshold: 0.8
4548
model_scores:
4649
- model: openai/gpt-oss-20b
4750
score: 0.7

config/config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ categories:
7474
use_reasoning: false
7575
- name: psychology
7676
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
77+
semantic_cache_enabled: true
78+
semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances
7779
model_scores:
7880
- model: qwen3
7981
score: 0.6
@@ -98,12 +100,16 @@ categories:
98100
use_reasoning: false
99101
- name: other
100102
system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
103+
semantic_cache_enabled: true
104+
semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive
101105
model_scores:
102106
- model: qwen3
103107
score: 0.7
104108
use_reasoning: false
105109
- name: health
106110
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
111+
semantic_cache_enabled: true
112+
semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes
107113
model_scores:
108114
- model: qwen3
109115
score: 0.5

src/semantic-router/pkg/cache/cache_interface.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ type CacheBackend interface {
3333
// Returns the cached response, match status, and any error
3434
FindSimilar(model string, query string) ([]byte, bool, error)
3535

36+
// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
37+
// This allows category-specific similarity thresholds
38+
// Returns the cached response, match status, and any error
39+
FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error)
40+
3641
// Close releases all resources held by the cache backend
3742
Close() error
3843

src/semantic-router/pkg/cache/inmemory_cache.go

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -207,20 +207,25 @@ func (c *InMemoryCache) AddEntry(requestID string, model string, query string, r
207207
return nil
208208
}
209209

210-
// FindSimilar searches for semantically similar cached requests
210+
// FindSimilar searches for semantically similar cached requests using the default threshold
211211
func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, error) {
212+
return c.FindSimilarWithThreshold(model, query, c.similarityThreshold)
213+
}
214+
215+
// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
216+
func (c *InMemoryCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) {
212217
start := time.Now()
213218

214219
if !c.enabled {
215-
observability.Debugf("InMemoryCache.FindSimilar: cache disabled")
220+
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: cache disabled")
216221
return nil, false, nil
217222
}
218223
queryPreview := query
219224
if len(query) > 50 {
220225
queryPreview = query[:50] + "..."
221226
}
222-
observability.Debugf("InMemoryCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)",
223-
model, queryPreview, len(query))
227+
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f",
228+
model, queryPreview, len(query), threshold)
224229

225230
// Generate semantic embedding for similarity comparison
226231
queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
@@ -237,7 +242,7 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
237242
entriesChecked int
238243
expiredCount int
239244
)
240-
// Capture the lookup time after acquiring the read lock so TTL checks arent skewed by embedding work or lock wait
245+
// Capture the lookup time after acquiring the read lock so TTL checks aren't skewed by embedding work or lock wait
241246
now := time.Now()
242247

243248
// Compare with completed entries for the same model, tracking only the best match
@@ -292,26 +297,26 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
292297
// Handle case where no suitable entries exist
293298
if bestIndex < 0 {
294299
atomic.AddInt64(&c.missCount, 1)
295-
observability.Debugf("InMemoryCache.FindSimilar: no entries found with responses")
300+
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: no entries found with responses")
296301
metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds())
297302
metrics.RecordCacheMiss()
298303
return nil, false, nil
299304
}
300305

301306
// Check if the best match meets the similarity threshold
302-
if bestSimilarity >= c.similarityThreshold {
307+
if bestSimilarity >= threshold {
303308
atomic.AddInt64(&c.hitCount, 1)
304309

305310
c.mu.Lock()
306311
c.updateAccessInfo(bestIndex, bestEntry)
307312
c.mu.Unlock()
308313

309-
observability.Debugf("InMemoryCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
310-
bestSimilarity, c.similarityThreshold, len(bestEntry.ResponseBody))
314+
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
315+
bestSimilarity, threshold, len(bestEntry.ResponseBody))
311316
observability.LogEvent("cache_hit", map[string]interface{}{
312317
"backend": "memory",
313318
"similarity": bestSimilarity,
314-
"threshold": c.similarityThreshold,
319+
"threshold": threshold,
315320
"model": model,
316321
})
317322
metrics.RecordCacheOperation("memory", "find_similar", "hit", time.Since(start).Seconds())
@@ -320,12 +325,12 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
320325
}
321326

322327
atomic.AddInt64(&c.missCount, 1)
323-
observability.Debugf("InMemoryCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)",
324-
bestSimilarity, c.similarityThreshold, entriesChecked)
328+
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)",
329+
bestSimilarity, threshold, entriesChecked)
325330
observability.LogEvent("cache_miss", map[string]interface{}{
326331
"backend": "memory",
327332
"best_similarity": bestSimilarity,
328-
"threshold": c.similarityThreshold,
333+
"threshold": threshold,
329334
"model": model,
330335
"entries_checked": entriesChecked,
331336
})

0 commit comments

Comments
 (0)