Allow semantic cache similarity threshold to be set at the category level (vllm-project#493)

Copilot · rootfs · Copilot · rootfs · commit c5aef71d75e0 · 2025-10-23T15:42:28.000Z
* Initial plan * Add category-level cache settings: enabled and similarity_threshold Co-authored-by: rootfs <7062400+rootfs@users.noreply.github.com> * Add comprehensive tests for category-level cache settings Co-authored-by: rootfs <7062400+rootfs@users.noreply.github.com> * Update config files and documentation for category-level cache settings - Updated 7 config YAML files (development, production, testing, e2e, and 3 recipes) with commented examples of category-level cache settings - Added comprehensive documentation section explaining category-level cache configuration - Updated semantic cache overview and in-memory cache docs with category-level examples - Added best practices for threshold selection and privacy considerations Co-authored-by: rootfs <7062400+rootfs@users.noreply.github.com> * Remove duplicate code in FindSimilar functions Refactored FindSimilar() to delegate to FindSimilarWithThreshold() with default threshold instead of duplicating the entire implementation. This eliminates 226 lines of duplicate code across inmemory_cache.go and milvus_cache.go. Co-authored-by: rootfs <7062400+rootfs@users.noreply.github.com> * Update src/semantic-router/pkg/extproc/request_handler.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Revert changes from unsigned commit ae39fe2 Restored the classificationText empty check that was removed in the previous commit. Co-authored-by: rootfs <7062400+rootfs@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: rootfs <7062400+rootfs@users.noreply.github.com> Co-authored-by: Huamin Chen <rootfs@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Huamin Chen <hchen@redhat.com>
diff --git a/config/config.development.yaml b/config/config.development.yaml
@@ -47,6 +47,9 @@ classifier:
 categories:
   - name: test
     system_prompt: "You are a test assistant."
+    # Example: Category-level cache settings
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.85
     model_scores:
       - model: test-model
         score: 1.0
diff --git a/config/config.e2e.yaml b/config/config.e2e.yaml
@@ -107,6 +107,9 @@ categories:
         score: 0.4
         use_reasoning: false
   - name: psychology
+    # Example: Strict cache threshold for psychology - clinical nuances matter
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.92
     model_scores:
       - model: "Model-A"
         score: 0.6
@@ -156,6 +159,9 @@ categories:
         score: 0.4
         use_reasoning: false
   - name: other
+    # Example: Lower threshold for general queries - better cache hit rate
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.75
     model_scores:
       - model: "Model-B"
         score: 0.8
@@ -168,6 +174,9 @@ categories:
         score: 0.6
         use_reasoning: false
   - name: health
+    # Example: Very strict cache threshold for health - word changes matter medically
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.95
     model_scores:
       - model: "Model-B"
         score: 0.8
diff --git a/config/config.production.yaml b/config/config.production.yaml
@@ -60,12 +60,18 @@ classifier:
 categories:
   - name: math
     system_prompt: "You are a mathematics expert. Provide step-by-step solutions."
+    # Example: High threshold for math - precision matters
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.92
     model_scores:
       - model: openai/gpt-oss-20b
         score: 1.0
         use_reasoning: true
   - name: other
     system_prompt: "You are a helpful assistant."
+    # Example: Lower threshold for general queries - more cache hits
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.75
     model_scores:
       - model: openai/gpt-oss-20b
         score: 0.7
diff --git a/config/config.recipe-accuracy.yaml b/config/config.recipe-accuracy.yaml
@@ -87,6 +87,9 @@ categories:
         use_reasoning: true  # Enable reasoning for legal analysis
   - name: psychology
     system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
+    # Category-level cache override (if global cache is enabled)
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.92  # Strict for clinical nuances
     model_scores:
       - model: openai/gpt-oss-20b
         score: 1.0
@@ -117,6 +120,9 @@ categories:
         use_reasoning: false  # Default queries don't need reasoning
   - name: health
     system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
+    # Category-level cache override (if global cache is enabled)
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.95  # Very strict - medical accuracy critical
     model_scores:
       - model: openai/gpt-oss-20b
         score: 1.0
diff --git a/config/config.recipe-latency.yaml b/config/config.recipe-latency.yaml
@@ -105,6 +105,9 @@ categories:
         use_reasoning: false
   - name: other
     system_prompt: "Provide helpful responses."
+    # Category-level cache (optional, already enabled globally with low threshold)
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.65  # Even lower for general queries
     model_scores:
       - model: openai/gpt-oss-20b
         score: 0.7
diff --git a/config/config.recipe-token-efficiency.yaml b/config/config.recipe-token-efficiency.yaml
@@ -110,6 +110,9 @@ categories:
         use_reasoning: false
   - name: other
     system_prompt: "You are a helpful assistant. Provide concise, accurate responses."
+    # Category-level cache (optional, already enabled globally)
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.7  # Match global or slightly lower
     model_scores:
       - model: openai/gpt-oss-20b
         score: 0.7
diff --git a/config/config.testing.yaml b/config/config.testing.yaml
@@ -42,6 +42,9 @@ model_config:
 
 categories:
   - name: other
+    # Category-level cache settings (optional - falls back to global if not set)
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.8
     model_scores:
       - model: openai/gpt-oss-20b
         score: 0.7
diff --git a/config/config.yaml b/config/config.yaml
@@ -78,6 +78,8 @@ categories:
         use_reasoning: false
   - name: psychology
     system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
+    semantic_cache_enabled: true
+    semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances
     model_scores:
       - model: qwen3
         score: 0.6
@@ -102,12 +104,16 @@ categories:
         use_reasoning: false
   - name: other
     system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
+    semantic_cache_enabled: true
+    semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive
     model_scores:
       - model: qwen3
         score: 0.7
         use_reasoning: false
   - name: health
     system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
+    semantic_cache_enabled: true
+    semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes
     model_scores:
       - model: qwen3
         score: 0.5
diff --git a/src/semantic-router/pkg/cache/cache_interface.go b/src/semantic-router/pkg/cache/cache_interface.go
@@ -33,6 +33,11 @@ type CacheBackend interface {
 	// Returns the cached response, match status, and any error
 	FindSimilar(model string, query string) ([]byte, bool, error)
 
+	// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
+	// This allows category-specific similarity thresholds
+	// Returns the cached response, match status, and any error
+	FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error)
+
 	// Close releases all resources held by the cache backend
 	Close() error
 
diff --git a/src/semantic-router/pkg/cache/inmemory_cache.go b/src/semantic-router/pkg/cache/inmemory_cache.go
@@ -240,20 +240,25 @@ func (c *InMemoryCache) AddEntry(requestID string, model string, query string, r
 	return nil
 }
 
-// FindSimilar searches for semantically similar cached requests
+// FindSimilar searches for semantically similar cached requests using the default threshold
 func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, error) {
+	return c.FindSimilarWithThreshold(model, query, c.similarityThreshold)
+}
+
+// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
+func (c *InMemoryCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) {
 	start := time.Now()
 
 	if !c.enabled {
-		observability.Debugf("InMemoryCache.FindSimilar: cache disabled")
+		observability.Debugf("InMemoryCache.FindSimilarWithThreshold: cache disabled")
 		return nil, false, nil
 	}
 	queryPreview := query
 	if len(query) > 50 {
 		queryPreview = query[:50] + "..."
 	}
-	observability.Debugf("InMemoryCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)",
-		model, queryPreview, len(query))
+	observability.Debugf("InMemoryCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f",
+		model, queryPreview, len(query), threshold)
 
 	// Generate semantic embedding using the configured model
 	queryEmbedding, err := c.generateEmbedding(query)
@@ -270,7 +275,7 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
 		entriesChecked int
 		expiredCount   int
 	)
-	// Capture the lookup time after acquiring the read lock so TTL checks aren’t skewed by embedding work or lock wait
+	// Capture the lookup time after acquiring the read lock so TTL checks aren't skewed by embedding work or lock wait
 	now := time.Now()
 
 	// Compare with completed entries for the same model, tracking only the best match
@@ -325,26 +330,26 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
 	// Handle case where no suitable entries exist
 	if bestIndex < 0 {
 		atomic.AddInt64(&c.missCount, 1)
-		observability.Debugf("InMemoryCache.FindSimilar: no entries found with responses")
+		observability.Debugf("InMemoryCache.FindSimilarWithThreshold: no entries found with responses")
 		metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds())
 		metrics.RecordCacheMiss()
 		return nil, false, nil
 	}
 
 	// Check if the best match meets the similarity threshold
-	if bestSimilarity >= c.similarityThreshold {
+	if bestSimilarity >= threshold {
 		atomic.AddInt64(&c.hitCount, 1)
 
 		c.mu.Lock()
 		c.updateAccessInfo(bestIndex, bestEntry)
 		c.mu.Unlock()
 
-		observability.Debugf("InMemoryCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
-			bestSimilarity, c.similarityThreshold, len(bestEntry.ResponseBody))
+		observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
+			bestSimilarity, threshold, len(bestEntry.ResponseBody))
 		observability.LogEvent("cache_hit", map[string]interface{}{
 			"backend":    "memory",
 			"similarity": bestSimilarity,
-			"threshold":  c.similarityThreshold,
+			"threshold":  threshold,
 			"model":      model,
 		})
 		metrics.RecordCacheOperation("memory", "find_similar", "hit", time.Since(start).Seconds())
@@ -353,12 +358,12 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
 	}
 
 	atomic.AddInt64(&c.missCount, 1)
-	observability.Debugf("InMemoryCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)",
-		bestSimilarity, c.similarityThreshold, entriesChecked)
+	observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)",
+		bestSimilarity, threshold, entriesChecked)
 	observability.LogEvent("cache_miss", map[string]interface{}{
 		"backend":         "memory",
 		"best_similarity": bestSimilarity,
-		"threshold":       c.similarityThreshold,
+		"threshold":       threshold,
 		"model":           model,
 		"entries_checked": entriesChecked,
 	})
diff --git a/src/semantic-router/pkg/cache/milvus_cache.go b/src/semantic-router/pkg/cache/milvus_cache.go
@@ -496,18 +496,23 @@ func (c *MilvusCache) addEntry(id string, requestID string, model string, query
 
 // FindSimilar searches for semantically similar cached requests
 func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, error) {
+	return c.FindSimilarWithThreshold(model, query, c.similarityThreshold)
+}
+
+// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
+func (c *MilvusCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) {
 	start := time.Now()
 
 	if !c.enabled {
-		observability.Debugf("MilvusCache.FindSimilar: cache disabled")
+		observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache disabled")
 		return nil, false, nil
 	}
 	queryPreview := query
 	if len(query) > 50 {
 		queryPreview = query[:50] + "..."
 	}
-	observability.Debugf("MilvusCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)",
-		model, queryPreview, len(query))
+	observability.Debugf("MilvusCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f",
+		model, queryPreview, len(query), threshold)
 
 	// Generate semantic embedding for similarity comparison
 	queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
@@ -538,7 +543,7 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err
 		searchParam,
 	)
 	if err != nil {
-		observability.Debugf("MilvusCache.FindSimilar: search failed: %v", err)
+		observability.Debugf("MilvusCache.FindSimilarWithThreshold: search failed: %v", err)
 		atomic.AddInt64(&c.missCount, 1)
 		metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
 		metrics.RecordCacheMiss()
@@ -547,21 +552,21 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err
 
 	if len(searchResult) == 0 || searchResult[0].ResultCount == 0 {
 		atomic.AddInt64(&c.missCount, 1)
-		observability.Debugf("MilvusCache.FindSimilar: no entries found")
+		observability.Debugf("MilvusCache.FindSimilarWithThreshold: no entries found")
 		metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds())
 		metrics.RecordCacheMiss()
 		return nil, false, nil
 	}
 
 	bestScore := searchResult[0].Scores[0]
-	if bestScore < c.similarityThreshold {
+	if bestScore < threshold {
 		atomic.AddInt64(&c.missCount, 1)
-		observability.Debugf("MilvusCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f",
-			bestScore, c.similarityThreshold)
+		observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f",
+			bestScore, threshold)
 		observability.LogEvent("cache_miss", map[string]interface{}{
 			"backend":         "milvus",
 			"best_similarity": bestScore,
-			"threshold":       c.similarityThreshold,
+			"threshold":       threshold,
 			"model":           model,
 			"collection":      c.collectionName,
 		})
@@ -578,20 +583,20 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err
 	}
 
 	if responseBody == nil {
-		observability.Debugf("MilvusCache.FindSimilar: cache hit but response_body is missing or not a string")
+		observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache hit but response_body is missing or not a string")
 		atomic.AddInt64(&c.missCount, 1)
 		metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
 		metrics.RecordCacheMiss()
 		return nil, false, nil
 	}
 
 	atomic.AddInt64(&c.hitCount, 1)
-	observability.Debugf("MilvusCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
-		bestScore, c.similarityThreshold, len(responseBody))
+	observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
+		bestScore, threshold, len(responseBody))
 	observability.LogEvent("cache_hit", map[string]interface{}{
 		"backend":    "milvus",
 		"similarity": bestScore,
-		"threshold":  c.similarityThreshold,
+		"threshold":  threshold,
 		"model":      model,
 		"collection": c.collectionName,
 	})
diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go
@@ -381,6 +381,12 @@ type Category struct {
 	// "replace": Replace any existing system message with the category-specific prompt
 	// "insert": Prepend the category-specific prompt to the existing system message content
 	SystemPromptMode string `yaml:"system_prompt_mode,omitempty"`
+	// SemanticCacheEnabled controls whether semantic caching is enabled for this category
+	// If nil, inherits from global SemanticCache.Enabled setting
+	SemanticCacheEnabled *bool `yaml:"semantic_cache_enabled,omitempty"`
+	// SemanticCacheSimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0)
+	// If nil, uses the global threshold from SemanticCache.SimilarityThreshold or BertModel.Threshold
+	SemanticCacheSimilarityThreshold *float32 `yaml:"semantic_cache_similarity_threshold,omitempty"`
 }
 
 // GetModelReasoningFamily returns the reasoning family configuration for a given model name
@@ -436,6 +442,11 @@ func BoolPtr(b bool) *bool {
 	return &b
 }
 
+// Float32Ptr returns a pointer to a float32 value (helper for tests and config)
+func Float32Ptr(f float32) *float32 {
+	return &f
+}
+
 // validateConfigStructure performs additional validation on the parsed config
 func validateConfigStructure(cfg *RouterConfig) error {
 	// Ensure all categories have at least one model with scores
@@ -799,3 +810,25 @@ func (c *RouterConfig) GetCategoryByName(name string) *Category {
 	}
 	return nil
 }
+
+// IsCacheEnabledForCategory returns whether semantic caching is enabled for a specific category
+// If the category has an explicit setting, it takes precedence; otherwise, uses global setting
+func (c *RouterConfig) IsCacheEnabledForCategory(categoryName string) bool {
+	category := c.GetCategoryByName(categoryName)
+	if category != nil && category.SemanticCacheEnabled != nil {
+		return *category.SemanticCacheEnabled
+	}
+	// Fall back to global setting
+	return c.SemanticCache.Enabled
+}
+
+// GetCacheSimilarityThresholdForCategory returns the effective cache similarity threshold for a category
+// Priority: category-specific > global semantic_cache > bert_model threshold
+func (c *RouterConfig) GetCacheSimilarityThresholdForCategory(categoryName string) float32 {
+	category := c.GetCategoryByName(categoryName)
+	if category != nil && category.SemanticCacheSimilarityThreshold != nil {
+		return *category.SemanticCacheSimilarityThreshold
+	}
+	// Fall back to global cache threshold or bert threshold
+	return c.GetCacheSimilarityThreshold()
+}
diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go
diff --git a/src/semantic-router/pkg/extproc/caching_test.go b/src/semantic-router/pkg/extproc/caching_test.go
diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go
diff --git a/website/docs/installation/configuration.md b/website/docs/installation/configuration.md
diff --git a/website/docs/tutorials/semantic-cache/in-memory-cache.md b/website/docs/tutorials/semantic-cache/in-memory-cache.md
diff --git a/website/docs/tutorials/semantic-cache/overview.md b/website/docs/tutorials/semantic-cache/overview.md