fix: stricter semantic cache defaults for reliability

ranaroussi · ranaroussi · commit 92d2ec93d277 · 2025-12-22T23:59:46.000Z
- Increase default similarity_threshold from 0.95 to 0.98
- Increase default min_text_length from 50 to 128 chars
- Make min_text_length configurable via CacheConfig

These changes ensure semantic caching only activates for longer,
substantive queries with near-identical content.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,13 +12,16 @@ Fixed critical issues with the semantic cache that caused incorrect cache matche
 
 1. **System Prompt Hash Matching**: The semantic cache now includes a hash of the system prompt when matching cached responses. Previously, different LLM operations with similar user messages but different system prompts could incorrectly return cached responses from unrelated operations.
 
-2. **Short Text Exclusion**: Messages shorter than 50 characters are now excluded from semantic matching. Short questions like "what about X?" and "what is Y?" have misleadingly high semantic similarity scores which caused false cache hits. These short messages still benefit from exact hash matching.
+2. **Short Text Exclusion**: Messages shorter than 128 characters are now excluded from semantic matching (configurable via `min_text_length`). Short questions have misleadingly high semantic similarity scores which caused false cache hits. These short messages still benefit from exact hash matching.
+
+3. **Stricter Default Threshold**: Default similarity threshold increased from 0.95 to 0.98 for more reliable matching.
 
 ### Changes
 
 - Added `_extract_system_hash()` method to compute SHA256 hash of system prompt content
 - Modified `_semantic_search()` to require both semantic similarity AND system hash match
-- Added minimum text length check (50 chars) before semantic cache operations
+- Added configurable `min_text_length` parameter (default: 128 chars) before semantic cache operations
+- Changed default `similarity_threshold` from 0.95 to 0.98
 - Added `caching` parameter to `ChatCompletion.create/acreate` for per-call cache bypass
 
 ---
diff --git a/onellm/cache.py b/onellm/cache.py
@@ -40,29 +40,33 @@ class CacheConfig:
     def __init__(
         self,
         max_entries: int = 1000,
-        similarity_threshold: float = 0.95,
+        similarity_threshold: float = 0.98,
         hash_only: bool = False,
         stream_chunk_strategy: str = "words",
         stream_chunk_length: int = 8,
         ttl: int = 86400,
+        min_text_length: int = 128,
     ):
         """
         Initialize cache configuration.
 
         Args:
             max_entries: Maximum number of cache entries before LRU eviction (default: 1000)
-            similarity_threshold: Minimum similarity score for semantic cache hit (default: 0.95)
+            similarity_threshold: Minimum similarity score for semantic cache hit (default: 0.98)
             hash_only: Disable semantic matching, use only hash-based exact matches (default: False)
             stream_chunk_strategy: How to chunk cached streaming responses (default: "words")
             stream_chunk_length: Number of strategy units per chunk (default: 8)
             ttl: Time-to-live in seconds for cache entries (default: 86400, 1 day)
+            min_text_length: Minimum text length for semantic matching (default: 128).
+                Short texts have misleadingly high similarity and skip semantic cache.
         """
         self.max_entries = max_entries
         self.similarity_threshold = similarity_threshold
         self.hash_only = hash_only
         self.stream_chunk_strategy = stream_chunk_strategy
         self.stream_chunk_length = stream_chunk_length
         self.ttl = ttl
+        self.min_text_length = min_text_length
 
         # Validate strategy
         valid_strategies = {"words", "sentences", "paragraphs", "characters"}
@@ -300,7 +304,7 @@ def get(self, model: str, messages: list[dict], **kwargs) -> dict | None:
             system_hash = self._extract_system_hash(messages)
             # Skip semantic search for short texts - they have misleadingly high similarity
             # Short questions like "what about X?" and "what is Y?" can match incorrectly
-            if text and len(text) >= 50:
+            if text and len(text) >= self.config.min_text_length:
                 result = self._semantic_search(text, system_hash)
                 if result is not None:
                     self.hits += 1
@@ -353,7 +357,7 @@ def set(self, model: str, messages: list[dict], response: dict, **kwargs):
             text = self._extract_text(messages)
             system_hash = self._extract_system_hash(messages)
             # Skip semantic indexing for short texts - they cause false matches
-            if text and len(text) >= 50:
+            if text and len(text) >= self.config.min_text_length:
                 try:
                     # Generate embedding
                     import numpy as np