[Fix] x-litellm-cache-key header not being returned on cache hit (#15348)

ishaan-jaff · web-flow · commit 2f42c806cbba · 2025-10-08T18:10:43.000-07:00
* fix: x-cache-key

* test_cache_key_in_hidden_params_acompletion

* fix: remove_cache_control_flag_from_messages_and_tools
diff --git a/litellm/caching/caching_handler.py b/litellm/caching/caching_handler.py
@@ -14,10 +14,10 @@
 In each method it will call the appropriate method from caching.py
 """
 
-import time
 import asyncio
 import datetime
 import inspect
+import time
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -62,12 +62,10 @@
     LiteLLMLoggingObj = Any
 
 
-from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
-
-
 from litellm.litellm_core_utils.core_helpers import (
-_get_parent_otel_span_from_kwargs,
+    _get_parent_otel_span_from_kwargs,
 )
+from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
 
 
 class CachingHandlerResponse(BaseModel):
@@ -214,9 +212,7 @@ async def _async_get_cache(
                             end_time=end_time,
                             cache_hit=cache_hit,
                         )
-                    cache_key = litellm.cache._get_preset_cache_key_from_kwargs(
-                        **kwargs
-                    )
+                    cache_key = litellm.cache.get_cache_key(**kwargs)
                     if (
                         isinstance(cached_result, BaseModel)
                         or isinstance(cached_result, CustomStreamWrapper)
@@ -330,9 +326,7 @@ def _sync_get_cache(
                         end_time=end_time,
                         cache_hit=cache_hit
                     )
-                    cache_key = litellm.cache._get_preset_cache_key_from_kwargs(
-                        **kwargs
-                    )
+                    cache_key = litellm.cache.get_cache_key(**kwargs)
                     if (
                         isinstance(cached_result, BaseModel)
                         or isinstance(cached_result, CustomStreamWrapper)
diff --git a/litellm/llms/openai/chat/gpt_transformation.py b/litellm/llms/openai/chat/gpt_transformation.py
@@ -397,13 +397,13 @@ def remove_cache_control_flag_from_messages_and_tools(
         )
         from litellm.types.llms.openai import ChatCompletionToolParam
 
-        for message in messages:
-            message = cast(
+        for i, message in enumerate(messages):
+            messages[i] = cast(
                 AllMessageValues, filter_value_from_dict(message, "cache_control")  # type: ignore
             )
         if tools is not None:
-            for tool in tools:
-                tool = cast(
+            for i, tool in enumerate(tools):
+                tools[i] = cast(
                     ChatCompletionToolParam,
                     filter_value_from_dict(tool, "cache_control"),  # type: ignore
                 )
diff --git a/tests/local_testing/test_caching.py b/tests/local_testing/test_caching.py
@@ -2765,3 +2765,56 @@ def test_caching_thinking_args_hit():  # test in memory cache
     except Exception as e:
         print(f"error occurred: {traceback.format_exc()}")
         pytest.fail(f"Error occurred: {e}")
+
+
+@pytest.mark.asyncio
+async def test_cache_key_in_hidden_params_acompletion():
+    """
+    Test that cache_key is present in _hidden_params on cache hits for acompletion.
+    
+    Validates fix for missing x-litellm-cache-key header on proxy cache hits.
+    """
+    litellm.cache = Cache(
+        type="redis",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+    )
+    
+    unique_content = f"test cache key hidden params {uuid.uuid4()}"
+    messages = [{"role": "user", "content": unique_content}]
+    
+    # First call - cache miss
+    response1 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=messages,
+        mock_response="test response",
+        caching=True,
+    )
+    
+    print(f"Response 1 _hidden_params: {response1._hidden_params}")
+    assert response1._hidden_params.get("cache_hit") is not True
+    
+    await asyncio.sleep(0.5)
+    
+    # Second call - cache hit
+    response2 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=messages,
+        mock_response="test response",
+        caching=True,
+    )
+    
+    print(f"Response 2 _hidden_params: {response2._hidden_params}")
+    
+    # Verify cache hit occurred
+    assert response2._hidden_params.get("cache_hit") is True
+    
+    # Verify cache_key is present in _hidden_params
+    assert "cache_key" in response2._hidden_params
+    assert response2._hidden_params["cache_key"] is not None
+    
+    # Verify both responses have same ID (cache hit)
+    assert response1.id == response2.id
+    
+    litellm.cache = None

Original file line number	Diff line number	Diff line change
`@@ -397,13 +397,13 @@ def remove_cache_control_flag_from_messages_and_tools(`
`397`	`397`	`)`
`398`	`398`	`from litellm.types.llms.openai import ChatCompletionToolParam`
`399`	`399`
`400`		`- for message in messages:`
`401`		`- message = cast(`
	`400`	`+ for i, message in enumerate(messages):`
	`401`	`+ messages[i] = cast(`
`402`	`402`	`AllMessageValues, filter_value_from_dict(message, "cache_control") # type: ignore`
`403`	`403`	`)`
`404`	`404`	`if tools is not None:`
`405`		`- for tool in tools:`
`406`		`- tool = cast(`
	`405`	`+ for i, tool in enumerate(tools):`
	`406`	`+ tools[i] = cast(`
`407`	`407`	`ChatCompletionToolParam,`
`408`	`408`	`filter_value_from_dict(tool, "cache_control"), # type: ignore`
`409`	`409`	`)`