Merge pull request #14831 from otaviofbrito/minor/fix-context-caching-vertex

krrishdholakia · web-flow · commit e92a73d603c0 · 2025-09-23T18:29:51.000-07:00
Vertex AI Context Caching: use Vertex ai API v1 instead of v1beta1 and accept 'cachedContent' param
diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md
@@ -815,6 +815,77 @@ Use Vertex AI context caching is supported by calling provider api directly. (Un
 
 [**Go straight to provider**](../pass_through/vertex_ai.md#context-caching)
 
+#### 1. Create the Cache
+
+First, create the cache by sending a `POST` request to the `cachedContents` endpoint via the LiteLLM proxy.
+
+<Tabs>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/vertex_ai/v1/projects/{project_id}/locations/{location}/cachedContents \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "projects/{project_id}/locations/{location}/publishers/google/models/gemini-2.5-flash",
+    "displayName": "example_cache",
+    "contents": [{
+      "role": "user",
+      "parts": [{
+        "text": ".... a long book to be cached"
+      }]
+    }]
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+#### 2. Get the Cache Name from the Response
+
+Vertex AI will return a response containing the `name` of the cached content. This name is the identifier for your cached data.
+
+```json
+{
+    "name": "projects/12341234/locations/{location}/cachedContents/123123123123123",
+    "model": "projects/{project_id}/locations/{location}/publishers/google/models/gemini-2.5-flash",
+    "createTime": "2025-09-23T19:13:50.674976Z",
+    "updateTime": "2025-09-23T19:13:50.674976Z",
+    "expireTime": "2025-09-23T20:13:50.655988Z",
+    "displayName": "example_cache",
+    "usageMetadata": {
+        "totalTokenCount": 1246,
+        "textCount": 5132
+    }
+}
+```
+
+#### 3. Use the Cached Content
+
+Use the `name` from the response as `cachedContent` or `cached_content` in subsequent API calls to reuse the cached information. This is passed in the body of your request to `/chat/completions`.
+
+<Tabs>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+
+curl http://0.0.0.0:4000/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "cachedContent": "projects/545201925769/locations/us-central1/cachedContents/4511135542628319232",
+    "model": "gemini-2.5-flash",
+    "messages": [
+        {
+            "role": "user",
+            "content": "what is the book about?"
+        }
+    ]
+  }'
+```
+
+</TabItem>
+
 
 ## Pre-requisites
 * `pip install google-cloud-aiplatform` (pre-installed on proxy docker image)
@@ -2724,7 +2795,3 @@ Once that's done, when you deploy the new container in the Google Cloud Run serv
 
 
 s/o @[Darien Kindlund](https://www.linkedin.com/in/kindlund/) for this tutorial
-
-
-
-
diff --git a/litellm/llms/vertex_ai/gemini/transformation.py b/litellm/llms/vertex_ai/gemini/transformation.py
@@ -537,7 +537,11 @@ def sync_transform_request_body(
             logging_obj=logging_obj,
         )
     else:  # [TODO] implement context caching for gemini as well
-        cached_content = optional_params.pop("cached_content", None)
+        cached_content = None
+        if "cached_content" in optional_params:
+            cached_content = optional_params.pop("cached_content")
+        elif "cachedContent" in optional_params:
+            cached_content = optional_params.pop("cachedContent")
 
     return _transform_request_body(
         messages=messages,
@@ -584,7 +588,11 @@ async def async_transform_request_body(
             logging_obj=logging_obj,
         )
     else:  # [TODO] implement context caching for gemini as well
-        cached_content = optional_params.pop("cached_content", None)
+        cached_content = None
+        if "cached_content" in optional_params:
+            cached_content = optional_params.pop("cached_content")
+        elif "cachedContent" in optional_params:
+            cached_content = optional_params.pop("cachedContent")
 
     return _transform_request_body(
         messages=messages,
@@ -649,5 +657,3 @@ def _transform_system_message(
         return SystemInstructions(parts=system_content_blocks), messages
 
     return None, messages
-
-
diff --git a/litellm/llms/vertex_ai/vertex_llm_base.py b/litellm/llms/vertex_ai/vertex_llm_base.py
@@ -271,17 +271,11 @@ def _ensure_access_token(
 
     def is_using_v1beta1_features(self, optional_params: dict) -> bool:
         """
-        VertexAI only supports ContextCaching on v1beta1
-
         use this helper to decide if request should be sent to v1 or v1beta1
 
-        Returns v1beta1 if context caching is enabled
-        Returns v1 in all other cases
+        Returns true if any beta feature is enabled
+        Returns false in all other cases
         """
-        if "cached_content" in optional_params:
-            return True
-        if "CachedContent" in optional_params:
-            return True
         return False
 
     def _check_custom_proxy(