chore: Create the context cache based on the token count of previous request

seanzhougoogle · copybara-github · commit c5b976b30632 · 2025-10-02T19:22:00.000-07:00
before this change, we estimate the token count of the contents to cache and use it to compare with the threshold user set. but that's not precise , so we use the actual prompt token count of previous llm request.

We won't create cache for the very initial request

PiperOrigin-RevId: 814484840
diff --git a/src/google/adk/flows/llm_flows/context_cache_processor.py b/src/google/adk/flows/llm_flows/context_cache_processor.py
@@ -62,9 +62,11 @@ async def run_async(
     # Set cache config to request
     llm_request.cache_config = invocation_context.context_cache_config
 
-    # Find latest cache metadata from session events
-    latest_cache_metadata = self._find_latest_cache_metadata(
-        invocation_context, agent.name, invocation_context.invocation_id
+    # Find latest cache metadata and previous token count from session events
+    latest_cache_metadata, previous_token_count = (
+        self._find_cache_info_from_events(
+            invocation_context, agent.name, invocation_context.invocation_id
+        )
     )
 
     if latest_cache_metadata:
@@ -77,51 +79,78 @@ async def run_async(
           latest_cache_metadata.cached_contents_count,
       )
 
+    if previous_token_count is not None:
+      llm_request.cacheable_contents_token_count = previous_token_count
+      logger.debug(
+          'Found previous prompt token count for agent %s: %d',
+          agent.name,
+          previous_token_count,
+      )
+
     logger.debug('Context caching enabled for agent %s', agent.name)
 
     # This processor yields no events
     return
     yield  # AsyncGenerator requires a yield in function body
 
-  def _find_latest_cache_metadata(
+  def _find_cache_info_from_events(
       self,
       invocation_context: 'InvocationContext',
       agent_name: str,
       current_invocation_id: str,
-  ) -> Optional[CacheMetadata]:
-    """Find the latest cache metadata from session events.
+  ) -> tuple[Optional[CacheMetadata], Optional[int]]:
+    """Find cache metadata and previous token count from session events.
 
     Args:
         invocation_context: Context containing session with events
-        agent_name: Name of agent to find cache metadata for
+        agent_name: Name of agent to find cache info for
         current_invocation_id: Current invocation ID to compare for increment
 
     Returns:
-        Latest cache metadata for the agent (with updated invocations_used
-        if needed), or None if not found
+        Tuple of (cache_metadata, previous_prompt_token_count)
+        cache_metadata: Latest cache metadata with updated invocations_used if needed
+        previous_prompt_token_count: Most recent prompt token count from LLM response
     """
     if not invocation_context.session or not invocation_context.session.events:
-      return None
+      return None, None
+
+    cache_metadata = None
+    previous_token_count = None
 
     # Search events from most recent to oldest using index traversal
     events = invocation_context.session.events
     for i in range(len(events) - 1, -1, -1):
       event = events[i]
-      if event.cache_metadata is not None and event.author == agent_name:
-
-        cache_metadata = event.cache_metadata
+      if event.author != agent_name:
+        continue
 
+      # Look for cache metadata (only in actual LLM response events)
+      if cache_metadata is None and event.cache_metadata is not None:
         # Check if this is a different invocation - increment invocations_used
         if event.invocation_id and event.invocation_id != current_invocation_id:
           # Different invocation - increment invocations_used
-          return cache_metadata.model_copy(
-              update={'invocations_used': cache_metadata.invocations_used + 1}
+          cache_metadata = event.cache_metadata.model_copy(
+              update={
+                  'invocations_used': event.cache_metadata.invocations_used + 1
+              }
           )
         else:
           # Same invocation or no invocation_id - return as-is
-          return cache_metadata
+          cache_metadata = event.cache_metadata
+
+      # Look for previous prompt token count (from actual LLM response events)
+      if (
+          previous_token_count is None
+          and event.usage_metadata
+          and event.usage_metadata.prompt_token_count is not None
+      ):
+        previous_token_count = event.usage_metadata.prompt_token_count
+
+      # Stop early if we found both pieces of information
+      if cache_metadata is not None and previous_token_count is not None:
+        break
 
-    return None
+    return cache_metadata, previous_token_count
 
 
 # Create processor instance for use in flows
diff --git a/src/google/adk/models/gemini_context_cache_manager.py b/src/google/adk/models/gemini_context_cache_manager.py
@@ -257,12 +257,21 @@ async def _create_new_cache_with_contents(
     Returns:
         Cache metadata if successful, None otherwise
     """
-    # Estimate token count for minimum cache size check
-    estimated_tokens = self._estimate_request_tokens(llm_request)
-    if estimated_tokens < llm_request.cache_config.min_tokens:
+    # Check if we have token count from previous response for cache size validation
+    if llm_request.cacheable_contents_token_count is None:
       logger.info(
-          "Request too small for caching (%d < %d tokens)",
-          estimated_tokens,
+          "No previous token count available, skipping cache creation for"
+          " initial request"
+      )
+      return None
+
+    if (
+        llm_request.cacheable_contents_token_count
+        < llm_request.cache_config.min_tokens
+    ):
+      logger.info(
+          "Previous request too small for caching (%d < %d tokens)",
+          llm_request.cacheable_contents_token_count,
           llm_request.cache_config.min_tokens,
       )
       return None
diff --git a/src/google/adk/models/llm_request.py b/src/google/adk/models/llm_request.py
@@ -88,6 +88,9 @@ class LlmRequest(BaseModel):
   cache_metadata: Optional[CacheMetadata] = None
   """Cache metadata from previous requests, used for cache management."""
 
+  cacheable_contents_token_count: Optional[int] = None
+  """Token count from previous request's prompt, used for cache size validation."""
+
   def append_instructions(
       self, instructions: Union[list[str], types.Content]
   ) -> list[types.Content]:
diff --git a/tests/unittests/agents/test_gemini_context_cache_manager.py b/tests/unittests/agents/test_gemini_context_cache_manager.py
@@ -121,6 +121,9 @@ async def test_handle_context_caching_new_cache(self):
     )
 
     llm_request = self.create_llm_request()
+    llm_request.cacheable_contents_token_count = (
+        2048  # Add token count for cache creation
+    )
     start_time = time.time()
 
     with patch.object(
@@ -194,6 +197,9 @@ async def test_handle_context_caching_invalid_existing_cache(self):
         invocations_used=15
     )  # Exceeds cache_intervals
     llm_request = self.create_llm_request(cache_metadata=existing_cache)
+    llm_request.cacheable_contents_token_count = (
+        2048  # Add token count for cache creation
+    )
 
     with (
         patch.object(self.manager, "_is_cache_valid", return_value=False),
@@ -521,3 +527,65 @@ def test_parameter_types_enforcement(self):
     assert not hasattr(
         cache_metadata, "usage_metadata"
     )  # CacheMetadata should NOT have this
+
+  def create_llm_request_with_token_count(
+      self, token_count=None, cache_metadata=None
+  ):
+    """Helper to create LlmRequest with cacheable_contents_token_count."""
+    llm_request = self.create_llm_request(cache_metadata=cache_metadata)
+    llm_request.cacheable_contents_token_count = token_count
+    return llm_request
+
+  async def test_cache_creation_with_sufficient_token_count(self):
+    """Test cache creation succeeds when token count meets minimum."""
+    # Setup mocks
+    mock_cached_content = AsyncMock()
+    mock_cached_content.name = (
+        "projects/test/locations/us-central1/cachedContents/token123"
+    )
+    self.manager.genai_client.aio.caches.create = AsyncMock(
+        return_value=mock_cached_content
+    )
+
+    # Create request with sufficient token count
+    llm_request = self.create_llm_request_with_token_count(token_count=2048)
+
+    with patch.object(
+        self.manager, "_generate_cache_fingerprint", return_value="test_fp"
+    ):
+      result = await self.manager.handle_context_caching(llm_request)
+
+    # Should succeed in creating cache
+    assert result is not None
+    assert result.cache_name == mock_cached_content.name
+    self.manager.genai_client.aio.caches.create.assert_called_once()
+
+  async def test_cache_creation_with_insufficient_token_count(self):
+    """Test cache creation fails when token count is below minimum."""
+    # Set higher minimum token requirement
+    self.manager.cache_config = ContextCacheConfig(
+        cache_intervals=10,
+        ttl_seconds=1800,
+        min_tokens=2048,
+    )
+
+    # Create request with insufficient token count
+    llm_request = self.create_llm_request_with_token_count(token_count=1024)
+    llm_request.cache_config = self.manager.cache_config
+
+    result = await self.manager.handle_context_caching(llm_request)
+
+    # Should not create cache
+    assert result is None
+    self.manager.genai_client.aio.caches.create.assert_not_called()
+
+  async def test_cache_creation_without_token_count(self):
+    """Test cache creation is skipped when no token count is available."""
+    # Create request without token count (initial request)
+    llm_request = self.create_llm_request_with_token_count(token_count=None)
+
+    result = await self.manager.handle_context_caching(llm_request)
+
+    # Should skip cache creation for initial request
+    assert result is None
+    self.manager.genai_client.aio.caches.create.assert_not_called()
diff --git a/tests/unittests/flows/llm_flows/test_context_cache_processor.py b/tests/unittests/flows/llm_flows/test_context_cache_processor.py