feat: use callback instead patch

zakahan · zakahan · commit 15fb4807e88e · 2025-11-05T19:23:11.000+08:00
diff --git a/veadk/agent.py b/veadk/agent.py
@@ -36,6 +36,7 @@
 from veadk.knowledgebase import KnowledgeBase
 from veadk.memory.long_term_memory import LongTermMemory
 from veadk.memory.short_term_memory import ShortTermMemory
+from veadk.models.ark_llm import add_previous_response_id
 from veadk.processors import BaseRunProcessor, NoOpRunProcessor
 from veadk.prompts.agent_default_prompt import DEFAULT_DESCRIPTION, DEFAULT_INSTRUCTION
 from veadk.tracing.base_tracer import BaseTracer
@@ -200,16 +201,20 @@ def model_post_init(self, __context: Any) -> None:
 
         if not self.model:
             if self.enable_responses:
-                from veadk.utils.patches import patch_google_adk_call_llm_async
+                # from veadk.utils.patches import patch_google_adk_call_llm_async
                 from veadk.models.ark_llm import ArkLlm
 
-                patch_google_adk_call_llm_async()
+                # patch_google_adk_call_llm_async()
                 self.model = ArkLlm(
                     model=f"{self.model_provider}/{self.model_name}",
                     api_key=self.model_api_key,
                     api_base=self.model_api_base,
                     **self.model_extra_config,
                 )
+                if not self.before_model_callback:
+                    self.before_model_callback = add_previous_response_id
+                else:
+                    self.before_model_callback.append(add_previous_response_id)
             else:
                 self.model = LiteLlm(
                     model=f"{self.model_provider}/{self.model_name}",
diff --git a/veadk/models/ark_llm.py b/veadk/models/ark_llm.py
@@ -1,9 +1,11 @@
 import json
 import uuid
 from datetime import datetime
-from typing import Any, Dict, Union, AsyncGenerator
+from typing import Any, Dict, Union, AsyncGenerator, Optional
 
 import litellm
+from google.adk.agents.callback_context import CallbackContext
+from google.adk.models.cache_metadata import CacheMetadata
 from openai import OpenAI
 from google.adk.models import LlmRequest, LlmResponse
 from google.adk.models.lite_llm import (
@@ -77,13 +79,20 @@
 ]
 
 
-def _add_response_id_to_llm_response(
+def _add_response_data_to_llm_response(
     llm_response: LlmResponse, response: ModelResponse
 ) -> LlmResponse:
+    # add responses id
     if not response.id.startswith("chatcmpl"):
         if llm_response.custom_metadata is None:
             llm_response.custom_metadata = {}
         llm_response.custom_metadata["response_id"] = response["id"]
+    # add responses cache data
+    if response.get("usage", {}).get("prompt_tokens_details"):
+        if llm_response.usage_metadata:
+            llm_response.usage_metadata.cached_content_token_count = (
+                response.get("usage", {}).get("prompt_tokens_details").cached_tokens
+            )
     return llm_response
 
 
@@ -423,5 +432,27 @@ async def generate_content_async(
             # Transport response id
             # yield _model_response_to_generate_content_response(response)
             llm_response = _model_response_to_generate_content_response(response)
-            yield _add_response_id_to_llm_response(llm_response, response)
+            yield _add_response_data_to_llm_response(llm_response, response)
             # ------------------------------------------------------ #
+
+
+def add_previous_response_id(
+    callback_context: CallbackContext, llm_request: LlmRequest
+) -> Optional[LlmResponse]:
+    invocation_context = callback_context._invocation_context
+    events = invocation_context.session.events
+    if (
+        events
+        and len(events) >= 2
+        and events[-2].custom_metadata
+        and "response_id" in events[-2].custom_metadata
+    ):
+        previous_response_id = events[-2].custom_metadata["response_id"]
+        llm_request.cache_metadata = CacheMetadata(
+            cache_name=previous_response_id,
+            expire_time=0,
+            fingerprint="",
+            invocations_used=0,
+            cached_contents_count=0,
+        )
+    return
diff --git a/veadk/utils/patches.py b/veadk/utils/patches.py
@@ -16,9 +16,6 @@
 import sys
 from typing import Callable
 
-from google.adk.agents import InvocationContext
-from google.adk.models import LlmRequest
-from google.adk.models.cache_metadata import CacheMetadata
 
 from veadk.tracing.telemetry.telemetry import (
     trace_call_llm,
@@ -82,63 +79,3 @@ def patch_google_adk_telemetry() -> None:
                     logger.debug(
                         f"Patch {mod_name} {var_name} with {trace_functions[var_name]}"
                     )
-
-
-#
-# BaseLlmFlow._call_llm_async patch hook
-#
-def patch_google_adk_call_llm_async() -> None:
-    """Patch google.adk BaseLlmFlow._call_llm_async with a delegating wrapper.
-
-    Current behavior: simply calls the original implementation and yields its results.
-    This provides a stable hook for later custom business logic without changing behavior now.
-    """
-    # Prevent duplicate patches
-    if hasattr(patch_google_adk_call_llm_async, "_patched"):
-        logger.debug("BaseLlmFlow._call_llm_async already patched, skipping")
-        return
-
-    try:
-        from google.adk.flows.llm_flows.base_llm_flow import BaseLlmFlow
-
-        original_call_llm_async = BaseLlmFlow._call_llm_async
-
-        async def patched_call_llm_async(
-            self,
-            invocation_context: InvocationContext,
-            llm_request: LlmRequest,
-            model_response_event,
-        ):
-            logger.debug(
-                "Patched BaseLlmFlow._call_llm_async invoked; delegating to original"
-            )
-            events = invocation_context.session.events
-            if (
-                events
-                and len(events) >= 2
-                and events[-2].custom_metadata
-                and "response_id" in events[-2].custom_metadata
-            ):
-                previous_response_id = events[-2].custom_metadata["response_id"]
-                llm_request.cache_metadata = CacheMetadata(
-                    cache_name=previous_response_id,
-                    expire_time=0,
-                    fingerprint="",
-                    invocations_used=0,
-                    cached_contents_count=0,
-                )
-
-            async for llm_response in original_call_llm_async(
-                self, invocation_context, llm_request, model_response_event
-            ):
-                # Currently, just pass through the original responses
-                yield llm_response
-
-        BaseLlmFlow._call_llm_async = patched_call_llm_async
-
-        # Marked as patched to prevent duplicate application
-        patch_google_adk_call_llm_async._patched = True
-        logger.info("Successfully patched BaseLlmFlow._call_llm_async")
-
-    except ImportError as e:
-        logger.warning(f"Failed to patch BaseLlmFlow._call_llm_async: {e}")