feat: use the patch method to pass it

zakahan · zakahan · commit 92b170a2df04 · 2025-11-05T15:30:48.000+08:00
diff --git a/veadk/agent.py b/veadk/agent.py
@@ -200,8 +200,10 @@ def model_post_init(self, __context: Any) -> None:
 
         if not self.model:
             if self.enable_responses:
+                from veadk.utils.patches import patch_google_adk_call_llm_async
                 from veadk.models.ark_llm import ArkLlm
 
+                patch_google_adk_call_llm_async()
                 self.model = ArkLlm(
                     model=f"{self.model_provider}/{self.model_name}",
                     api_key=self.model_api_key,
diff --git a/veadk/models/ark_llm.py b/veadk/models/ark_llm.py
@@ -1,22 +1,36 @@
+import json
 import uuid
 from datetime import datetime
-from typing import Any, Dict, Union
+from typing import Any, Dict, Union, AsyncGenerator
 
 import litellm
+from google.adk.models import LlmRequest, LlmResponse
 from google.adk.models.lite_llm import (
     LiteLlm,
     LiteLLMClient,
+    _get_completion_inputs,
+    _model_response_to_chunk,
+    FunctionChunk,
+    TextChunk,
+    _message_to_generate_content_response,
+    UsageMetadataChunk,
+    _model_response_to_generate_content_response,
 )
-from litellm import Logging
-from litellm import aresponses
+from google.genai import types
+from litellm import Logging, aresponses, ChatCompletionAssistantMessage
 from litellm.completion_extras.litellm_responses_transformation.transformation import (
     LiteLLMResponsesTransformationHandler,
 )
 from litellm.litellm_core_utils.get_litellm_params import get_litellm_params
 from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider
 from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
 from litellm.types.llms.openai import ResponsesAPIResponse
-from litellm.types.utils import ModelResponse, LlmProviders
+from litellm.types.utils import (
+    ModelResponse,
+    LlmProviders,
+    ChatCompletionMessageToolCall,
+    Function,
+)
 from litellm.utils import get_optional_params, ProviderConfigManager
 from pydantic import Field
 
@@ -28,6 +42,16 @@
 logger = get_logger(__name__)
 
 
+def _add_response_id_to_llm_response(
+    llm_response: LlmResponse, response: ModelResponse
+) -> LlmResponse:
+    if not response.id.startswith("chatcmpl"):
+        if llm_response.custom_metadata is None:
+            llm_response.custom_metadata = {}
+        llm_response.custom_metadata["response_id"] = response["id"]
+    return llm_response
+
+
 class ArkLlmClient(LiteLLMClient):
     def __init__(self):
         super().__init__()
@@ -37,7 +61,7 @@ async def acompletion(
         self, model, messages, tools, **kwargs
     ) -> Union[ModelResponse, CustomStreamWrapper]:
         # 1.1. Get optional_params using get_optional_params function
-        optional_params = get_optional_params(model=model, **kwargs)
+        optional_params = get_optional_params(model=model, tools=tools, **kwargs)
 
         # 1.2. Get litellm_params using get_litellm_params function
         litellm_params = get_litellm_params(**kwargs)
@@ -74,8 +98,10 @@ async def acompletion(
                 messages = provider_config.translate_developer_role_to_system_role(
                     messages=messages
                 )
-
-        # 1.6 Transform request to responses api format
+        # 1.6 Add response_id to llm_response
+        # Keep the header system-prompt and the user's messages
+        messages = messages[:1] + messages[-1:]
+        # 1.7 Transform request to responses api format
         request_data = self.transformation_handler.transform_request(
             model=model,
             messages=messages,
@@ -87,7 +113,7 @@ async def acompletion(
         )
 
         # 2. Call litellm.aresponses with the transformed request data
-        result = await aresponses(
+        raw_response = await aresponses(
             **request_data,
         )
 
@@ -96,10 +122,10 @@ async def acompletion(
         setattr(model_response, "usage", litellm.Usage())
 
         # 3.2 Transform ResponsesAPIResponse to ModelResponses
-        if isinstance(result, ResponsesAPIResponse):
-            return self.transformation_handler.transform_response(
+        if isinstance(raw_response, ResponsesAPIResponse):
+            response = self.transformation_handler.transform_response(
                 model=model,
-                raw_response=result,
+                raw_response=raw_response,
                 model_response=model_response,
                 logging_obj=logging_obj,
                 request_data=request_data,
@@ -110,9 +136,14 @@ async def acompletion(
                 api_key=kwargs.get("api_key"),
                 json_mode=kwargs.get("json_mode"),
             )
+            # 3.2.1 Modify ModelResponse id
+            if raw_response and hasattr(raw_response, "id"):
+                response.id = raw_response.id
+            return response
+
         else:
             completion_stream = self.transformation_handler.get_model_response_iterator(
-                streaming_response=result,  # type: ignore
+                streaming_response=raw_response,  # type: ignore
                 sync_stream=True,
                 json_mode=kwargs.get("json_mode"),
             )
@@ -132,7 +163,152 @@ class ArkLlm(LiteLlm):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    # async def generate_content_async(
-    #   self, llm_request: LlmRequest, stream: bool = False
-    # ) -> AsyncGenerator[LlmResponse, None]:
-    #     pass
+    async def generate_content_async(
+        self, llm_request: LlmRequest, stream: bool = False
+    ) -> AsyncGenerator[LlmResponse, None]:
+        """Generates content asynchronously.
+
+        Args:
+          llm_request: LlmRequest, the request to send to the LiteLlm model.
+          stream: bool = False, whether to do streaming call.
+
+        Yields:
+          LlmResponse: The model response.
+        """
+
+        self._maybe_append_user_content(llm_request)
+        # logger.debug(_build_request_log(llm_request))
+
+        messages, tools, response_format, generation_params = _get_completion_inputs(
+            llm_request
+        )
+
+        if "functions" in self._additional_args:
+            # LiteLLM does not support both tools and functions together.
+            tools = None
+        # ------------------------------------------------------ #
+        # get previous_response_id
+        previous_response_id = None
+        if llm_request.cache_metadata and llm_request.cache_metadata.cache_name:
+            previous_response_id = llm_request.cache_metadata.cache_name
+        # ------------------------------------------------------ #
+        completion_args = {
+            "model": self.model,
+            "messages": messages,
+            "tools": tools,
+            "response_format": response_format,
+            "previous_response_id": previous_response_id,  # supply previous_response_id
+        }
+        completion_args.update(self._additional_args)
+
+        if generation_params:
+            completion_args.update(generation_params)
+
+        if stream:
+            text = ""
+            # Track function calls by index
+            function_calls = {}  # index -> {name, args, id}
+            completion_args["stream"] = True
+            aggregated_llm_response = None
+            aggregated_llm_response_with_tool_call = None
+            usage_metadata = None
+            fallback_index = 0
+            async for part in await self.llm_client.acompletion(**completion_args):
+                for chunk, finish_reason in _model_response_to_chunk(part):
+                    if isinstance(chunk, FunctionChunk):
+                        index = chunk.index or fallback_index
+                        if index not in function_calls:
+                            function_calls[index] = {"name": "", "args": "", "id": None}
+
+                        if chunk.name:
+                            function_calls[index]["name"] += chunk.name
+                        if chunk.args:
+                            function_calls[index]["args"] += chunk.args
+
+                            # check if args is completed (workaround for improper chunk
+                            # indexing)
+                            try:
+                                json.loads(function_calls[index]["args"])
+                                fallback_index += 1
+                            except json.JSONDecodeError:
+                                pass
+
+                        function_calls[index]["id"] = (
+                            chunk.id or function_calls[index]["id"] or str(index)
+                        )
+                    elif isinstance(chunk, TextChunk):
+                        text += chunk.text
+                        yield _message_to_generate_content_response(
+                            ChatCompletionAssistantMessage(
+                                role="assistant",
+                                content=chunk.text,
+                            ),
+                            is_partial=True,
+                        )
+                    elif isinstance(chunk, UsageMetadataChunk):
+                        usage_metadata = types.GenerateContentResponseUsageMetadata(
+                            prompt_token_count=chunk.prompt_tokens,
+                            candidates_token_count=chunk.completion_tokens,
+                            total_token_count=chunk.total_tokens,
+                        )
+
+                    if (
+                        finish_reason == "tool_calls" or finish_reason == "stop"
+                    ) and function_calls:
+                        tool_calls = []
+                        for index, func_data in function_calls.items():
+                            if func_data["id"]:
+                                tool_calls.append(
+                                    ChatCompletionMessageToolCall(
+                                        type="function",
+                                        id=func_data["id"],
+                                        function=Function(
+                                            name=func_data["name"],
+                                            arguments=func_data["args"],
+                                            index=index,
+                                        ),
+                                    )
+                                )
+                        aggregated_llm_response_with_tool_call = (
+                            _message_to_generate_content_response(
+                                ChatCompletionAssistantMessage(
+                                    role="assistant",
+                                    content=text,
+                                    tool_calls=tool_calls,
+                                )
+                            )
+                        )
+                        text = ""
+                        function_calls.clear()
+                    elif finish_reason == "stop" and text:
+                        aggregated_llm_response = _message_to_generate_content_response(
+                            ChatCompletionAssistantMessage(
+                                role="assistant", content=text
+                            )
+                        )
+                        text = ""
+
+            # waiting until streaming ends to yield the llm_response as litellm tends
+            # to send chunk that contains usage_metadata after the chunk with
+            # finish_reason set to tool_calls or stop.
+            if aggregated_llm_response:
+                if usage_metadata:
+                    aggregated_llm_response.usage_metadata = usage_metadata
+                    usage_metadata = None
+                yield aggregated_llm_response
+
+            if aggregated_llm_response_with_tool_call:
+                if usage_metadata:
+                    aggregated_llm_response_with_tool_call.usage_metadata = (
+                        usage_metadata
+                    )
+                yield aggregated_llm_response_with_tool_call
+
+        else:
+            response = await self.llm_client.acompletion(**completion_args)
+            # ------------------------------------------------------ #
+            # Transport response id
+            # yield _model_response_to_generate_content_response(response)
+            llm_response = _model_response_to_generate_content_response(response)
+            yield _add_response_id_to_llm_response(llm_response, response)
+            # ------------------------------------------------------ #
diff --git a/veadk/utils/patches.py b/veadk/utils/patches.py
@@ -16,6 +16,10 @@
 import sys
 from typing import Callable
 
+from google.adk.agents import InvocationContext
+from google.adk.models import LlmRequest
+from google.adk.models.cache_metadata import CacheMetadata
+
 from veadk.tracing.telemetry.telemetry import (
     trace_call_llm,
     trace_send_data,
@@ -78,3 +82,63 @@ def patch_google_adk_telemetry() -> None:
                     logger.debug(
                         f"Patch {mod_name} {var_name} with {trace_functions[var_name]}"
                     )
+
+
+#
+# BaseLlmFlow._call_llm_async patch hook
+#
+def patch_google_adk_call_llm_async() -> None:
+    """Patch google.adk BaseLlmFlow._call_llm_async with a delegating wrapper.
+
+    Current behavior: simply calls the original implementation and yields its results.
+    This provides a stable hook for later custom business logic without changing behavior now.
+    """
+    # Prevent duplicate patches
+    if hasattr(patch_google_adk_call_llm_async, "_patched"):
+        logger.debug("BaseLlmFlow._call_llm_async already patched, skipping")
+        return
+
+    try:
+        from google.adk.flows.llm_flows.base_llm_flow import BaseLlmFlow
+
+        original_call_llm_async = BaseLlmFlow._call_llm_async
+
+        async def patched_call_llm_async(
+            self,
+            invocation_context: InvocationContext,
+            llm_request: LlmRequest,
+            model_response_event,
+        ):
+            logger.debug(
+                "Patched BaseLlmFlow._call_llm_async invoked; delegating to original"
+            )
+            events = invocation_context.session.events
+            if (
+                events
+                and len(events) >= 2
+                and events[-2].custom_metadata
+                and "response_id" in events[-2].custom_metadata
+            ):
+                previous_response_id = events[-2].custom_metadata["response_id"]
+                llm_request.cache_metadata = CacheMetadata(
+                    cache_name=previous_response_id,
+                    expire_time=0,
+                    fingerprint="",
+                    invocations_used=0,
+                    cached_contents_count=0,
+                )
+
+            async for llm_response in original_call_llm_async(
+                self, invocation_context, llm_request, model_response_event
+            ):
+                # Currently, just pass through the original responses
+                yield llm_response
+
+        BaseLlmFlow._call_llm_async = patched_call_llm_async
+
+        # Marked as patched to prevent duplicate application
+        patch_google_adk_call_llm_async._patched = True
+        logger.info("Successfully patched BaseLlmFlow._call_llm_async")
+
+    except ImportError as e:
+        logger.warning(f"Failed to patch BaseLlmFlow._call_llm_async: {e}")