fix: send response.id usage matadata

zakahan · zakahan · commit ecdc6f6b11a9 · 2025-11-10T10:22:00.000+08:00
diff --git a/veadk/models/ark_llm.py b/veadk/models/ark_llm.py
@@ -14,7 +14,6 @@
     TextChunk,
     _message_to_generate_content_response,
     UsageMetadataChunk,
-    _model_response_to_generate_content_response,
 )
 from google.genai import types
 from litellm import ChatCompletionAssistantMessage
@@ -91,14 +90,14 @@ async def generate_content_async(
         previous_response_id = None
         if llm_request.cache_metadata and llm_request.cache_metadata.cache_name:
             previous_response_id = llm_request.cache_metadata.cache_name
-        # ------------------------------------------------------ #
         completion_args = {
             "model": self.model,
             "messages": messages,
             "tools": tools,
             "response_format": response_format,
             "previous_response_id": previous_response_id,  # supply previous_response_id
         }
+        # ------------------------------------------------------ #
         completion_args.update(self._additional_args)
 
         if generation_params:
@@ -117,6 +116,7 @@ async def generate_content_async(
             raw_response = await self.llm_client.aresponse(**response_args)
             async for part in raw_response:
                 for (
+                    model_response,
                     chunk,
                     finish_reason,
                 ) in self.transform_handler.stream_event_to_chunk(
@@ -158,6 +158,14 @@ async def generate_content_async(
                             candidates_token_count=chunk.completion_tokens,
                             total_token_count=chunk.total_tokens,
                         )
+                        # ------------------------------------------------------ #
+                        if model_response.get("usage", {}).get("prompt_tokens_details"):
+                            usage_metadata.cached_content_token_count = (
+                                model_response.get("usage", {})
+                                .get("prompt_tokens_details")
+                                .cached_tokens
+                            )
+                        # ------------------------------------------------------ #
 
                     if (
                         finish_reason == "tool_calls" or finish_reason == "stop"
@@ -185,6 +193,11 @@ async def generate_content_async(
                                 )
                             )
                         )
+                        self.transform_handler.adapt_responses_api(
+                            model_response,
+                            aggregated_llm_response_with_tool_call,
+                            stream=True,
+                        )
                         text = ""
                         function_calls.clear()
                     elif finish_reason == "stop" and text:
@@ -193,6 +206,9 @@ async def generate_content_async(
                                 role="assistant", content=text
                             )
                         )
+                        self.transform_handler.adapt_responses_api(
+                            model_response, aggregated_llm_response, stream=True
+                        )
                         text = ""
 
             # waiting until streaming ends to yield the llm_response as litellm tends
@@ -213,32 +229,9 @@ async def generate_content_async(
 
         else:
             raw_response = await self.llm_client.aresponse(**response_args)
-            yield self._openai_response_to_generate_content_response(raw_response)
-
-    def _openai_response_to_generate_content_response(
-        self, raw_response: OpenAITypeResponse
-    ) -> LlmResponse:
-        """
-        OpenAITypeResponse -> litellm.ModelResponse -> LlmResponse
-        """
-        model_response = self.transform_handler.transform_response(
-            openai_response=raw_response, stream=False
-        )
-        llm_response = _model_response_to_generate_content_response(model_response)
-
-        if not model_response.id.startswith("chatcmpl"):
-            if llm_response.custom_metadata is None:
-                llm_response.custom_metadata = {}
-            llm_response.custom_metadata["response_id"] = model_response["id"]
-        # add responses cache data
-        if model_response.get("usage", {}).get("prompt_tokens_details"):
-            if llm_response.usage_metadata:
-                llm_response.usage_metadata.cached_content_token_count = (
-                    model_response.get("usage", {})
-                    .get("prompt_tokens_details")
-                    .cached_tokens
-                )
-        return llm_response
+            yield self.transform_handler.openai_response_to_generate_content_response(
+                raw_response
+            )
 
 
 # before_model_callback
diff --git a/veadk/models/ark_transform.py b/veadk/models/ark_transform.py
@@ -2,11 +2,13 @@
 from typing import Any, Dict, Optional, cast, List, Generator, Tuple, Union
 
 import litellm
+from google.adk.models import LlmResponse
 from google.adk.models.lite_llm import (
     TextChunk,
     FunctionChunk,
     UsageMetadataChunk,
     _model_response_to_chunk,
+    _model_response_to_generate_content_response,
 )
 from openai.types.responses import (
     Response as OpenAITypeResponse,
@@ -192,16 +194,64 @@ def transform_response(
             response.id = raw_response.id
         return response
 
+    def openai_response_to_generate_content_response(
+        self, raw_response: OpenAITypeResponse
+    ) -> LlmResponse:
+        """
+        OpenAITypeResponse -> litellm.ModelResponse -> LlmResponse
+        instead of `_model_response_to_generate_content_response`,
+        """
+        # no stream response
+        model_response = self.transform_response(
+            openai_response=raw_response, stream=False
+        )
+        llm_response = _model_response_to_generate_content_response(model_response)
+
+        llm_response = self.adapt_responses_api(
+            model_response,
+            llm_response,
+        )
+        return llm_response
+
+    def adapt_responses_api(
+        self,
+        model_response: ModelResponse,
+        llm_response: LlmResponse,
+        stream: bool = False,
+    ):
+        """
+        Adapt responses api.
+        """
+        if not model_response.id.startswith("chatcmpl"):
+            if llm_response.custom_metadata is None:
+                llm_response.custom_metadata = {}
+            llm_response.custom_metadata["response_id"] = model_response["id"]
+        # add responses cache data
+        if not stream:
+            if model_response.get("usage", {}).get("prompt_tokens_details"):
+                if llm_response.usage_metadata:
+                    llm_response.usage_metadata.cached_content_token_count = (
+                        model_response.get("usage", {})
+                        .get("prompt_tokens_details")
+                        .cached_tokens
+                    )
+        return llm_response
+
     def stream_event_to_chunk(
         self, event: ResponseStreamEvent, model: str
     ) -> Generator[
         Tuple[
+            ModelResponse,
             Optional[Union[TextChunk, FunctionChunk, UsageMetadataChunk]],
             Optional[str],
         ],
         None,
         None,
     ]:
+        """
+        instead of using `_model_response_to_chunk`,
+        we use our own implementation to support the responses api.
+        """
         choices = []
         model_response = None
 
@@ -214,21 +264,21 @@ def stream_event_to_chunk(
                 stream=True, choices=choices, model=model, id=str(uuid.uuid4())
             )
         elif isinstance(event, ResponseCompletedEvent):
-            pass
             response = event.response
             model_response = self.transform_response(response, stream=True)
-            model_response = fix_response(model_response)
+            model_response = fix_model_response(model_response)
         else:
             # Ignore other event types like ResponseOutputItemAddedEvent, etc.
             pass
 
         if model_response:
-            yield from _model_response_to_chunk(model_response)
+            for chunk, finish_reason in _model_response_to_chunk(model_response):
+                yield model_response, chunk, finish_reason
 
 
-def fix_response(model_response: ModelResponse) -> ModelResponse:
+def fix_model_response(model_response: ModelResponse) -> ModelResponse:
     """
-    Fix the response to ensure some fields that cannot be transferred through direct conversion.
+    fix: tool_call has no attribute `index` in `_model_response_to_chunk`
     """
     for i, choice in enumerate(model_response.choices):
         if choice.message.tool_calls: