1414 TextChunk ,
1515 _message_to_generate_content_response ,
1616 UsageMetadataChunk ,
17- _model_response_to_generate_content_response ,
1817)
1918from google .genai import types
2019from litellm import ChatCompletionAssistantMessage
@@ -91,14 +90,14 @@ async def generate_content_async(
9190 previous_response_id = None
9291 if llm_request .cache_metadata and llm_request .cache_metadata .cache_name :
9392 previous_response_id = llm_request .cache_metadata .cache_name
94- # ------------------------------------------------------ #
9593 completion_args = {
9694 "model" : self .model ,
9795 "messages" : messages ,
9896 "tools" : tools ,
9997 "response_format" : response_format ,
10098 "previous_response_id" : previous_response_id , # supply previous_response_id
10199 }
100+ # ------------------------------------------------------ #
102101 completion_args .update (self ._additional_args )
103102
104103 if generation_params :
@@ -117,6 +116,7 @@ async def generate_content_async(
117116 raw_response = await self .llm_client .aresponse (** response_args )
118117 async for part in raw_response :
119118 for (
119+ model_response ,
120120 chunk ,
121121 finish_reason ,
122122 ) in self .transform_handler .stream_event_to_chunk (
@@ -158,6 +158,14 @@ async def generate_content_async(
158158 candidates_token_count = chunk .completion_tokens ,
159159 total_token_count = chunk .total_tokens ,
160160 )
161+ # ------------------------------------------------------ #
162+ if model_response .get ("usage" , {}).get ("prompt_tokens_details" ):
163+ usage_metadata .cached_content_token_count = (
164+ model_response .get ("usage" , {})
165+ .get ("prompt_tokens_details" )
166+ .cached_tokens
167+ )
168+ # ------------------------------------------------------ #
161169
162170 if (
163171 finish_reason == "tool_calls" or finish_reason == "stop"
@@ -185,6 +193,11 @@ async def generate_content_async(
185193 )
186194 )
187195 )
196+ self .transform_handler .adapt_responses_api (
197+ model_response ,
198+ aggregated_llm_response_with_tool_call ,
199+ stream = True ,
200+ )
188201 text = ""
189202 function_calls .clear ()
190203 elif finish_reason == "stop" and text :
@@ -193,6 +206,9 @@ async def generate_content_async(
193206 role = "assistant" , content = text
194207 )
195208 )
209+ self .transform_handler .adapt_responses_api (
210+ model_response , aggregated_llm_response , stream = True
211+ )
196212 text = ""
197213
198214 # waiting until streaming ends to yield the llm_response as litellm tends
@@ -213,32 +229,9 @@ async def generate_content_async(
213229
214230 else :
215231 raw_response = await self .llm_client .aresponse (** response_args )
216- yield self ._openai_response_to_generate_content_response (raw_response )
217-
218- def _openai_response_to_generate_content_response (
219- self , raw_response : OpenAITypeResponse
220- ) -> LlmResponse :
221- """
222- OpenAITypeResponse -> litellm.ModelResponse -> LlmResponse
223- """
224- model_response = self .transform_handler .transform_response (
225- openai_response = raw_response , stream = False
226- )
227- llm_response = _model_response_to_generate_content_response (model_response )
228-
229- if not model_response .id .startswith ("chatcmpl" ):
230- if llm_response .custom_metadata is None :
231- llm_response .custom_metadata = {}
232- llm_response .custom_metadata ["response_id" ] = model_response ["id" ]
233- # add responses cache data
234- if model_response .get ("usage" , {}).get ("prompt_tokens_details" ):
235- if llm_response .usage_metadata :
236- llm_response .usage_metadata .cached_content_token_count = (
237- model_response .get ("usage" , {})
238- .get ("prompt_tokens_details" )
239- .cached_tokens
240- )
241- return llm_response
232+ yield self .transform_handler .openai_response_to_generate_content_response (
233+ raw_response
234+ )
242235
243236
244237# before_model_callback
0 commit comments