@@ -124,6 +124,15 @@ async def get_response(
124124
125125 if hasattr (response , "usage" ):
126126 response_usage = response .usage
127+
128+ # Extract cost from LiteLLM's hidden params if cost tracking is enabled.
129+ cost = None
130+ if model_settings .track_cost :
131+ if hasattr (response , "_hidden_params" ) and isinstance (
132+ response ._hidden_params , dict
133+ ):
134+ cost = response ._hidden_params .get ("response_cost" )
135+
127136 usage = (
128137 Usage (
129138 requests = 1 ,
@@ -142,6 +151,7 @@ async def get_response(
142151 )
143152 or 0
144153 ),
154+ cost = cost ,
145155 )
146156 if response .usage
147157 else Usage ()
@@ -201,10 +211,67 @@ async def stream_response(
201211
202212 final_response : Response | None = None
203213 async for chunk in ChatCmplStreamHandler .handle_stream (response , stream ):
204- yield chunk
205-
214+ # Intercept the response.completed event to calculate and attach cost.
206215 if chunk .type == "response.completed" :
207216 final_response = chunk .response
217+ # Calculate cost using LiteLLM's completion_cost function if enabled.
218+ # Streaming responses don't include cost in _hidden_params, so we
219+ # calculate it from the final token counts.
220+ if model_settings .track_cost and final_response .usage :
221+ try :
222+ # Create a mock ModelResponse for cost calculation.
223+ # Include token details (cached, reasoning) for accurate pricing.
224+ from litellm .types .utils import (
225+ Choices as LitellmChoices ,
226+ CompletionTokensDetailsWrapper ,
227+ Message as LitellmMessage ,
228+ ModelResponse as LitellmModelResponse ,
229+ PromptTokensDetailsWrapper ,
230+ Usage as LitellmUsage ,
231+ )
232+
233+ # Extract token details for accurate cost calculation.
234+ cached_tokens = (
235+ final_response .usage .input_tokens_details .cached_tokens
236+ if final_response .usage .input_tokens_details
237+ else 0
238+ )
239+ reasoning_tokens = (
240+ final_response .usage .output_tokens_details .reasoning_tokens
241+ if final_response .usage .output_tokens_details
242+ else 0
243+ )
244+
245+ mock_response = LitellmModelResponse (
246+ choices = [
247+ LitellmChoices (
248+ index = 0 ,
249+ message = LitellmMessage (role = "assistant" , content = "" ),
250+ )
251+ ],
252+ usage = LitellmUsage (
253+ prompt_tokens = final_response .usage .input_tokens ,
254+ completion_tokens = final_response .usage .output_tokens ,
255+ total_tokens = final_response .usage .total_tokens ,
256+ prompt_tokens_details = PromptTokensDetailsWrapper (
257+ cached_tokens = cached_tokens
258+ ),
259+ completion_tokens_details = CompletionTokensDetailsWrapper (
260+ reasoning_tokens = reasoning_tokens
261+ ),
262+ ),
263+ model = self .model ,
264+ )
265+ cost = litellm .completion_cost (completion_response = mock_response )
266+ # Attach cost as a custom attribute on the Response object so
267+ # run.py can access it when creating the Usage object.
268+ final_response ._litellm_cost = cost
269+ except Exception :
270+ # If cost calculation fails (e.g., unknown model), continue
271+ # without cost.
272+ pass
273+
274+ yield chunk
208275
209276 if tracing .include_data () and final_response :
210277 span_generation .span_data .output = [final_response .model_dump ()]
0 commit comments