@@ -124,6 +124,15 @@ async def get_response(
124
124
125
125
if hasattr (response , "usage" ):
126
126
response_usage = response .usage
127
+
128
+ # Extract cost from LiteLLM's hidden params if cost tracking is enabled.
129
+ cost = None
130
+ if model_settings .track_cost :
131
+ if hasattr (response , "_hidden_params" ) and isinstance (
132
+ response ._hidden_params , dict
133
+ ):
134
+ cost = response ._hidden_params .get ("response_cost" )
135
+
127
136
usage = (
128
137
Usage (
129
138
requests = 1 ,
@@ -142,6 +151,7 @@ async def get_response(
142
151
)
143
152
or 0
144
153
),
154
+ cost = cost ,
145
155
)
146
156
if response .usage
147
157
else Usage ()
@@ -201,10 +211,67 @@ async def stream_response(
201
211
202
212
final_response : Response | None = None
203
213
async for chunk in ChatCmplStreamHandler .handle_stream (response , stream ):
204
- yield chunk
205
-
214
+ # Intercept the response.completed event to calculate and attach cost.
206
215
if chunk .type == "response.completed" :
207
216
final_response = chunk .response
217
+ # Calculate cost using LiteLLM's completion_cost function if enabled.
218
+ # Streaming responses don't include cost in _hidden_params, so we
219
+ # calculate it from the final token counts.
220
+ if model_settings .track_cost and final_response .usage :
221
+ try :
222
+ # Create a mock ModelResponse for cost calculation.
223
+ # Include token details (cached, reasoning) for accurate pricing.
224
+ from litellm .types .utils import (
225
+ Choices as LitellmChoices ,
226
+ CompletionTokensDetailsWrapper ,
227
+ Message as LitellmMessage ,
228
+ ModelResponse as LitellmModelResponse ,
229
+ PromptTokensDetailsWrapper ,
230
+ Usage as LitellmUsage ,
231
+ )
232
+
233
+ # Extract token details for accurate cost calculation.
234
+ cached_tokens = (
235
+ final_response .usage .input_tokens_details .cached_tokens
236
+ if final_response .usage .input_tokens_details
237
+ else 0
238
+ )
239
+ reasoning_tokens = (
240
+ final_response .usage .output_tokens_details .reasoning_tokens
241
+ if final_response .usage .output_tokens_details
242
+ else 0
243
+ )
244
+
245
+ mock_response = LitellmModelResponse (
246
+ choices = [
247
+ LitellmChoices (
248
+ index = 0 ,
249
+ message = LitellmMessage (role = "assistant" , content = "" ),
250
+ )
251
+ ],
252
+ usage = LitellmUsage (
253
+ prompt_tokens = final_response .usage .input_tokens ,
254
+ completion_tokens = final_response .usage .output_tokens ,
255
+ total_tokens = final_response .usage .total_tokens ,
256
+ prompt_tokens_details = PromptTokensDetailsWrapper (
257
+ cached_tokens = cached_tokens
258
+ ),
259
+ completion_tokens_details = CompletionTokensDetailsWrapper (
260
+ reasoning_tokens = reasoning_tokens
261
+ ),
262
+ ),
263
+ model = self .model ,
264
+ )
265
+ cost = litellm .completion_cost (completion_response = mock_response )
266
+ # Attach cost as a custom attribute on the Response object so
267
+ # run.py can access it when creating the Usage object.
268
+ final_response ._litellm_cost = cost
269
+ except Exception :
270
+ # If cost calculation fails (e.g., unknown model), continue
271
+ # without cost.
272
+ pass
273
+
274
+ yield chunk
208
275
209
276
if tracing .include_data () and final_response :
210
277
span_generation .span_data .output = [final_response .model_dump ()]
0 commit comments