@@ -113,20 +113,30 @@ def _generic_cost_per_character(
113
113
return prompt_cost , completion_cost
114
114
115
115
116
- def _get_token_base_cost (model_info : ModelInfo , usage : Usage ) -> Tuple [float , float , float , float ]:
116
+ def _get_token_base_cost (
117
+ model_info : ModelInfo , usage : Usage
118
+ ) -> Tuple [float , float , float , float ]:
117
119
"""
118
120
Return prompt cost, completion cost, and cache costs for a given model and usage.
119
121
120
122
If input_tokens > threshold and `input_cost_per_token_above_[x]k_tokens` or `input_cost_per_token_above_[x]_tokens` is set,
121
123
then we use the corresponding threshold cost for all token types.
122
-
124
+
123
125
Returns:
124
126
Tuple[float, float, float, float] - (prompt_cost, completion_cost, cache_creation_cost, cache_read_cost)
125
127
"""
126
- prompt_base_cost = cast (float , _get_cost_per_unit (model_info , "input_cost_per_token" ))
127
- completion_base_cost = cast (float , _get_cost_per_unit (model_info , "output_cost_per_token" ))
128
- cache_creation_cost = cast (float , _get_cost_per_unit (model_info , "cache_creation_input_token_cost" ))
129
- cache_read_cost = cast (float , _get_cost_per_unit (model_info , "cache_read_input_token_cost" ))
128
+ prompt_base_cost = cast (
129
+ float , _get_cost_per_unit (model_info , "input_cost_per_token" )
130
+ )
131
+ completion_base_cost = cast (
132
+ float , _get_cost_per_unit (model_info , "output_cost_per_token" )
133
+ )
134
+ cache_creation_cost = cast (
135
+ float , _get_cost_per_unit (model_info , "cache_creation_input_token_cost" )
136
+ )
137
+ cache_read_cost = cast (
138
+ float , _get_cost_per_unit (model_info , "cache_read_input_token_cost" )
139
+ )
130
140
131
141
## CHECK IF ABOVE THRESHOLD
132
142
threshold : Optional [float ] = None
@@ -140,27 +150,44 @@ def _get_token_base_cost(model_info: ModelInfo, usage: Usage) -> Tuple[float, fl
140
150
)
141
151
if usage .prompt_tokens > threshold :
142
152
143
- prompt_base_cost = cast (float , _get_cost_per_unit (model_info , key , prompt_base_cost ))
144
- completion_base_cost = cast (float , _get_cost_per_unit (
145
- model_info ,
146
- f"output_cost_per_token_above_{ threshold_str } _tokens" ,
147
- completion_base_cost ,
148
- ))
149
-
153
+ prompt_base_cost = cast (
154
+ float , _get_cost_per_unit (model_info , key , prompt_base_cost )
155
+ )
156
+ completion_base_cost = cast (
157
+ float ,
158
+ _get_cost_per_unit (
159
+ model_info ,
160
+ f"output_cost_per_token_above_{ threshold_str } _tokens" ,
161
+ completion_base_cost ,
162
+ ),
163
+ )
164
+
150
165
# Apply tiered pricing to cache costs
151
- cache_creation_tiered_key = f"cache_creation_input_token_cost_above_{ threshold_str } _tokens"
152
- cache_read_tiered_key = f"cache_read_input_token_cost_above_{ threshold_str } _tokens"
153
-
166
+ cache_creation_tiered_key = (
167
+ f"cache_creation_input_token_cost_above_{ threshold_str } _tokens"
168
+ )
169
+ cache_read_tiered_key = (
170
+ f"cache_read_input_token_cost_above_{ threshold_str } _tokens"
171
+ )
172
+
154
173
if cache_creation_tiered_key in model_info :
155
- cache_creation_cost = cast (float , _get_cost_per_unit (
156
- model_info , cache_creation_tiered_key , cache_creation_cost
157
- ))
158
-
174
+ cache_creation_cost = cast (
175
+ float ,
176
+ _get_cost_per_unit (
177
+ model_info ,
178
+ cache_creation_tiered_key ,
179
+ cache_creation_cost ,
180
+ ),
181
+ )
182
+
159
183
if cache_read_tiered_key in model_info :
160
- cache_read_cost = cast (float , _get_cost_per_unit (
161
- model_info , cache_read_tiered_key , cache_read_cost
162
- ))
163
-
184
+ cache_read_cost = cast (
185
+ float ,
186
+ _get_cost_per_unit (
187
+ model_info , cache_read_tiered_key , cache_read_cost
188
+ ),
189
+ )
190
+
164
191
break
165
192
except (IndexError , ValueError ):
166
193
continue
@@ -195,7 +222,9 @@ def calculate_cost_component(
195
222
return 0.0
196
223
197
224
198
- def _get_cost_per_unit (model_info : ModelInfo , cost_key : str , default_value : Optional [float ] = 0.0 ) -> Optional [float ]:
225
+ def _get_cost_per_unit (
226
+ model_info : ModelInfo , cost_key : str , default_value : Optional [float ] = 0.0
227
+ ) -> Optional [float ]:
199
228
# Sometimes the cost per unit is a string (e.g.: If a value like "3e-7" was read from the config.yaml)
200
229
cost_per_unit = model_info .get (cost_key )
201
230
if isinstance (cost_per_unit , float ):
@@ -210,7 +239,6 @@ def _get_cost_per_unit(model_info: ModelInfo, cost_key: str, default_value: Opti
210
239
f"litellm.litellm_core_utils.llm_cost_calc.utils.py::calculate_cost_per_component(): Exception occured - { cost_per_unit } \n Defaulting to 0.0"
211
240
)
212
241
return default_value
213
-
214
242
215
243
216
244
def generic_cost_per_token (
@@ -238,6 +266,7 @@ def generic_cost_per_token(
238
266
### PROCESSING COST
239
267
text_tokens = usage .prompt_tokens
240
268
cache_hit_tokens = 0
269
+ cache_creation_tokens = 0
241
270
audio_tokens = 0
242
271
character_count = 0
243
272
image_count = 0
@@ -249,6 +278,13 @@ def generic_cost_per_token(
249
278
)
250
279
or 0
251
280
)
281
+ cache_creation_tokens = (
282
+ cast (
283
+ Optional [int ],
284
+ getattr (usage .prompt_tokens_details , "cache_creation_tokens" , 0 ),
285
+ )
286
+ or 0
287
+ )
252
288
text_tokens = (
253
289
cast (
254
290
Optional [int ], getattr (usage .prompt_tokens_details , "text_tokens" , None )
@@ -279,11 +315,17 @@ def generic_cost_per_token(
279
315
)
280
316
281
317
## EDGE CASE - text tokens not set inside PromptTokensDetails
318
+
282
319
if text_tokens == 0 :
283
- text_tokens = usage .prompt_tokens - cache_hit_tokens - audio_tokens
320
+ text_tokens = (
321
+ usage .prompt_tokens
322
+ - cache_hit_tokens
323
+ - audio_tokens
324
+ - cache_creation_tokens
325
+ )
284
326
285
- prompt_base_cost , completion_base_cost , cache_creation_cost , cache_read_cost = _get_token_base_cost (
286
- model_info = model_info , usage = usage
327
+ prompt_base_cost , completion_base_cost , cache_creation_cost , cache_read_cost = (
328
+ _get_token_base_cost ( model_info = model_info , usage = usage )
287
329
)
288
330
289
331
prompt_cost = float (text_tokens ) * prompt_base_cost
@@ -297,7 +339,7 @@ def generic_cost_per_token(
297
339
)
298
340
299
341
### CACHE WRITING COST - Now uses tiered pricing
300
- prompt_cost += float (usage . _cache_creation_input_tokens or 0 ) * cache_creation_cost
342
+ prompt_cost += float (cache_creation_tokens ) * cache_creation_cost
301
343
302
344
### CHARACTER COST
303
345
@@ -350,8 +392,12 @@ def generic_cost_per_token(
350
392
## TEXT COST
351
393
completion_cost = float (text_tokens ) * completion_base_cost
352
394
353
- _output_cost_per_audio_token = _get_cost_per_unit (model_info , "output_cost_per_audio_token" , None )
354
- _output_cost_per_reasoning_token = _get_cost_per_unit (model_info , "output_cost_per_reasoning_token" , None )
395
+ _output_cost_per_audio_token = _get_cost_per_unit (
396
+ model_info , "output_cost_per_audio_token" , None
397
+ )
398
+ _output_cost_per_reasoning_token = _get_cost_per_unit (
399
+ model_info , "output_cost_per_reasoning_token" , None
400
+ )
355
401
356
402
## AUDIO COST
357
403
if not is_text_tokens_total and audio_tokens is not None and audio_tokens > 0 :
@@ -397,7 +443,7 @@ def _call_type_has_image_response(call_type: str) -> bool:
397
443
]:
398
444
return True
399
445
return False
400
-
446
+
401
447
@staticmethod
402
448
def route_image_generation_cost_calculator (
403
449
model : str ,
0 commit comments