BerriAI
diff --git a/‎litellm/litellm_core_utils/llm_cost_calc/utils.py‎
Lines changed: 79 additions & 33 deletions b/‎litellm/litellm_core_utils/llm_cost_calc/utils.py‎
Lines changed: 79 additions & 33 deletions
diff --git a/‎litellm/llms/anthropic/chat/transformation.py‎
Lines changed: 1 addition & 0 deletions b/‎litellm/llms/anthropic/chat/transformation.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎litellm/proxy/management_endpoints/key_management_endpoints.py‎
Lines changed: 9 additions & 0 deletions b/‎litellm/proxy/management_endpoints/key_management_endpoints.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎litellm/types/utils.py‎
Lines changed: 18 additions & 0 deletions b/‎litellm/types/utils.py‎
Lines changed: 18 additions & 0 deletions
@@ -113,20 +113,30 @@ def _generic_cost_per_character(
     return prompt_cost, completion_cost
 
 
-def _get_token_base_cost(model_info: ModelInfo, usage: Usage) -> Tuple[float, float, float, float]:
+def _get_token_base_cost(
+    model_info: ModelInfo, usage: Usage
+) -> Tuple[float, float, float, float]:
     """
     Return prompt cost, completion cost, and cache costs for a given model and usage.
 
     If input_tokens > threshold and `input_cost_per_token_above_[x]k_tokens` or `input_cost_per_token_above_[x]_tokens` is set,
     then we use the corresponding threshold cost for all token types.
-    
+
     Returns:
         Tuple[float, float, float, float] - (prompt_cost, completion_cost, cache_creation_cost, cache_read_cost)
     """
-    prompt_base_cost = cast(float, _get_cost_per_unit(model_info, "input_cost_per_token"))
-    completion_base_cost = cast(float, _get_cost_per_unit(model_info, "output_cost_per_token"))
-    cache_creation_cost = cast(float, _get_cost_per_unit(model_info, "cache_creation_input_token_cost"))
-    cache_read_cost = cast(float, _get_cost_per_unit(model_info, "cache_read_input_token_cost"))
+    prompt_base_cost = cast(
+        float, _get_cost_per_unit(model_info, "input_cost_per_token")
+    )
+    completion_base_cost = cast(
+        float, _get_cost_per_unit(model_info, "output_cost_per_token")
+    )
+    cache_creation_cost = cast(
+        float, _get_cost_per_unit(model_info, "cache_creation_input_token_cost")
+    )
+    cache_read_cost = cast(
+        float, _get_cost_per_unit(model_info, "cache_read_input_token_cost")
+    )
 
     ## CHECK IF ABOVE THRESHOLD
     threshold: Optional[float] = None
@@ -140,27 +150,44 @@ def _get_token_base_cost(model_info: ModelInfo, usage: Usage) -> Tuple[float, fl
                 )
                 if usage.prompt_tokens > threshold:
 
-                    prompt_base_cost = cast(float, _get_cost_per_unit(model_info, key, prompt_base_cost))
-                    completion_base_cost = cast(float, _get_cost_per_unit(
-                        model_info,
-                        f"output_cost_per_token_above_{threshold_str}_tokens",
-                        completion_base_cost,
-                    ))
-                    
+                    prompt_base_cost = cast(
+                        float, _get_cost_per_unit(model_info, key, prompt_base_cost)
+                    )
+                    completion_base_cost = cast(
+                        float,
+                        _get_cost_per_unit(
+                            model_info,
+                            f"output_cost_per_token_above_{threshold_str}_tokens",
+                            completion_base_cost,
+                        ),
+                    )
+
                     # Apply tiered pricing to cache costs
-                    cache_creation_tiered_key = f"cache_creation_input_token_cost_above_{threshold_str}_tokens"
-                    cache_read_tiered_key = f"cache_read_input_token_cost_above_{threshold_str}_tokens"
-                    
+                    cache_creation_tiered_key = (
+                        f"cache_creation_input_token_cost_above_{threshold_str}_tokens"
+                    )
+                    cache_read_tiered_key = (
+                        f"cache_read_input_token_cost_above_{threshold_str}_tokens"
+                    )
+
                     if cache_creation_tiered_key in model_info:
-                        cache_creation_cost = cast(float, _get_cost_per_unit(
-                            model_info, cache_creation_tiered_key, cache_creation_cost
-                        ))
-                    
+                        cache_creation_cost = cast(
+                            float,
+                            _get_cost_per_unit(
+                                model_info,
+                                cache_creation_tiered_key,
+                                cache_creation_cost,
+                            ),
+                        )
+
                     if cache_read_tiered_key in model_info:
-                        cache_read_cost = cast(float, _get_cost_per_unit(
-                            model_info, cache_read_tiered_key, cache_read_cost
-                        ))
-                    
+                        cache_read_cost = cast(
+                            float,
+                            _get_cost_per_unit(
+                                model_info, cache_read_tiered_key, cache_read_cost
+                            ),
+                        )
+
                     break
             except (IndexError, ValueError):
                 continue
@@ -195,7 +222,9 @@ def calculate_cost_component(
     return 0.0
 
 
-def _get_cost_per_unit(model_info: ModelInfo, cost_key: str, default_value: Optional[float] = 0.0) -> Optional[float]:
+def _get_cost_per_unit(
+    model_info: ModelInfo, cost_key: str, default_value: Optional[float] = 0.0
+) -> Optional[float]:
     # Sometimes the cost per unit is a string (e.g.: If a value like "3e-7" was read from the config.yaml)
     cost_per_unit = model_info.get(cost_key)
     if isinstance(cost_per_unit, float):
@@ -210,7 +239,6 @@ def _get_cost_per_unit(model_info: ModelInfo, cost_key: str, default_value: Opti
                 f"litellm.litellm_core_utils.llm_cost_calc.utils.py::calculate_cost_per_component(): Exception occured - {cost_per_unit}\nDefaulting to 0.0"
             )
     return default_value
-    
 
 
 def generic_cost_per_token(
@@ -238,6 +266,7 @@ def generic_cost_per_token(
     ### PROCESSING COST
     text_tokens = usage.prompt_tokens
     cache_hit_tokens = 0
+    cache_creation_tokens = 0
     audio_tokens = 0
     character_count = 0
     image_count = 0
@@ -249,6 +278,13 @@ def generic_cost_per_token(
             )
             or 0
         )
+        cache_creation_tokens = (
+            cast(
+                Optional[int],
+                getattr(usage.prompt_tokens_details, "cache_creation_tokens", 0),
+            )
+            or 0
+        )
         text_tokens = (
             cast(
                 Optional[int], getattr(usage.prompt_tokens_details, "text_tokens", None)
@@ -279,11 +315,17 @@ def generic_cost_per_token(
         )
 
     ## EDGE CASE - text tokens not set inside PromptTokensDetails
+
     if text_tokens == 0:
-        text_tokens = usage.prompt_tokens - cache_hit_tokens - audio_tokens
+        text_tokens = (
+            usage.prompt_tokens
+            - cache_hit_tokens
+            - audio_tokens
+            - cache_creation_tokens
+        )
 
-    prompt_base_cost, completion_base_cost, cache_creation_cost, cache_read_cost = _get_token_base_cost(
-        model_info=model_info, usage=usage
+    prompt_base_cost, completion_base_cost, cache_creation_cost, cache_read_cost = (
+        _get_token_base_cost(model_info=model_info, usage=usage)
     )
 
     prompt_cost = float(text_tokens) * prompt_base_cost
@@ -297,7 +339,7 @@ def generic_cost_per_token(
     )
 
     ### CACHE WRITING COST - Now uses tiered pricing
-    prompt_cost += float(usage._cache_creation_input_tokens or 0) * cache_creation_cost
+    prompt_cost += float(cache_creation_tokens) * cache_creation_cost
 
     ### CHARACTER COST
 
@@ -350,8 +392,12 @@ def generic_cost_per_token(
     ## TEXT COST
     completion_cost = float(text_tokens) * completion_base_cost
 
-    _output_cost_per_audio_token = _get_cost_per_unit(model_info, "output_cost_per_audio_token", None)
-    _output_cost_per_reasoning_token = _get_cost_per_unit(model_info, "output_cost_per_reasoning_token", None)
+    _output_cost_per_audio_token = _get_cost_per_unit(
+        model_info, "output_cost_per_audio_token", None
+    )
+    _output_cost_per_reasoning_token = _get_cost_per_unit(
+        model_info, "output_cost_per_reasoning_token", None
+    )
 
     ## AUDIO COST
     if not is_text_tokens_total and audio_tokens is not None and audio_tokens > 0:
@@ -397,7 +443,7 @@ def _call_type_has_image_response(call_type: str) -> bool:
         ]:
             return True
         return False
-    
+
     @staticmethod
     def route_image_generation_cost_calculator(
         model: str,
 
@@ -826,6 +826,7 @@ def calculate_usage(
             and _usage["cache_creation_input_tokens"] is not None
         ):
             cache_creation_input_tokens = _usage["cache_creation_input_tokens"]
+            prompt_tokens += cache_creation_input_tokens
         if (
             "cache_read_input_tokens" in _usage
             and _usage["cache_read_input_tokens"] is not None
 
@@ -2462,6 +2462,9 @@ async def list_keys(
     include_team_keys: bool = Query(
         False, description="Include all keys for teams that user is an admin of."
     ),
+    include_created_by_keys: bool = Query(
+        False, description="Include keys created by the user"
+    ),
     sort_by: Optional[str] = Query(
         default=None,
         description="Column to sort by (e.g. 'user_id', 'created_at', 'spend')",
@@ -2524,6 +2527,7 @@ async def list_keys(
             return_full_object=return_full_object,
             organization_id=organization_id,
             admin_team_ids=admin_team_ids,
+            include_created_by_keys=include_created_by_keys,
             sort_by=sort_by,
             sort_order=sort_order,
         )
@@ -2601,6 +2605,7 @@ async def _list_key_helper(
     admin_team_ids: Optional[
         List[str]
     ] = None,  # New parameter for teams where user is admin
+    include_created_by_keys: bool = False,
     sort_by: Optional[str] = None,
     sort_order: str = "desc",
 ) -> KeyListResponseObject:
@@ -2650,6 +2655,10 @@ async def _list_key_helper(
     if user_condition:
         or_conditions.append(user_condition)
 
+    # Add condition for created by keys if provided
+    if include_created_by_keys and user_id:
+        or_conditions.append({"created_by": user_id})
+
     # Add condition for admin team keys if provided
     if admin_team_ids:
         or_conditions.append({"team_id": {"in": admin_team_ids}})
 
@@ -883,6 +883,9 @@ class PromptTokensDetailsWrapper(
     video_length_seconds: Optional[float] = None
     """Length of videos sent to the model. Used for Vertex AI multimodal embeddings."""
 
+    cache_creation_tokens: Optional[int] = None
+    """Number of cache creation tokens sent to the model. Used for Anthropic prompt caching."""
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         if self.character_count is None:
@@ -893,6 +896,8 @@ def __init__(self, *args, **kwargs):
             del self.video_length_seconds
         if self.web_search_requests is None:
             del self.web_search_requests
+        if self.cache_creation_tokens is None:
+            del self.cache_creation_tokens
 
 
 class ServerToolUse(BaseModel):
@@ -954,6 +959,7 @@ def __init__(
         # handle prompt_tokens_details
         _prompt_tokens_details: Optional[PromptTokensDetailsWrapper] = None
 
+        # guarantee prompt_token_details is always a PromptTokensDetailsWrapper
         if prompt_tokens_details:
             if isinstance(prompt_tokens_details, dict):
                 _prompt_tokens_details = PromptTokensDetailsWrapper(
@@ -988,6 +994,18 @@ def __init__(
             else:
                 _prompt_tokens_details.cached_tokens = params["cache_read_input_tokens"]
 
+        if "cache_creation_input_tokens" in params and isinstance(
+            params["cache_creation_input_tokens"], int
+        ):
+            if _prompt_tokens_details is None:
+                _prompt_tokens_details = PromptTokensDetailsWrapper(
+                    cache_creation_tokens=params["cache_creation_input_tokens"]
+                )
+            else:
+                _prompt_tokens_details.cache_creation_tokens = params[
+                    "cache_creation_input_tokens"
+                ]
+
         super().__init__(
             prompt_tokens=prompt_tokens or 0,
             completion_tokens=completion_tokens or 0,