fix(utils.py): log cache_creation_tokens in prompt token details

krrishdholakia · krrishdholakia · commit e4883128735c · 2025-09-16T18:24:10.000-07:00
Closes LIT-907
diff --git a/litellm/litellm_core_utils/llm_cost_calc/utils.py b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@@ -278,6 +278,13 @@ def generic_cost_per_token(
             )
             or 0
         )
+        cache_creation_tokens = (
+            cast(
+                Optional[int],
+                getattr(usage.prompt_tokens_details, "cache_creation_tokens", 0),
+            )
+            or 0
+        )
         text_tokens = (
             cast(
                 Optional[int], getattr(usage.prompt_tokens_details, "text_tokens", None)
@@ -307,9 +314,8 @@ def generic_cost_per_token(
             or 0
         )
 
-    if getattr(usage, "_cache_creation_input_tokens", 0) is not None:
-        cache_creation_tokens = usage._cache_creation_input_tokens
     ## EDGE CASE - text tokens not set inside PromptTokensDetails
+
     if text_tokens == 0:
         text_tokens = (
             usage.prompt_tokens
@@ -333,7 +339,7 @@ def generic_cost_per_token(
     )
 
     ### CACHE WRITING COST - Now uses tiered pricing
-    prompt_cost += float(usage._cache_creation_input_tokens or 0) * cache_creation_cost
+    prompt_cost += float(cache_creation_tokens) * cache_creation_cost
 
     ### CHARACTER COST
 
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
@@ -162,7 +162,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
         SearchContextCostPerQuery
     ]  # Cost for using web search tool
     citation_cost_per_token: Optional[float]  # Cost per citation token for Perplexity
-    tiered_pricing: Optional[List[Dict[str, Any]]]  # Tiered pricing structure for models like Dashscope
+    tiered_pricing: Optional[
+        List[Dict[str, Any]]
+    ]  # Tiered pricing structure for models like Dashscope
     litellm_provider: Required[str]
     mode: Required[
         Literal[
@@ -880,6 +882,9 @@ class PromptTokensDetailsWrapper(
     video_length_seconds: Optional[float] = None
     """Length of videos sent to the model. Used for Vertex AI multimodal embeddings."""
 
+    cache_creation_tokens: Optional[int] = None
+    """Number of cache creation tokens sent to the model. Used for Anthropic prompt caching."""
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         if self.character_count is None:
@@ -890,6 +895,8 @@ def __init__(self, *args, **kwargs):
             del self.video_length_seconds
         if self.web_search_requests is None:
             del self.web_search_requests
+        if self.cache_creation_tokens is None:
+            del self.cache_creation_tokens
 
 
 class ServerToolUse(BaseModel):
@@ -951,6 +958,7 @@ def __init__(
         # handle prompt_tokens_details
         _prompt_tokens_details: Optional[PromptTokensDetailsWrapper] = None
 
+        # guarantee prompt_token_details is always a PromptTokensDetailsWrapper
         if prompt_tokens_details:
             if isinstance(prompt_tokens_details, dict):
                 _prompt_tokens_details = PromptTokensDetailsWrapper(
@@ -985,6 +993,18 @@ def __init__(
             else:
                 _prompt_tokens_details.cached_tokens = params["cache_read_input_tokens"]
 
+        if "cache_creation_input_tokens" in params and isinstance(
+            params["cache_creation_input_tokens"], int
+        ):
+            if _prompt_tokens_details is None:
+                _prompt_tokens_details = PromptTokensDetailsWrapper(
+                    cache_creation_tokens=params["cache_creation_input_tokens"]
+                )
+            else:
+                _prompt_tokens_details.cache_creation_tokens = params[
+                    "cache_creation_input_tokens"
+                ]
+
         super().__init__(
             prompt_tokens=prompt_tokens or 0,
             completion_tokens=completion_tokens or 0,
diff --git a/tests/llm_translation/base_llm_unit_tests.py b/tests/llm_translation/base_llm_unit_tests.py
@@ -954,6 +954,7 @@ def test_image_url_string(self):
 
     @pytest.mark.flaky(retries=4, delay=1)
     def test_prompt_caching(self):
+        print("test_prompt_caching")
         litellm.set_verbose = True
         from litellm.utils import supports_prompt_caching
 
@@ -1049,8 +1050,8 @@ def test_prompt_caching(self):
                 assert (
                     response.usage.prompt_tokens_details.cached_tokens > 0
                 ), f"cached_tokens={response.usage.prompt_tokens_details.cached_tokens} should be greater than 0. Got usage={response.usage}"
-        except litellm.InternalServerError:
-            pass
+        except litellm.InternalServerError as e:
+            print("InternalServerError", e)
 
     @pytest.fixture
     def pdf_messages(self):
diff --git a/tests/local_testing/test_anthropic_prompt_caching.py b/tests/local_testing/test_anthropic_prompt_caching.py
@@ -250,7 +250,7 @@ async def test_anthropic_api_prompt_caching_basic():
                         "type": "text",
                         "text": "Here is the full text of a complex legal agreement"
                         * 400,
-                        "cache_control": {"type": "ephemeral"},
+                        : {"type": "ephemeral"},
                     }
                 ],
             },
@@ -510,6 +510,7 @@ async def test_anthropic_api_prompt_caching_streaming():
         if hasattr(chunk, "usage") and hasattr(
             chunk.usage, "cache_creation_input_tokens"
         ):
+            print("chunk.usage", chunk.usage)
             is_cache_creation_input_tokens_in_usage = True
 
         idx += 1
diff --git a/tests/test_litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py b/tests/test_litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py
@@ -174,6 +174,35 @@ def test_generic_cost_per_token_anthropic_prompt_caching():
     assert prompt_cost < 0.085
 
 
+def test_generic_cost_per_token_anthropic_prompt_caching_with_cache_creation():
+    model = "claude-3-5-haiku-20241022"
+    usage = Usage(
+        completion_tokens=90,
+        prompt_tokens=28436,
+        total_tokens=28526,
+        completion_tokens_details=CompletionTokensDetailsWrapper(
+            accepted_prediction_tokens=None,
+            audio_tokens=None,
+            reasoning_tokens=0,
+            rejected_prediction_tokens=None,
+            text_tokens=None,
+        ),
+        prompt_tokens_details=None,
+        cache_creation_input_tokens=2000,
+    )
+
+    custom_llm_provider = "anthropic"
+
+    prompt_cost, completion_cost = generic_cost_per_token(
+        model=model,
+        usage=usage,
+        custom_llm_provider=custom_llm_provider,
+    )
+
+    print(f"prompt_cost: {prompt_cost}")
+    assert round(prompt_cost, 3) == 0.023
+
+
 def test_string_cost_values():
     """Test that cost values defined as strings are properly converted to floats."""
     from unittest.mock import patch