allenai · regan-huff · Mar 26, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "agent-eval"
-version = "0.1.45"
+version = "0.1.46"
 description = "Agent evaluation toolkit"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -15,7 +15,7 @@ dependencies = [
   "inspect-ai>=0.3.104,<0.3.137",
   # pin litellm so that we know what model costs we're using
   # see the Development.md doc before changing
-  "litellm>=1.67.4.post1,<=1.75.8",
+  "litellm>=1.67.4.post1,<=1.82.3",
   "pydantic>=2.0.0",
   # For leaderboard
   "huggingface_hub",

diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py
@@ -159,7 +159,7 @@ def prep_litellm_cost_map():
     # This snippet is mostly lifted from
     # https://github.com/BerriAI/litellm/blob/b9621c760d3355e06dd17ec89b9eb6776755392e/litellm/litellm_core_utils/get_model_cost_map.py#L16
     # See the Development.md before changing.
-    desired_model_costs_url = "https://raw.githubusercontent.com/BerriAI/litellm/eb66daeef740947c0326826817cf68fb56a8b931/litellm/model_prices_and_context_window_backup.json"
+    desired_model_costs_url = "https://raw.githubusercontent.com/BerriAI/litellm/9a5c778f1824641fe9f6c8dcc1d096fd9d8ef9f0/litellm/model_prices_and_context_window_backup.json"
     response = httpx.get(desired_model_costs_url, timeout=5)
     response.raise_for_status()
     desired_model_costs = response.json()

diff --git a/src/agenteval/local_cost.py b/src/agenteval/local_cost.py
@@ -1,4 +1,5 @@
 from litellm.utils import CostPerToken
+from pydantic import BaseModel
 
 # even where these exist in https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
 # calling cost_per_token does not return a cost, perhaps due to the associated provider
@@ -33,3 +34,23 @@
         input_cost_per_token=1.8e-07, output_cost_per_token=1.8e-07
     ),
 }
+
+
+class CostPerTokenWithCache(BaseModel):
+    input_cost_per_token: float
+    output_cost_per_token: float
+    cache_read_input_token_cost: float
+
+
+# Like CUSTOM_PRICING, but for models that also have a cache read discount.
+# cost_per_token with usage_object doesn't work for these models in litellm 1.75.8,
+# so costs are computed manually in compute_model_cost.
+# key represents model name as found in inspect model_usage
+CUSTOM_PRICING_WITH_CACHE = {
+    # costs from https://platform.moonshot.ai/docs/guide/kimi-k2-5-quickstart
+    "moonshotai/kimi-k2.5-0127": CostPerTokenWithCache(
+        input_cost_per_token=6e-07,
+        output_cost_per_token=3e-06,
+        cache_read_input_token_cost=1e-07,
+    ),
+}
diff --git a/src/agenteval/log.py b/src/agenteval/log.py
@@ -15,7 +15,7 @@
 from litellm.types.utils import PromptTokensDetailsWrapper, Usage
 from pydantic import BaseModel
 
-from .local_cost import CUSTOM_PRICING
+from .local_cost import CUSTOM_PRICING, CUSTOM_PRICING_WITH_CACHE
 
 logger = getLogger(__name__)
 
@@ -113,6 +113,17 @@ def compute_model_cost(model_usages: list[ModelUsageWithName]) -> float | None:
                     custom_cost_per_token=CUSTOM_PRICING[model_usage.model],
                 )
 
+            elif model_usage.model in CUSTOM_PRICING_WITH_CACHE.keys():
+
+                pricing = CUSTOM_PRICING_WITH_CACHE[model_usage.model]
+                cache_read_tokens = model_usage.usage.input_tokens_cache_read or 0
+                text_tokens = input_tokens - cache_read_tokens
+                prompt_cost = (
+                    text_tokens * pricing.input_cost_per_token
+                    + cache_read_tokens * pricing.cache_read_input_token_cost
+                )
+                completion_cost = output_tokens * pricing.output_cost_per_token
+
             else:
                 total_tokens = model_usage.usage.total_tokens