adapt scoring for user-submitted models (#76)

regan-huff · web-flow · commit eba87580a427 · 2026-03-26T13:12:44.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "agent-eval"
-version = "0.1.45"
+version = "0.1.46"
 description = "Agent evaluation toolkit"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -15,7 +15,7 @@ dependencies = [
   "inspect-ai>=0.3.104,<0.3.137",
   # pin litellm so that we know what model costs we're using
   # see the Development.md doc before changing
-  "litellm>=1.67.4.post1,<=1.75.8",
+  "litellm>=1.67.4.post1,<=1.82.3",
   "pydantic>=2.0.0",
   # For leaderboard
   "huggingface_hub",
diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py
@@ -159,7 +159,7 @@ def prep_litellm_cost_map():
     # This snippet is mostly lifted from
     # https://github.com/BerriAI/litellm/blob/b9621c760d3355e06dd17ec89b9eb6776755392e/litellm/litellm_core_utils/get_model_cost_map.py#L16
     # See the Development.md before changing.
-    desired_model_costs_url = "https://raw.githubusercontent.com/BerriAI/litellm/eb66daeef740947c0326826817cf68fb56a8b931/litellm/model_prices_and_context_window_backup.json"
+    desired_model_costs_url = "https://raw.githubusercontent.com/BerriAI/litellm/9a5c778f1824641fe9f6c8dcc1d096fd9d8ef9f0/litellm/model_prices_and_context_window_backup.json"
     response = httpx.get(desired_model_costs_url, timeout=5)
     response.raise_for_status()
     desired_model_costs = response.json()
@@ -183,11 +183,11 @@ def prep_litellm_cost_map():
     # is incompatible.
     click.echo(f"Model costs hash {model_cost_hash}.")
 
-    # Between this and the version of the file we pass to register_model()
-    # I think we can reconstruct the model costs used.
     litellm_version = importlib.metadata.version("litellm")
     click.echo(f"litellm version: {litellm_version}")
 
+    return desired_model_costs_url
+
 
 @click.group()
 def cli():
@@ -207,7 +207,7 @@ def score_command(
 ):
     # so that we know what model costs we're using to score
     # more details in the Development.md
-    prep_litellm_cost_map()
+    cost_map_url = prep_litellm_cost_map()
 
     hf_url_match = re.match(HF_URL_PATTERN, log_dir)
     temp_dir: tempfile.TemporaryDirectory | None = None
@@ -245,7 +245,9 @@ def score_command(
             click.echo(f"  - {error}")
         sys.exit(1)
 
-    task_results = TaskResults(results=log_processing_outcome.results)
+    task_results = TaskResults(
+        results=log_processing_outcome.results, cost_map_url=cost_map_url
+    )
 
     # Warn if multiple evaluation specs present
     if len(task_results.agent_specs) > 1:
diff --git a/src/agenteval/local_cost.py b/src/agenteval/local_cost.py
@@ -1,4 +1,5 @@
 from litellm.utils import CostPerToken
+from pydantic import BaseModel
 
 # even where these exist in https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
 # calling cost_per_token does not return a cost, perhaps due to the associated provider
@@ -33,3 +34,23 @@
         input_cost_per_token=1.8e-07, output_cost_per_token=1.8e-07
     ),
 }
+
+
+class CostPerTokenWithCache(BaseModel):
+    input_cost_per_token: float
+    output_cost_per_token: float
+    cache_read_input_token_cost: float
+
+
+# Like CUSTOM_PRICING, but for models that also have a cache read discount.
+# cost_per_token with usage_object doesn't work for these models in litellm 1.75.8,
+# so costs are computed manually in compute_model_cost.
+# key represents model name as found in inspect model_usage
+CUSTOM_PRICING_WITH_CACHE = {
+    # costs from https://platform.moonshot.ai/docs/guide/kimi-k2-5-quickstart
+    "moonshotai/kimi-k2.5-0127": CostPerTokenWithCache(
+        input_cost_per_token=6e-07,
+        output_cost_per_token=3e-06,
+        cache_read_input_token_cost=1e-07,
+    ),
+}
diff --git a/src/agenteval/log.py b/src/agenteval/log.py
@@ -15,7 +15,7 @@
 from litellm.types.utils import PromptTokensDetailsWrapper, Usage
 from pydantic import BaseModel
 
-from .local_cost import CUSTOM_PRICING
+from .local_cost import CUSTOM_PRICING, CUSTOM_PRICING_WITH_CACHE
 
 logger = getLogger(__name__)
 
@@ -113,6 +113,17 @@ def compute_model_cost(model_usages: list[ModelUsageWithName]) -> float | None:
                     custom_cost_per_token=CUSTOM_PRICING[model_usage.model],
                 )
 
+            elif model_usage.model in CUSTOM_PRICING_WITH_CACHE.keys():
+
+                pricing = CUSTOM_PRICING_WITH_CACHE[model_usage.model]
+                cache_read_tokens = model_usage.usage.input_tokens_cache_read or 0
+                text_tokens = input_tokens - cache_read_tokens
+                prompt_cost = (
+                    text_tokens * pricing.input_cost_per_token
+                    + cache_read_tokens * pricing.cache_read_input_token_cost
+                )
+                completion_cost = output_tokens * pricing.output_cost_per_token
+
             else:
                 total_tokens = model_usage.usage.total_tokens
 
diff --git a/src/agenteval/models.py b/src/agenteval/models.py
@@ -47,6 +47,9 @@ class TaskResults(BaseModel):
     """Scores for all tasks in the suite"""
 
     results: list[TaskResult]
+    cost_map_url: str | None = None
+    """URL of the litellm model pricing JSON used to compute costs.
+    Points to a specific git commit so the cost basis is exactly reproducible."""
 
     @cached_property
     def agent_specs(self) -> set[str]: