Skip to content

Commit eba8758

Browse files
authored
adapt scoring for user-submitted models (#76)
1 parent fe9be85 commit eba8758

File tree

5 files changed

+45
-8
lines changed

5 files changed

+45
-8
lines changed

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "agent-eval"
7-
version = "0.1.45"
7+
version = "0.1.46"
88
description = "Agent evaluation toolkit"
99
readme = "README.md"
1010
requires-python = ">=3.10"
@@ -15,7 +15,7 @@ dependencies = [
1515
"inspect-ai>=0.3.104,<0.3.137",
1616
# pin litellm so that we know what model costs we're using
1717
# see the Development.md doc before changing
18-
"litellm>=1.67.4.post1,<=1.75.8",
18+
"litellm>=1.67.4.post1,<=1.82.3",
1919
"pydantic>=2.0.0",
2020
# For leaderboard
2121
"huggingface_hub",

src/agenteval/cli.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def prep_litellm_cost_map():
159159
# This snippet is mostly lifted from
160160
# https://github.com/BerriAI/litellm/blob/b9621c760d3355e06dd17ec89b9eb6776755392e/litellm/litellm_core_utils/get_model_cost_map.py#L16
161161
# See the Development.md before changing.
162-
desired_model_costs_url = "https://raw.githubusercontent.com/BerriAI/litellm/eb66daeef740947c0326826817cf68fb56a8b931/litellm/model_prices_and_context_window_backup.json"
162+
desired_model_costs_url = "https://raw.githubusercontent.com/BerriAI/litellm/9a5c778f1824641fe9f6c8dcc1d096fd9d8ef9f0/litellm/model_prices_and_context_window_backup.json"
163163
response = httpx.get(desired_model_costs_url, timeout=5)
164164
response.raise_for_status()
165165
desired_model_costs = response.json()
@@ -183,11 +183,11 @@ def prep_litellm_cost_map():
183183
# is incompatible.
184184
click.echo(f"Model costs hash {model_cost_hash}.")
185185

186-
# Between this and the version of the file we pass to register_model()
187-
# I think we can reconstruct the model costs used.
188186
litellm_version = importlib.metadata.version("litellm")
189187
click.echo(f"litellm version: {litellm_version}")
190188

189+
return desired_model_costs_url
190+
191191

192192
@click.group()
193193
def cli():
@@ -207,7 +207,7 @@ def score_command(
207207
):
208208
# so that we know what model costs we're using to score
209209
# more details in the Development.md
210-
prep_litellm_cost_map()
210+
cost_map_url = prep_litellm_cost_map()
211211

212212
hf_url_match = re.match(HF_URL_PATTERN, log_dir)
213213
temp_dir: tempfile.TemporaryDirectory | None = None
@@ -245,7 +245,9 @@ def score_command(
245245
click.echo(f" - {error}")
246246
sys.exit(1)
247247

248-
task_results = TaskResults(results=log_processing_outcome.results)
248+
task_results = TaskResults(
249+
results=log_processing_outcome.results, cost_map_url=cost_map_url
250+
)
249251

250252
# Warn if multiple evaluation specs present
251253
if len(task_results.agent_specs) > 1:

src/agenteval/local_cost.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from litellm.utils import CostPerToken
2+
from pydantic import BaseModel
23

34
# even where these exist in https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
45
# calling cost_per_token does not return a cost, perhaps due to the associated provider
@@ -33,3 +34,23 @@
3334
input_cost_per_token=1.8e-07, output_cost_per_token=1.8e-07
3435
),
3536
}
37+
38+
39+
class CostPerTokenWithCache(BaseModel):
40+
input_cost_per_token: float
41+
output_cost_per_token: float
42+
cache_read_input_token_cost: float
43+
44+
45+
# Like CUSTOM_PRICING, but for models that also have a cache read discount.
46+
# cost_per_token with usage_object doesn't work for these models in litellm 1.75.8,
47+
# so costs are computed manually in compute_model_cost.
48+
# key represents model name as found in inspect model_usage
49+
CUSTOM_PRICING_WITH_CACHE = {
50+
# costs from https://platform.moonshot.ai/docs/guide/kimi-k2-5-quickstart
51+
"moonshotai/kimi-k2.5-0127": CostPerTokenWithCache(
52+
input_cost_per_token=6e-07,
53+
output_cost_per_token=3e-06,
54+
cache_read_input_token_cost=1e-07,
55+
),
56+
}

src/agenteval/log.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from litellm.types.utils import PromptTokensDetailsWrapper, Usage
1616
from pydantic import BaseModel
1717

18-
from .local_cost import CUSTOM_PRICING
18+
from .local_cost import CUSTOM_PRICING, CUSTOM_PRICING_WITH_CACHE
1919

2020
logger = getLogger(__name__)
2121

@@ -113,6 +113,17 @@ def compute_model_cost(model_usages: list[ModelUsageWithName]) -> float | None:
113113
custom_cost_per_token=CUSTOM_PRICING[model_usage.model],
114114
)
115115

116+
elif model_usage.model in CUSTOM_PRICING_WITH_CACHE.keys():
117+
118+
pricing = CUSTOM_PRICING_WITH_CACHE[model_usage.model]
119+
cache_read_tokens = model_usage.usage.input_tokens_cache_read or 0
120+
text_tokens = input_tokens - cache_read_tokens
121+
prompt_cost = (
122+
text_tokens * pricing.input_cost_per_token
123+
+ cache_read_tokens * pricing.cache_read_input_token_cost
124+
)
125+
completion_cost = output_tokens * pricing.output_cost_per_token
126+
116127
else:
117128
total_tokens = model_usage.usage.total_tokens
118129

src/agenteval/models.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ class TaskResults(BaseModel):
4747
"""Scores for all tasks in the suite"""
4848

4949
results: list[TaskResult]
50+
cost_map_url: str | None = None
51+
"""URL of the litellm model pricing JSON used to compute costs.
52+
Points to a specific git commit so the cost basis is exactly reproducible."""
5053

5154
@cached_property
5255
def agent_specs(self) -> set[str]:

0 commit comments

Comments
 (0)