Skip to content

Commit 36bf3fa

Browse files
authored
Drop some stuff for You.com, Elicit, and SciSpace (#55)
1 parent 02899eb commit 36bf3fa

File tree

2 files changed

+23
-2
lines changed

2 files changed

+23
-2
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "agent-eval"
7-
version = "0.1.32"
7+
version = "0.1.33"
88
description = "Agent evaluation toolkit"
99
readme = "README.md"
1010
requires-python = ">=3.10"

src/agenteval/leaderboard/view.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,15 @@ def create_display_name(row):
309309
return display_df, plots
310310

311311

312+
def _agent_with_probably_incomplete_model_usage_info(agent_name):
313+
# See https://github.com/allenai/astabench-issues/issues/330
314+
lowered_agent_name = agent_name.lower()
315+
is_elicit = lowered_agent_name == "elicit"
316+
is_scispace = lowered_agent_name == "scispace"
317+
is_you_dot_com = ("you" in lowered_agent_name) and ("com" in lowered_agent_name)
318+
return any([is_elicit, is_scispace, is_you_dot_com])
319+
320+
312321
def _get_dataframe(
313322
eval_results: datasets.DatasetDict,
314323
split: str,
@@ -333,10 +342,23 @@ def _get_dataframe(
333342
rows = []
334343
for itm in ds:
335344
ev = LeaderboardSubmission.model_validate(itm)
345+
sub = ev.submission
346+
347+
probably_incomplete_model_info = (
348+
_agent_with_probably_incomplete_model_usage_info(sub.agent_name)
349+
)
336350

337351
model_token_counts: dict[str, int] = {}
338352
if ev.results:
339353
for task_result in ev.results:
354+
355+
if probably_incomplete_model_info:
356+
logger.warning(
357+
f"Dropping model_usages and model_costs for the following submission because model usage info may be incomplete: {sub}."
358+
)
359+
task_result.model_usages = None
360+
task_result.model_costs = None
361+
340362
if task_result.model_usages:
341363
for usage_list in task_result.model_usages:
342364
for model_usage in usage_list:
@@ -357,7 +379,6 @@ def _get_dataframe(
357379
LB_MODEL_NAME_MAPPING.get(name, name) for name in sorted_raw_names
358380
]
359381

360-
sub = ev.submission
361382
# only format if submit_time present, else leave as None
362383
ts = sub.submit_time
363384
if ts is not None:

0 commit comments

Comments
 (0)