Skip to content
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/agenteval/leaderboard/model_name_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,6 @@
"gemini/gemini-2.5-pro": "Gemini 2.5 Pro (unpinned)",
"openai/gpt-4o": "GPT-4o (unpinned)",
"gpt-3.5-turbo-0125": "GPT-3.5 Turbo (2025-01)",
"openai/gpt-5": "GPT-5 (unpinned)",
"gpt-5": "GPT-5 (unpinned)",
}
101 changes: 98 additions & 3 deletions src/agenteval/leaderboard/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from .. import compute_summary_statistics
from ..config import SuiteConfig
from ..score import EvalSpec
from .model_name_mapping import LB_MODEL_NAME_MAPPING
from .models import LeaderboardSubmission

Expand Down Expand Up @@ -366,6 +367,79 @@ def construct_reproducibility_url(task_revisions: list[EvalRevision]) -> str | N
return source_url


def adjust_model_name_for_reasoning_effort(model_name: str, effort: str) -> str:
return f"{model_name} (reasoning_effort={effort})"


def get_model_name_aliases(raw_name: str) -> set[str]:
Copy link
Copy Markdown
Collaborator Author

@ca16 ca16 Aug 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A submission can have multiple results. Each result can have one eval spec with model args (and a model name), as well as a list of model usages (which also have model names). The names we show through lb view and in the leaderboard are based on the names in the model usages.

For a given result within a submission, assuming the model args apply to the model indicated by the model name in the eval spec, we need to figure out which names from model usages correspond to the same model (we can't assume they're the same, e.g. for the case that brought this on, the model name in the eval spec is openai/gpt-5, and in model usages we have gpt-5 and openai/gpt-4o).

My approach here is to map the model name from the eval spec to a list of aliases, and each model name from the model usages to a list of aliases, and if there's any overlap, treat that as the two referencing the same model.

Here's how aliases are determined:

  • the raw name provided (lower case)
  • if the name is a key in our current LB_MODEL_NAME_MAPPING mapping, the corresponding value (lower case)
  • if the name is a key in our current LB_MODEL_NAME_MAPPING mapping and the corresponding value indicates it's unpinned, the corresponding value without the date part (lower case)

Here's what that looks like for our example:

  • aliases for the model name in the eval spec: openai/gpt-5 -> {'openai/gpt-5', 'gpt-5', 'gpt-5 (unpinned)'}
  • aliases for the model names in the results:
    • openai/gpt-4o -> {'openai/gpt-4o', 'gpt-4o', 'gpt-4o (unpinned)'}
    • gpt-5 -> {'gpt-5'}

Does this logic make sense?

(Sidenote: might be helpful to refactor the model name stuff a little at some point, to put some assumptions in code... E.g. the map values all have dates or 'unpinned' between parens at the end, but nothing in the code makes sure that's always true AFAIK.)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a need to map to aliases before joining between the evalspec and model usages? I worry that this may cause some incorrect joins.
Since both are logged through Inspect, I would hope that they would use the same identifier.
There's the additional issue that some people manually logged usages outside of Inspect they may not match.

Copy link
Copy Markdown
Collaborator Author

@ca16 ca16 Aug 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we do need it at least for this particular result because they don't match as is (openai/gpt-5 vs gpt-5). Maybe because of the issue you mention ("There's the additional issue that some people manually logged usages outside of Inspect they may not match.")

aliases = {raw_name}
if raw_name in LB_MODEL_NAME_MAPPING:
# pretty just means a value in our LB_MODEL_NAME_MAPPING map
pretty_name = LB_MODEL_NAME_MAPPING[raw_name]
aliases.add(pretty_name)

# if the pretty name suggests it's unpinned
# include the pretty version without the date part
open_paren_index = pretty_name.rindex("(")
name_date = pretty_name[open_paren_index:].strip()
if name_date == "(unpinned)":
dateless_pretty_name = pretty_name[:open_paren_index].strip()
aliases.add(dateless_pretty_name)
return {a.lower() for a in aliases}


def format_model_names_for_one_result(
raw_names: set[str], eval_spec: EvalSpec | None
) -> dict[str, str]:
to_return: dict[str, str] = {}

if (
(eval_spec is not None)
and (eval_spec.model_args is not None)
and (isinstance(eval_spec.model_args, dict))
and ("reasoning_effort" in eval_spec.model_args)
):
consider_eval_spec = True
spec_model_name_aliases = get_model_name_aliases(eval_spec.model)
else:
consider_eval_spec = False
spec_model_name_aliases = None

for raw_name in raw_names:
safe_name_option = LB_MODEL_NAME_MAPPING.get(raw_name, raw_name)
other_name_option = None

if consider_eval_spec:
# make mypy happy
assert eval_spec is not None
assert spec_model_name_aliases is not None
assert isinstance(eval_spec.model_args, dict)
raw_name_aliases = get_model_name_aliases(raw_name)
looks_like_same_model = (
len(raw_name_aliases.intersection(spec_model_name_aliases)) > 0
)
if looks_like_same_model:
reasoning_effort = eval_spec.model_args["reasoning_effort"]
other_name_option = adjust_model_name_for_reasoning_effort(
model_name=safe_name_option,
effort=reasoning_effort,
)
Comment on lines +421 to +432
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK this probably works for the specific experiments/agents we used, but in general for agents with multiple models, it's likely to run into mistakes (in particular, with an agent that tries cheap models first and escalates to higher reasoning if it detects a hard problem; this is a known strategy that we want to test at some point).

There are currently limitations for what we can do if usages are coming from outside Inspect, but for usages within Inspect the "model" events each have a property like:

"config": {
    "max_retries": 8,
    "max_connections": 8,
    "reasoning_effort": "medium"
},

which specifies the config for that particular model usage (as opposed to the global default config). That example above is from the ReAct-o3 run on the leaderboard. (There's also a "call" field with the exact request; could inspect directly that but maybe loses the benefit of any normalization Inspect tries to do).

I would like it if we can first check the direct model-usage config before falling back to the global default (and logging a corresponding warning). Idk how hard that is to set up here but maybe it requires a lot of restructuring, in which case I think it's okay to merge this but we need to make a ticket to revisit when we start getting external submissions or testing mixed-reasoning agents.

We should also consider reasoning_tokens (Anthropic's finer-grained equivalent of OpenAI's reasoning_effort) at some point, but maybe that's off-topic for this ticket (though seems like it might be trivial to slot into the logic alongside reasoning_effort?).

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thinking more...

So if we have gpt-5 and gpt-5 (unpinned), we will end up attaching reasoning effort to both of them? But if you have those two different names, it basically means we really do have a mixture of different model configs (which could also have different reasoning settings). I think if there's more than one variant of the same model used with reasoning_effort, this should be an error until we add per-usage effort extraction. Presumably that wouldn't break anything with our current runs because we shouldn't have mixed variants of the reasoning models (not within the same run, anyway)?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh one more thing: in theory this will be fine for my runs since I changed args per-model, but I believe it is technically possible to specify --reasoning-effort for models that don't actually do reasoning, e.g. gpt-4o, but would be silently ignore in the actual requests. So we'd get a bogus name in that case.


to_use = safe_name_option if other_name_option is None else other_name_option
to_return[raw_name] = to_use

return to_return


def merge_in_formatted_names_from_one_result(
so_far: dict[str, set[str]], from_one_result: dict[str, str]
):
for k, v in from_one_result.items():
if k not in so_far:
so_far[k] = set()
so_far[k].add(v)


def _get_dataframe(
eval_results: datasets.DatasetDict,
split: str,
Expand Down Expand Up @@ -397,6 +471,9 @@ def _get_dataframe(
)

model_token_counts: dict[str, int] = {}
# formatted model names
raw_names_to_formatted_names: dict[str, set[str]] = {}

if ev.results:
for task_result in ev.results:

Expand All @@ -407,6 +484,7 @@ def _get_dataframe(
task_result.model_usages = None
task_result.model_costs = None

models_in_this_task = set([])
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
models_in_this_task = set([])
models_in_this_task = set()

if task_result.model_usages:
for usage_list in task_result.model_usages:
for model_usage in usage_list:
Expand All @@ -418,14 +496,31 @@ def _get_dataframe(
else:
model_token_counts[model_name] = total_tokens

models_in_this_task.add(model_name)

merge_in_formatted_names_from_one_result(
so_far=raw_names_to_formatted_names,
from_one_result=format_model_names_for_one_result(
raw_names=models_in_this_task,
eval_spec=task_result.eval_spec,
),
)

# Sort by cumulative token count (descending - most used first)
sorted_raw_names = sorted(
model_token_counts.keys(), key=lambda x: model_token_counts[x], reverse=True
)

model_names = [
LB_MODEL_NAME_MAPPING.get(name, name) for name in sorted_raw_names
]
# use a list because order matter here
model_names = []
for raw_name in sorted_raw_names:
# we might have mapped the same raw name to different formatted names
# e.g. if reasoning effort wasn't at the default for a result
formatted_names = raw_names_to_formatted_names[raw_name]
# in case two raw names map to the same formatted name
for formatted_name in formatted_names:
if formatted_name not in model_names:
model_names.append(formatted_name)

# only format if submit_time present, else leave as None
ts = sub.submit_time
Expand Down