Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
301 changes: 253 additions & 48 deletions src/agenteval/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import subprocess
import sys
import tempfile
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timezone
from io import BytesIO

Expand All @@ -17,12 +17,12 @@
from litellm import model_cost as litellm_model_cost
from litellm import register_model

from agenteval.leaderboard.schema_generator import load_dataset_features
from agenteval.leaderboard.schema_generator import check_submissions_against_readme

from .cli_utils import AliasedChoice, generate_choice_help
from .config import load_suite_config
from .io import atomic_write_file
from .leaderboard.models import LeaderboardSubmission, Readme
from .leaderboard.models import LeaderboardSubmission
from .leaderboard.upload import (
compress_model_usages,
sanitize_path_component,
Expand Down Expand Up @@ -51,6 +51,33 @@
}


@dataclass
class RepoPathsOfInterest:
Copy link
Copy Markdown
Collaborator Author

@ca16 ca16 Aug 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pulls out logic used in lb publish and the new copy command.

repo_id: str
relative_paths: list[str]

@staticmethod
def from_urls(urls: list[str]):
repo_ids = set()
paths = []

for url in urls:
# validates submission_url format "hf://<repo_id>/<path>"
repo_id, path = parse_hf_url(url)
repo_ids.add(repo_id)
paths.append(path)

if len(repo_ids) > 1:
raise ValueError("All URLs must reference the same repo")

repo_id_to_use = repo_ids.pop()

return RepoPathsOfInterest(
repo_id=repo_id_to_use,
relative_paths=paths,
)


def parse_hf_url(url: str) -> tuple[str, str]:
hf_url_match = re.match(HF_URL_PATTERN, url)
if not hf_url_match:
Expand Down Expand Up @@ -555,22 +582,14 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]):

hf_api = HfApi()

submission_repo_ids = set()
submission_paths = []

# Validate URLs
for submission_url in submission_urls:
submission_repo_id, submission_path = parse_hf_url(
submission_url
) # validates submission_url format "hf://<repo_id>/<submission_path>"
submission_repo_ids.add(submission_repo_id)
submission_paths.append(submission_path)

if len(submission_repo_ids) > 1:
click.echo("All submission URLs must reference the same repo")
try:
paths_of_interest = RepoPathsOfInterest.from_urls(list(submission_urls))
except ValueError as exc:
click.echo(str(exc))
sys.exit(1)

submission_repo_id = submission_repo_ids.pop()
submission_repo_id = paths_of_interest.repo_id
submission_paths = paths_of_interest.relative_paths

eval_config_rel_paths = [
f"{p}/{EVAL_CONFIG_FILENAME}" for p in submission_paths
Expand All @@ -592,10 +611,8 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]):
local_dir=local_submissions_dir,
)

lb_submissions = []
# Create results files locally
config_splits = defaultdict(
list
) # Accumulate config names and splits being published
for (
submission_url,
submission_path,
Expand Down Expand Up @@ -636,7 +653,6 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]):
eval_config = EvalConfig.model_validate_json(
open(local_eval_config_path).read()
)
config_splits[eval_config.suite_config.version].append(eval_config.split)
results = TaskResults.model_validate_json(
open(local_scores_path).read()
).results
Expand All @@ -657,6 +673,7 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]):
submission=submission,
)
lb_submission = compress_model_usages(lb_submission)
lb_submissions.append(lb_submission)
os.makedirs(
os.path.join(local_results_dir, os.path.dirname(submission_path)),
exist_ok=True,
Expand All @@ -669,34 +686,12 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]):
f.write(lb_submission.model_dump_json(indent=None))

# Validate the config with the schema in HF
readme = Readme.download_and_parse(repo_id)
missing_configs = list(set(config_splits.keys()) - set(readme.configs.keys()))
if missing_configs:
click.echo(
f"Config name {missing_configs} not present in hf://{repo_id}/README.md"
)
click.echo(
f"Run 'update_readme.py add-config --repo-id {repo_id} --config-name {missing_configs[0]}' to add it"
)
sys.exit(1)
missing_splits = list(
set(((c, s) for c in config_splits.keys() for s in config_splits[c]))
- set(((c, s) for c in readme.configs.keys() for s in readme.configs[c]))
)
if missing_splits:
click.echo(
f"Config/Split {missing_splits} not present in hf://{repo_id}/README.md"
)
click.echo(
f"Run 'update_readme.py add-config --repo-id {repo_id} --config-name {missing_splits[0][0]} --split {missing_splits[0][1]}` to add it"
)
sys.exit(1)
local_features = load_dataset_features()
if local_features.arrow_schema != readme.features.arrow_schema:
click.echo(
"Schema in local dataset_features.yml does not match schema in hf://{repo_id}/README.md"
try:
check_submissions_against_readme(
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pulling this out so I can use it in copy too

lb_submissions=lb_submissions, repo_id=repo_id
)
click.echo("Run 'update_readme.py sync-schema' to update it")
except ValueError as exc:
click.echo(str(exc))
sys.exit(1)

# Upload all results files in one shot
Expand Down Expand Up @@ -1045,5 +1040,215 @@ def eval_command(

cli.add_command(eval_command)


@dataclass
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found it easier to make sense of submission paths in particular this way... Could be useful in some other commands too...? DIdn't want to make too many changes in one go though...

class WithinRepoPathComponents:
hf_config: str
split: str
submission_name: str

def to_within_repo_result_path(self):
return f"{self.hf_config}/{self.split}/{self.submission_name}.json"

def to_within_repo_submission_dir(self):
return f"{self.hf_config}/{self.split}/{self.submission_name}"

def to_within_repo_submission_summary_dir(self):
return f"summaries/{self.hf_config}/{self.split}/{self.submission_name}"

def to_within_repo_submission_eval_config(self):
return f"{self.to_within_repo_submission_dir()}/{EVAL_CONFIG_FILENAME}"

def to_within_repo_submission_submission_metadata(self):
return f"{self.to_within_repo_submission_dir()}/{SUBMISSION_METADATA_FILENAME}"

def to_within_repo_submission_scores(self):
return f"{self.to_within_repo_submission_summary_dir()}/{SCORES_FILENAME}"

def within_repo_submission_patterns(self):
return [
self.to_within_repo_submission_eval_config(),
self.to_within_repo_submission_submission_metadata(),
self.to_within_repo_submission_scores(),
f"{self.to_within_repo_submission_dir()}/*.eval",
]

@staticmethod
def from_within_repo_result_path(result_path):
[hf_config, split, filename] = result_path.split("/")
suffix = ".json"
assert filename.endswith(suffix)
submission_name = filename[: -len(suffix)]
return WithinRepoPathComponents(
hf_config=hf_config, split=split, submission_name=submission_name
)

@staticmethod
def from_within_repo_submission_path(submission_path):
[hf_config, split, submission_name] = submission_path.split("/")
return WithinRepoPathComponents(
hf_config=hf_config, split=split, submission_name=submission_name
)


@click.command(name="copy", help="Copy a result from one results HF repo to another.")
@click.argument("result_urls", nargs=-1, required=True, type=str)
@click.option(
"--target-submissions-repo",
default=None,
required=False,
help="Provide this if you also want to copy the underlying submission to another submissions HF repo.",
)
@click.option(
"--target-results-repo",
default=None,
required=True,
)
@click.option(
"--read-public-logs-field",
is_flag=True,
default=False,
help="Provide this if the source results have log urls in logs_url_public.",
)
@click.option(
"--write-public-logs-field",
is_flag=True,
default=False,
help="Provide this if the target results should have log urls in logs_url_public.",
)
def copy_command(
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Example command, if you wanted to copy something over from the internal repos to the public ones.

agenteval copy --target-submissions-repo "allenai/asta-bench-submissions" --target-results-repo "allenai/asta-bench-results" --write-public-logs-field  "hf://allenai/asta-bench-internal-results/1.0.0/test/aakanksha19_Asta_Table_Synthesis__GPT-4.1__2025-07-11T21-23-03.json"  "hf://allenai/asta-bench-internal-results/1.0.0/test/aakanksha19_Asta_Table_Synthesis__Pro-2.5__2025-07-12T06-06-53.json"

target_submissions_repo: str | None,
target_results_repo: str,
read_public_logs_field: bool,
write_public_logs_field: bool,
result_urls: tuple[str, ...],
):
try:
src_result_paths_of_interest = RepoPathsOfInterest.from_urls(list(result_urls))
except ValueError as exc:
click.echo(str(exc))
sys.exit(1)

src_results_repo = src_result_paths_of_interest.repo_id
result_paths = src_result_paths_of_interest.relative_paths
click.echo(f"{len(result_paths)} result files to copy.")

click.echo(f"source results repo: {src_results_repo}")
click.echo(f"target results repo: {target_results_repo}")

with tempfile.TemporaryDirectory() as temp_dir:
from huggingface_hub import HfApi, snapshot_download

hf_api = HfApi()

local_src_results_dir = os.path.join(temp_dir, "sourceresults")
local_target_results_dir = os.path.join(temp_dir, "targetresults")

snapshot_download(
repo_id=src_results_repo,
repo_type="dataset",
allow_patterns=result_paths,
local_dir=local_src_results_dir,
)

log_urls = []
lb_submissions = []

for result_path in result_paths:
local_src_result_path = os.path.join(local_src_results_dir, result_path)
with open(local_src_result_path) as f_src_result:
result = LeaderboardSubmission.model_validate(json.load(f_src_result))

current_logs_url = (
result.submission.logs_url_public
if read_public_logs_field
else result.submission.logs_url
)
if current_logs_url is not None:
log_urls.append(current_logs_url)
_, src_submission_path = parse_hf_url(current_logs_url)

if target_submissions_repo is not None:
new_logs_url = (
f"hf://datasets/{target_submissions_repo}/{src_submission_path}"
)
if write_public_logs_field:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not up to speed on logs_url vs. logs_url_public, but this certainly looks weird to me. It means the copy is not a strict copy. If we need some way of syncing the logs_url and logs_url_public, maybe that should be an explicit separate step.

Or if modifying the data during the copy is unavoidable, maybe the command should just have a different name, like migrate

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I agree that this looks kinda weird...

For the results in the new results repo, I ended up putting their logs under logs_url_public because of the logs_url line here:

rows.append(
{
"id": sub.submit_time,
"agent_name": sub.agent_name,
"agent_description": sub.agent_description or "",
"username": sub.username or "",
"submit_time": date,
"openness": sub.openness,
"tool_usage": sub.tool_usage,
"base_models": model_names,
**flat,
"logs_url": sub.logs_url if is_internal else sub.logs_url_public,
"source_url": source_url,
}

The leaderboard looks at that log_url field to make its log link, and there's leaderboard code that determines which HF resources to use based on whether we're in 'internal' mode or not.

But I guess it might be worth figuring out if this is what was originally pictured for logs_url vs logs_url_public. @mdarcy220 do you know or should we pull in Jonathan?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know much about the lb-related code in general; my best guess is that it's related to the notion that we could publish a result publicly with some of our logs but have more verbose logs internally (e.g. for legal reasons). But I don't actually know; Jonathan is probably the best bet.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling @jbragg

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was built with the idea that we may post submissions to a private dataset, and then only make a subset of submissions public. However, we did not need this functionality and only have a public dataset.

Copy link
Copy Markdown
Collaborator Author

@ca16 ca16 Aug 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we no longer need both the logs_url and logs_url_public fields in submission metadata?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You would know better what we need @ca16 given my comment :)

result.submission.logs_url_public = new_logs_url
result.submission.logs_url = None
else:
result.submission.logs_url_public = None
result.submission.logs_url = new_logs_url

lb_submissions.append(result)

os.makedirs(
os.path.join(local_target_results_dir, os.path.dirname(result_path)),
exist_ok=True,
)
with open(
os.path.join(local_target_results_dir, result_path),
"w",
encoding="utf-8",
) as f_target_result:
f_target_result.write(result.model_dump_json(indent=None))

# Validate the config with the schema in HF
check_submissions_against_readme(
lb_submissions=lb_submissions, repo_id=target_results_repo
)
click.echo(
f"Uploading {len(lb_submissions)} results to {target_results_repo}..."
)
hf_api.upload_folder(
folder_path=local_target_results_dir,
path_in_repo="",
repo_id=target_results_repo,
repo_type="dataset",
)

if target_submissions_repo is not None:
try:
src_submission_paths_of_interest = RepoPathsOfInterest.from_urls(
log_urls
)
except ValueError as exc:
click.echo(str(exc))
sys.exit(1)

src_submissions_repo = src_submission_paths_of_interest.repo_id
submission_paths = src_submission_paths_of_interest.relative_paths

# I think we can just use the same local dir.
local_submissions_dir = os.path.join(temp_dir, "submissions")
paths_to_pull = []
for submission_path in submission_paths:
paths_to_pull.extend(
WithinRepoPathComponents.from_within_repo_submission_path(
submission_path
).within_repo_submission_patterns()
)

snapshot_download(
repo_id=src_submissions_repo,
repo_type="dataset",
allow_patterns=paths_to_pull,
local_dir=local_submissions_dir,
)
click.echo(
f"Uploading {len(submission_paths)} submissions to {target_submissions_repo}..."
)
hf_api.upload_folder(
folder_path=local_submissions_dir,
path_in_repo="",
repo_id=target_submissions_repo,
repo_type="dataset",
)

click.echo("Done")


cli.add_command(copy_command)


if __name__ == "__main__":
cli()
Loading