From 39e9fe59e67a73052e6bee21a92c5691dd0a6c65 Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sat, 16 Aug 2025 17:29:21 -0700 Subject: [PATCH 01/14] save --- src/agenteval/cli.py | 11 +++++++++++ src/agenteval/repairs.py | 19 +++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 src/agenteval/repairs.py diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index 130476a..7f82e25 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -24,6 +24,7 @@ upload_folder_to_hf, ) from .models import EvalConfig, SubmissionMetadata, TaskResults +from .repairs import repair from .score import process_eval_logs from .summary import compute_summary_statistics @@ -257,6 +258,16 @@ def score_command( cli.add_command(score_command) +@click.command(name="repair", help="TODO.") +@click.option("--name", type=str) +def repair_command(name: str): + print("in repair command") + print(f"name arg: {name}") + repair(name) + + +cli.add_command(repair_command) + @click.command( name="publish", diff --git a/src/agenteval/repairs.py b/src/agenteval/repairs.py new file mode 100644 index 0000000..e80be19 --- /dev/null +++ b/src/agenteval/repairs.py @@ -0,0 +1,19 @@ +from importlib import import_module +import sys + + +def repair(name: str): + """ + """ + private_astabench_repairs = import_module("astabench.private_repairs") + astabench_repairs = import_module("astabench.repairs") + + for v in private_astabench_repairs.REPAIRS.values(): + print("private") + v("private") + v(name) + + for v in astabench_repairs.REPAIRS.values(): + print("public") + v("public") + v(name) From e3b7ccded75f75c9a0231e6641e8ee83972e1078 Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sat, 16 Aug 2025 18:19:45 -0700 Subject: [PATCH 02/14] checkpoint 2 --- src/agenteval/cli.py | 11 ++++-- src/agenteval/leaderboard/models.py | 15 +++++++- src/agenteval/repairs.py | 55 +++++++++++++++++++++-------- 3 files changed, 63 insertions(+), 18 deletions(-) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index 7f82e25..4d6b0b0 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -259,11 +259,16 @@ def score_command( cli.add_command(score_command) @click.command(name="repair", help="TODO.") -@click.option("--name", type=str) -def repair_command(name: str): +@click.option("--name", type=str, help="TODO.") +@click.option("--registry", type=str, multiple=True, help="TODO.") +@click.option("--intervention", type=str, multiple=True, help="TODO.") +def repair_command(name: str, registry: tuple, intervention: tuple): + """ + # uv run astabench repair --name dog --intervention agenteval:say-hi --intervention astabench:say-hello --intervention astabenchprivate:say-hey --registry astabench:astabench.repairs --registry astabenchprivate:astabench.private_repairs + """ print("in repair command") print(f"name arg: {name}") - repair(name) + repair(name, intervention_pointer_strs=intervention, registry_pointer_strs=registry) cli.add_command(repair_command) diff --git a/src/agenteval/leaderboard/models.py b/src/agenteval/leaderboard/models.py index 93a8751..1ce4a4b 100644 --- a/src/agenteval/leaderboard/models.py +++ b/src/agenteval/leaderboard/models.py @@ -4,11 +4,24 @@ import yaml from datasets import Features -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from ..models import SubmissionMetadata, SuiteConfig, TaskResult +class InterventionInfo(BaseModel): + model_config = ConfigDict(frozen=True) + + registry: str + name: str + + @staticmethod + def from_str(a_str: str): + sep = ":" + [registry, name] = a_str.split(sep) + return InterventionInfo(registry=registry, name=name) + + class LeaderboardSubmission(BaseModel): suite_config: SuiteConfig """Task configuration for the results.""" diff --git a/src/agenteval/repairs.py b/src/agenteval/repairs.py index e80be19..3fec647 100644 --- a/src/agenteval/repairs.py +++ b/src/agenteval/repairs.py @@ -1,19 +1,46 @@ from importlib import import_module -import sys +from pydantic import BaseModel -def repair(name: str): +from agenteval.leaderboard.models import InterventionInfo + + +CHANGES = {"say-hi": lambda x: print(f"from agent-eval (inner): hi {x}")} + + +class RegistryEntry(BaseModel): + registry: str + name: str + + @staticmethod + def from_str(a_str): + sep = ":" + [registry, name] = a_str.split(sep) + return RegistryEntry(registry=registry, name=name) + + +class Registry: + def __init__(self, registry_pointer_strs: list[str]): + self.registry = {"agenteval": CHANGES} + + registry_entries = [RegistryEntry.from_str(p) for p in registry_pointer_strs] + for entry in registry_entries: + assert entry.registry not in self.registry, "Multiple change registry entries with the same name." + self.registry[entry.registry] = import_module(entry.name).CHANGES + + def find_change(self, change_pointer: InterventionInfo): + return self.registry.get(change_pointer.registry, {}).get(change_pointer.name) + + + +def repair(name: str, intervention_pointer_strs: list[str], registry_pointer_strs: list[str]): """ """ - private_astabench_repairs = import_module("astabench.private_repairs") - astabench_repairs = import_module("astabench.repairs") - - for v in private_astabench_repairs.REPAIRS.values(): - print("private") - v("private") - v(name) - - for v in astabench_repairs.REPAIRS.values(): - print("public") - v("public") - v(name) + registry = Registry(registry_pointer_strs) + intervention_pointers = [InterventionInfo.from_str(p) for p in intervention_pointer_strs] + for intervention_pointer in intervention_pointers: + maybe_change = registry.find_change(intervention_pointer) + if maybe_change is not None: + maybe_change(name) + else: + print(f"Unable to find change {intervention_pointer}.") From 79c653b0a59a37d288f8d588e5ce46414c9b1fad Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sun, 17 Aug 2025 15:27:18 -0700 Subject: [PATCH 03/14] closer to repair --- src/agenteval/cli.py | 67 ++++++++++++-- src/agenteval/leaderboard/models.py | 37 +++++++- src/agenteval/repairs.py | 133 +++++++++++++++++++++++----- 3 files changed, 203 insertions(+), 34 deletions(-) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index 4d6b0b0..e1fa6f0 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -24,7 +24,7 @@ upload_folder_to_hf, ) from .models import EvalConfig, SubmissionMetadata, TaskResults -from .repairs import repair +from .repairs import edit_lb_submissions, LbSubmissionWithDetails from .score import process_eval_logs from .summary import compute_summary_statistics @@ -258,20 +258,71 @@ def score_command( cli.add_command(score_command) -@click.command(name="repair", help="TODO.") -@click.option("--name", type=str, help="TODO.") +@click.command(name="edit", help="TODO.") @click.option("--registry", type=str, multiple=True, help="TODO.") @click.option("--intervention", type=str, multiple=True, help="TODO.") -def repair_command(name: str, registry: tuple, intervention: tuple): +@click.argument("result_urls", nargs=-1, required=True, type=str) +def edit_command( + registry: tuple, + intervention: tuple, + result_urls: tuple[str, ...], +): """ # uv run astabench repair --name dog --intervention agenteval:say-hi --intervention astabench:say-hello --intervention astabenchprivate:say-hey --registry astabench:astabench.repairs --registry astabenchprivate:astabench.private_repairs """ - print("in repair command") - print(f"name arg: {name}") - repair(name, intervention_pointer_strs=intervention, registry_pointer_strs=registry) + if not result_urls: + click.echo("At least one result URL is required.") + sys.exit(1) + + with tempfile.TemporaryDirectory() as temp_dir: + from huggingface_hub import HfApi, snapshot_download + + local_current_results_dir = os.path.join(temp_dir, "current") + local_edited_results_dir = os.path.join(temp_dir, "edited") + + hf_api = HfApi() + + result_repo_ids = set() + result_paths = [] + + # Validate URLs + for result_url in result_urls: + result_repo_id, result_path = parse_hf_url( + result_url + ) # validates result_url format "hf:///" + result_repo_ids.add(result_repo_id) + result_paths.append(result_path) + + if len(result_repo_ids) > 1: + click.echo("All result URLs must reference the same repo") + sys.exit(1) + + result_repo_id = result_repo_ids.pop() + + # Download all input files in one shot + snapshot_download( + repo_id=result_repo_id, + repo_type="dataset", + allow_patterns=result_paths, + local_dir=local_current_results_dir, + ) + + all_lb_submissions_with_details = [] + for result_path in result_paths: + local_current_result_path = os.path.join(local_current_results_dir, result_path) + with open(local_current_result_path) as f_current: + lb_submission = LeaderboardSubmission.model_validate(json.load(f_current)) + lb_submission_with_details = LbSubmissionWithDetails.mk(lb_submission, result_path) + all_lb_submissions_with_details.append(lb_submission_with_details) + + edit_lb_submissions( + lb_submissions_with_details=all_lb_submissions_with_details, + intervention_pointer_strs=list(intervention), + registry_pointer_strs=(list(registry)), + ) -cli.add_command(repair_command) +cli.add_command(edit_command) @click.command( diff --git a/src/agenteval/leaderboard/models.py b/src/agenteval/leaderboard/models.py index 1ce4a4b..335c5fe 100644 --- a/src/agenteval/leaderboard/models.py +++ b/src/agenteval/leaderboard/models.py @@ -1,5 +1,6 @@ import re from dataclasses import dataclass +from datetime import datetime, timezone from io import BytesIO import yaml @@ -9,9 +10,7 @@ from ..models import SubmissionMetadata, SuiteConfig, TaskResult -class InterventionInfo(BaseModel): - model_config = ConfigDict(frozen=True) - +class InterventionPointer(BaseModel): registry: str name: str @@ -19,7 +18,29 @@ class InterventionInfo(BaseModel): def from_str(a_str: str): sep = ":" [registry, name] = a_str.split(sep) - return InterventionInfo(registry=registry, name=name) + return InterventionPointer(registry=registry, name=name) + + +class AppliedIntervention(BaseModel): + pointer: InterventionPointer + applied: datetime + + +class Interventions(BaseModel): + edits: list[AppliedIntervention] | None + conversions: list[AppliedIntervention] | None + + def add_edit(self, pointer: InterventionPointer): + self.edits.append(AppliedIntervention(pointer=pointer, applied=datetime.now(timezone.utc))) + + def has_edits(self): + return (self.edits is not None) and (len(self.edits) > 0) + + def add_conversion(self, pointer: InterventionPointer): + self.conversions.append(AppliedIntervention(pointer=pointer, applied=datetime.now(timezone.utc))) + + def has_conversions(self): + return (self.conversions is not None) and (len(self.conversions) > 0) class LeaderboardSubmission(BaseModel): @@ -32,6 +53,14 @@ class LeaderboardSubmission(BaseModel): results: list[TaskResult] | None = None submission: SubmissionMetadata = Field(default_factory=SubmissionMetadata) + interventions: Interventions | None = None + + def has_edits(self): + return (self.interventions is not None) and self.interventions.has_edits() + + def has_conversions(self): + return (self.interventions is not None) and self.interventions.has_conversions() + @dataclass class Readme: diff --git a/src/agenteval/repairs.py b/src/agenteval/repairs.py index 3fec647..d4b5bda 100644 --- a/src/agenteval/repairs.py +++ b/src/agenteval/repairs.py @@ -1,14 +1,80 @@ +from dataclasses import dataclass from importlib import import_module +from typing import Callable from pydantic import BaseModel -from agenteval.leaderboard.models import InterventionInfo +from agenteval.leaderboard.models import InterventionPointer, LeaderboardSubmission -CHANGES = {"say-hi": lambda x: print(f"from agent-eval (inner): hi {x}")} +EDIT_INTERVENTION_KIND = "edit" +CONVERSION_INTERVENTION_KIND = "conversion" -class RegistryEntry(BaseModel): +@dataclass +class WithinRepoPath: + hf_config: str + split: str + end: str + + def submission_name(self) -> str: + suffix = ".json" + if self.end.endswith(suffix): + submission_name = self.end[:-len(suffix)] + else: + submission_name = self.end + return submission_name + + @staticmethod + def from_path(path: str, sep: str = "/"): + [hf_config, split, end] = path.split(sep) + return WithinRepoPath( + hf_config=hf_config, + split=split, + end=end, + ) + + def to_path(self, sep: str = "/") -> str: + return sep.join([self.hf_config, self.split, self.end]) + + +class LbSubmissionWithDetails(BaseModel): + lb_submission: LeaderboardSubmission + submission_path: WithinRepoPath + + @staticmethod + def mk(lb_submission: LeaderboardSubmission, submission_path: str): + return LbSubmissionWithDetails( + lb_submission=lb_submission, + submission_path=WithinRepoPath.from_path(submission_path), + ) + + +class Intervention: + def __init__( + self, + eligible: Callable[[LbSubmissionWithDetails], bool], + transform: Callable[[LeaderboardSubmission], bool] + ): + self._eligible = eligible + self._transform = transform + + def eligible(self, submission_with_details: LbSubmissionWithDetails) -> bool: + return self._eligible(submission_with_details) + + def transform(self, lb_submission: LeaderboardSubmission) -> bool: + self._transform(lb_submission) + + +# intervention kind -> config name -> intervention name -> Intervention +INTERVENTIONS: dict[str, dict[str, dict[str, Intervention]]] = { + EDIT_INTERVENTION_KIND: {}, + CONVERSION_INTERVENTION_KIND: {}, +} + + +@dataclass +class RegistryPointer: registry: str name: str @@ -16,31 +82,54 @@ class RegistryEntry(BaseModel): def from_str(a_str): sep = ":" [registry, name] = a_str.split(sep) - return RegistryEntry(registry=registry, name=name) + return RegistryPointer(registry=registry, name=name) class Registry: def __init__(self, registry_pointer_strs: list[str]): - self.registry = {"agenteval": CHANGES} + self.registry = {"agenteval": INTERVENTIONS} - registry_entries = [RegistryEntry.from_str(p) for p in registry_pointer_strs] - for entry in registry_entries: - assert entry.registry not in self.registry, "Multiple change registry entries with the same name." - self.registry[entry.registry] = import_module(entry.name).CHANGES + registry_pointers = [RegistryPointer.from_str(p) for p in registry_pointer_strs] + for pointer in registry_pointers: + assert pointer.registry not in self.registry, "Multiple registry entries with the same name." + self.registry[pointer.registry] = import_module(pointer.name).INTERVENTIONS - def find_change(self, change_pointer: InterventionInfo): - return self.registry.get(change_pointer.registry, {}).get(change_pointer.name) + def find_intervention(self, intervention_kind: str, config_name: str, pointer: InterventionPointer): + return self.registry.get(pointer.registry, {}).get(intervention_kind, {}).get(config_name, {}).get(pointer.name) - -def repair(name: str, intervention_pointer_strs: list[str], registry_pointer_strs: list[str]): - """ - """ +def edit_lb_submissions( + lb_submissions_with_details: list[LbSubmissionWithDetails], + intervention_pointer_strs: list[str], + registry_pointer_strs: list[str], +): registry = Registry(registry_pointer_strs) - intervention_pointers = [InterventionInfo.from_str(p) for p in intervention_pointer_strs] - for intervention_pointer in intervention_pointers: - maybe_change = registry.find_change(intervention_pointer) - if maybe_change is not None: - maybe_change(name) - else: - print(f"Unable to find change {intervention_pointer}.") + intervention_pointers = [InterventionPointer.from_str(p) for p in intervention_pointer_strs] + + edited_any_lb_submissions = False + for lb_submission_with_details in lb_submissions_with_details: + + edited_this_lb_submission = False + for intervention_pointer in intervention_pointers: + + maybe_edit = registry.find_intervention( + intervention_kind="edit", + config_name=lb_submission_with_details.lb_submission.suite_config.version, + pointer=intervention_pointer, + ) + if (maybe_edit is not None) : + if maybe_edit.eligible(lb_submission_with_details): + applied_one_edit = maybe_edit.transform(lb_submission_with_details.lb_submission) + edited_this_lb_submission = edited_this_lb_submission or applied_one_edit + else: + print(f"{lb_submission_with_details.submission_path} is not eligble for the {intervention_pointer} change.") + + else: + print(f"Unable to find {intervention_pointer}.") + + if edited_this_lb_submission: + lb_submission_with_details.lb_submission.interventions.add_edit(intervention_pointer) + + edited_any_lb_submissions = edited_any_lb_submissions or edited_this_lb_submission + + return edited_any_lb_submissions From 48ae5afd5323d9112cd89e4017ecb3f0c09f0fd8 Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sun, 17 Aug 2025 15:38:49 -0700 Subject: [PATCH 04/14] rename repairs --- src/agenteval/cli.py | 5 +---- src/agenteval/{repairs.py => interventions.py} | 0 2 files changed, 1 insertion(+), 4 deletions(-) rename src/agenteval/{repairs.py => interventions.py} (100%) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index e1fa6f0..c027017 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -24,7 +24,7 @@ upload_folder_to_hf, ) from .models import EvalConfig, SubmissionMetadata, TaskResults -from .repairs import edit_lb_submissions, LbSubmissionWithDetails +from .interventions import edit_lb_submissions, LbSubmissionWithDetails from .score import process_eval_logs from .summary import compute_summary_statistics @@ -267,9 +267,6 @@ def edit_command( intervention: tuple, result_urls: tuple[str, ...], ): - """ - # uv run astabench repair --name dog --intervention agenteval:say-hi --intervention astabench:say-hello --intervention astabenchprivate:say-hey --registry astabench:astabench.repairs --registry astabenchprivate:astabench.private_repairs - """ if not result_urls: click.echo("At least one result URL is required.") sys.exit(1) diff --git a/src/agenteval/repairs.py b/src/agenteval/interventions.py similarity index 100% rename from src/agenteval/repairs.py rename to src/agenteval/interventions.py From fd75be36252162a117a19617b4a8acbdfdaca9ec Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sun, 17 Aug 2025 19:02:42 -0700 Subject: [PATCH 05/14] just one at a time --- src/agenteval/cli.py | 17 ++++++----- src/agenteval/interventions.py | 54 +++++++++++++++------------------- 2 files changed, 33 insertions(+), 38 deletions(-) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index c027017..bd03749 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -17,14 +17,14 @@ from .cli_utils import AliasedChoice, generate_choice_help from .config import load_suite_config from .io import atomic_write_file -from .leaderboard.models import LeaderboardSubmission, Readme +from .leaderboard.models import InterventionPointer, LeaderboardSubmission, Readme from .leaderboard.upload import ( compress_model_usages, sanitize_path_component, upload_folder_to_hf, ) from .models import EvalConfig, SubmissionMetadata, TaskResults -from .interventions import edit_lb_submissions, LbSubmissionWithDetails +from .interventions import edit_lb_submission, LbSubmissionWithDetails, Registry from .score import process_eval_logs from .summary import compute_summary_statistics @@ -271,6 +271,9 @@ def edit_command( click.echo("At least one result URL is required.") sys.exit(1) + registry = Registry(list(registry)) + intervention_pointers = [InterventionPointer.from_str(p) for p in list(intervention)] + with tempfile.TemporaryDirectory() as temp_dir: from huggingface_hub import HfApi, snapshot_download @@ -312,11 +315,11 @@ def edit_command( lb_submission_with_details = LbSubmissionWithDetails.mk(lb_submission, result_path) all_lb_submissions_with_details.append(lb_submission_with_details) - edit_lb_submissions( - lb_submissions_with_details=all_lb_submissions_with_details, - intervention_pointer_strs=list(intervention), - registry_pointer_strs=(list(registry)), - ) + edit_lb_submission( + lb_submission_with_details=lb_submission_with_details, + intervention_pointers=intervention_pointers, + registry=registry, + ) cli.add_command(edit_command) diff --git a/src/agenteval/interventions.py b/src/agenteval/interventions.py index d4b5bda..775d983 100644 --- a/src/agenteval/interventions.py +++ b/src/agenteval/interventions.py @@ -98,38 +98,30 @@ def find_intervention(self, intervention_kind: str, config_name: str, pointer: I return self.registry.get(pointer.registry, {}).get(intervention_kind, {}).get(config_name, {}).get(pointer.name) -def edit_lb_submissions( - lb_submissions_with_details: list[LbSubmissionWithDetails], - intervention_pointer_strs: list[str], - registry_pointer_strs: list[str], -): - registry = Registry(registry_pointer_strs) - intervention_pointers = [InterventionPointer.from_str(p) for p in intervention_pointer_strs] - - edited_any_lb_submissions = False - for lb_submission_with_details in lb_submissions_with_details: - - edited_this_lb_submission = False - for intervention_pointer in intervention_pointers: - - maybe_edit = registry.find_intervention( - intervention_kind="edit", - config_name=lb_submission_with_details.lb_submission.suite_config.version, - pointer=intervention_pointer, - ) - if (maybe_edit is not None) : - if maybe_edit.eligible(lb_submission_with_details): - applied_one_edit = maybe_edit.transform(lb_submission_with_details.lb_submission) - edited_this_lb_submission = edited_this_lb_submission or applied_one_edit - else: - print(f"{lb_submission_with_details.submission_path} is not eligble for the {intervention_pointer} change.") - +def edit_lb_submission( + lb_submission_with_details: LbSubmissionWithDetails, + intervention_pointers: list[InterventionPointer], + registry: Registry, +) -> bool: + edited_this_lb_submission = False + for intervention_pointer in intervention_pointers: + + maybe_edit = registry.find_intervention( + intervention_kind="edit", + config_name=lb_submission_with_details.lb_submission.suite_config.version, + pointer=intervention_pointer, + ) + if (maybe_edit is not None) : + if maybe_edit.eligible(lb_submission_with_details): + applied_one_edit = maybe_edit.transform(lb_submission_with_details.lb_submission) + edited_this_lb_submission = edited_this_lb_submission or applied_one_edit else: - print(f"Unable to find {intervention_pointer}.") + print(f"{lb_submission_with_details.submission_path} is not eligble for the {intervention_pointer} change.") - if edited_this_lb_submission: - lb_submission_with_details.lb_submission.interventions.add_edit(intervention_pointer) + else: + print(f"Unable to find {intervention_pointer}.") - edited_any_lb_submissions = edited_any_lb_submissions or edited_this_lb_submission + if edited_this_lb_submission: + lb_submission_with_details.lb_submission.interventions.add_edit(intervention_pointer) - return edited_any_lb_submissions + return edited_this_lb_submission From 6ba520111b208d6163dfa365511a8ff6f6dc5694 Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sun, 17 Aug 2025 19:11:51 -0700 Subject: [PATCH 06/14] pull out repo from paths --- src/agenteval/cli.py | 71 +++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index bd03749..da5cb07 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -6,6 +6,7 @@ import sys import tempfile from collections import defaultdict +from dataclasses import dataclass from datetime import datetime, timezone from io import BytesIO @@ -58,6 +59,33 @@ def parse_hf_url(url: str) -> tuple[str, str]: return hf_url_match.group("repo_id"), hf_url_match.group("path") +@dataclass +class RepoPathsOfInterest: + repo_id: str + relative_paths: list[str] + + @staticmethod + def from_urls(urls: list[str]): + repo_ids = set() + paths = [] + + for url in urls: + # validates submission_url format "hf:///" + repo_id, path = parse_hf_url(url) + repo_ids.add(repo_id) + paths.append(path) + + if len(repo_ids) > 1: + raise Exception("All URLs must reference the same repo") + + repo_id_to_use = repo_ids.pop() + + return RepoPathsOfInterest( + repo_id=repo_id_to_use, + relative_paths=list(set(paths)), + ) + + def verify_git_reproducibility() -> None: try: # Get current commit SHA and origin @@ -280,24 +308,9 @@ def edit_command( local_current_results_dir = os.path.join(temp_dir, "current") local_edited_results_dir = os.path.join(temp_dir, "edited") - hf_api = HfApi() - - result_repo_ids = set() - result_paths = [] - - # Validate URLs - for result_url in result_urls: - result_repo_id, result_path = parse_hf_url( - result_url - ) # validates result_url format "hf:///" - result_repo_ids.add(result_repo_id) - result_paths.append(result_path) - - if len(result_repo_ids) > 1: - click.echo("All result URLs must reference the same repo") - sys.exit(1) - - result_repo_id = result_repo_ids.pop() + result_paths_of_interest = RepoPathsOfInterest.from_urls(result_urls) + result_repo_id = result_paths_of_interest.repo_id + result_paths = result_paths_of_interest.relative_paths # Download all input files in one shot snapshot_download( @@ -321,6 +334,9 @@ def edit_command( registry=registry, ) + hf_api = HfApi() + + cli.add_command(edit_command) @@ -565,22 +581,9 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): hf_api = HfApi() - submission_repo_ids = set() - submission_paths = [] - - # Validate URLs - for submission_url in submission_urls: - submission_repo_id, submission_path = parse_hf_url( - submission_url - ) # validates submission_url format "hf:///" - submission_repo_ids.add(submission_repo_id) - submission_paths.append(submission_path) - - if len(submission_repo_ids) > 1: - click.echo("All submission URLs must reference the same repo") - sys.exit(1) - - submission_repo_id = submission_repo_ids.pop() + submission_paths_of_interest = RepoPathsOfInterest.from_urls(submission_urls) + submission_repo_id = submission_paths_of_interest.repo_id + submission_paths = submission_paths_of_interest.relative_paths eval_config_rel_paths = [ f"{p}/{EVAL_CONFIG_FILENAME}" for p in submission_paths From 21e7eb6901c4987fcf48ae8775a0158522b8e84f Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sun, 17 Aug 2025 19:20:12 -0700 Subject: [PATCH 07/14] pull out readme check --- src/agenteval/cli.py | 40 ++++--------------- src/agenteval/leaderboard/schema_generator.py | 38 ++++++++++++++++++ 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index da5cb07..b12d67f 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -13,7 +13,7 @@ import click import datasets -from agenteval.leaderboard.schema_generator import load_dataset_features +from agenteval.leaderboard.schema_generator import check_lb_submissions_against_readme, load_dataset_features from .cli_utils import AliasedChoice, generate_choice_help from .config import load_suite_config @@ -605,10 +605,8 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): local_dir=local_submissions_dir, ) + all_lb_submissions = [] # Create results files locally - config_splits = defaultdict( - list - ) # Accumulate config names and splits being published for ( submission_url, submission_path, @@ -649,7 +647,6 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): eval_config = EvalConfig.model_validate_json( open(local_eval_config_path).read() ) - config_splits[eval_config.suite_config.version].append(eval_config.split) results = TaskResults.model_validate_json( open(local_scores_path).read() ).results @@ -664,6 +661,7 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): submission=submission, ) lb_submission = compress_model_usages(lb_submission) + all_lb_submissions.append(lb_submission) os.makedirs( os.path.join(local_results_dir, os.path.dirname(submission_path)), exist_ok=True, @@ -676,34 +674,10 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): f.write(lb_submission.model_dump_json(indent=None)) # Validate the config with the schema in HF - readme = Readme.download_and_parse(repo_id) - missing_configs = list(set(config_splits.keys()) - set(readme.configs.keys())) - if missing_configs: - click.echo( - f"Config name {missing_configs} not present in hf://{repo_id}/README.md" - ) - click.echo( - f"Run 'update_readme.py add-config --repo-id {repo_id} --config-name {missing_configs[0]}' to add it" - ) - sys.exit(1) - missing_splits = list( - set(((c, s) for c in config_splits.keys() for s in config_splits[c])) - - set(((c, s) for c in readme.configs.keys() for s in readme.configs[c])) - ) - if missing_splits: - click.echo( - f"Config/Split {missing_splits} not present in hf://{repo_id}/README.md" - ) - click.echo( - f"Run 'update_readme.py add-config --repo-id {repo_id} --config-name {missing_splits[0][0]} --split {missing_splits[0][1]}` to add it" - ) - sys.exit(1) - local_features = load_dataset_features() - if local_features.arrow_schema != readme.features.arrow_schema: - click.echo( - "Schema in local dataset_features.yml does not match schema in hf://{repo_id}/README.md" - ) - click.echo("Run 'update_readme.py sync-schema' to update it") + try: + check_lb_submissions_against_readme(all_lb_submissions, repo_id) + except Exception as exc: + click.echo(str(exc)) sys.exit(1) # Upload all results files in one shot diff --git a/src/agenteval/leaderboard/schema_generator.py b/src/agenteval/leaderboard/schema_generator.py index a2dcd42..baabb7c 100644 --- a/src/agenteval/leaderboard/schema_generator.py +++ b/src/agenteval/leaderboard/schema_generator.py @@ -125,3 +125,41 @@ def load_dataset_features(input_path: str | None = None) -> Features: with open(input_path, "r", encoding="utf-8") as f: yaml_values = yaml.safe_load(f) return Features._from_yaml_list(yaml_values) + + +def check_lb_submissions_against_readme( + lb_submissions: list[LeaderboardSubmission], + repo_id: str, +): + config_splits = defaultdict( + list + ) # Accumulate config names and splits being published + for lb_submission in lb_submissions: + config_splits[lb_submission.suite_config.version].append(lb_submission.split) + + readme = Readme.download_and_parse(repo_id) + missing_configs = list(set(config_splits.keys()) - set(readme.configs.keys())) + if missing_configs: + message_for_exc = ( + f"Config name {missing_configs} not present in hf://{repo_id}/README.md" + f"Run 'update_readme.py add-config --repo-id {repo_id} --config-name {missing_configs[0]}' to add it" + raise Exception(message_for_exc) + + missing_splits = list( + set(((c, s) for c in config_splits.keys() for s in config_splits[c])) + - set(((c, s) for c in readme.configs.keys() for s in readme.configs[c])) + ) + if missing_splits: + message_for_exc = ( + f"Config/Split {missing_splits} not present in hf://{repo_id}/README.md" + f"Run 'update_readme.py add-config --repo-id {repo_id} --config-name {missing_splits[0][0]} --split {missing_splits[0][1]}` to add it" + ) + raise Exception(message_for_exc) + + local_features = load_dataset_features() + if local_features.arrow_schema != readme.features.arrow_schema: + message_for_exc = ( + f"Schema in local dataset_features.yml does not match schema in hf://{repo_id}/README.md" + "Run 'update_readme.py sync-schema' to update it" + ) + raise Exception(message_for_exc) From 48d7472f9870b946efc2ab15ed8a0a91e0e61bd7 Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sun, 17 Aug 2025 20:05:15 -0700 Subject: [PATCH 08/14] updates for edit --- src/agenteval/cli.py | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index b12d67f..ba4bf64 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -286,6 +286,7 @@ def score_command( cli.add_command(score_command) + @click.command(name="edit", help="TODO.") @click.option("--registry", type=str, multiple=True, help="TODO.") @click.option("--intervention", type=str, multiple=True, help="TODO.") @@ -320,22 +321,51 @@ def edit_command( local_dir=local_current_results_dir, ) - all_lb_submissions_with_details = [] + all_edited_lb_submissions = [] for result_path in result_paths: local_current_result_path = os.path.join(local_current_results_dir, result_path) with open(local_current_result_path) as f_current: lb_submission = LeaderboardSubmission.model_validate(json.load(f_current)) lb_submission_with_details = LbSubmissionWithDetails.mk(lb_submission, result_path) - all_lb_submissions_with_details.append(lb_submission_with_details) - edit_lb_submission( + # edits the lb submission in place + edited_this_submission = edit_lb_submission( lb_submission_with_details=lb_submission_with_details, intervention_pointers=intervention_pointers, registry=registry, ) + if edited_this_submission: + all_edited_lb_submissions.append(lb_submission) - hf_api = HfApi() + os.makedirs( + os.path.join(local_edited_results_dir, os.path.dirname(result_path)), + exist_ok=True, + ) + with open( + os.path.join(local_edited_results_dir, result_path), + "w", + encoding="utf-8", + ) as f_edited: + f_edited.write(lb_submission.model_dump_json(indent=None)) + # Validate the config with the schema in HF + if len(all_edited_lb_submissions): + try: + check_lb_submissions_against_readme(all_edited_lb_submissions, result_repo_id) + except Exception as exc: + click.echo(str(exc)) + sys.exit(1) + + # Upload all results files in one shot + click.echo(f"Uploading {len(result_paths)} results to {result_repo_id}...") + hf_api = HfApi() + hf_api.upload_folder( + folder_path=local_edited_results_dir, + path_in_repo="", + repo_id=result_repo_id, + repo_type="dataset", + ) + click.echo("Done") cli.add_command(edit_command) From a01cd9e4cf3bfe7c9a6c7cb299417ade6b9d6fea Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sun, 17 Aug 2025 20:50:45 -0700 Subject: [PATCH 09/14] fix stuff --- src/agenteval/cli.py | 1 - src/agenteval/interventions.py | 7 +++---- src/agenteval/leaderboard/models.py | 14 ++++++++++++++ src/agenteval/leaderboard/schema_generator.py | 4 +++- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index ba4bf64..d1d2755 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -5,7 +5,6 @@ import subprocess import sys import tempfile -from collections import defaultdict from dataclasses import dataclass from datetime import datetime, timezone from io import BytesIO diff --git a/src/agenteval/interventions.py b/src/agenteval/interventions.py index 775d983..8032a95 100644 --- a/src/agenteval/interventions.py +++ b/src/agenteval/interventions.py @@ -63,7 +63,7 @@ def eligible(self, submission_with_details: LbSubmissionWithDetails) -> bool: return self._eligible(submission_with_details) def transform(self, lb_submission: LeaderboardSubmission) -> bool: - self._transform(lb_submission) + return self._transform(lb_submission) # intervention kind -> config name -> intervention name -> Intervention @@ -114,6 +114,8 @@ def edit_lb_submission( if (maybe_edit is not None) : if maybe_edit.eligible(lb_submission_with_details): applied_one_edit = maybe_edit.transform(lb_submission_with_details.lb_submission) + if applied_one_edit: + lb_submission_with_details.lb_submission.add_edit(intervention_pointer) edited_this_lb_submission = edited_this_lb_submission or applied_one_edit else: print(f"{lb_submission_with_details.submission_path} is not eligble for the {intervention_pointer} change.") @@ -121,7 +123,4 @@ def edit_lb_submission( else: print(f"Unable to find {intervention_pointer}.") - if edited_this_lb_submission: - lb_submission_with_details.lb_submission.interventions.add_edit(intervention_pointer) - return edited_this_lb_submission diff --git a/src/agenteval/leaderboard/models.py b/src/agenteval/leaderboard/models.py index 335c5fe..2257f0d 100644 --- a/src/agenteval/leaderboard/models.py +++ b/src/agenteval/leaderboard/models.py @@ -31,12 +31,16 @@ class Interventions(BaseModel): conversions: list[AppliedIntervention] | None def add_edit(self, pointer: InterventionPointer): + if self.edits is None: + self.edits = [] self.edits.append(AppliedIntervention(pointer=pointer, applied=datetime.now(timezone.utc))) def has_edits(self): return (self.edits is not None) and (len(self.edits) > 0) def add_conversion(self, pointer: InterventionPointer): + if self.conversions is None: + self.conversions = [] self.conversions.append(AppliedIntervention(pointer=pointer, applied=datetime.now(timezone.utc))) def has_conversions(self): @@ -55,9 +59,19 @@ class LeaderboardSubmission(BaseModel): interventions: Interventions | None = None + def add_edit(self, pointer: InterventionPointer): + if self.interventions is None: + self.interventions = Interventions(edits=[], conversions=None) + self.interventions.add_edit(pointer) + def has_edits(self): return (self.interventions is not None) and self.interventions.has_edits() + def add_conversion(self, pointer: InterventionPointer): + if self.interventions is None: + self.interventions = Interventions(edits=None, conversions=[]) + self.interventions.add_conversion(pointer) + def has_conversions(self): return (self.interventions is not None) and self.interventions.has_conversions() diff --git a/src/agenteval/leaderboard/schema_generator.py b/src/agenteval/leaderboard/schema_generator.py index baabb7c..4e654f1 100644 --- a/src/agenteval/leaderboard/schema_generator.py +++ b/src/agenteval/leaderboard/schema_generator.py @@ -4,6 +4,7 @@ import datetime import types +from collections import defaultdict from importlib import resources from typing import Any, Literal, Union, get_args, get_origin @@ -12,7 +13,7 @@ from datasets import Features from pydantic import BaseModel -from .models import LeaderboardSubmission +from .models import LeaderboardSubmission, Readme def _pa_type_for_annotation(anno) -> pa.DataType: @@ -143,6 +144,7 @@ def check_lb_submissions_against_readme( message_for_exc = ( f"Config name {missing_configs} not present in hf://{repo_id}/README.md" f"Run 'update_readme.py add-config --repo-id {repo_id} --config-name {missing_configs[0]}' to add it" + ) raise Exception(message_for_exc) missing_splits = list( From d24b3657c9a4a6719bb82fa7ec74cb59bb92a99a Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sun, 17 Aug 2025 21:08:23 -0700 Subject: [PATCH 10/14] better comment --- src/agenteval/interventions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agenteval/interventions.py b/src/agenteval/interventions.py index 8032a95..6d8e328 100644 --- a/src/agenteval/interventions.py +++ b/src/agenteval/interventions.py @@ -121,6 +121,6 @@ def edit_lb_submission( print(f"{lb_submission_with_details.submission_path} is not eligble for the {intervention_pointer} change.") else: - print(f"Unable to find {intervention_pointer}.") + print(f"Unable to find intervention {intervention_pointer}.") return edited_this_lb_submission From 03fde9c5d4c9d46e70973ead65ea1b312f928d99 Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sun, 17 Aug 2025 22:28:49 -0700 Subject: [PATCH 11/14] convert first pass --- src/agenteval/cli.py | 105 ++++++++++++++++++++++++++++++--- src/agenteval/config.py | 8 +++ src/agenteval/interventions.py | 37 +++++++++++- src/agenteval/score.py | 3 + 4 files changed, 143 insertions(+), 10 deletions(-) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index d1d2755..b9f29a3 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -24,7 +24,7 @@ upload_folder_to_hf, ) from .models import EvalConfig, SubmissionMetadata, TaskResults -from .interventions import edit_lb_submission, LbSubmissionWithDetails, Registry +from .interventions import convert_lb_submission, edit_lb_submission, LbSubmissionWithDetails, Registry from .score import process_eval_logs from .summary import compute_summary_statistics @@ -346,6 +346,7 @@ def edit_command( encoding="utf-8", ) as f_edited: f_edited.write(lb_submission.model_dump_json(indent=None)) + print(lb_submission.model_dump_json(indent=2)) # Validate the config with the schema in HF if len(all_edited_lb_submissions): @@ -357,19 +358,107 @@ def edit_command( # Upload all results files in one shot click.echo(f"Uploading {len(result_paths)} results to {result_repo_id}...") - hf_api = HfApi() - hf_api.upload_folder( - folder_path=local_edited_results_dir, - path_in_repo="", - repo_id=result_repo_id, - repo_type="dataset", - ) + # hf_api = HfApi() + # hf_api.upload_folder( + # folder_path=local_edited_results_dir, + # path_in_repo="", + # repo_id=result_repo_id, + # repo_type="dataset", + # ) click.echo("Done") cli.add_command(edit_command) +@click.command(name="convert", help="TODO.") +@click.option("--registry", type=str, multiple=True, help="TODO.") +@click.option("--intervention", type=str, help="TODO.") +@click.argument("result_urls", nargs=-1, required=True, type=str) +def convert_command( + registry: tuple, + intervention: str, + result_urls: tuple[str, ...], +): + if not result_urls: + click.echo("At least one result URL is required.") + sys.exit(1) + + registry = Registry(list(registry)) + intervention_pointer = InterventionPointer.from_str(intervention) + + with tempfile.TemporaryDirectory() as temp_dir: + from huggingface_hub import HfApi, snapshot_download + + local_current_config_results_dir = os.path.join(temp_dir, "current") + local_new_config_results_dir = os.path.join(temp_dir, "new") + + result_paths_of_interest = RepoPathsOfInterest.from_urls(result_urls) + result_repo_id = result_paths_of_interest.repo_id + result_paths = result_paths_of_interest.relative_paths + + # Download all input files in one shot + snapshot_download( + repo_id=result_repo_id, + repo_type="dataset", + allow_patterns=result_paths, + local_dir=local_current_config_results_dir, + ) + + all_converted_lb_submissions = [] + for current_config_result_path in result_paths: + local_current_config_result_path = os.path.join(local_current_config_results_dir, current_config_result_path) + with open(local_current_config_result_path) as f_current: + lb_submission = LeaderboardSubmission.model_validate(json.load(f_current)) + lb_submission_with_details = LbSubmissionWithDetails.mk(lb_submission, current_config_result_path) + + # edits the lb submission in place + converted_this_submission = convert_lb_submission( + lb_submission_with_details=lb_submission_with_details, + intervention_pointer=intervention_pointer, + registry=registry, + ) + if converted_this_submission: + all_converted_lb_submissions.append(lb_submission) + new_config_result_path = lb_submission_with_details.submission_path.with_different_hf_config(lb_submission.suite_config.version).to_path() + + os.makedirs( + os.path.join(local_new_config_results_dir, os.path.dirname(new_config_result_path)), + exist_ok=True, + ) + with open( + os.path.join(local_new_config_results_dir, new_config_result_path), + "w", + encoding="utf-8", + ) as f_new: + f_new.write(lb_submission.model_dump_json(indent=None)) + + print(f"{current_config_result_path} -> {new_config_result_path}") + print(lb_submission.model_dump_json(indent=2)) + + # Validate the config with the schema in HF + if len(all_converted_lb_submissions): + try: + check_lb_submissions_against_readme(all_converted_lb_submissions, result_repo_id) + except Exception as exc: + click.echo(str(exc)) + sys.exit(1) + + # Upload all results files in one shot + click.echo(f"Uploading {len(result_paths)} results to {result_repo_id}...") + # hf_api = HfApi() + # hf_api.upload_folder( + # folder_path=local_new_config_results_dir, + # path_in_repo="", + # repo_id=result_repo_id, + # repo_type="dataset", + # ) + click.echo("Done") + + +cli.add_command(convert_command) + + @click.command( name="publish", help="Upload Inspect logs to HuggingFace for official scoring", diff --git a/src/agenteval/config.py b/src/agenteval/config.py index df51e5b..abd450d 100644 --- a/src/agenteval/config.py +++ b/src/agenteval/config.py @@ -86,6 +86,14 @@ def get_split(self, split_name: str) -> Split: f"Split '{split_name}' not found. Available splits: {available_splits}" ) + def get_tasks_by_name(self, split_name: str) -> dict[str, Task]: + tasks = {} + for task in self.get_tasks(split_name): + task_name = task.name + assert task_name not in tasks + tasks[task_name] = task + return tasks + def load_suite_config(file_path: str) -> SuiteConfig: """ diff --git a/src/agenteval/interventions.py b/src/agenteval/interventions.py index 6d8e328..b299c0e 100644 --- a/src/agenteval/interventions.py +++ b/src/agenteval/interventions.py @@ -37,6 +37,13 @@ def from_path(path: str, sep: str = "/"): def to_path(self, sep: str = "/") -> str: return sep.join([self.hf_config, self.split, self.end]) + def with_different_hf_config(self, new_hf_config: str): + return WithinRepoPath( + hf_config=new_hf_config, + split=self.split, + end=self.end, + ) + class LbSubmissionWithDetails(BaseModel): lb_submission: LeaderboardSubmission @@ -118,9 +125,35 @@ def edit_lb_submission( lb_submission_with_details.lb_submission.add_edit(intervention_pointer) edited_this_lb_submission = edited_this_lb_submission or applied_one_edit else: - print(f"{lb_submission_with_details.submission_path} is not eligble for the {intervention_pointer} change.") + print(f"{lb_submission_with_details.submission_path} is not eligble for the {intervention_pointer} edit.") else: - print(f"Unable to find intervention {intervention_pointer}.") + print(f"Unable to find edit {intervention_pointer}.") return edited_this_lb_submission + + +def convert_lb_submission( + lb_submission_with_details: LbSubmissionWithDetails, + intervention_pointer: InterventionPointer, + registry: Registry, +) -> bool: + converted_this_lb_submission = False + maybe_conversion = registry.find_intervention( + intervention_kind="conversion", + config_name=lb_submission_with_details.lb_submission.suite_config.version, + pointer=intervention_pointer, + ) + if (maybe_conversion is not None) : + if maybe_conversion.eligible(lb_submission_with_details): + converted_this_lb_submission = maybe_conversion.transform(lb_submission_with_details.lb_submission) + if converted_this_lb_submission: + lb_submission_with_details.lb_submission.add_conversion(intervention_pointer) + else: + print(f"{lb_submission_with_details.submission_path} is not eligble for the {intervention_pointer} conversion.") + + else: + print(f"Unable to find conversion {intervention_pointer}.") + + return converted_this_lb_submission + diff --git a/src/agenteval/score.py b/src/agenteval/score.py index f4fa2d8..76e9978 100644 --- a/src/agenteval/score.py +++ b/src/agenteval/score.py @@ -96,6 +96,9 @@ class TaskResult(BaseModel): model_costs: list[float | None] | None = None """List of model costs per sample. Computed from `model_usages`.""" + + def available_metrics(self) -> set[str]: + return set([m.name for m in self.metrics]) def get_metrics(log: EvalLog) -> list[Metric]: From 8982c3ea93275cab290287f7333b712134d01555 Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Sun, 17 Aug 2025 22:48:22 -0700 Subject: [PATCH 12/14] chck first pass --- src/agenteval/cli.py | 46 +++++++++++++++++++++++++++++++++- src/agenteval/interventions.py | 17 +++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index b9f29a3..063299e 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -24,7 +24,7 @@ upload_folder_to_hf, ) from .models import EvalConfig, SubmissionMetadata, TaskResults -from .interventions import convert_lb_submission, edit_lb_submission, LbSubmissionWithDetails, Registry +from .interventions import check_lb_submission_for_edit_eligibility, convert_lb_submission, edit_lb_submission, LbSubmissionWithDetails, Registry from .score import process_eval_logs from .summary import compute_summary_statistics @@ -459,6 +459,50 @@ def convert_command( cli.add_command(convert_command) +@click.command(name="check", help="TODO.") +@click.option("--registry", type=str, multiple=True, help="TODO.") +@click.option("--intervention", type=str, multiple=True, help="TODO.") +@click.argument("result_urls", nargs=-1, required=True, type=str) +def check_command( + registry: tuple, + intervention: tuple, + result_urls: tuple[str, ...], +): + if not result_urls: + click.echo("At least one result URL is required.") + sys.exit(1) + + registry = Registry(list(registry)) + intervention_pointers = [InterventionPointer.from_str(p) for p in list(intervention)] + + with tempfile.TemporaryDirectory() as temp_dir: + from huggingface_hub import HfApi, snapshot_download + + local_results_dir = os.path.join(temp_dir, "results") + + result_paths_of_interest = RepoPathsOfInterest.from_urls(result_urls) + result_repo_id = result_paths_of_interest.repo_id + result_paths = result_paths_of_interest.relative_paths + + # Download all input files in one shot + snapshot_download( + repo_id=result_repo_id, + repo_type="dataset", + allow_patterns=result_paths, + local_dir=local_results_dir, + ) + + for result_path in result_paths: + local_result_path = os.path.join(local_results_dir, result_path) + with open(local_result_path) as f_current: + lb_submission = LeaderboardSubmission.model_validate(json.load(f_current)) + lb_submission_with_details = LbSubmissionWithDetails.mk(lb_submission, result_path) + check_lb_submission_for_edit_eligibility(lb_submission_with_details, intervention_pointers, registry) + + +cli.add_command(check_command) + + @click.command( name="publish", help="Upload Inspect logs to HuggingFace for official scoring", diff --git a/src/agenteval/interventions.py b/src/agenteval/interventions.py index b299c0e..12a95ba 100644 --- a/src/agenteval/interventions.py +++ b/src/agenteval/interventions.py @@ -157,3 +157,20 @@ def convert_lb_submission( return converted_this_lb_submission + +def check_lb_submission_for_edit_eligibility( + lb_submission_with_details: LbSubmissionWithDetails, + intervention_pointers: list[InterventionPointer], + registry: Registry, +) -> bool: + for intervention_pointer in intervention_pointers: + maybe_edit = registry.find_intervention( + intervention_kind="edit", + config_name=lb_submission_with_details.lb_submission.suite_config.version, + pointer=intervention_pointer, + ) + if (maybe_edit is not None) : + if maybe_edit.eligible(lb_submission_with_details): + print(f"{lb_submission_with_details.submission_path} is eligble for the {intervention_pointer} edit.") + else: + print(f"Unable to find edit {intervention_pointer}.") From aefdb670a65e0ceeeb82b2808f50069b26767d22 Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Mon, 18 Aug 2025 08:56:55 -0700 Subject: [PATCH 13/14] integrate into lb publish --- src/agenteval/cli.py | 157 +++++++++++++++++++++++---------- src/agenteval/interventions.py | 95 ++++++++++++++++++++ 2 files changed, 205 insertions(+), 47 deletions(-) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index 063299e..16d6b93 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -5,7 +5,6 @@ import subprocess import sys import tempfile -from dataclasses import dataclass from datetime import datetime, timezone from io import BytesIO @@ -14,7 +13,7 @@ from agenteval.leaderboard.schema_generator import check_lb_submissions_against_readme, load_dataset_features -from .cli_utils import AliasedChoice, generate_choice_help +from .cli_utils import AliasedChoice, RepoPathsOfInterest, generate_choice_help from .config import load_suite_config from .io import atomic_write_file from .leaderboard.models import InterventionPointer, LeaderboardSubmission, Readme @@ -58,33 +57,6 @@ def parse_hf_url(url: str) -> tuple[str, str]: return hf_url_match.group("repo_id"), hf_url_match.group("path") -@dataclass -class RepoPathsOfInterest: - repo_id: str - relative_paths: list[str] - - @staticmethod - def from_urls(urls: list[str]): - repo_ids = set() - paths = [] - - for url in urls: - # validates submission_url format "hf:///" - repo_id, path = parse_hf_url(url) - repo_ids.add(repo_id) - paths.append(path) - - if len(repo_ids) > 1: - raise Exception("All URLs must reference the same repo") - - repo_id_to_use = repo_ids.pop() - - return RepoPathsOfInterest( - repo_id=repo_id_to_use, - relative_paths=list(set(paths)), - ) - - def verify_git_reproducibility() -> None: try: # Get current commit SHA and origin @@ -349,7 +321,7 @@ def edit_command( print(lb_submission.model_dump_json(indent=2)) # Validate the config with the schema in HF - if len(all_edited_lb_submissions): + if len(all_edited_lb_submissions) > 0: try: check_lb_submissions_against_readme(all_edited_lb_submissions, result_repo_id) except Exception as exc: @@ -437,7 +409,7 @@ def convert_command( print(lb_submission.model_dump_json(indent=2)) # Validate the config with the schema in HF - if len(all_converted_lb_submissions): + if len(all_converted_lb_submissions) > 0: try: check_lb_submissions_against_readme(all_converted_lb_submissions, result_repo_id) except Exception as exc: @@ -719,6 +691,99 @@ def backfill_command(results_repo_id, submissions_repo_id, submission_path): cli.add_command(backfill_command) +def publish_lb_results_helper( + repo_paths_of_interest: RepoPathsOfInterest, + local_new_results_dir: str, + temp_dir: str, + registry: Registry, + counter: int +): + # The idea is, when you're about to publish a result file, + # look at whether there's already a result file under the same path, + # and take into account any interventions that were applied to + # that result file. + from huggingface_hub import HfApi, snapshot_download + + # local_new_results_dir starting off has results we want to upload + # that may need still need to have edits applied first, and that + # also may indicate that corresponding results converted to different + # configs should also be uploaded + # local_existing_results_dir will have any existing corresponding + # result files under the same configs as files in local_new_results_dir + local_existing_results_dir = os.path.join(temp_dir, f"existingresults{counter}") + + # download any existing result files that correspond to the new ones we want to + # upload, so we can see if any edits or conversions were applied to them, + # so we can apply the same things to the new result files + snapshot_download( + repo_id=repo_paths_of_interest.repo_id, + repo_type="dataset", + allow_patterns=repo_paths_of_interest.relative_paths, + local_dir=local_existing_results_dir, + ) + + # we'll update files in local_new_results_dir to have edited results + all_current_config_lb_submissions = apply_existing_edits_to_result_files( + repo_paths_of_interest=repo_paths_of_interest, + local_new_results_dir=local_new_results_dir, + local_existing_results_dir=local_existing_results_dir, + registry=registry, + ) + + # then we'll push them to the results repo + if len(all_current_config_lb_submissions) > 0: + try: + check_lb_submissions_against_readme(all_current_config_lb_submissions, repo_paths_of_interest.repo_id) + except Exception as exc: + click.echo(str(exc)) + sys.exit(1) + + # Upload all results files in one shot + click.echo(f"Uploading {len(repo_paths_of_interest.relative_paths)} results to {repo_paths_of_interest.repo_id}...") + # hf_api = HfApi() + # hf_api.upload_folder( + # folder_path=local_new_results_dir, + # path_in_repo="", + # repo_id=repo_paths_of_interest.repo_id, + # repo_type="dataset", + # ) + + # local_converted_results_dir is where we'll put corresponding + # result files under new configs when needed + local_converted_results_dir = os.path.join(temp_dir, f"convertedresults{counter}") + + # We'll do conversions off edited results. + # Converted results will go under local_converted_results_dir, under new paths. + # We figure out if we need conversions based on the corresponding + # existing results that we already pulled. + new_config_paths_of_interest = apply_existing_conversions_to_result_files( + repo_paths_of_interest=repo_paths_of_interest, + local_new_results_dir=local_new_results_dir, + local_existing_results_dir=local_existing_results_dir, + local_converted_results_dir=local_converted_results_dir, + registry=registry, + ) + + # Call publish_lb_results() instead of writing converted results directly, + # because we want to make sure that we take into account any interventions + # applied to existing converted result files. + if len(new_config_paths_of_interest): + publish_lb_results_helper( + repo_paths_of_interest=RepoPathsOfInterest( + repo_id=repo_paths_of_interest.repo_id, + relative_paths=new_config_paths_of_interest + ), + local_new_results_dir=local_converted_results_dir, + temp_dir=temp_dir, + registry=registry, + counter=counter+1, + ) + + else: + # If there are no conversions to do, we stop here. + click.echo("Done") + + @click.command( name="publish", help="Publish scored results in log_dir to HuggingFace leaderboard.", @@ -767,7 +832,7 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): local_dir=local_submissions_dir, ) - all_lb_submissions = [] + all_result_paths = [] # Create results files locally for ( submission_url, @@ -823,7 +888,6 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): submission=submission, ) lb_submission = compress_model_usages(lb_submission) - all_lb_submissions.append(lb_submission) os.makedirs( os.path.join(local_results_dir, os.path.dirname(submission_path)), exist_ok=True, @@ -835,22 +899,21 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): ) as f: f.write(lb_submission.model_dump_json(indent=None)) - # Validate the config with the schema in HF - try: - check_lb_submissions_against_readme(all_lb_submissions, repo_id) - except Exception as exc: - click.echo(str(exc)) - sys.exit(1) + all_result_paths.append(f"{submission_path}.json") - # Upload all results files in one shot - click.echo(f"Uploading {len(submission_paths)} results to {repo_id}...") - hf_api.upload_folder( - folder_path=local_results_dir, - path_in_repo="", - repo_id=repo_id, - repo_type="dataset", + # This will upload the results, taking into account any + # interventions that have been applied to exisitng result files + # under the same paths. + publish_lb_results_helper( + repo_paths_of_interest=RepoPathsOfInterest( + repo_id=repo_id, + relative_paths=all_result_paths, + ), + local_new_results_dir=local_results_dir, + temp_dir=temp_dir, + registry=registry, + counter=0, ) - click.echo("Done") @click.group(name="lb", help="Leaderboard related commands") diff --git a/src/agenteval/interventions.py b/src/agenteval/interventions.py index 12a95ba..0447669 100644 --- a/src/agenteval/interventions.py +++ b/src/agenteval/interventions.py @@ -5,6 +5,7 @@ from pydantic import BaseModel from agenteval.leaderboard.models import InterventionPointer, LeaderboardSubmission +from agenteval.cli_utils import RepoPathsOfInterest EDIT_INTERVENTION_KIND = "edit" @@ -174,3 +175,97 @@ def check_lb_submission_for_edit_eligibility( print(f"{lb_submission_with_details.submission_path} is eligble for the {intervention_pointer} edit.") else: print(f"Unable to find edit {intervention_pointer}.") + + +def apply_existing_edits_to_result_files( + repo_paths_of_interest: RepoPathsOfInterest, + local_new_results_dir: str, + local_existing_results_dir: str, + registry: Registry, +): + all_current_config_lb_submissions = [] + for current_config_result_path in repo_paths_of_interest.relative_paths: + local_current_config_new_result_path = os.path.join(local_new_results_dir, current_config_result_path) + with open(local_current_config_new_result_path) as f_current_config_new: + lb_submission_current_config_new = LeaderboardSubmission.model_validate(json.load(f_current_config_new)) + lb_submission_with_details_current_config_new = LbSubmissionWithDetails.mk(lb_submission_current_config_new, current_config_result_path) + + local_current_config_existing_result_path = os.path.join(local_existing_results_dir, current_config_result_path) + if os.path.isfile(local_current_config_existing_result_path): + with open(local_current_config_existing_result_path) as f_current_config_existing: + lb_submission_current_config_existing = LeaderboardSubmission.model_validate(json.load(f_current_config_existing)) + + if lb_submission_current_config_existing.has_edits(): + edit_pointers = [e.pointer for e in lb_submission_current_config_existing.interventions.edits] + + # edits the lb submission in place + edited_this_submission = edit_lb_submission( + lb_submission_with_details=lb_submission_with_details_current_config_new, + intervention_pointers=edit_pointers, + registry=registry, + ) + if edited_this_submission: + with open( + local_current_config_new_result_path, + "w", + encoding="utf-8", + ) as f_current_config_new_post_edits: + f_current_config_new_post_edits.write(lb_submission_current_config_new.model_dump_json(indent=None)) + print(lb_submission_current_config_new.model_dump_json(indent=2)) + + # whether we applied edits or not, we still want to upload all these new result files + all_current_config_lb_submissions.append(lb_submission_current_config_new) + + return all_current_config_lb_submissions + + +def apply_existing_conversions_to_result_files( + repo_paths_of_interest: RepoPathsOfInterest, + local_new_results_dir: str, + local_existing_results_dir: str, + local_converted_results_dir: str, + registry=registry, +): + new_config_paths_of_interest = [] + for current_config_result_path in repo_paths_of_interest.relative_paths: + local_current_config_existing_result_path = os.path.join(local_existing_results_dir, current_config_result_path) + if os.path.isfile(local_current_config_existing_result_path): + with open(local_current_config_existing_result_path) as f_current_config_existing: + lb_submission_current_config_existing = LeaderboardSubmission.model_validate(json.load(f_current_config_existing)) + + if lb_submission_current_config_existing.has_conversions(): + conversion_pointers = [c.pointer for c in lb_submission_current_config_existing.interventions.edits] + + for conversion_pointer in conversion_pointers: + # reopen every time. don't reuse lb_submission_new_with_edits instances + # because they are converted to a different config in place + local_current_config_new_result_path = os.path.join(local_new_results_dir, current_config_result_path) + with open(local_current_config_new_result_path) as f_current_config_new_with_edits: + lb_submission_new_with_edits = LeaderboardSubmission.model_validate(json.load(f_current_config_new_with_edits)) + lb_submission_with_details_new_with_edits = LbSubmissionWithDetails.mk(lb_submission_new_with_edits, current_config_result_path) + + # edits the lb submission in place + converted_this_submission = convert_lb_submission( + lb_submission_with_details=lb_submission_with_details_new_with_edits, + intervention_pointer=conversion_pointer, + registry=registry, + ) + if converted_this_submission:: + new_config_result_path = lb_submission_with_details_new_with_edits.submission_path.with_different_hf_config(lb_submission_new_with_edits.suite_config.version).to_path() + new_config_paths_of_interest.append(new_config_result_path) + + os.makedirs( + os.path.join(local_converted_results_dir, os.path.dirname(new_config_result_path)), + exist_ok=True, + ) + with open( + os.path.join(local_converted_results_dir, new_config_result_path), + "w", + encoding="utf-8", + ) as f_new_config: + f_new_config.write(lb_submission_new_with_edits.model_dump_json(indent=None)) + + print(f"{current_config_result_path} -> {new_config_result_path}") + print(lb_submission_new_with_edits.model_dump_json(indent=2)) + + return new_config_paths_of_interest From 8add4ee996085f48e340ec4f32dc931e5a2a2a51 Mon Sep 17 00:00:00 2001 From: Chloe Alcestes Anastasiades Date: Mon, 18 Aug 2025 09:30:53 -0700 Subject: [PATCH 14/14] integrate into lb publish --- src/agenteval/cli_utils.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/agenteval/cli_utils.py b/src/agenteval/cli_utils.py index 0dfe8c7..bd27ba6 100644 --- a/src/agenteval/cli_utils.py +++ b/src/agenteval/cli_utils.py @@ -1,4 +1,5 @@ import click +from dataclasses import dataclass def generate_choice_help(mapping, base_help=""): @@ -33,3 +34,30 @@ def convert(self, value, param, ctx): def get_missing_message(self, param): formatted_choices = ", ".join(f"{k} ({v})" for k, v in self.choices_map.items()) return f"Choose from: {formatted_choices}" + + +@dataclass +class RepoPathsOfInterest: + repo_id: str + relative_paths: list[str] + + @staticmethod + def from_urls(urls: list[str]): + repo_ids = set() + paths = [] + + for url in urls: + # validates submission_url format "hf:///" + repo_id, path = parse_hf_url(url) + repo_ids.add(repo_id) + paths.append(path) + + if len(repo_ids) > 1: + raise Exception("All URLs must reference the same repo") + + repo_id_to_use = repo_ids.pop() + + return RepoPathsOfInterest( + repo_id=repo_id_to_use, + relative_paths=list(set(paths)), + )