diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index 130476a..16d6b93 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -5,25 +5,25 @@ import subprocess import sys import tempfile -from collections import defaultdict from datetime import datetime, timezone from io import BytesIO import click import datasets -from agenteval.leaderboard.schema_generator import load_dataset_features +from agenteval.leaderboard.schema_generator import check_lb_submissions_against_readme, load_dataset_features -from .cli_utils import AliasedChoice, generate_choice_help +from .cli_utils import AliasedChoice, RepoPathsOfInterest, generate_choice_help from .config import load_suite_config from .io import atomic_write_file -from .leaderboard.models import LeaderboardSubmission, Readme +from .leaderboard.models import InterventionPointer, LeaderboardSubmission, Readme from .leaderboard.upload import ( compress_model_usages, sanitize_path_component, upload_folder_to_hf, ) from .models import EvalConfig, SubmissionMetadata, TaskResults +from .interventions import check_lb_submission_for_edit_eligibility, convert_lb_submission, edit_lb_submission, LbSubmissionWithDetails, Registry from .score import process_eval_logs from .summary import compute_summary_statistics @@ -258,6 +258,223 @@ def score_command( cli.add_command(score_command) +@click.command(name="edit", help="TODO.") +@click.option("--registry", type=str, multiple=True, help="TODO.") +@click.option("--intervention", type=str, multiple=True, help="TODO.") +@click.argument("result_urls", nargs=-1, required=True, type=str) +def edit_command( + registry: tuple, + intervention: tuple, + result_urls: tuple[str, ...], +): + if not result_urls: + click.echo("At least one result URL is required.") + sys.exit(1) + + registry = Registry(list(registry)) + intervention_pointers = [InterventionPointer.from_str(p) for p in list(intervention)] + + with tempfile.TemporaryDirectory() as temp_dir: + from huggingface_hub import HfApi, snapshot_download + + local_current_results_dir = os.path.join(temp_dir, "current") + local_edited_results_dir = os.path.join(temp_dir, "edited") + + result_paths_of_interest = RepoPathsOfInterest.from_urls(result_urls) + result_repo_id = result_paths_of_interest.repo_id + result_paths = result_paths_of_interest.relative_paths + + # Download all input files in one shot + snapshot_download( + repo_id=result_repo_id, + repo_type="dataset", + allow_patterns=result_paths, + local_dir=local_current_results_dir, + ) + + all_edited_lb_submissions = [] + for result_path in result_paths: + local_current_result_path = os.path.join(local_current_results_dir, result_path) + with open(local_current_result_path) as f_current: + lb_submission = LeaderboardSubmission.model_validate(json.load(f_current)) + lb_submission_with_details = LbSubmissionWithDetails.mk(lb_submission, result_path) + + # edits the lb submission in place + edited_this_submission = edit_lb_submission( + lb_submission_with_details=lb_submission_with_details, + intervention_pointers=intervention_pointers, + registry=registry, + ) + if edited_this_submission: + all_edited_lb_submissions.append(lb_submission) + + os.makedirs( + os.path.join(local_edited_results_dir, os.path.dirname(result_path)), + exist_ok=True, + ) + with open( + os.path.join(local_edited_results_dir, result_path), + "w", + encoding="utf-8", + ) as f_edited: + f_edited.write(lb_submission.model_dump_json(indent=None)) + print(lb_submission.model_dump_json(indent=2)) + + # Validate the config with the schema in HF + if len(all_edited_lb_submissions) > 0: + try: + check_lb_submissions_against_readme(all_edited_lb_submissions, result_repo_id) + except Exception as exc: + click.echo(str(exc)) + sys.exit(1) + + # Upload all results files in one shot + click.echo(f"Uploading {len(result_paths)} results to {result_repo_id}...") + # hf_api = HfApi() + # hf_api.upload_folder( + # folder_path=local_edited_results_dir, + # path_in_repo="", + # repo_id=result_repo_id, + # repo_type="dataset", + # ) + click.echo("Done") + + +cli.add_command(edit_command) + + +@click.command(name="convert", help="TODO.") +@click.option("--registry", type=str, multiple=True, help="TODO.") +@click.option("--intervention", type=str, help="TODO.") +@click.argument("result_urls", nargs=-1, required=True, type=str) +def convert_command( + registry: tuple, + intervention: str, + result_urls: tuple[str, ...], +): + if not result_urls: + click.echo("At least one result URL is required.") + sys.exit(1) + + registry = Registry(list(registry)) + intervention_pointer = InterventionPointer.from_str(intervention) + + with tempfile.TemporaryDirectory() as temp_dir: + from huggingface_hub import HfApi, snapshot_download + + local_current_config_results_dir = os.path.join(temp_dir, "current") + local_new_config_results_dir = os.path.join(temp_dir, "new") + + result_paths_of_interest = RepoPathsOfInterest.from_urls(result_urls) + result_repo_id = result_paths_of_interest.repo_id + result_paths = result_paths_of_interest.relative_paths + + # Download all input files in one shot + snapshot_download( + repo_id=result_repo_id, + repo_type="dataset", + allow_patterns=result_paths, + local_dir=local_current_config_results_dir, + ) + + all_converted_lb_submissions = [] + for current_config_result_path in result_paths: + local_current_config_result_path = os.path.join(local_current_config_results_dir, current_config_result_path) + with open(local_current_config_result_path) as f_current: + lb_submission = LeaderboardSubmission.model_validate(json.load(f_current)) + lb_submission_with_details = LbSubmissionWithDetails.mk(lb_submission, current_config_result_path) + + # edits the lb submission in place + converted_this_submission = convert_lb_submission( + lb_submission_with_details=lb_submission_with_details, + intervention_pointer=intervention_pointer, + registry=registry, + ) + if converted_this_submission: + all_converted_lb_submissions.append(lb_submission) + new_config_result_path = lb_submission_with_details.submission_path.with_different_hf_config(lb_submission.suite_config.version).to_path() + + os.makedirs( + os.path.join(local_new_config_results_dir, os.path.dirname(new_config_result_path)), + exist_ok=True, + ) + with open( + os.path.join(local_new_config_results_dir, new_config_result_path), + "w", + encoding="utf-8", + ) as f_new: + f_new.write(lb_submission.model_dump_json(indent=None)) + + print(f"{current_config_result_path} -> {new_config_result_path}") + print(lb_submission.model_dump_json(indent=2)) + + # Validate the config with the schema in HF + if len(all_converted_lb_submissions) > 0: + try: + check_lb_submissions_against_readme(all_converted_lb_submissions, result_repo_id) + except Exception as exc: + click.echo(str(exc)) + sys.exit(1) + + # Upload all results files in one shot + click.echo(f"Uploading {len(result_paths)} results to {result_repo_id}...") + # hf_api = HfApi() + # hf_api.upload_folder( + # folder_path=local_new_config_results_dir, + # path_in_repo="", + # repo_id=result_repo_id, + # repo_type="dataset", + # ) + click.echo("Done") + + +cli.add_command(convert_command) + + +@click.command(name="check", help="TODO.") +@click.option("--registry", type=str, multiple=True, help="TODO.") +@click.option("--intervention", type=str, multiple=True, help="TODO.") +@click.argument("result_urls", nargs=-1, required=True, type=str) +def check_command( + registry: tuple, + intervention: tuple, + result_urls: tuple[str, ...], +): + if not result_urls: + click.echo("At least one result URL is required.") + sys.exit(1) + + registry = Registry(list(registry)) + intervention_pointers = [InterventionPointer.from_str(p) for p in list(intervention)] + + with tempfile.TemporaryDirectory() as temp_dir: + from huggingface_hub import HfApi, snapshot_download + + local_results_dir = os.path.join(temp_dir, "results") + + result_paths_of_interest = RepoPathsOfInterest.from_urls(result_urls) + result_repo_id = result_paths_of_interest.repo_id + result_paths = result_paths_of_interest.relative_paths + + # Download all input files in one shot + snapshot_download( + repo_id=result_repo_id, + repo_type="dataset", + allow_patterns=result_paths, + local_dir=local_results_dir, + ) + + for result_path in result_paths: + local_result_path = os.path.join(local_results_dir, result_path) + with open(local_result_path) as f_current: + lb_submission = LeaderboardSubmission.model_validate(json.load(f_current)) + lb_submission_with_details = LbSubmissionWithDetails.mk(lb_submission, result_path) + check_lb_submission_for_edit_eligibility(lb_submission_with_details, intervention_pointers, registry) + + +cli.add_command(check_command) + + @click.command( name="publish", help="Upload Inspect logs to HuggingFace for official scoring", @@ -474,6 +691,99 @@ def backfill_command(results_repo_id, submissions_repo_id, submission_path): cli.add_command(backfill_command) +def publish_lb_results_helper( + repo_paths_of_interest: RepoPathsOfInterest, + local_new_results_dir: str, + temp_dir: str, + registry: Registry, + counter: int +): + # The idea is, when you're about to publish a result file, + # look at whether there's already a result file under the same path, + # and take into account any interventions that were applied to + # that result file. + from huggingface_hub import HfApi, snapshot_download + + # local_new_results_dir starting off has results we want to upload + # that may need still need to have edits applied first, and that + # also may indicate that corresponding results converted to different + # configs should also be uploaded + # local_existing_results_dir will have any existing corresponding + # result files under the same configs as files in local_new_results_dir + local_existing_results_dir = os.path.join(temp_dir, f"existingresults{counter}") + + # download any existing result files that correspond to the new ones we want to + # upload, so we can see if any edits or conversions were applied to them, + # so we can apply the same things to the new result files + snapshot_download( + repo_id=repo_paths_of_interest.repo_id, + repo_type="dataset", + allow_patterns=repo_paths_of_interest.relative_paths, + local_dir=local_existing_results_dir, + ) + + # we'll update files in local_new_results_dir to have edited results + all_current_config_lb_submissions = apply_existing_edits_to_result_files( + repo_paths_of_interest=repo_paths_of_interest, + local_new_results_dir=local_new_results_dir, + local_existing_results_dir=local_existing_results_dir, + registry=registry, + ) + + # then we'll push them to the results repo + if len(all_current_config_lb_submissions) > 0: + try: + check_lb_submissions_against_readme(all_current_config_lb_submissions, repo_paths_of_interest.repo_id) + except Exception as exc: + click.echo(str(exc)) + sys.exit(1) + + # Upload all results files in one shot + click.echo(f"Uploading {len(repo_paths_of_interest.relative_paths)} results to {repo_paths_of_interest.repo_id}...") + # hf_api = HfApi() + # hf_api.upload_folder( + # folder_path=local_new_results_dir, + # path_in_repo="", + # repo_id=repo_paths_of_interest.repo_id, + # repo_type="dataset", + # ) + + # local_converted_results_dir is where we'll put corresponding + # result files under new configs when needed + local_converted_results_dir = os.path.join(temp_dir, f"convertedresults{counter}") + + # We'll do conversions off edited results. + # Converted results will go under local_converted_results_dir, under new paths. + # We figure out if we need conversions based on the corresponding + # existing results that we already pulled. + new_config_paths_of_interest = apply_existing_conversions_to_result_files( + repo_paths_of_interest=repo_paths_of_interest, + local_new_results_dir=local_new_results_dir, + local_existing_results_dir=local_existing_results_dir, + local_converted_results_dir=local_converted_results_dir, + registry=registry, + ) + + # Call publish_lb_results() instead of writing converted results directly, + # because we want to make sure that we take into account any interventions + # applied to existing converted result files. + if len(new_config_paths_of_interest): + publish_lb_results_helper( + repo_paths_of_interest=RepoPathsOfInterest( + repo_id=repo_paths_of_interest.repo_id, + relative_paths=new_config_paths_of_interest + ), + local_new_results_dir=local_converted_results_dir, + temp_dir=temp_dir, + registry=registry, + counter=counter+1, + ) + + else: + # If there are no conversions to do, we stop here. + click.echo("Done") + + @click.command( name="publish", help="Publish scored results in log_dir to HuggingFace leaderboard.", @@ -498,22 +808,9 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): hf_api = HfApi() - submission_repo_ids = set() - submission_paths = [] - - # Validate URLs - for submission_url in submission_urls: - submission_repo_id, submission_path = parse_hf_url( - submission_url - ) # validates submission_url format "hf:///" - submission_repo_ids.add(submission_repo_id) - submission_paths.append(submission_path) - - if len(submission_repo_ids) > 1: - click.echo("All submission URLs must reference the same repo") - sys.exit(1) - - submission_repo_id = submission_repo_ids.pop() + submission_paths_of_interest = RepoPathsOfInterest.from_urls(submission_urls) + submission_repo_id = submission_paths_of_interest.repo_id + submission_paths = submission_paths_of_interest.relative_paths eval_config_rel_paths = [ f"{p}/{EVAL_CONFIG_FILENAME}" for p in submission_paths @@ -535,10 +832,8 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): local_dir=local_submissions_dir, ) + all_result_paths = [] # Create results files locally - config_splits = defaultdict( - list - ) # Accumulate config names and splits being published for ( submission_url, submission_path, @@ -579,7 +874,6 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): eval_config = EvalConfig.model_validate_json( open(local_eval_config_path).read() ) - config_splits[eval_config.suite_config.version].append(eval_config.split) results = TaskResults.model_validate_json( open(local_scores_path).read() ).results @@ -605,46 +899,21 @@ def publish_lb_command(repo_id: str, submission_urls: tuple[str, ...]): ) as f: f.write(lb_submission.model_dump_json(indent=None)) - # Validate the config with the schema in HF - readme = Readme.download_and_parse(repo_id) - missing_configs = list(set(config_splits.keys()) - set(readme.configs.keys())) - if missing_configs: - click.echo( - f"Config name {missing_configs} not present in hf://{repo_id}/README.md" - ) - click.echo( - f"Run 'update_readme.py add-config --repo-id {repo_id} --config-name {missing_configs[0]}' to add it" - ) - sys.exit(1) - missing_splits = list( - set(((c, s) for c in config_splits.keys() for s in config_splits[c])) - - set(((c, s) for c in readme.configs.keys() for s in readme.configs[c])) - ) - if missing_splits: - click.echo( - f"Config/Split {missing_splits} not present in hf://{repo_id}/README.md" - ) - click.echo( - f"Run 'update_readme.py add-config --repo-id {repo_id} --config-name {missing_splits[0][0]} --split {missing_splits[0][1]}` to add it" - ) - sys.exit(1) - local_features = load_dataset_features() - if local_features.arrow_schema != readme.features.arrow_schema: - click.echo( - "Schema in local dataset_features.yml does not match schema in hf://{repo_id}/README.md" - ) - click.echo("Run 'update_readme.py sync-schema' to update it") - sys.exit(1) + all_result_paths.append(f"{submission_path}.json") - # Upload all results files in one shot - click.echo(f"Uploading {len(submission_paths)} results to {repo_id}...") - hf_api.upload_folder( - folder_path=local_results_dir, - path_in_repo="", - repo_id=repo_id, - repo_type="dataset", + # This will upload the results, taking into account any + # interventions that have been applied to exisitng result files + # under the same paths. + publish_lb_results_helper( + repo_paths_of_interest=RepoPathsOfInterest( + repo_id=repo_id, + relative_paths=all_result_paths, + ), + local_new_results_dir=local_results_dir, + temp_dir=temp_dir, + registry=registry, + counter=0, ) - click.echo("Done") @click.group(name="lb", help="Leaderboard related commands") diff --git a/src/agenteval/cli_utils.py b/src/agenteval/cli_utils.py index 0dfe8c7..bd27ba6 100644 --- a/src/agenteval/cli_utils.py +++ b/src/agenteval/cli_utils.py @@ -1,4 +1,5 @@ import click +from dataclasses import dataclass def generate_choice_help(mapping, base_help=""): @@ -33,3 +34,30 @@ def convert(self, value, param, ctx): def get_missing_message(self, param): formatted_choices = ", ".join(f"{k} ({v})" for k, v in self.choices_map.items()) return f"Choose from: {formatted_choices}" + + +@dataclass +class RepoPathsOfInterest: + repo_id: str + relative_paths: list[str] + + @staticmethod + def from_urls(urls: list[str]): + repo_ids = set() + paths = [] + + for url in urls: + # validates submission_url format "hf:///" + repo_id, path = parse_hf_url(url) + repo_ids.add(repo_id) + paths.append(path) + + if len(repo_ids) > 1: + raise Exception("All URLs must reference the same repo") + + repo_id_to_use = repo_ids.pop() + + return RepoPathsOfInterest( + repo_id=repo_id_to_use, + relative_paths=list(set(paths)), + ) diff --git a/src/agenteval/config.py b/src/agenteval/config.py index df51e5b..abd450d 100644 --- a/src/agenteval/config.py +++ b/src/agenteval/config.py @@ -86,6 +86,14 @@ def get_split(self, split_name: str) -> Split: f"Split '{split_name}' not found. Available splits: {available_splits}" ) + def get_tasks_by_name(self, split_name: str) -> dict[str, Task]: + tasks = {} + for task in self.get_tasks(split_name): + task_name = task.name + assert task_name not in tasks + tasks[task_name] = task + return tasks + def load_suite_config(file_path: str) -> SuiteConfig: """ diff --git a/src/agenteval/interventions.py b/src/agenteval/interventions.py new file mode 100644 index 0000000..0447669 --- /dev/null +++ b/src/agenteval/interventions.py @@ -0,0 +1,271 @@ +from dataclasses import dataclass +from importlib import import_module +from typing import Callable + +from pydantic import BaseModel + +from agenteval.leaderboard.models import InterventionPointer, LeaderboardSubmission +from agenteval.cli_utils import RepoPathsOfInterest + + +EDIT_INTERVENTION_KIND = "edit" +CONVERSION_INTERVENTION_KIND = "conversion" + + +@dataclass +class WithinRepoPath: + hf_config: str + split: str + end: str + + def submission_name(self) -> str: + suffix = ".json" + if self.end.endswith(suffix): + submission_name = self.end[:-len(suffix)] + else: + submission_name = self.end + return submission_name + + @staticmethod + def from_path(path: str, sep: str = "/"): + [hf_config, split, end] = path.split(sep) + return WithinRepoPath( + hf_config=hf_config, + split=split, + end=end, + ) + + def to_path(self, sep: str = "/") -> str: + return sep.join([self.hf_config, self.split, self.end]) + + def with_different_hf_config(self, new_hf_config: str): + return WithinRepoPath( + hf_config=new_hf_config, + split=self.split, + end=self.end, + ) + + +class LbSubmissionWithDetails(BaseModel): + lb_submission: LeaderboardSubmission + submission_path: WithinRepoPath + + @staticmethod + def mk(lb_submission: LeaderboardSubmission, submission_path: str): + return LbSubmissionWithDetails( + lb_submission=lb_submission, + submission_path=WithinRepoPath.from_path(submission_path), + ) + + +class Intervention: + def __init__( + self, + eligible: Callable[[LbSubmissionWithDetails], bool], + transform: Callable[[LeaderboardSubmission], bool] + ): + self._eligible = eligible + self._transform = transform + + def eligible(self, submission_with_details: LbSubmissionWithDetails) -> bool: + return self._eligible(submission_with_details) + + def transform(self, lb_submission: LeaderboardSubmission) -> bool: + return self._transform(lb_submission) + + +# intervention kind -> config name -> intervention name -> Intervention +INTERVENTIONS: dict[str, dict[str, dict[str, Intervention]]] = { + EDIT_INTERVENTION_KIND: {}, + CONVERSION_INTERVENTION_KIND: {}, +} + + +@dataclass +class RegistryPointer: + registry: str + name: str + + @staticmethod + def from_str(a_str): + sep = ":" + [registry, name] = a_str.split(sep) + return RegistryPointer(registry=registry, name=name) + + +class Registry: + def __init__(self, registry_pointer_strs: list[str]): + self.registry = {"agenteval": INTERVENTIONS} + + registry_pointers = [RegistryPointer.from_str(p) for p in registry_pointer_strs] + for pointer in registry_pointers: + assert pointer.registry not in self.registry, "Multiple registry entries with the same name." + self.registry[pointer.registry] = import_module(pointer.name).INTERVENTIONS + + def find_intervention(self, intervention_kind: str, config_name: str, pointer: InterventionPointer): + return self.registry.get(pointer.registry, {}).get(intervention_kind, {}).get(config_name, {}).get(pointer.name) + + +def edit_lb_submission( + lb_submission_with_details: LbSubmissionWithDetails, + intervention_pointers: list[InterventionPointer], + registry: Registry, +) -> bool: + edited_this_lb_submission = False + for intervention_pointer in intervention_pointers: + + maybe_edit = registry.find_intervention( + intervention_kind="edit", + config_name=lb_submission_with_details.lb_submission.suite_config.version, + pointer=intervention_pointer, + ) + if (maybe_edit is not None) : + if maybe_edit.eligible(lb_submission_with_details): + applied_one_edit = maybe_edit.transform(lb_submission_with_details.lb_submission) + if applied_one_edit: + lb_submission_with_details.lb_submission.add_edit(intervention_pointer) + edited_this_lb_submission = edited_this_lb_submission or applied_one_edit + else: + print(f"{lb_submission_with_details.submission_path} is not eligble for the {intervention_pointer} edit.") + + else: + print(f"Unable to find edit {intervention_pointer}.") + + return edited_this_lb_submission + + +def convert_lb_submission( + lb_submission_with_details: LbSubmissionWithDetails, + intervention_pointer: InterventionPointer, + registry: Registry, +) -> bool: + converted_this_lb_submission = False + maybe_conversion = registry.find_intervention( + intervention_kind="conversion", + config_name=lb_submission_with_details.lb_submission.suite_config.version, + pointer=intervention_pointer, + ) + if (maybe_conversion is not None) : + if maybe_conversion.eligible(lb_submission_with_details): + converted_this_lb_submission = maybe_conversion.transform(lb_submission_with_details.lb_submission) + if converted_this_lb_submission: + lb_submission_with_details.lb_submission.add_conversion(intervention_pointer) + else: + print(f"{lb_submission_with_details.submission_path} is not eligble for the {intervention_pointer} conversion.") + + else: + print(f"Unable to find conversion {intervention_pointer}.") + + return converted_this_lb_submission + + +def check_lb_submission_for_edit_eligibility( + lb_submission_with_details: LbSubmissionWithDetails, + intervention_pointers: list[InterventionPointer], + registry: Registry, +) -> bool: + for intervention_pointer in intervention_pointers: + maybe_edit = registry.find_intervention( + intervention_kind="edit", + config_name=lb_submission_with_details.lb_submission.suite_config.version, + pointer=intervention_pointer, + ) + if (maybe_edit is not None) : + if maybe_edit.eligible(lb_submission_with_details): + print(f"{lb_submission_with_details.submission_path} is eligble for the {intervention_pointer} edit.") + else: + print(f"Unable to find edit {intervention_pointer}.") + + +def apply_existing_edits_to_result_files( + repo_paths_of_interest: RepoPathsOfInterest, + local_new_results_dir: str, + local_existing_results_dir: str, + registry: Registry, +): + all_current_config_lb_submissions = [] + for current_config_result_path in repo_paths_of_interest.relative_paths: + local_current_config_new_result_path = os.path.join(local_new_results_dir, current_config_result_path) + with open(local_current_config_new_result_path) as f_current_config_new: + lb_submission_current_config_new = LeaderboardSubmission.model_validate(json.load(f_current_config_new)) + lb_submission_with_details_current_config_new = LbSubmissionWithDetails.mk(lb_submission_current_config_new, current_config_result_path) + + local_current_config_existing_result_path = os.path.join(local_existing_results_dir, current_config_result_path) + if os.path.isfile(local_current_config_existing_result_path): + with open(local_current_config_existing_result_path) as f_current_config_existing: + lb_submission_current_config_existing = LeaderboardSubmission.model_validate(json.load(f_current_config_existing)) + + if lb_submission_current_config_existing.has_edits(): + edit_pointers = [e.pointer for e in lb_submission_current_config_existing.interventions.edits] + + # edits the lb submission in place + edited_this_submission = edit_lb_submission( + lb_submission_with_details=lb_submission_with_details_current_config_new, + intervention_pointers=edit_pointers, + registry=registry, + ) + if edited_this_submission: + with open( + local_current_config_new_result_path, + "w", + encoding="utf-8", + ) as f_current_config_new_post_edits: + f_current_config_new_post_edits.write(lb_submission_current_config_new.model_dump_json(indent=None)) + print(lb_submission_current_config_new.model_dump_json(indent=2)) + + # whether we applied edits or not, we still want to upload all these new result files + all_current_config_lb_submissions.append(lb_submission_current_config_new) + + return all_current_config_lb_submissions + + +def apply_existing_conversions_to_result_files( + repo_paths_of_interest: RepoPathsOfInterest, + local_new_results_dir: str, + local_existing_results_dir: str, + local_converted_results_dir: str, + registry=registry, +): + new_config_paths_of_interest = [] + for current_config_result_path in repo_paths_of_interest.relative_paths: + local_current_config_existing_result_path = os.path.join(local_existing_results_dir, current_config_result_path) + if os.path.isfile(local_current_config_existing_result_path): + with open(local_current_config_existing_result_path) as f_current_config_existing: + lb_submission_current_config_existing = LeaderboardSubmission.model_validate(json.load(f_current_config_existing)) + + if lb_submission_current_config_existing.has_conversions(): + conversion_pointers = [c.pointer for c in lb_submission_current_config_existing.interventions.edits] + + for conversion_pointer in conversion_pointers: + # reopen every time. don't reuse lb_submission_new_with_edits instances + # because they are converted to a different config in place + local_current_config_new_result_path = os.path.join(local_new_results_dir, current_config_result_path) + with open(local_current_config_new_result_path) as f_current_config_new_with_edits: + lb_submission_new_with_edits = LeaderboardSubmission.model_validate(json.load(f_current_config_new_with_edits)) + lb_submission_with_details_new_with_edits = LbSubmissionWithDetails.mk(lb_submission_new_with_edits, current_config_result_path) + + # edits the lb submission in place + converted_this_submission = convert_lb_submission( + lb_submission_with_details=lb_submission_with_details_new_with_edits, + intervention_pointer=conversion_pointer, + registry=registry, + ) + if converted_this_submission:: + new_config_result_path = lb_submission_with_details_new_with_edits.submission_path.with_different_hf_config(lb_submission_new_with_edits.suite_config.version).to_path() + new_config_paths_of_interest.append(new_config_result_path) + + os.makedirs( + os.path.join(local_converted_results_dir, os.path.dirname(new_config_result_path)), + exist_ok=True, + ) + with open( + os.path.join(local_converted_results_dir, new_config_result_path), + "w", + encoding="utf-8", + ) as f_new_config: + f_new_config.write(lb_submission_new_with_edits.model_dump_json(indent=None)) + + print(f"{current_config_result_path} -> {new_config_result_path}") + print(lb_submission_new_with_edits.model_dump_json(indent=2)) + + return new_config_paths_of_interest diff --git a/src/agenteval/leaderboard/models.py b/src/agenteval/leaderboard/models.py index 93a8751..2257f0d 100644 --- a/src/agenteval/leaderboard/models.py +++ b/src/agenteval/leaderboard/models.py @@ -1,14 +1,52 @@ import re from dataclasses import dataclass +from datetime import datetime, timezone from io import BytesIO import yaml from datasets import Features -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from ..models import SubmissionMetadata, SuiteConfig, TaskResult +class InterventionPointer(BaseModel): + registry: str + name: str + + @staticmethod + def from_str(a_str: str): + sep = ":" + [registry, name] = a_str.split(sep) + return InterventionPointer(registry=registry, name=name) + + +class AppliedIntervention(BaseModel): + pointer: InterventionPointer + applied: datetime + + +class Interventions(BaseModel): + edits: list[AppliedIntervention] | None + conversions: list[AppliedIntervention] | None + + def add_edit(self, pointer: InterventionPointer): + if self.edits is None: + self.edits = [] + self.edits.append(AppliedIntervention(pointer=pointer, applied=datetime.now(timezone.utc))) + + def has_edits(self): + return (self.edits is not None) and (len(self.edits) > 0) + + def add_conversion(self, pointer: InterventionPointer): + if self.conversions is None: + self.conversions = [] + self.conversions.append(AppliedIntervention(pointer=pointer, applied=datetime.now(timezone.utc))) + + def has_conversions(self): + return (self.conversions is not None) and (len(self.conversions) > 0) + + class LeaderboardSubmission(BaseModel): suite_config: SuiteConfig """Task configuration for the results.""" @@ -19,6 +57,24 @@ class LeaderboardSubmission(BaseModel): results: list[TaskResult] | None = None submission: SubmissionMetadata = Field(default_factory=SubmissionMetadata) + interventions: Interventions | None = None + + def add_edit(self, pointer: InterventionPointer): + if self.interventions is None: + self.interventions = Interventions(edits=[], conversions=None) + self.interventions.add_edit(pointer) + + def has_edits(self): + return (self.interventions is not None) and self.interventions.has_edits() + + def add_conversion(self, pointer: InterventionPointer): + if self.interventions is None: + self.interventions = Interventions(edits=None, conversions=[]) + self.interventions.add_conversion(pointer) + + def has_conversions(self): + return (self.interventions is not None) and self.interventions.has_conversions() + @dataclass class Readme: diff --git a/src/agenteval/leaderboard/schema_generator.py b/src/agenteval/leaderboard/schema_generator.py index a2dcd42..4e654f1 100644 --- a/src/agenteval/leaderboard/schema_generator.py +++ b/src/agenteval/leaderboard/schema_generator.py @@ -4,6 +4,7 @@ import datetime import types +from collections import defaultdict from importlib import resources from typing import Any, Literal, Union, get_args, get_origin @@ -12,7 +13,7 @@ from datasets import Features from pydantic import BaseModel -from .models import LeaderboardSubmission +from .models import LeaderboardSubmission, Readme def _pa_type_for_annotation(anno) -> pa.DataType: @@ -125,3 +126,42 @@ def load_dataset_features(input_path: str | None = None) -> Features: with open(input_path, "r", encoding="utf-8") as f: yaml_values = yaml.safe_load(f) return Features._from_yaml_list(yaml_values) + + +def check_lb_submissions_against_readme( + lb_submissions: list[LeaderboardSubmission], + repo_id: str, +): + config_splits = defaultdict( + list + ) # Accumulate config names and splits being published + for lb_submission in lb_submissions: + config_splits[lb_submission.suite_config.version].append(lb_submission.split) + + readme = Readme.download_and_parse(repo_id) + missing_configs = list(set(config_splits.keys()) - set(readme.configs.keys())) + if missing_configs: + message_for_exc = ( + f"Config name {missing_configs} not present in hf://{repo_id}/README.md" + f"Run 'update_readme.py add-config --repo-id {repo_id} --config-name {missing_configs[0]}' to add it" + ) + raise Exception(message_for_exc) + + missing_splits = list( + set(((c, s) for c in config_splits.keys() for s in config_splits[c])) + - set(((c, s) for c in readme.configs.keys() for s in readme.configs[c])) + ) + if missing_splits: + message_for_exc = ( + f"Config/Split {missing_splits} not present in hf://{repo_id}/README.md" + f"Run 'update_readme.py add-config --repo-id {repo_id} --config-name {missing_splits[0][0]} --split {missing_splits[0][1]}` to add it" + ) + raise Exception(message_for_exc) + + local_features = load_dataset_features() + if local_features.arrow_schema != readme.features.arrow_schema: + message_for_exc = ( + f"Schema in local dataset_features.yml does not match schema in hf://{repo_id}/README.md" + "Run 'update_readme.py sync-schema' to update it" + ) + raise Exception(message_for_exc) diff --git a/src/agenteval/score.py b/src/agenteval/score.py index f4fa2d8..76e9978 100644 --- a/src/agenteval/score.py +++ b/src/agenteval/score.py @@ -96,6 +96,9 @@ class TaskResult(BaseModel): model_costs: list[float | None] | None = None """List of model costs per sample. Computed from `model_usages`.""" + + def available_metrics(self) -> set[str]: + return set([m.name for m in self.metrics]) def get_metrics(log: EvalLog) -> list[Metric]: