diff --git a/codegen-examples/examples/swebench_agent_run/README.md b/codegen-examples/examples/swebench_agent_run/README.md index c6dc48676..277578f16 100644 --- a/codegen-examples/examples/swebench_agent_run/README.md +++ b/codegen-examples/examples/swebench_agent_run/README.md @@ -27,7 +27,8 @@ Options: --use-existing-preds TEXT The run ID of the existing predictions to use. - --dataset [lite|full|verified] The dataset to use. + --dataset [lite|full|verified|lite_small|lite_medium|lite_large] + The dataset to use. --length INTEGER The number of examples to process. --instance-id TEXT The instance ID of the example to process. --repo TEXT The repo to use. diff --git a/codegen-examples/examples/swebench_agent_run/constants.py b/codegen-examples/examples/swebench_agent_run/constants.py new file mode 100644 index 000000000..138669e96 --- /dev/null +++ b/codegen-examples/examples/swebench_agent_run/constants.py @@ -0,0 +1,12 @@ +from codegen.extensions.swebench.enums import SWEBenchDataset +from codegen.extensions.swebench.enums import SWEBenchLiteSubset + + +DATASET_DICT = { + "lite": SWEBenchDataset.LITE, + "full": SWEBenchDataset.FULL, + "verified": SWEBenchDataset.VERIFIED, + "lite_small": SWEBenchLiteSubset.LITE_SMALL, + "lite_medium": SWEBenchLiteSubset.LITE_MEDIUM, + "lite_large": SWEBenchLiteSubset.LITE_LARGE, +} diff --git a/codegen-examples/examples/swebench_agent_run/run_eval.py b/codegen-examples/examples/swebench_agent_run/run_eval.py index 9a86135d8..b38041c88 100644 --- a/codegen-examples/examples/swebench_agent_run/run_eval.py +++ b/codegen-examples/examples/swebench_agent_run/run_eval.py @@ -6,8 +6,10 @@ import modal import click import time +from codegen.extensions.swebench.enums import SWEBenchDataset, SWEBenchLiteSubset +from constants import DATASET_DICT from codegen.extensions.swebench.harness import run_agent_on_entry -from codegen.extensions.swebench.utils import SWEBenchDataset, SweBenchExample, get_swe_bench_examples +from codegen.extensions.swebench.utils import SweBenchExample, get_swe_bench_examples from codegen.extensions.swebench.report import generate_report from codegen.sdk.core.codebase import Codebase @@ -280,13 +282,8 @@ async def run_eval( run_id = use_existing_preds or str(uuid.uuid4()) print(f"Run ID: {run_id}") predictions_dir = PREDS_DNAME / f"results_{run_id}" - dataset_dict = { - "lite": SWEBenchDataset.LITE, - "full": SWEBenchDataset.FULL, - "verified": SWEBenchDataset.VERIFIED, - } - dataset_enum = dataset_dict[dataset] + dataset_enum = DATASET_DICT[dataset] examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo) try: @@ -345,6 +342,8 @@ async def run_eval( for error_type, count in summary["error_types"].items(): print(f" {error_type}: {count}") + if isinstance(dataset_enum, SWEBenchLiteSubset): + dataset_enum = SWEBenchDataset.LITE # Generate Report on Modal generate_report(predictions_dir, LOG_DIR, dataset_enum, run_id) except Exception: @@ -355,7 +354,7 @@ async def run_eval( @click.command() @click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None) -@click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified"]), default="lite") +@click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified", "lite_small", "lite_medium", "lite_large"]), default="lite") @click.option("--length", help="The number of examples to process.", type=int, default=10) @click.option("--instance-id", help="The instance ID of the example to process.", type=str, default=None) @click.option("--local", help="Run the evaluation locally.", is_flag=True, default=False) diff --git a/src/codegen/agents/code_agent.py b/src/codegen/agents/code_agent.py index 157df294d..d02cd08a2 100644 --- a/src/codegen/agents/code_agent.py +++ b/src/codegen/agents/code_agent.py @@ -35,6 +35,7 @@ def __init__( tools: Optional[list[BaseTool]] = None, run_id: Optional[str] = None, instance_id: Optional[str] = None, + difficulty: Optional[int] = None, **kwargs, ): """Initialize a CodeAgent. @@ -64,6 +65,7 @@ def __init__( self.langsmith_client = Client() self.run_id = run_id self.instance_id = instance_id + self.difficulty = difficulty # Get project name from environment variable or use a default self.project_name = os.environ.get("LANGCHAIN_PROJECT", "RELACE") @@ -93,16 +95,7 @@ def run(self, prompt: str, thread_id: Optional[str] = None) -> str: # this message has a reducer which appends the current message to the existing history # see more https://langchain-ai.github.io/langgraph/concepts/low_level/#reducers input = {"messages": [("user", prompt)]} - metadata = {"project": self.project_name, "model": self.model_name} - tags = [self.model_name] - # Add SWEBench run ID and instance ID to the metadata and tags for filtering - if self.run_id is not None: - metadata["swebench_run_id"] = self.run_id - tags.append(self.run_id) - - if self.instance_id is not None: - metadata["swebench_instance_id"] = self.instance_id - tags.append(self.instance_id) + tags, metadata = self.get_tags_metadata() config = RunnableConfig(configurable={"thread_id": thread_id}, tags=tags, metadata=metadata, recursion_limit=100) # we stream the steps instead of invoke because it allows us to access intermediate nodes @@ -165,3 +158,21 @@ def get_tools(self) -> list[BaseTool]: def get_state(self) -> dict: return self.agent.get_state(self.config) + + def get_tags_metadata(self) -> tuple[list[str], dict]: + tags = [self.model_name] + metadata = {"project": self.project_name, "model": self.model_name} + # Add SWEBench run ID and instance ID to the metadata and tags for filtering + if self.run_id is not None: + metadata["swebench_run_id"] = self.run_id + tags.append(self.run_id) + + if self.instance_id is not None: + metadata["swebench_instance_id"] = self.instance_id + tags.append(self.instance_id) + + if self.difficulty is not None: + metadata["swebench_difficulty"] = self.difficulty + tags.append(f"difficulty_{self.difficulty}") + + return tags, metadata diff --git a/src/codegen/extensions/swebench/enums.py b/src/codegen/extensions/swebench/enums.py new file mode 100644 index 000000000..0cf3a484a --- /dev/null +++ b/src/codegen/extensions/swebench/enums.py @@ -0,0 +1,13 @@ +from enum import Enum + + +class SWEBenchDataset(Enum): + LITE = "princeton-nlp/SWE-bench_Lite" + FULL = "princeton-nlp/SWE-bench" + VERIFIED = "princeton-nlp/SWE-bench-verified" + + +class SWEBenchLiteSubset(Enum): + LITE_SMALL = "lite_small" + LITE_MEDIUM = "lite_medium" + LITE_LARGE = "lite_large" diff --git a/src/codegen/extensions/swebench/harness.py b/src/codegen/extensions/swebench/harness.py index 90cc9e017..dbf9f6b9b 100644 --- a/src/codegen/extensions/swebench/harness.py +++ b/src/codegen/extensions/swebench/harness.py @@ -70,7 +70,7 @@ def run_agent_on_entry(entry: SweBenchExample, model: str, codebase: Codebase | ) codebase = Codebase.from_repo(repo_full_name=entry.repo, commit=base_commit, language="python", config=config) # check out the repo - agent = CodeAgent(codebase=codebase, run_id=run_id, instance_id=instance_id, model_name=model) + agent = CodeAgent(codebase=codebase, run_id=run_id, instance_id=instance_id, model_name=model, difficulty=entry.difficulty) pprint.pprint(instance_id) pprint.pprint(gold_files) diff --git a/src/codegen/extensions/swebench/report.py b/src/codegen/extensions/swebench/report.py index a373e87e1..f8100e36d 100755 --- a/src/codegen/extensions/swebench/report.py +++ b/src/codegen/extensions/swebench/report.py @@ -5,8 +5,8 @@ from collections import defaultdict from pathlib import Path +from codegen.extensions.swebench.enums import SWEBenchDataset from codegen.extensions.swebench.tests import remove_patches_to_tests -from codegen.extensions.swebench.utils import SWEBenchDataset NUM_EVAL_PROCS = 5 diff --git a/src/codegen/extensions/swebench/subsets.py b/src/codegen/extensions/swebench/subsets.py new file mode 100644 index 000000000..a2f522ffe --- /dev/null +++ b/src/codegen/extensions/swebench/subsets.py @@ -0,0 +1,146 @@ +from codegen.extensions.swebench.enums import SWEBenchLiteSubset + +SMALL_LITE_SUBSET = [ + "mwaskom__seaborn-2848", + "sphinx-doc__sphinx-8627", + "sphinx-doc__sphinx-7975", + "django__django-17087", + "sympy__sympy-17655", + "matplotlib__matplotlib-26020", + "sympy__sympy-20154", + "scikit-learn__scikit-learn-13439", + "pytest-dev__pytest-7373", + "django__django-16527", +] + +MEDIUM_LITE_SUBSET = [ + "sympy__sympy-15346", + "sympy__sympy-16281", + "sympy__sympy-22840", + "pytest-dev__pytest-7220", + "django__django-12284", + "pytest-dev__pytest-7490", + "matplotlib__matplotlib-25442", + "django__django-13757", + "django__django-15790", + "sympy__sympy-18532", + "sympy__sympy-13471", + "scikit-learn__scikit-learn-15535", + "django__django-13447", + "django__django-15789", + "scikit-learn__scikit-learn-14894", + "django__django-14238", + "django__django-10914", + "pytest-dev__pytest-11143", + "django__django-16255", + "django__django-13658", +] + +LARGE_LITE_SUBSET = [ + "pytest-dev__pytest-5495", + "django__django-11797", + "django__django-14730", + "scikit-learn__scikit-learn-25500", + "sphinx-doc__sphinx-8506", + "django__django-16408", + "django__django-16910", + "sympy__sympy-12236", + "matplotlib__matplotlib-24265", + "django__django-15320", + "matplotlib__matplotlib-25311", + "django__django-12125", + "django__django-12747", + "matplotlib__matplotlib-24334", + "scikit-learn__scikit-learn-14983", + "scikit-learn__scikit-learn-13497", + "django__django-14580", + "pylint-dev__pylint-6506", + "matplotlib__matplotlib-23987", + "scikit-learn__scikit-learn-13497", + "django__django-14017", + "django__django-15213", + "django__django-12284", + "pylint-dev__pylint-7114", + "django__django-11422", + "django__django-11620", + "django__django-12284", + "sympy__sympy-13971", + "django__django-12284", + "sphinx-doc__sphinx-7975", + "scikit-learn__scikit-learn-15512", + "scikit-learn__scikit-learn-15512", + "pylint-dev__pylint-7993", + "django__django-12184", + "django__django-13315", + "sympy__sympy-15609", + "pylint-dev__pylint-7993", + "sympy__sympy-17022", + "pylint-dev__pylint-7993", + "sympy__sympy-15678", + "sympy__sympy-18057", + "sympy__sympy-17655", + "sympy__sympy-17655", + "django__django-13028", + "sympy__sympy-17139", + "django__django-14999", + "django__django-15790", + "scikit-learn__scikit-learn-11281", + "astropy__astropy-12907", + "django__django-11815", + "sympy__sympy-18621", + "django__django-11999", + "sphinx-doc__sphinx-8721", + "matplotlib__matplotlib-23314", + "sphinx-doc__sphinx-8721", + "sympy__sympy-18621", + "django__django-12497", + "scikit-learn__scikit-learn-13584", + "matplotlib__matplotlib-24970", + "scikit-learn__scikit-learn-13584", + "django__django-12453", + "sympy__sympy-20154", + "django__django-13447", + "sphinx-doc__sphinx-8595", + "sympy__sympy-20154", + "sympy__sympy-20154", + "django__django-12700", + "psf__requests-2317", + "django__django-16046", + "sympy__sympy-20154", + "sympy__sympy-20212", + "django__django-13710", + "sympy__sympy-13647", + "django__django-15851", + "scikit-learn__scikit-learn-14894", + "sympy__sympy-24213", + "scikit-learn__scikit-learn-13779", + "django__django-13710", + "django__django-13933", + "sympy__sympy-20212", + "django__django-14855", + "django__django-11039", + "django__django-16379", + "pydata__xarray-5131", + "pytest-dev__pytest-7373", + "django__django-16139", + "django__django-14382", + "pytest-dev__pytest-5227", + "django__django-16595", + "django__django-16379", + "django__django-16527", + "django__django-13658", + "django__django-16255", + "django__django-16527", + "django__django-13658", + "django__django-13658", + "django__django-13658", + "django__django-11099", + "django__django-16527", + "django__django-11099", +] + +LITE_SUBSETS = { + SWEBenchLiteSubset.LITE_SMALL: SMALL_LITE_SUBSET, + SWEBenchLiteSubset.LITE_MEDIUM: MEDIUM_LITE_SUBSET, + SWEBenchLiteSubset.LITE_LARGE: LARGE_LITE_SUBSET, +} diff --git a/src/codegen/extensions/swebench/success_rates.py b/src/codegen/extensions/swebench/success_rates.py new file mode 100644 index 000000000..2d3cbbdf1 --- /dev/null +++ b/src/codegen/extensions/swebench/success_rates.py @@ -0,0 +1,302 @@ +LITE_SUCCESS_RATES = { + "pallets__flask-5063": 0.0, + "sphinx-doc__sphinx-8282": 0.0, + "django__django-14667": 0.0, + "sphinx-doc__sphinx-8474": 0.0, + "sympy__sympy-11400": 0.0, + "sympy__sympy-11870": 0.0, + "sympy__sympy-11897": 0.0, + "sympy__sympy-12171": 0.0, + "sympy__sympy-12236": 0.0, + "sympy__sympy-13146": 0.0, + "sympy__sympy-13773": 0.0, + "sympy__sympy-13895": 0.0, + "django__django-13220": 0.0, + "sympy__sympy-13915": 0.0, + "sympy__sympy-14024": 0.0, + "sympy__sympy-14308": 0.0, + "django__django-14730": 0.0, + "sphinx-doc__sphinx-7738": 0.0, + "sphinx-doc__sphinx-7686": 0.0, + "django__django-14997": 0.0, + "matplotlib__matplotlib-25079": 0.0, + "pydata__xarray-4493": 0.0, + "matplotlib__matplotlib-22835": 0.0, + "matplotlib__matplotlib-18869": 0.0, + "pylint-dev__pylint-7228": 0.0, + "pytest-dev__pytest-5103": 0.0, + "pytest-dev__pytest-5221": 0.0, + "sympy__sympy-14317": 0.0, + "django__django-16820": 0.0, + "django__django-16229": 0.0, + "pytest-dev__pytest-9359": 0.0, + "scikit-learn__scikit-learn-10508": 0.0, + "scikit-learn__scikit-learn-10949": 0.0, + "scikit-learn__scikit-learn-11040": 0.0, + "django__django-15695": 0.0, + "scikit-learn__scikit-learn-25638": 0.0, + "django__django-16816": 0.0, + "sympy__sympy-15308": 0.0, + "matplotlib__matplotlib-25433": 0.0, + "sympy__sympy-18087": 0.0, + "astropy__astropy-7746": 0.0, + "django__django-11630": 0.0, + "sympy__sympy-18199": 0.0, + "sympy__sympy-23191": 0.0, + "sympy__sympy-17630": 0.0, + "sympy__sympy-19254": 0.0, + "sympy__sympy-21627": 0.0, + "sympy__sympy-16281": 0.0, + "sympy__sympy-16106": 0.0, + "sympy__sympy-24102": 0.0, + "django__django-11905": 0.0, + "sympy__sympy-21171": 0.0, + "sympy__sympy-20639": 0.0, + "django__django-12589": 0.0, + "sympy__sympy-20322": 0.0, + "django__django-11564": 0.0, + "django__django-11019": 0.0, + "django__django-16910": 0.02, + "django__django-15252": 0.02, + "pytest-dev__pytest-5413": 0.02, + "django__django-11742": 0.02, + "sphinx-doc__sphinx-8273": 0.02, + "pytest-dev__pytest-8906": 0.02, + "django__django-15996": 0.02, + "sympy__sympy-19007": 0.02, + "django__django-11910": 0.02, + "matplotlib__matplotlib-22711": 0.02, + "django__django-13768": 0.02, + "astropy__astropy-14182": 0.02, + "mwaskom__seaborn-3407": 0.02, + "pallets__flask-4045": 0.02, + "django__django-12908": 0.02, + "pallets__flask-4992": 0.02, + "pydata__xarray-3364": 0.02, + "sympy__sympy-16503": 0.02, + "django__django-15738": 0.02, + "pydata__xarray-4248": 0.02, + "django__django-13265": 0.02, + "sympy__sympy-13177": 0.02, + "django__django-13448": 0.02, + "django__django-12113": 0.02, + "sympy__sympy-13043": 0.02, + "sympy__sympy-12454": 0.02, + "sympy__sympy-13437": 0.02, + "django__django-16408": 0.03, + "pytest-dev__pytest-6116": 0.03, + "pytest-dev__pytest-8365": 0.03, + "psf__requests-2148": 0.03, + "sympy__sympy-21612": 0.03, + "astropy__astropy-14365": 0.03, + "matplotlib__matplotlib-23299": 0.03, + "django__django-11283": 0.03, + "django__django-14155": 0.03, + "sphinx-doc__sphinx-8506": 0.03, + "django__django-11797": 0.03, + "sympy__sympy-18698": 0.03, + "django__django-15320": 0.03, + "sphinx-doc__sphinx-10451": 0.03, + "django__django-15388": 0.03, + "sympy__sympy-20049": 0.03, + "django__django-15781": 0.05, + "django__django-13321": 0.05, + "sympy__sympy-18835": 0.05, + "django__django-14534": 0.05, + "matplotlib__matplotlib-24265": 0.05, + "django__django-15202": 0.05, + "django__django-12856": 0.05, + "matplotlib__matplotlib-23476": 0.05, + "django__django-15061": 0.05, + "sphinx-doc__sphinx-11445": 0.06, + "django__django-12470": 0.06, + "django__django-16400": 0.06, + "sympy__sympy-15346": 0.06, + "pytest-dev__pytest-5495": 0.06, + "sphinx-doc__sphinx-8801": 0.08, + "matplotlib__matplotlib-23563": 0.08, + "sympy__sympy-21379": 0.08, + "django__django-15819": 0.08, + "mwaskom__seaborn-2848": 0.08, + "scikit-learn__scikit-learn-25500": 0.08, + "sympy__sympy-12419": 0.08, + "django__django-12308": 0.09, + "sympy__sympy-14396": 0.09, + "sympy__sympy-15345": 0.09, + "sympy__sympy-19487": 0.09, + "pytest-dev__pytest-7168": 0.09, + "scikit-learn__scikit-learn-25747": 0.09, + "matplotlib__matplotlib-25498": 0.11, + "sympy__sympy-22840": 0.11, + "sphinx-doc__sphinx-8627": 0.11, + "pydata__xarray-4094": 0.11, + "pytest-dev__pytest-7220": 0.11, + "django__django-12747": 0.11, + "sympy__sympy-13031": 0.12, + "django__django-13660": 0.12, + "scikit-learn__scikit-learn-14983": 0.12, + "sphinx-doc__sphinx-8435": 0.14, + "sympy__sympy-20590": 0.14, + "scikit-learn__scikit-learn-14087": 0.14, + "sympy__sympy-24909": 0.14, + "django__django-15400": 0.14, + "matplotlib__matplotlib-25311": 0.14, + "pylint-dev__pylint-6506": 0.15, + "django__django-12125": 0.15, + "matplotlib__matplotlib-24334": 0.15, + "scikit-learn__scikit-learn-13497": 0.17, + "sympy__sympy-16792": 0.17, + "django__django-14580": 0.17, + "pylint-dev__pylint-7080": 0.18, + "matplotlib__matplotlib-25332": 0.18, + "sympy__sympy-22005": 0.18, + "sympy__sympy-20442": 0.2, + "django__django-13551": 0.2, + "sympy__sympy-14817": 0.2, + "matplotlib__matplotlib-23987": 0.2, + "django__django-13033": 0.21, + "sphinx-doc__sphinx-7975": 0.21, + "django__django-13925": 0.23, + "sphinx-doc__sphinx-10325": 0.23, + "sympy__sympy-16988": 0.23, + "pytest-dev__pytest-7490": 0.24, + "django__django-15213": 0.24, + "django__django-12284": 0.24, + "pytest-dev__pytest-11148": 0.24, + "django__django-11964": 0.24, + "pylint-dev__pylint-7114": 0.26, + "django__django-11422": 0.26, + "django__django-14017": 0.27, + "django__django-15902": 0.27, + "django__django-10924": 0.27, + "django__django-13158": 0.29, + "django__django-11620": 0.29, + "sympy__sympy-13971": 0.29, + "django__django-15498": 0.3, + "django__django-12184": 0.3, + "django__django-13964": 0.3, + "psf__requests-1963": 0.3, + "matplotlib__matplotlib-25442": 0.3, + "django__django-13757": 0.32, + "scikit-learn__scikit-learn-15512": 0.32, + "sympy__sympy-21614": 0.33, + "sympy__sympy-15609": 0.33, + "matplotlib__matplotlib-23562": 0.33, + "django__django-13315": 0.33, + "django__django-11848": 0.35, + "django__django-17087": 0.35, + "matplotlib__matplotlib-26011": 0.36, + "sympy__sympy-21055": 0.36, + "sympy__sympy-17022": 0.36, + "pylint-dev__pylint-7993": 0.36, + "astropy__astropy-6938": 0.38, + "sympy__sympy-15678": 0.38, + "django__django-17051": 0.38, + "scikit-learn__scikit-learn-14092": 0.38, + "pylint-dev__pylint-5859": 0.39, + "django__django-14411": 0.39, + "django__django-11001": 0.41, + "astropy__astropy-12907": 0.41, + "sympy__sympy-18057": 0.42, + "sympy__sympy-23262": 0.44, + "sympy__sympy-18189": 0.44, + "sympy__sympy-17139": 0.45, + "django__django-15790": 0.45, + "django__django-14999": 0.45, + "sympy__sympy-18532": 0.47, + "scikit-learn__scikit-learn-11281": 0.47, + "django__django-12915": 0.47, + "sympy__sympy-12481": 0.47, + "sympy__sympy-24066": 0.48, + "django__django-11815": 0.48, + "django__django-13028": 0.48, + "sympy__sympy-17655": 0.48, + "django__django-12708": 0.48, + "matplotlib__matplotlib-24970": 0.5, + "mwaskom__seaborn-3190": 0.52, + "scikit-learn__scikit-learn-13142": 0.52, + "matplotlib__matplotlib-26020": 0.53, + "scikit-learn__scikit-learn-15535": 0.53, + "sympy__sympy-13471": 0.53, + "sympy__sympy-15011": 0.53, + "psf__requests-3362": 0.55, + "matplotlib__matplotlib-24149": 0.55, + "matplotlib__matplotlib-23314": 0.55, + "django__django-14608": 0.56, + "scikit-learn__scikit-learn-13241": 0.56, + "scikit-learn__scikit-learn-25570": 0.56, + "sympy__sympy-18621": 0.56, + "scikit-learn__scikit-learn-13584": 0.56, + "django__django-13401": 0.58, + "pytest-dev__pytest-5692": 0.58, + "django__django-14787": 0.58, + "django__django-15814": 0.58, + "sphinx-doc__sphinx-8721": 0.58, + "django__django-14016": 0.58, + "django__django-11999": 0.59, + "django__django-12497": 0.59, + "psf__requests-2674": 0.59, + "matplotlib__matplotlib-23913": 0.59, + "pytest-dev__pytest-7432": 0.59, + "django__django-11049": 0.59, + "sympy__sympy-22714": 0.62, + "scikit-learn__scikit-learn-12471": 0.62, + "psf__requests-863": 0.62, + "django__django-14672": 0.62, + "sympy__sympy-20154": 0.62, + "django__django-13590": 0.64, + "django__django-12700": 0.64, + "sphinx-doc__sphinx-8595": 0.64, + "django__django-15789": 0.65, + "django__django-12453": 0.68, + "django__django-13447": 0.68, + "psf__requests-2317": 0.7, + "django__django-11583": 0.7, + "django__django-16046": 0.7, + "django__django-14238": 0.71, + "django__django-15851": 0.71, + "django__django-13710": 0.73, + "sympy__sympy-21847": 0.73, + "sympy__sympy-23117": 0.73, + "django__django-12983": 0.73, + "scikit-learn__scikit-learn-13779": 0.74, + "sympy__sympy-13647": 0.74, + "django__django-16041": 0.74, + "scikit-learn__scikit-learn-10297": 0.74, + "django__django-15347": 0.74, + "scikit-learn__scikit-learn-13496": 0.74, + "sympy__sympy-20212": 0.76, + "scikit-learn__scikit-learn-13439": 0.76, + "django__django-13933": 0.76, + "django__django-12286": 0.76, + "django__django-13230": 0.77, + "astropy__astropy-14995": 0.77, + "django__django-11179": 0.77, + "sphinx-doc__sphinx-8713": 0.77, + "sympy__sympy-24213": 0.77, + "matplotlib__matplotlib-23964": 0.79, + "scikit-learn__scikit-learn-14894": 0.79, + "django__django-10914": 0.8, + "pydata__xarray-5131": 0.8, + "django__django-11039": 0.82, + "pytest-dev__pytest-7373": 0.82, + "django__django-14915": 0.82, + "django__django-16595": 0.83, + "pytest-dev__pytest-11143": 0.85, + "sympy__sympy-14774": 0.85, + "pytest-dev__pytest-5227": 0.85, + "django__django-16873": 0.85, + "django__django-16139": 0.85, + "mwaskom__seaborn-3010": 0.86, + "django__django-14382": 0.86, + "django__django-14752": 0.86, + "sympy__sympy-13480": 0.86, + "django__django-16379": 0.86, + "sympy__sympy-24152": 0.88, + "django__django-14855": 0.88, + "django__django-11133": 0.88, + "django__django-11099": 0.91, + "django__django-13658": 0.91, + "django__django-16255": 0.91, + "django__django-16527": 0.91, +} diff --git a/src/codegen/extensions/swebench/utils.py b/src/codegen/extensions/swebench/utils.py index cd1ded36e..d977dce4e 100644 --- a/src/codegen/extensions/swebench/utils.py +++ b/src/codegen/extensions/swebench/utils.py @@ -1,17 +1,14 @@ import json from dataclasses import dataclass -from enum import Enum from pathlib import Path from pprint import pprint from typing import Literal, Optional from datasets import load_dataset - -class SWEBenchDataset(Enum): - LITE = "princeton-nlp/SWE-bench_Lite" - FULL = "princeton-nlp/SWE-bench" - VERIFIED = "princeton-nlp/SWE-bench-verified" +from codegen.extensions.swebench.enums import SWEBenchDataset, SWEBenchLiteSubset +from codegen.extensions.swebench.subsets import LITE_SUBSETS +from codegen.extensions.swebench.success_rates import LITE_SUCCESS_RATES @dataclass @@ -30,6 +27,7 @@ class SweBenchExample: fail_to_pass: str pass_to_pass: Optional[str] environment_setup_commit: Optional[str] + difficulty: Optional[int] def load_predictions(paths): @@ -64,11 +62,16 @@ def load_predictions(paths): return predictions +def get_difficulty(instance_id: str) -> int | None: + if instance_id in LITE_SUCCESS_RATES: + return 10 - int(LITE_SUCCESS_RATES[instance_id] * 10) + return None + + def get_swe_bench_examples( - dataset: SWEBenchDataset = SWEBenchDataset.LITE, + dataset: SWEBenchDataset | SWEBenchLiteSubset = SWEBenchLiteSubset.LITE_SMALL, split: Literal["train", "dev", "test"] = "test", - offset: int = 0, - length: int = 100, + length: int | None = None, instance_id: str | None = None, repo: str | None = None, ) -> list[SweBenchExample]: @@ -87,31 +90,26 @@ def get_swe_bench_examples( # Convert string dataset name to enum # Load the dataset with caching enabled - swe_bench_dataset = load_dataset(dataset.value, download_mode="reuse_dataset_if_exists") + instance_ids = [] + if isinstance(dataset, SWEBenchLiteSubset): + swe_bench_dataset = load_dataset(SWEBenchDataset.LITE.value, download_mode="reuse_dataset_if_exists") + instance_ids = LITE_SUBSETS[dataset] + else: + swe_bench_dataset = load_dataset(dataset.value, download_mode="reuse_dataset_if_exists") # Get the requested split split_data = swe_bench_dataset[split] - # Apply offset and length - if instance_id or repo: - offset = 0 - end_idx = len(split_data) - else: - end_idx = min(offset + length, len(split_data)) - if offset >= len(split_data): - return [] - - # Use the select method instead of slicing - # This ensures we get dictionary-like objects - selected_rows = split_data.select(range(offset, end_idx)) - # Convert to SweBenchExample objects examples = [] - for row in selected_rows: + for row in split_data: if instance_id and row["instance_id"] != instance_id: continue if repo and row["repo"] != repo: continue + if instance_ids and row["instance_id"] not in instance_ids: + continue + example = SweBenchExample( repo=row["repo"], instance_id=row["instance_id"], @@ -125,7 +123,11 @@ def get_swe_bench_examples( fail_to_pass=row["FAIL_TO_PASS"], pass_to_pass=row.get("PASS_TO_PASS"), environment_setup_commit=row.get("environment_setup_commit"), + difficulty=get_difficulty(row["instance_id"]), ) examples.append(example) - return examples[:length] + if length: + examples = examples[:length] + + return examples