codegen-sh · jemeza-codegen · Mar 8, 2025 · Mar 8, 2025 · Mar 8, 2025
@@ -27,7 +27,8 @@
   Options:
   --use-existing-preds TEXT       The run ID of the existing predictions to
                                   use.
-  --dataset [lite|full|verified]  The dataset to use.
+  --dataset [lite|full|verified|lite_small|lite_medium|lite_large]
+                                  The dataset to use.
   --length INTEGER                The number of examples to process.
   --instance-id TEXT              The instance ID of the example to process.
   --repo TEXT                     The repo to use.

@@ -0,0 +1,12 @@
+from codegen.extensions.swebench.enums import SWEBenchDataset
+from codegen.extensions.swebench.enums import SWEBenchLiteSubset
+
+
+DATASET_DICT = {
+    "lite": SWEBenchDataset.LITE,
+    "full": SWEBenchDataset.FULL,
+    "verified": SWEBenchDataset.VERIFIED,
+    "lite_small": SWEBenchLiteSubset.LITE_SMALL,
+    "lite_medium": SWEBenchLiteSubset.LITE_MEDIUM,
+    "lite_large": SWEBenchLiteSubset.LITE_LARGE,
+}
@@ -6,8 +6,10 @@
 import modal
 import click
 import time
+from codegen.extensions.swebench.enums import SWEBenchDataset, SWEBenchLiteSubset
+from constants import DATASET_DICT
 from codegen.extensions.swebench.harness import run_agent_on_entry
-from codegen.extensions.swebench.utils import SWEBenchDataset, SweBenchExample, get_swe_bench_examples
+from codegen.extensions.swebench.utils import SweBenchExample, get_swe_bench_examples
 from codegen.extensions.swebench.report import generate_report
 from codegen.sdk.core.codebase import Codebase
 
@@ -280,13 +282,8 @@ async def run_eval(
     run_id = use_existing_preds or str(uuid.uuid4())
     print(f"Run ID: {run_id}")
     predictions_dir = PREDS_DNAME / f"results_{run_id}"
-    dataset_dict = {
-        "lite": SWEBenchDataset.LITE,
-        "full": SWEBenchDataset.FULL,
-        "verified": SWEBenchDataset.VERIFIED,
-    }
-    dataset_enum = dataset_dict[dataset]
 
+    dataset_enum = DATASET_DICT[dataset]
     examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo)
 
     try:
@@ -345,6 +342,8 @@ async def run_eval(
                 for error_type, count in summary["error_types"].items():
                     print(f"  {error_type}: {count}")
 
+        if isinstance(dataset_enum, SWEBenchLiteSubset):
+            dataset_enum = SWEBenchDataset.LITE
         # Generate Report on Modal
         generate_report(predictions_dir, LOG_DIR, dataset_enum, run_id)
     except Exception:
@@ -355,7 +354,7 @@ async def run_eval(
 
 @click.command()
 @click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None)
-@click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified"]), default="lite")
+@click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified", "lite_small", "lite_medium", "lite_large"]), default="lite")
 @click.option("--length", help="The number of examples to process.", type=int, default=10)
 @click.option("--instance-id", help="The instance ID of the example to process.", type=str, default=None)
 @click.option("--local", help="Run the evaluation locally.", is_flag=True, default=False)

@@ -35,6 +35,7 @@ def __init__(
         tools: Optional[list[BaseTool]] = None,
         run_id: Optional[str] = None,
         instance_id: Optional[str] = None,
+        difficulty: Optional[int] = None,
         **kwargs,
     ):
         """Initialize a CodeAgent.
@@ -64,6 +65,7 @@ def __init__(
         self.langsmith_client = Client()
         self.run_id = run_id
         self.instance_id = instance_id
+        self.difficulty = difficulty
 
         # Get project name from environment variable or use a default
         self.project_name = os.environ.get("LANGCHAIN_PROJECT", "RELACE")
@@ -93,16 +95,7 @@ def run(self, prompt: str, thread_id: Optional[str] = None) -> str:
         # this message has a reducer which appends the current message to the existing history
         # see more https://langchain-ai.github.io/langgraph/concepts/low_level/#reducers
         input = {"messages": [("user", prompt)]}
-        metadata = {"project": self.project_name, "model": self.model_name}
-        tags = [self.model_name]
-        # Add SWEBench run ID and instance ID to the metadata and tags for filtering
-        if self.run_id is not None:
-            metadata["swebench_run_id"] = self.run_id
-            tags.append(self.run_id)
-
-        if self.instance_id is not None:
-            metadata["swebench_instance_id"] = self.instance_id
-            tags.append(self.instance_id)
+        tags, metadata = self.get_tags_metadata()
 
         config = RunnableConfig(configurable={"thread_id": thread_id}, tags=tags, metadata=metadata, recursion_limit=100)
         # we stream the steps instead of invoke because it allows us to access intermediate nodes
@@ -165,3 +158,21 @@ def get_tools(self) -> list[BaseTool]:
 
     def get_state(self) -> dict:
         return self.agent.get_state(self.config)
+
+    def get_tags_metadata(self) -> tuple[list[str], dict]:
+        tags = [self.model_name]
+        metadata = {"project": self.project_name, "model": self.model_name}
+        # Add SWEBench run ID and instance ID to the metadata and tags for filtering
+        if self.run_id is not None:
+            metadata["swebench_run_id"] = self.run_id
+            tags.append(self.run_id)
+
+        if self.instance_id is not None:
+            metadata["swebench_instance_id"] = self.instance_id
+            tags.append(self.instance_id)
+
+        if self.difficulty is not None:
+            metadata["swebench_difficulty"] = self.difficulty
+            tags.append(f"difficulty_{self.difficulty}")
+
+        return tags, metadata
@@ -0,0 +1,13 @@
+from enum import Enum
+
+
+class SWEBenchDataset(Enum):
+    LITE = "princeton-nlp/SWE-bench_Lite"
+    FULL = "princeton-nlp/SWE-bench"
+    VERIFIED = "princeton-nlp/SWE-bench-verified"
+
+
+class SWEBenchLiteSubset(Enum):
+    LITE_SMALL = "lite_small"
+    LITE_MEDIUM = "lite_medium"
+    LITE_LARGE = "lite_large"
@@ -70,7 +70,7 @@ def run_agent_on_entry(entry: SweBenchExample, model: str, codebase: Codebase |
         )
         codebase = Codebase.from_repo(repo_full_name=entry.repo, commit=base_commit, language="python", config=config)  # check out the repo
 
-    agent = CodeAgent(codebase=codebase, run_id=run_id, instance_id=instance_id, model_name=model)
+    agent = CodeAgent(codebase=codebase, run_id=run_id, instance_id=instance_id, model_name=model, difficulty=entry.difficulty)
 
     pprint.pprint(instance_id)
     pprint.pprint(gold_files)

@@ -5,8 +5,8 @@
 from collections import defaultdict
 from pathlib import Path
 
+from codegen.extensions.swebench.enums import SWEBenchDataset
 from codegen.extensions.swebench.tests import remove_patches_to_tests
-from codegen.extensions.swebench.utils import SWEBenchDataset
 
 NUM_EVAL_PROCS = 5
 

@@ -0,0 +1,146 @@
+from codegen.extensions.swebench.enums import SWEBenchLiteSubset
+
+SMALL_LITE_SUBSET = [
+    "mwaskom__seaborn-2848",
+    "sphinx-doc__sphinx-8627",
+    "sphinx-doc__sphinx-7975",
+    "django__django-17087",
+    "sympy__sympy-17655",
+    "matplotlib__matplotlib-26020",
+    "sympy__sympy-20154",
+    "scikit-learn__scikit-learn-13439",
+    "pytest-dev__pytest-7373",
+    "django__django-16527",
+]
+
+MEDIUM_LITE_SUBSET = [
+    "sympy__sympy-15346",
+    "sympy__sympy-16281",
+    "sympy__sympy-22840",
+    "pytest-dev__pytest-7220",
+    "django__django-12284",
+    "pytest-dev__pytest-7490",
+    "matplotlib__matplotlib-25442",
+    "django__django-13757",
+    "django__django-15790",
+    "sympy__sympy-18532",
+    "sympy__sympy-13471",
+    "scikit-learn__scikit-learn-15535",
+    "django__django-13447",
+    "django__django-15789",
+    "scikit-learn__scikit-learn-14894",
+    "django__django-14238",
+    "django__django-10914",
+    "pytest-dev__pytest-11143",
+    "django__django-16255",
+    "django__django-13658",
+]
+
+LARGE_LITE_SUBSET = [
+    "pytest-dev__pytest-5495",
+    "django__django-11797",
+    "django__django-14730",
+    "scikit-learn__scikit-learn-25500",
+    "sphinx-doc__sphinx-8506",
+    "django__django-16408",
+    "django__django-16910",
+    "sympy__sympy-12236",
+    "matplotlib__matplotlib-24265",
+    "django__django-15320",
+    "matplotlib__matplotlib-25311",
+    "django__django-12125",
+    "django__django-12747",
+    "matplotlib__matplotlib-24334",
+    "scikit-learn__scikit-learn-14983",
+    "scikit-learn__scikit-learn-13497",
+    "django__django-14580",
+    "pylint-dev__pylint-6506",
+    "matplotlib__matplotlib-23987",
+    "scikit-learn__scikit-learn-13497",
+    "django__django-14017",
+    "django__django-15213",
+    "django__django-12284",
+    "pylint-dev__pylint-7114",
+    "django__django-11422",
+    "django__django-11620",
+    "django__django-12284",
+    "sympy__sympy-13971",
+    "django__django-12284",
+    "sphinx-doc__sphinx-7975",
+    "scikit-learn__scikit-learn-15512",
+    "scikit-learn__scikit-learn-15512",
+    "pylint-dev__pylint-7993",
+    "django__django-12184",
+    "django__django-13315",
+    "sympy__sympy-15609",
+    "pylint-dev__pylint-7993",
+    "sympy__sympy-17022",
+    "pylint-dev__pylint-7993",
+    "sympy__sympy-15678",
+    "sympy__sympy-18057",
+    "sympy__sympy-17655",
+    "sympy__sympy-17655",
+    "django__django-13028",
+    "sympy__sympy-17139",
+    "django__django-14999",
+    "django__django-15790",
+    "scikit-learn__scikit-learn-11281",
+    "astropy__astropy-12907",
+    "django__django-11815",
+    "sympy__sympy-18621",
+    "django__django-11999",
+    "sphinx-doc__sphinx-8721",
+    "matplotlib__matplotlib-23314",
+    "sphinx-doc__sphinx-8721",
+    "sympy__sympy-18621",
+    "django__django-12497",
+    "scikit-learn__scikit-learn-13584",
+    "matplotlib__matplotlib-24970",
+    "scikit-learn__scikit-learn-13584",
+    "django__django-12453",
+    "sympy__sympy-20154",
+    "django__django-13447",
+    "sphinx-doc__sphinx-8595",
+    "sympy__sympy-20154",
+    "sympy__sympy-20154",
+    "django__django-12700",
+    "psf__requests-2317",
+    "django__django-16046",
+    "sympy__sympy-20154",
+    "sympy__sympy-20212",
+    "django__django-13710",
+    "sympy__sympy-13647",
+    "django__django-15851",
+    "scikit-learn__scikit-learn-14894",
+    "sympy__sympy-24213",
+    "scikit-learn__scikit-learn-13779",
+    "django__django-13710",
+    "django__django-13933",
+    "sympy__sympy-20212",
+    "django__django-14855",
+    "django__django-11039",
+    "django__django-16379",
+    "pydata__xarray-5131",
+    "pytest-dev__pytest-7373",
+    "django__django-16139",
+    "django__django-14382",
+    "pytest-dev__pytest-5227",
+    "django__django-16595",
+    "django__django-16379",
+    "django__django-16527",
+    "django__django-13658",
+    "django__django-16255",
+    "django__django-16527",
+    "django__django-13658",
+    "django__django-13658",
+    "django__django-13658",
+    "django__django-11099",
+    "django__django-16527",
+    "django__django-11099",
+]
+
+LITE_SUBSETS = {
+    SWEBenchLiteSubset.LITE_SMALL: SMALL_LITE_SUBSET,
+    SWEBenchLiteSubset.LITE_MEDIUM: MEDIUM_LITE_SUBSET,
+    SWEBenchLiteSubset.LITE_LARGE: LARGE_LITE_SUBSET,
+}