Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion codegen-examples/examples/swebench_agent_run/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
Options:
--use-existing-preds TEXT The run ID of the existing predictions to
use.
--dataset [lite|full|verified] The dataset to use.
--dataset [lite|full|verified|lite_small|lite_medium|lite_large]
The dataset to use.
--length INTEGER The number of examples to process.
--instance-id TEXT The instance ID of the example to process.
--repo TEXT The repo to use.
Expand Down
12 changes: 12 additions & 0 deletions codegen-examples/examples/swebench_agent_run/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from codegen.extensions.swebench.enums import SWEBenchDataset
from codegen.extensions.swebench.enums import SWEBenchLiteSubset


DATASET_DICT = {
"lite": SWEBenchDataset.LITE,
"full": SWEBenchDataset.FULL,
"verified": SWEBenchDataset.VERIFIED,
"lite_small": SWEBenchLiteSubset.LITE_SMALL,
"lite_medium": SWEBenchLiteSubset.LITE_MEDIUM,
"lite_large": SWEBenchLiteSubset.LITE_LARGE,
}
15 changes: 7 additions & 8 deletions codegen-examples/examples/swebench_agent_run/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
import modal
import click
import time
from codegen.extensions.swebench.enums import SWEBenchDataset, SWEBenchLiteSubset
from constants import DATASET_DICT
from codegen.extensions.swebench.harness import run_agent_on_entry
from codegen.extensions.swebench.utils import SWEBenchDataset, SweBenchExample, get_swe_bench_examples
from codegen.extensions.swebench.utils import SweBenchExample, get_swe_bench_examples
from codegen.extensions.swebench.report import generate_report
from codegen.sdk.core.codebase import Codebase

Expand Down Expand Up @@ -280,13 +282,8 @@ async def run_eval(
run_id = use_existing_preds or str(uuid.uuid4())
print(f"Run ID: {run_id}")
predictions_dir = PREDS_DNAME / f"results_{run_id}"
dataset_dict = {
"lite": SWEBenchDataset.LITE,
"full": SWEBenchDataset.FULL,
"verified": SWEBenchDataset.VERIFIED,
}
dataset_enum = dataset_dict[dataset]

dataset_enum = DATASET_DICT[dataset]
examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo)

try:
Expand Down Expand Up @@ -345,6 +342,8 @@ async def run_eval(
for error_type, count in summary["error_types"].items():
print(f" {error_type}: {count}")

if isinstance(dataset_enum, SWEBenchLiteSubset):
dataset_enum = SWEBenchDataset.LITE
# Generate Report on Modal
generate_report(predictions_dir, LOG_DIR, dataset_enum, run_id)
except Exception:
Expand All @@ -355,7 +354,7 @@ async def run_eval(

@click.command()
@click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None)
@click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified"]), default="lite")
@click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified", "lite_small", "lite_medium", "lite_large"]), default="lite")
@click.option("--length", help="The number of examples to process.", type=int, default=10)
@click.option("--instance-id", help="The instance ID of the example to process.", type=str, default=None)
@click.option("--local", help="Run the evaluation locally.", is_flag=True, default=False)
Expand Down
31 changes: 21 additions & 10 deletions src/codegen/agents/code_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(
tools: Optional[list[BaseTool]] = None,
run_id: Optional[str] = None,
instance_id: Optional[str] = None,
difficulty: Optional[int] = None,
**kwargs,
):
"""Initialize a CodeAgent.
Expand Down Expand Up @@ -64,6 +65,7 @@ def __init__(
self.langsmith_client = Client()
self.run_id = run_id
self.instance_id = instance_id
self.difficulty = difficulty

# Get project name from environment variable or use a default
self.project_name = os.environ.get("LANGCHAIN_PROJECT", "RELACE")
Expand Down Expand Up @@ -93,16 +95,7 @@ def run(self, prompt: str, thread_id: Optional[str] = None) -> str:
# this message has a reducer which appends the current message to the existing history
# see more https://langchain-ai.github.io/langgraph/concepts/low_level/#reducers
input = {"messages": [("user", prompt)]}
metadata = {"project": self.project_name, "model": self.model_name}
tags = [self.model_name]
# Add SWEBench run ID and instance ID to the metadata and tags for filtering
if self.run_id is not None:
metadata["swebench_run_id"] = self.run_id
tags.append(self.run_id)

if self.instance_id is not None:
metadata["swebench_instance_id"] = self.instance_id
tags.append(self.instance_id)
tags, metadata = self.get_tags_metadata()

config = RunnableConfig(configurable={"thread_id": thread_id}, tags=tags, metadata=metadata, recursion_limit=100)
# we stream the steps instead of invoke because it allows us to access intermediate nodes
Expand Down Expand Up @@ -165,3 +158,21 @@ def get_tools(self) -> list[BaseTool]:

def get_state(self) -> dict:
return self.agent.get_state(self.config)

def get_tags_metadata(self) -> tuple[list[str], dict]:
tags = [self.model_name]
metadata = {"project": self.project_name, "model": self.model_name}
# Add SWEBench run ID and instance ID to the metadata and tags for filtering
if self.run_id is not None:
metadata["swebench_run_id"] = self.run_id
tags.append(self.run_id)

if self.instance_id is not None:
metadata["swebench_instance_id"] = self.instance_id
tags.append(self.instance_id)

if self.difficulty is not None:
metadata["swebench_difficulty"] = self.difficulty
tags.append(f"difficulty_{self.difficulty}")

return tags, metadata
13 changes: 13 additions & 0 deletions src/codegen/extensions/swebench/enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from enum import Enum


class SWEBenchDataset(Enum):
LITE = "princeton-nlp/SWE-bench_Lite"
FULL = "princeton-nlp/SWE-bench"
VERIFIED = "princeton-nlp/SWE-bench-verified"


class SWEBenchLiteSubset(Enum):
LITE_SMALL = "lite_small"
LITE_MEDIUM = "lite_medium"
LITE_LARGE = "lite_large"
2 changes: 1 addition & 1 deletion src/codegen/extensions/swebench/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def run_agent_on_entry(entry: SweBenchExample, model: str, codebase: Codebase |
)
codebase = Codebase.from_repo(repo_full_name=entry.repo, commit=base_commit, language="python", config=config) # check out the repo

agent = CodeAgent(codebase=codebase, run_id=run_id, instance_id=instance_id, model_name=model)
agent = CodeAgent(codebase=codebase, run_id=run_id, instance_id=instance_id, model_name=model, difficulty=entry.difficulty)

pprint.pprint(instance_id)
pprint.pprint(gold_files)
Expand Down
2 changes: 1 addition & 1 deletion src/codegen/extensions/swebench/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from collections import defaultdict
from pathlib import Path

from codegen.extensions.swebench.enums import SWEBenchDataset
from codegen.extensions.swebench.tests import remove_patches_to_tests
from codegen.extensions.swebench.utils import SWEBenchDataset

NUM_EVAL_PROCS = 5

Expand Down
146 changes: 146 additions & 0 deletions src/codegen/extensions/swebench/subsets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from codegen.extensions.swebench.enums import SWEBenchLiteSubset

SMALL_LITE_SUBSET = [
"mwaskom__seaborn-2848",
"sphinx-doc__sphinx-8627",
"sphinx-doc__sphinx-7975",
"django__django-17087",
"sympy__sympy-17655",
"matplotlib__matplotlib-26020",
"sympy__sympy-20154",
"scikit-learn__scikit-learn-13439",
"pytest-dev__pytest-7373",
"django__django-16527",
]

MEDIUM_LITE_SUBSET = [
"sympy__sympy-15346",
"sympy__sympy-16281",
"sympy__sympy-22840",
"pytest-dev__pytest-7220",
"django__django-12284",
"pytest-dev__pytest-7490",
"matplotlib__matplotlib-25442",
"django__django-13757",
"django__django-15790",
"sympy__sympy-18532",
"sympy__sympy-13471",
"scikit-learn__scikit-learn-15535",
"django__django-13447",
"django__django-15789",
"scikit-learn__scikit-learn-14894",
"django__django-14238",
"django__django-10914",
"pytest-dev__pytest-11143",
"django__django-16255",
"django__django-13658",
]

LARGE_LITE_SUBSET = [
"pytest-dev__pytest-5495",
"django__django-11797",
"django__django-14730",
"scikit-learn__scikit-learn-25500",
"sphinx-doc__sphinx-8506",
"django__django-16408",
"django__django-16910",
"sympy__sympy-12236",
"matplotlib__matplotlib-24265",
"django__django-15320",
"matplotlib__matplotlib-25311",
"django__django-12125",
"django__django-12747",
"matplotlib__matplotlib-24334",
"scikit-learn__scikit-learn-14983",
"scikit-learn__scikit-learn-13497",
"django__django-14580",
"pylint-dev__pylint-6506",
"matplotlib__matplotlib-23987",
"scikit-learn__scikit-learn-13497",
"django__django-14017",
"django__django-15213",
"django__django-12284",
"pylint-dev__pylint-7114",
"django__django-11422",
"django__django-11620",
"django__django-12284",
"sympy__sympy-13971",
"django__django-12284",
"sphinx-doc__sphinx-7975",
"scikit-learn__scikit-learn-15512",
"scikit-learn__scikit-learn-15512",
"pylint-dev__pylint-7993",
"django__django-12184",
"django__django-13315",
"sympy__sympy-15609",
"pylint-dev__pylint-7993",
"sympy__sympy-17022",
"pylint-dev__pylint-7993",
"sympy__sympy-15678",
"sympy__sympy-18057",
"sympy__sympy-17655",
"sympy__sympy-17655",
"django__django-13028",
"sympy__sympy-17139",
"django__django-14999",
"django__django-15790",
"scikit-learn__scikit-learn-11281",
"astropy__astropy-12907",
"django__django-11815",
"sympy__sympy-18621",
"django__django-11999",
"sphinx-doc__sphinx-8721",
"matplotlib__matplotlib-23314",
"sphinx-doc__sphinx-8721",
"sympy__sympy-18621",
"django__django-12497",
"scikit-learn__scikit-learn-13584",
"matplotlib__matplotlib-24970",
"scikit-learn__scikit-learn-13584",
"django__django-12453",
"sympy__sympy-20154",
"django__django-13447",
"sphinx-doc__sphinx-8595",
"sympy__sympy-20154",
"sympy__sympy-20154",
"django__django-12700",
"psf__requests-2317",
"django__django-16046",
"sympy__sympy-20154",
"sympy__sympy-20212",
"django__django-13710",
"sympy__sympy-13647",
"django__django-15851",
"scikit-learn__scikit-learn-14894",
"sympy__sympy-24213",
"scikit-learn__scikit-learn-13779",
"django__django-13710",
"django__django-13933",
"sympy__sympy-20212",
"django__django-14855",
"django__django-11039",
"django__django-16379",
"pydata__xarray-5131",
"pytest-dev__pytest-7373",
"django__django-16139",
"django__django-14382",
"pytest-dev__pytest-5227",
"django__django-16595",
"django__django-16379",
"django__django-16527",
"django__django-13658",
"django__django-16255",
"django__django-16527",
"django__django-13658",
"django__django-13658",
"django__django-13658",
"django__django-11099",
"django__django-16527",
"django__django-11099",
]

LITE_SUBSETS = {
SWEBenchLiteSubset.LITE_SMALL: SMALL_LITE_SUBSET,
SWEBenchLiteSubset.LITE_MEDIUM: MEDIUM_LITE_SUBSET,
SWEBenchLiteSubset.LITE_LARGE: LARGE_LITE_SUBSET,
}
Loading
Loading