diff --git a/codegen-examples/examples/swebench_agent_run/entry_point.py b/codegen-examples/examples/swebench_agent_run/entry_point.py index 79fd19575..bd7b6ec2b 100644 --- a/codegen-examples/examples/swebench_agent_run/entry_point.py +++ b/codegen-examples/examples/swebench_agent_run/entry_point.py @@ -14,6 +14,6 @@ @app.function(timeout=43200) -async def run_agent_modal(entry: SweBenchExample): +async def run_agent_modal(entry: SweBenchExample, run_id: str): """Modal function to process a single example from the SWE-bench dataset.""" - return run_agent_on_entry(entry) + return run_agent_on_entry(entry, run_id=run_id) diff --git a/codegen-examples/examples/swebench_agent_run/run_eval.py b/codegen-examples/examples/swebench_agent_run/run_eval.py index 92752d976..0c0e04f65 100644 --- a/codegen-examples/examples/swebench_agent_run/run_eval.py +++ b/codegen-examples/examples/swebench_agent_run/run_eval.py @@ -17,7 +17,7 @@ run_agent_modal = modal.Function.from_name(app_name="swebench-agent-run", name="run_agent_modal") -async def process_batch_modal(examples: list[SweBenchExample], num_workers=5, min_workers=1, max_retries=3): +async def process_batch_modal(examples: list[SweBenchExample], run_id: str, num_workers=5, min_workers=1, max_retries=3): """Process a batch of examples concurrently using a queue system with incremental worker scaling. Args: @@ -110,7 +110,7 @@ async def is_rate_limit_error(error): async def process_example(example, attempt, current_task): try: - result = await run_agent_modal.remote.aio(example) + result = await run_agent_modal.remote.aio(example, run_id=run_id) if result is None: print(f"Warning: Null result for {example.instance_id}") @@ -222,7 +222,7 @@ async def worker(): return [results.get(example.instance_id, {"instance_id": example.instance_id, "status": "missing"}) for example in examples] -def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebases: dict[str, Codebase] = {}): +def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebases: dict[str, Codebase] = {}, run_id: str | None = None): """Process a batch of examples synchronously. Args: @@ -242,9 +242,9 @@ def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebase try: # Run the agent locally instead of using modal if codebases and example.instance_id in codebases: - result = run_agent_on_entry(example, codebase=codebases[example.instance_id]) + result = run_agent_on_entry(example, codebase=codebases[example.instance_id], run_id=run_id) else: - result = run_agent_on_entry(example) + result = run_agent_on_entry(example, run_id=run_id) results.append(result) except Exception as e: @@ -294,9 +294,9 @@ async def run_eval( # Process all examples in parallel batches if local: - results = process_batch_local(examples, codebases=codebases) + results = process_batch_local(examples, codebases=codebases, run_id=run_id) else: - results = await process_batch_modal(examples, num_workers=num_workers) + results = await process_batch_modal(examples, num_workers=num_workers, run_id=run_id) # Save individual results for result in results: diff --git a/src/codegen/agents/code_agent.py b/src/codegen/agents/code_agent.py index cb7be3ffa..a7c500c90 100644 --- a/src/codegen/agents/code_agent.py +++ b/src/codegen/agents/code_agent.py @@ -4,6 +4,7 @@ from langchain.tools import BaseTool from langchain_core.messages import AIMessage +from langchain_core.runnables.config import RunnableConfig from langsmith import Client from codegen.extensions.langchain.agent import create_codebase_agent @@ -16,7 +17,17 @@ class CodeAgent: """Agent for interacting with a codebase.""" - def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", model_name: str = "claude-3-5-sonnet-latest", memory: bool = True, tools: Optional[list[BaseTool]] = None, **kwargs): + def __init__( + self, + codebase: "Codebase", + model_provider: str = "anthropic", + model_name: str = "claude-3-7-sonnet-latest", + memory: bool = True, + tools: Optional[list[BaseTool]] = None, + run_id: Optional[str] = None, + instance_id: Optional[str] = None, + **kwargs, + ): """Initialize a CodeAgent. Args: @@ -34,6 +45,8 @@ def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", mode self.codebase = codebase self.agent = create_codebase_agent(self.codebase, model_provider=model_provider, model_name=model_name, memory=memory, additional_tools=tools, **kwargs) self.langsmith_client = Client() + self.run_id = run_id + self.instance_id = instance_id # Get project name from environment variable or use a default self.project_name = os.environ.get("LANGCHAIN_PROJECT", "RELACE") @@ -55,9 +68,20 @@ def run(self, prompt: str, thread_id: Optional[str] = None) -> str: # this message has a reducer which appends the current message to the existing history # see more https://langchain-ai.github.io/langgraph/concepts/low_level/#reducers input = {"messages": [("user", prompt)]} - + metadata = {"project": self.project_name} + tags = [] + # Add SWEBench run ID and instance ID to the metadata and tags for filtering + if self.run_id is not None: + metadata["swebench_run_id"] = self.run_id + tags.append(self.run_id) + + if self.instance_id is not None: + metadata["swebench_instance_id"] = self.instance_id + tags.append(self.instance_id) + + config = RunnableConfig(configurable={"thread_id": thread_id}, tags=tags, metadata=metadata, recursion_limit=100) # we stream the steps instead of invoke because it allows us to access intermediate nodes - stream = self.agent.stream(input, config={"configurable": {"thread_id": thread_id, "metadata": {"project": self.project_name}}, "recursion_limit": 100}, stream_mode="values") + stream = self.agent.stream(input, config=config, stream_mode="values") # Keep track of run IDs from the stream run_ids = [] diff --git a/src/codegen/extensions/swebench/harness.py b/src/codegen/extensions/swebench/harness.py index 5f4055d9c..5316b109a 100644 --- a/src/codegen/extensions/swebench/harness.py +++ b/src/codegen/extensions/swebench/harness.py @@ -49,7 +49,7 @@ def show_problems(dataset): print(f"{inst}: {problem}") -def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None): +def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None, run_id: str | None = None): """Process one `entry` from SWE Bench using the LLM `models` at the given `temperature`. Set `model_name_or_path` in the result json. """ @@ -70,7 +70,7 @@ def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None) ) codebase = Codebase.from_repo(repo_full_name=entry.repo, commit=base_commit, language="python", config=config) # check out the repo - agent = CodeAgent(codebase=codebase) + agent = CodeAgent(codebase=codebase, run_id=run_id, instance_id=instance_id) pprint.pprint(instance_id) pprint.pprint(gold_files)