diff --git a/codegen-examples/examples/swebench_agent_run/README.md b/codegen-examples/examples/swebench_agent_run/README.md
index 4ca8db548..c6dc48676 100644
--- a/codegen-examples/examples/swebench_agent_run/README.md
+++ b/codegen-examples/examples/swebench_agent_run/README.md
@@ -25,10 +25,11 @@
   Usage: run_eval.py [OPTIONS]
 
   Options:
-  --use-existing-preds TEXT       The run ID of the existing predictions to use.
-  --dataset [princeton-nlp/SWE-bench_Lite|princeton-nlp/SWE-bench|princeton-nlp/SWE-bench-verified]
-                                  The dataset to use.
+  --use-existing-preds TEXT       The run ID of the existing predictions to
+                                  use.
+  --dataset [lite|full|verified]  The dataset to use.
   --length INTEGER                The number of examples to process.
   --instance-id TEXT              The instance ID of the example to process.
+  --repo TEXT                     The repo to use.
   --help                          Show this message and exit.
   ```
diff --git a/codegen-examples/examples/swebench_agent_run/entry_point.py b/codegen-examples/examples/swebench_agent_run/entry_point.py
index cde50bbba..411b09d3b 100644
--- a/codegen-examples/examples/swebench_agent_run/entry_point.py
+++ b/codegen-examples/examples/swebench_agent_run/entry_point.py
@@ -1,8 +1,6 @@
 from codegen.extensions.swebench.utils import SweBenchExample
 from codegen.extensions.swebench.harness import run_agent_on_entry
 import modal
-import sys
-from codegen.sdk.core.codebase import Codebase
 
 image = (
     modal.Image.debian_slim(python_version="3.13")
@@ -15,26 +13,7 @@
 app = modal.App(name="swebench-agent-run", image=image, secrets=[modal.Secret.from_dotenv()])
 
 
-@app.function(timeout=5 * 60)
+@app.function(timeout=10 * 60)
 async def run_agent_modal(entry: SweBenchExample):
     """Modal function to process a single example from the SWE-bench dataset."""
     return run_agent_on_entry(entry)
-
-
-@app.cls(image=image, secrets=[modal.Secret.from_dotenv()], enable_memory_snapshot=True)
-class SwebenchAgentRun:
-    repo_full_name: str = modal.parameter()
-    commit: str = modal.parameter()
-    codebase: Codebase | None = None
-
-    @modal.enter(snap=True)
-    def load(self):
-        self.codebase = Codebase.from_repo(repo_full_name=self.repo_full_name, commit=self.commit, language="python")
-
-    @modal.exit()
-    def exit(self):
-        sys.exit(0)
-
-    @modal.method()
-    async def run(self, entry: SweBenchExample):
-        return run_agent_on_entry(entry, codebase=self.codebase)
diff --git a/codegen-examples/examples/swebench_agent_run/local_run.ipynb b/codegen-examples/examples/swebench_agent_run/local_run.ipynb
new file mode 100644
index 000000000..1f27f470c
--- /dev/null
+++ b/codegen-examples/examples/swebench_agent_run/local_run.ipynb
@@ -0,0 +1,90 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from codegen.sdk.core.codebase import Codebase\n",
+    "from codegen.extensions.swebench.utils import SWEBenchDataset, get_swe_bench_examples\n",
+    "from run_eval import run_eval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "examples = get_swe_bench_examples(dataset=SWEBenchDataset.LITE, split=\"test\", offset=0, length=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "codebase = Codebase.from_repo(examples[0].repo, commit=examples[0].base_commit, tmp_dir=f\"/tmp/{examples[0].instance_id}\")\n",
+    "# this will allow us to reuse the codebase for multiple examples\n",
+    "codebases = {examples[0].instance_id: codebase}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "await run_eval(use_existing_preds=None, dataset=\"lite\", length=None, instance_id=examples[0].instance_id, local=True, codebases=codebases)\n",
+    "codebases[examples[0].instance_id].reset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/codegen-examples/examples/swebench_agent_run/pyproject.toml b/codegen-examples/examples/swebench_agent_run/pyproject.toml
index 22b7f2e9b..fc612d4b1 100644
--- a/codegen-examples/examples/swebench_agent_run/pyproject.toml
+++ b/codegen-examples/examples/swebench_agent_run/pyproject.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12, <3.14"
-dependencies = ["swebench>=3.0.0", "modal>=0.73.25"]
+dependencies = ["modal>=0.73.25"]
 
 [tool.setuptools]
 py-modules = ["entry_point", "run_eval"]
diff --git a/codegen-examples/examples/swebench_agent_run/run_eval.py b/codegen-examples/examples/swebench_agent_run/run_eval.py
index 0c2132694..cc01b9b94 100644
--- a/codegen-examples/examples/swebench_agent_run/run_eval.py
+++ b/codegen-examples/examples/swebench_agent_run/run_eval.py
@@ -6,13 +6,15 @@
 import modal
 import click
 from datetime import datetime
+from codegen.extensions.swebench.harness import run_agent_on_entry
 from codegen.extensions.swebench.utils import SWEBenchDataset, SweBenchExample, get_swe_bench_examples
 from codegen.extensions.swebench.report import generate_report
+from codegen.sdk.core.codebase import Codebase
 
 PREDS_DNAME = Path(__file__).parent / "predictions"
 LOG_DIR = Path(__file__).parent / "logs"
 
-SwebenchAgentRun = modal.Cls.from_name(app_name="swebench-agent-run", name="SwebenchAgentRun")
+run_agent_modal = modal.Function.from_name(app_name="swebench-agent-run", name="run_agent_modal")
 
 
 async def process_batch(examples: list[SweBenchExample], batch_size=10):
@@ -31,7 +33,7 @@ async def process_batch(examples: list[SweBenchExample], batch_size=10):
         batch = examples[i : i + batch_size]
 
         # Create tasks for this batch
-        batch_tasks = [SwebenchAgentRun(repo_full_name=example.repo, commit=example.base_commit).run.remote.aio(example) for example in batch]
+        batch_tasks = [run_agent_modal.remote.aio(example) for example in batch]
 
         # Wait for all tasks in this batch to complete
         print(f"Processing batch {i // batch_size + 1}/{len(examples) // batch_size + 1} (examples {i + 1}-{min(i + batch_size, len(examples))})")
@@ -88,11 +90,63 @@ async def process_batch(examples: list[SweBenchExample], batch_size=10):
     return results
 
 
-async def run_eval(use_existing_preds: str | None, dataset: str, length: int, instance_id: str | None = None):
+def process_batch_sync(examples: list[SweBenchExample], batch_size=10, codebases: dict[str, Codebase] = {}):
+    """Process a batch of examples synchronously.
+
+    Args:
+        examples: List of SweBenchExample objects to process
+        batch_size: Number of examples to process in each batch.
+                   Default is 10 to avoid overwhelming the system.
+    """
+    results = []
+
+    # Process examples in batches
+    for i in range(0, len(examples), batch_size):
+        batch = examples[i : i + batch_size]
+        print(f"Processing batch {i // batch_size + 1}/{len(examples) // batch_size + 1} (examples {i + 1}-{min(i + batch_size, len(examples))})")
+
+        # Process each example in the batch
+        for example in batch:
+            try:
+                # Run the agent locally instead of using modal
+                if codebases and example.instance_id in codebases:
+                    result = run_agent_on_entry(example, codebase=codebases[example.instance_id])
+                else:
+                    result = run_agent_on_entry(example)
+                results.append(result)
+
+            except Exception as e:
+                error_type = type(e).__name__
+                error_info = {
+                    "error_type": error_type,
+                    "error_message": str(e),
+                    "traceback": traceback.format_exc(),
+                }
+
+                print(f"Error processing {example.instance_id}:")
+                print(f"Type: {error_type}")
+                print(f"Message: {str(e)}")
+                print("Traceback:")
+                print(error_info["traceback"])
+
+                results.append({"instance_id": example.instance_id, "status": "error", "error_info": error_info})
+
+    return results
+
+
+async def run_eval(use_existing_preds: str | None, dataset: str, length: int, instance_id: str | None = None, local: bool = False, codebases: dict[str, Codebase] = {}, repo: str | None = None):
     run_id = use_existing_preds or str(uuid.uuid4())
+    print(f"Run ID: {run_id}")
     predictions_dir = PREDS_DNAME / f"results_{run_id}"
-    dataset = SWEBenchDataset(dataset)
-    examples = get_swe_bench_examples(dataset=dataset, length=length, instance_id=instance_id)
+    dataset_dict = {
+        "lite": SWEBenchDataset.LITE,
+        "full": SWEBenchDataset.FULL,
+        "verified": SWEBenchDataset.VERIFIED,
+    }
+    dataset_enum = dataset_dict[dataset]
+    print(repo)
+    examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo)
+    print(f"Examples:\n{'\n'.join([f'{e.instance_id} - {e.repo} - {e.base_commit}' for e in examples])}")
 
     try:
         if use_existing_preds is None:
@@ -105,7 +159,10 @@ async def run_eval(use_existing_preds: str | None, dataset: str, length: int, in
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 
             # Process all examples in parallel batches
-            results = await process_batch(examples)
+            if local:
+                results = process_batch_sync(examples, codebases=codebases)
+            else:
+                results = await process_batch(examples)
 
             # Save individual results
             for result in results:
@@ -147,7 +204,7 @@ async def run_eval(use_existing_preds: str | None, dataset: str, length: int, in
                     print(f"  {error_type}: {count}")
 
         # Generate Report on Modal
-        generate_report(predictions_dir, LOG_DIR, dataset, run_id)
+        generate_report(predictions_dir, LOG_DIR, dataset_enum, run_id)
     except Exception:
         print("Fatal error in run_eval:")
         traceback.print_exc()
@@ -156,11 +213,14 @@ async def run_eval(use_existing_preds: str | None, dataset: str, length: int, in
 
 @click.command()
 @click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None)
-@click.option("--dataset", help="The dataset to use.", type=click.Choice([dataset.value for dataset in SWEBenchDataset]), default=SWEBenchDataset.LITE.value)
+@click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified"]), default="lite")
 @click.option("--length", help="The number of examples to process.", type=int, default=10)
 @click.option("--instance-id", help="The instance ID of the example to process.", type=str, default=None)
-def run_eval_command(use_existing_preds, dataset, length, instance_id):
-    asyncio.run(run_eval(use_existing_preds, dataset, length, instance_id))
+@click.option("--local", help="Run the evaluation locally.", is_flag=True, default=False)
+@click.option("--repo", help="The repo to use.", type=str, default=None)
+def run_eval_command(use_existing_preds, dataset, length, instance_id, local, repo):
+    print(f"Repo: {repo}")
+    asyncio.run(run_eval(use_existing_preds=use_existing_preds, dataset=dataset, length=length, instance_id=instance_id, codebases=None, local=local, repo=repo))
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index 4a092d1a6..e3fed08fe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -77,6 +77,7 @@ dependencies = [
   "urllib3>=2.0.0",
   "datasets",
   "colorlog>=6.9.0",
+  "langsmith",
 ]
 
 license = { text = "Apache-2.0" }
diff --git a/src/codegen/agents/code_agent.py b/src/codegen/agents/code_agent.py
index a234a185f..c83c43f41 100644
--- a/src/codegen/agents/code_agent.py
+++ b/src/codegen/agents/code_agent.py
@@ -19,7 +19,7 @@
 class CodeAgent:
     """Agent for interacting with a codebase."""
 
-    def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", model_name: str = "claude-3-7-sonnet-latest", memory: bool = True, tools: Optional[list[BaseTool]] = None, **kwargs):
+    def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", model_name: str = "claude-3-5-sonnet-latest", memory: bool = True, tools: Optional[list[BaseTool]] = None, **kwargs):
         """Initialize a CodeAgent.
 
         Args:
diff --git a/src/codegen/extensions/swebench/utils.py b/src/codegen/extensions/swebench/utils.py
index 05b4c4617..cd1ded36e 100644
--- a/src/codegen/extensions/swebench/utils.py
+++ b/src/codegen/extensions/swebench/utils.py
@@ -7,9 +7,6 @@
 
 from datasets import load_dataset
 
-# Add constant for cache directory
-CACHE_DIR = Path.home() / ".cache" / "swebench"
-
 
 class SWEBenchDataset(Enum):
     LITE = "princeton-nlp/SWE-bench_Lite"
@@ -73,30 +70,30 @@ def get_swe_bench_examples(
     offset: int = 0,
     length: int = 100,
     instance_id: str | None = None,
+    repo: str | None = None,
 ) -> list[SweBenchExample]:
     """Fetch examples from the SWE-bench dataset using the datasets library.
 
     Args:
-        dataset: The dataset to use (LITE, FULL, or VERIFIED)
+        dataset: The dataset to use ("lite", "full", or "verified")
         split: The dataset split to use
         offset: Starting index for examples
         length: Number of examples to fetch
+        instance_id: Optional specific instance ID to fetch
 
     Returns:
         List of SweBenchExample objects
     """
-    # Ensure cache directory exists
-    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    # Convert string dataset name to enum
 
     # Load the dataset with caching enabled
-    dataset_name = dataset.value
-    swe_bench_dataset = load_dataset(dataset_name, cache_dir=str(CACHE_DIR), download_mode="reuse_dataset_if_exists")
+    swe_bench_dataset = load_dataset(dataset.value, download_mode="reuse_dataset_if_exists")
 
     # Get the requested split
     split_data = swe_bench_dataset[split]
 
     # Apply offset and length
-    if instance_id:
+    if instance_id or repo:
         offset = 0
         end_idx = len(split_data)
     else:
@@ -113,6 +110,8 @@ def get_swe_bench_examples(
     for row in selected_rows:
         if instance_id and row["instance_id"] != instance_id:
             continue
+        if repo and row["repo"] != repo:
+            continue
         example = SweBenchExample(
             repo=row["repo"],
             instance_id=row["instance_id"],
@@ -129,4 +128,4 @@ def get_swe_bench_examples(
         )
         examples.append(example)
 
-    return examples
+    return examples[:length]
diff --git a/src/codegen/extensions/tools/reflection.py b/src/codegen/extensions/tools/reflection.py
index 57ac371fd..4c50bf68f 100644
--- a/src/codegen/extensions/tools/reflection.py
+++ b/src/codegen/extensions/tools/reflection.py
@@ -61,12 +61,16 @@ def render(self) -> str:
 REFLECTION_SYSTEM_PROMPT = """You are an expert AI assistant specialized in reflection and strategic planning.
 Your task is to help organize thoughts, identify knowledge gaps, and create a strategic plan based on the information provided.
 
+**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
+
 You will be given:
 1. A summary of the current context and problem being solved
 2. Key information and insights gathered so far
 3. Current obstacles or questions that need to be addressed (if any)
 4. A specific aspect to focus the reflection on (if any)
 
+**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
+
 Your response should be structured into the following sections:
 1. Current Understanding - Summarize what you understand about the problem and context
 2. Key Insights - Highlight the most important findings and their implications
@@ -74,8 +78,12 @@ def render(self) -> str:
 4. Action Plan - Recommend specific next steps to move forward
 5. Alternative Approaches - Suggest other ways to tackle the problem
 
+**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
+
 Your reflection should be clear, insightful, and actionable. Focus on helping the agent make progress and double check its own work.
 You will not suggest the agent writes new tests or modifies existing tests.
+
+**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
 """
 
 
diff --git a/uv.lock b/uv.lock
index eec36911b..81c55a07c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -550,7 +550,6 @@ dependencies = [
     { name = "docker" },
     { name = "docstring-parser" },
     { name = "fastapi", extra = ["standard"] },
-    { name = "fastmcp" },
     { name = "gitpython" },
     { name = "giturlparse" },
     { name = "hatch-vcs" },
@@ -678,7 +677,6 @@ requires-dist = [
     { name = "docker", specifier = ">=6.1.3" },
     { name = "docstring-parser", specifier = ">=0.16,<1.0" },
     { name = "fastapi", extras = ["standard"], specifier = ">=0.115.2,<1.0.0" },
-    { name = "fastmcp" },
     { name = "gitpython", specifier = "==3.1.44" },
     { name = "giturlparse" },
     { name = "hatch-vcs", specifier = ">=0.4.0" },
@@ -1264,23 +1262,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/90/2b/0817a2b257fe88725c25589d89aec060581aabf668707a8d03b2e9e0cb2a/fastjsonschema-2.21.1-py3-none-any.whl", hash = "sha256:c9e5b7e908310918cf494a434eeb31384dd84a98b57a30bcb1f535015b554667", size = 23924 },
 ]
 
-[[package]]
-name = "fastmcp"
-version = "0.4.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "httpx" },
-    { name = "mcp" },
-    { name = "pydantic" },
-    { name = "pydantic-settings" },
-    { name = "python-dotenv" },
-    { name = "typer" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/6f/84/17b549133263d7ee77141970769bbc401525526bf1af043ea6842bce1a55/fastmcp-0.4.1.tar.gz", hash = "sha256:713ad3b8e4e04841c9e2f3ca022b053adb89a286ceffad0d69ae7b56f31cbe64", size = 785575 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/79/0b/008a340435fe8f0879e9d608f48af2737ad48440e09bd33b83b3fd03798b/fastmcp-0.4.1-py3-none-any.whl", hash = "sha256:664b42c376fb89ec90a50c9433f5a1f4d24f36696d6c41b024b427ae545f9619", size = 35282 },
-]
-
 [[package]]
 name = "filelock"
 version = "3.17.0"